コード例 #1
0
ファイル: worm.py プロジェクト: kzx1025/SparkMonitor
 def __init__(self, url, stage_url, has_proxy, has_gc, property):
     self.property = property
     self.running_spark = SparkData(property)
     self.finish_spark = SparkData(property)
     self.url = url
     self.stage_url = stage_url
     self.running_stages = []
     self.finished_stages = []
     self.failed_stages = []
     self.has_proxy = has_proxy
     self.has_gc = has_gc
コード例 #2
0
ファイル: worm.py プロジェクト: kzx1025/SparkMonitor
class Worm(object):
    logger = MyLog.get_logger("Worm.class")

    def __init__(self, url, stage_url, has_proxy, has_gc, property):
        self.property = property
        self.running_spark = SparkData(property)
        self.finish_spark = SparkData(property)
        self.url = url
        self.stage_url = stage_url
        self.running_stages = []
        self.finished_stages = []
        self.failed_stages = []
        self.has_proxy = has_proxy
        self.has_gc = has_gc

    @staticmethod
    def get_html(url, has_proxy, time_out):
        if has_proxy is True:
            socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 1314)
            socket.socket = socks.socksocket
            page = urllib2.urlopen(url, timeout=time_out)
            return page.read()
        else:
            page = urllib2.urlopen(url, timeout=time_out)
            return page.read()

    """
    only support a running task
    if more than one raise exception
    """

    def get_running_spark(self):
        html = self.get_html(self.url, self.has_proxy, 9)
        soup = BeautifulSoup(html, "html.parser")

        for div in soup.find_all("div", "row-fluid"):
            if div.h4 is not None:
                # print div.h4.string
                if div.h4.string.find(" Running Applications ") != -1:
                    tr_tags = div.find_all("tr")
                    if len(tr_tags) != 1 and len(tr_tags) != 0:
                        raise ValueError("there is maybe more than  one running spark Task")
                    if len(tr_tags) == 0:
                        return None  # if 0 return none
                    tds = tr_tags[0].find_all("td")
                    # print tds[0].find_all('a')[0].string
                    self.running_spark.set_app_id(tds[0].find_all("a")[0].string.strip())
                    self.running_spark.set_app_name(tds[1].find_all("a")[0].string.strip())
                    self.running_spark.set_total_time(tds[7].string)
                    self.running_spark.set_status(tds[6].string)

        # start to get information about stages of running spark
        stage_html = self.get_html(self.stage_url, self.has_proxy, 9)
        stage_soup = BeautifulSoup(stage_html, "html.parser")

        tables = stage_soup.find_all("table", "table table-bordered table-striped table-condensed sortable")
        Worm.logger.debug(len(tables))
        divs = stage_soup.find_all("div", "container-fluid")
        all_h4 = divs[0].find_all("h4")
        i = 0
        for h4 in all_h4:
            if h4.string.find("Active Stages") != -1:
                running_trs = tables[i].find_all("tr")
                self.running_stages.extend(Worm.get_stages(running_trs, self.stage_url, self.has_gc))
            elif h4.string.find("Completed Stages") != -1:
                finished_trs = tables[i].find_all("tr")
                self.finished_stages.extend(Worm.get_stages(finished_trs, self.stage_url, self.has_gc))
            else:
                pass
            i += 1

        """
        for table in tables:
             if table.h4.string.find('Active Stages') != -1:
                 running_trs = tables[0].find_all('tr')
                 self.running_stages.extend(Worm.get_stages(running_trs, self.stage_url, self.has_gc))
             elif table.h4.string.find('Completed Stages') != -1:
                 finished_trs = tables[1].find_all('tr')
                 self.finished_stages.extend(Worm.get_stages(finished_trs, self.stage_url, self.has_gc))
             elif table.h4.string.find('Failed Stages') != -1:
                 finished_trs = tables[1].find_all('tr')
                 self.finished_stages.extend(Worm.get_stages(finished_trs, self.stage_url, self.has_gc))
             else:
                 pass

        if len(tables) >= 1:
            running_trs = tables[0].find_all('tr')
            self.running_stages.extend(Worm.get_stages(running_trs, self.stage_url, self.has_gc))

        if len(tables) >= 2:
            finished_trs = tables[0].find_all('tr')
            self.finished_stages.extend(Worm.get_stages(finished_trs, self.stage_url, self.has_gc))

        if len(tables) >= 3:
            failed_trs = tables[2].find_all('tr')
            self.failed_stages.extend(Worm.get_stages(failed_trs, self.stage_url, self.has_gc))
        """

        self.running_spark.set_running_stages(self.running_stages)
        self.running_spark.set_finished_stages(self.finished_stages)
        self.running_spark.set_failed_stages(self.failed_stages)

        return self.running_spark

    def get_finish_spark(self):
        html = self.get_html(self.url, self.has_proxy, 9)
        soup = BeautifulSoup(html, "html.parser")
        for div in soup.find_all("div", "row-fluid"):
            if div.h4 is not None:
                # print div.h4.string
                if div.h4.string.find("Completed Applications") != -1:
                    tr_tags = div.find_all("tr")
                    if len(tr_tags) == 0:
                        print "there is no finish spark Task"
                        return None  # if 0 return none
                    tds = tr_tags[0].find_all("td")  # return the first
                    # print tds[0].find_all('a')[0].string
                    self.finish_spark.set_app_id(tds[0].find_all("a")[0].string.strip())
                    self.finish_spark.set_app_name(tds[1].find_all("a")[0].string.strip())
                    self.finish_spark.set_total_time(tds[7].string)
                    self.finish_spark.set_status(tds[6].string)

        stage_html = self.get_html(
            self.url + "history/" + self.finish_spark.get_app_id() + "/stages/", self.has_proxy, 9
        )
        stage_soup = BeautifulSoup(stage_html, "html.parser")
        tables = stage_soup.find_all("table", "table table-bordered table-striped table-condensed sortable")
        Worm.logger.debug(len(tables))
        divs = stage_soup.find_all("div", "container-fluid")
        all_h4 = divs[0].find_all("h4")
        tables = stage_soup.find_all("table", "table table-bordered table-striped table-condensed sortable")
        Worm.logger.debug(len(tables))
        divs = stage_soup.find_all("div", "container-fluid")
        all_h4 = divs[0].find_all("h4")
        i = 0
        for h4 in all_h4:
            if h4.string.find("Completed Stages") != -1:
                finished_trs = tables[i].find_all("tr")
                self.finished_stages.extend(
                    Worm.get_stages(
                        finished_trs, self.url + "history/" + self.finish_spark.get_app_id() + "/stages/", self.has_gc
                    )
                )
            else:
                pass
            i += 1

        self.finish_spark.set_running_stages(self.running_stages)
        self.finish_spark.set_finished_stages(self.finished_stages)
        self.finish_spark.set_failed_stages(self.failed_stages)
        return self.finish_spark

    @staticmethod
    def get_stages(trs, stage_url, has_gc):
        final_stages = []
        for tr in trs:
            tds = tr.find_all("td")
            temp_stage = Stage()
            temp_stage.set_stage_id(int(tds[0].string.strip()))
            temp_stage.set_submit_time(tds[2].string.strip())
            temp_stage.set_duration(tds[3].string.strip())
            temp_stage.set_tasks_percent(tds[4].find("span").string.strip())
            temp_stage.set_input("0MB" if tds[5].string is None else tds[5].string)
            temp_stage.set_shuffle_read("0MB" if tds[7].string is None else tds[6].string)
            temp_stage.set_shuffle_write("0MB" if tds[8].string is None else tds[7].string)
            gc_total = 0.0
            try:
                if has_gc is True:
                    gc_html = Worm.get_html(
                        stage_url + "stage/?id=" + str(temp_stage.get_stage_id()) + "&attempt=0", True, 6
                    )
                    print stage_url + "stage/?id=" + str(temp_stage.get_stage_id()) + "&attempt=0"
                    gc_soup = BeautifulSoup(gc_html, "html.parser")
                    tables = gc_soup.find_all("table", "table table-bordered table-condensed sortable table-striped")
                    trs = tables[1].find_all("tr")
                    for i in range(0, len(trs)):
                        tds = trs[i].find_all("td")
                        gc_str = tds[10].string.strip()
                        if gc_str != "":
                            # print  gc_str
                            gc_total += Util.format_second(gc_str)
            except Exception, e:
                print e
                gc_total = 0.0
            temp_stage.set_gc_time(gc_total)

            final_stages.append(temp_stage)

        return final_stages