Beispiel #1
0
    def parse_webpage(url_base, start_idx, end_idx):
        # download webpages
        try:
            vd = IOHelper.VisualizeDownload(url_base)
            page_info = vd.go()
        except Exception as e:
            write_information("failed to get web page!")
            return []

        # decode to utf-8
        page_info = page_info.decode('utf-8')
        # print(page_info)

        # find all report names
        p_name = re.compile(r'>(error_report_([\d]*).zip)<')
        id_list = p_name.findall(page_info)  # currently unsorted
        # print(id_list)
        id_list = sorted(id_list, key=lambda x: int(x[1]))
        write_information(
            'totally <%d> files found on server, ranging from %s to %s' %
            (len(id_list), id_list[0][1], id_list[-1][1]))

        # create file list
        new_id_list = []
        for report in id_list:
            idx = int(report[1])
            if start_idx >= 0 and idx < start_idx:
                continue
            if end_idx >= 0 and idx > end_idx:
                continue
            new_id_list.append(idx)
        return new_id_list
Beispiel #2
0
    def parse_webpage2(url_base, start_idx, end_idx):
        # 下载页面
        try:
            vd = IOHelper.VisualizeDownload(url_base)
            page_info = vd.go()
        except Exception as e:
            write_information("failed to get web page!")
            return [], {}

        # 保存页面
        with open('page', 'wb') as f:
            f.write(page_info)

        # decode to utf-8
        page_info = page_info.decode('utf-8')
        # print(page_info)

        # 提取异常报告文件列表
        # <br>
        # 2017/5/18 17:26 888805
        # <a href="http://222.73.55.231/BugTrap/reports/swcSelf8.9.3.4687/error_report_6.zip">
        # error_report_6.zip
        # </a>
        pat = re.compile(
            r'<br>'  # 起始标签
            r'([0-9/ :]*?)'  # 文件时间 日期 大小  (捕获变量0)
            r'<a href=".*?">'  # URL
            r'(error_report_([\d]*).zip)'  # 文件名(捕获变量1)    报告ID(捕获变量2)
            r'</a>',
            re.IGNORECASE)  # 结束标签

        file_info_dict = {}  # report_id -> (time, size)
        res = pat.findall(page_info)
        for item in res:
            if len(item) < 3:  # 至少三个捕获变量
                continue

            # 文件信息 文件名     报告ID
            file_info, file_name, report_id = item

            # 2017/5/18     17:26   888805
            #   date        time    filesize
            f_date_str, f_time_str, f_size_str = file_info.split()

            f_date_time_str = f_date_str + " " + f_time_str  # 拼接日期和时间字符串
            f_date_time = datetime.datetime.strptime(
                f_date_time_str, "%Y/%m/%d %H:%M")  # 获取当前时间(本地时间)
            # print(f_date_time)
            # print(type(item), file_info, file_name, report_id)

            report_id = int(report_id)  # 报告编号
            f_size = int(f_size_str)  # 文件大小
            file_info_dict[report_id] = (f_date_time, f_size)

        id_list = file_info_dict.keys()
        id_list = sorted(id_list)
        write_information(
            'totally <%d> files found on server, ranging from %s to %s' %
            (len(id_list), id_list[0], id_list[-1]))

        # create file list
        new_id_list = []
        for report_id in id_list:
            idx = int(report_id)
            if start_idx >= 0 and idx < start_idx:
                continue
            if end_idx >= 0 and idx > end_idx:
                continue
            new_id_list.append(idx)
        return new_id_list, file_info_dict