def parse_webpage(url_base, start_idx, end_idx): # download webpages try: vd = IOHelper.VisualizeDownload(url_base) page_info = vd.go() except Exception as e: write_information("failed to get web page!") return [] # decode to utf-8 page_info = page_info.decode('utf-8') # print(page_info) # find all report names p_name = re.compile(r'>(error_report_([\d]*).zip)<') id_list = p_name.findall(page_info) # currently unsorted # print(id_list) id_list = sorted(id_list, key=lambda x: int(x[1])) write_information( 'totally <%d> files found on server, ranging from %s to %s' % (len(id_list), id_list[0][1], id_list[-1][1])) # create file list new_id_list = [] for report in id_list: idx = int(report[1]) if start_idx >= 0 and idx < start_idx: continue if end_idx >= 0 and idx > end_idx: continue new_id_list.append(idx) return new_id_list
def parse_webpage2(url_base, start_idx, end_idx): # 下载页面 try: vd = IOHelper.VisualizeDownload(url_base) page_info = vd.go() except Exception as e: write_information("failed to get web page!") return [], {} # 保存页面 with open('page', 'wb') as f: f.write(page_info) # decode to utf-8 page_info = page_info.decode('utf-8') # print(page_info) # 提取异常报告文件列表 # <br> # 2017/5/18 17:26 888805 # <a href="http://222.73.55.231/BugTrap/reports/swcSelf8.9.3.4687/error_report_6.zip"> # error_report_6.zip # </a> pat = re.compile( r'<br>' # 起始标签 r'([0-9/ :]*?)' # 文件时间 日期 大小 (捕获变量0) r'<a href=".*?">' # URL r'(error_report_([\d]*).zip)' # 文件名(捕获变量1) 报告ID(捕获变量2) r'</a>', re.IGNORECASE) # 结束标签 file_info_dict = {} # report_id -> (time, size) res = pat.findall(page_info) for item in res: if len(item) < 3: # 至少三个捕获变量 continue # 文件信息 文件名 报告ID file_info, file_name, report_id = item # 2017/5/18 17:26 888805 # date time filesize f_date_str, f_time_str, f_size_str = file_info.split() f_date_time_str = f_date_str + " " + f_time_str # 拼接日期和时间字符串 f_date_time = datetime.datetime.strptime( f_date_time_str, "%Y/%m/%d %H:%M") # 获取当前时间(本地时间) # print(f_date_time) # print(type(item), file_info, file_name, report_id) report_id = int(report_id) # 报告编号 f_size = int(f_size_str) # 文件大小 file_info_dict[report_id] = (f_date_time, f_size) id_list = file_info_dict.keys() id_list = sorted(id_list) write_information( 'totally <%d> files found on server, ranging from %s to %s' % (len(id_list), id_list[0], id_list[-1])) # create file list new_id_list = [] for report_id in id_list: idx = int(report_id) if start_idx >= 0 and idx < start_idx: continue if end_idx >= 0 and idx > end_idx: continue new_id_list.append(idx) return new_id_list, file_info_dict