def compare_url(): """ txt 的 link 的 id VS xls 的数据 id :return: """ for directory in FILE_DIR_LIST: l_arr = [ f for f in os.listdir(os.getcwd() + directory + folder[0]) if f[-3:] == 'txt' ] d_arr = [ f for f in os.listdir(os.getcwd() + directory + folder[1]) if f[-3:] == 'xls' ] for l, d in zip(l_arr, d_arr): print(l + ' vs ' + d) error = [] list_content = get_file_content(directory + folder[0] + l) detail_content = get_file_pd(directory + folder[1] + d) for l_line, d_line in zip(list_content, range(detail_content.shape[0])): l_url = str(l_line).split(',')[-1].strip() l_id = int(l_url.split('=')[-1]) # print(l_line, d_line) try: d_id = int(detail_content.iat[d_line, 1]) except ValueError: error.append(d_id) if l_id != d_id: error.append(l_line.split('.')[0]) print('row ' + l_line.split('.')[0].__str__() + ' : ' + l_id.__str__() + ' == ' + d_id.__str__()) if error.__len__(): print('with ' + error.__len__().__str__() + ' error has found') print('--------------------------------')
def txt_check(directory, file_arr): """ check whether there is failed in txt :param directory: :param file_arr: :return: """ for l in file_arr: f_path = directory + folder[0] + l start = int(re.compile('-').split(l)[2]) - 1 print(f_path) content = get_file_content(f_path) error = [] fail = [] for index, item in enumerate(content): if not item.__contains__('failed'): if item.strip().split('.')[0]: number = int(item.strip().split('.')[0]) - (15 * start) if number != index + 1: error.append(item) print(index.__str__() + ' == ' + item) else: fail.append(item) # print(item) if error.__len__() or fail.__len__(): print('check ' + f_path + ' finish : with ' + fail.__len__().__str__() + ' failed and ' + error.__len__().__str__() + ' error index has found') else: print('check ' + f_path + ' finish : with no error') print('------ ' + directory + ' txt check finish -------')
def fill_xls(directory, l, d): """ 填补xls insert and append 注意,会因为内容格式……emmm……太花哨而fail :param directory: category :param l: list file,txt :param d: detail file,xls """ list_content = get_file_content(directory + folder[0] + l) detail_content = get_file_pd(directory + folder[1] + d) start = (int(re.compile('-').split(d)[2]) - 1) * 15 browser = webdriver.Chrome() col_len = len(detail_content.columns) try: while detail_content.shape[0] < list_content.__len__(): detail_content = insert_or_append_xls(browser, detail_content, list_content, start, col_len) detail_content.to_excel(os.getcwd() + directory + folder[1] + d, index=False) finally: print('fill_xls() end') browser.close() detail_content.to_excel(os.getcwd() + directory + folder[1] + d, index=False)
def output_excel(file, name): data = pd.DataFrame() txt_content = get_file_content(file) b = webdriver.Chrome() # read file ,get url,get No. for line in txt_content: if line.strip(): try: drug_url = line.split(',')[-1] drug_id = line.split('=')[-1] # crawl detail detail = crawl_detail(b, drug_url) if detail: detail.insert(1, str(drug_id).strip()) detail_arr = arrange(detail[1:-8], drug_url) title = get_title(detail[1:-8]) data = data.append(pd.DataFrame(columns=title, data=[detail_arr]), ignore_index=True, sort=False) else: data = data.append(pd.DataFrame(data=[line + ',None']), ignore_index=True, sort=False) finally: data.to_excel(name + '.xlsx')
def txt_retry(path): """ replace 'page xx crawl failed' in txt :param path: :return: """ content = get_file_content(path) txt_file = pd.DataFrame(content) browser = webdriver.Chrome() try: for row in reversed(range(txt_file.shape[0])): s = str(txt_file.iat[row, 0]).strip() if not s: txt_file.drop(row) if s.__contains__('failed'): page = s.split(' ')[1] print(path + ' , ' + page) url_dir = get_ids(browser, util.get_list_url(page)) if url_dir: count = 0 txt_file = txt_file.drop([row], axis=0) for k, v in url_dir.items(): new_str = '{name},{urls}'.format(name=k, urls=v + '\n') txt_file = pd.DataFrame( pd.np.insert(txt_file.values, row + count, [new_str])) count += 1 txt_file.sort_index(axis=0) except Exception as e: print('txt_retry() Exception : ' + e.__str__()) finally: pd.np.savetxt(os.getcwd() + path, txt_file.values, fmt='%s', encoding='utf-8', newline='') print('txt_retry() ' + path + ' finish.')
def compare_txt_xls(d, directory, l): print(l + ' vs ' + d) list_content = get_file_content(directory + folder[0] + l) detail_content = get_file_pd(directory + folder[1] + d) error = [] for l_line, d_line in zip_longest(list_content, range(detail_content.shape[0]), fillvalue=None): # print(l_line, int(detail_content.iat[d_line, 0])) txt_number = int(l_line.split('.')[0]) if d_line is None: xls_number = -1 else: xls_number = int(detail_content.iat[d_line, 0]) if not txt_number == xls_number: print(txt_number, xls_number) error.append(txt_number) if error.__len__(): print(l + ' vs ' + d + ' finish : with ' + error.__len__().__str__() + ' error found') else: print(l + ' vs ' + d + ' finish : with no error') print('------------------------------------')
def fill_txt(directory, l): """ 填补xls insert and append :param directory: :param l: :return: """ start = int((l.split('-')[2]).split('.')[0]) - 1 end = int((l.split('-')[3]).split('.')[0]) # print('start:' + start.__str__() + ' , end =' + end.__str__()) list_content = get_file_content(directory + folder[0] + l) # print(list_content.__len__()) browser = webdriver.Chrome() # insert try: for index, line in enumerate(list_content): if line.strip(): number = int(line.split('.')[0]) - start * 15 if index + 1 != number: print(index + 1, number) page = math.ceil(index / 15) + start urls = get_ids(browser, get_list_url(page)) # print(urls) if urls: for k, v in urls.items(): if int(str(k).split('.')[0]) == (index + start * 15): insert_str = '{name},{urls}'.format(name=k, urls=v + '\n') print(l + ' ' + index.__str__() + ' row insert : ' + insert_str) list_content.insert(index, insert_str) else: list_content.insert(index, index.__str__() + '. crawl failed') print('--- ' + l + ' insert finish. ---') # append last_index = list_content.__len__() from_page = int(round((last_index / 15) + start, 0)) # print(from_page, end) if from_page != end and from_page <= end: last_content = (from_page - 1) * 15 list_content = list_content[0:last_content] while from_page <= end: try: url = util.get_list_url(from_page) print(l + ' append page ' + from_page.__str__() + ' , ' + url) url_list = get_ids(browser, url) try: for k, v in url_list.items(): if not v.startswith('http'): raise Exception('cannot get page content : ' + k + ',' + v) append_str = k + ',' + v list_content.append('\n' + append_str) # print(append_str) except Exception as e: list_content.append('page ' + from_page.__str__() + ' crawl failed') print(l + ' page ' + from_page.__str__() + ' crawl failed , with exception :' + e.__str__()) except Exception as e: list_content.append(' page ' + from_page.__str__() + ' crawl failed') print(l + ' page ' + from_page.__str__() + ' crawl failed, with exception : ' + e.__str__()) continue finally: from_page += 1 print('--- ' + l + ' append finish. ---') except WebDriverException as e: print(e.__str__()) finally: with codecs.open(os.getcwd() + directory + folder[0] + l, 'wb', encoding='utf-8') as f: f.write(''.join(list_content)) f.close()