def read_items(self): self.create() eb_list = [] for row in range(self.r_sheet.nrows - 1): eb = name_manager.execl_bean() eb.row_num = row + 1 eb.sourcename = self.r_sheet.cell(eb.row_num, self.nums[0]).value issn = self.r_sheet.cell(eb.row_num, self.nums[1]).value eissn = self.r_sheet.cell(eb.row_num, self.nums[2]).value if issn == "": eb.eissn = eissn elif (eissn == ""): eb.eissn = issn else: eb.eissn = issn + "-" + eissn eb.waibuaid = self.r_sheet.cell(eb.row_num, self.nums[3]).value eb.pinjie = self.r_sheet.cell(eb.row_num, self.nums[4]).value eb.full_url = self.r_sheet.cell(eb.row_num, self.nums[5]).value eb.abs_url = self.r_sheet.cell(eb.row_num, self.nums[6]).value eb.full_path = self.r_sheet.cell(eb.row_num, self.nums[7]).value if self.list.__len__() > self.nums[7] + 1: page_num = self.r_sheet.cell(eb.row_num, self.nums[7] + 1).value if page_num: eb.page = int(page_num) eb.check() eb_list.append(eb) return eb_list
def back_file_to_excel(self, back_file_path): back_file = open(back_file_path, "r") for line in back_file.readlines(): eb = name_manager.execl_bean() eb.paser(line) self.excel_write(eb) self.wb.save(self.file_path)
def run(self): logger.info("URL_THREAD - " + self.name + " - " + self.sourcename + " download_url start...") while (True): string = self.um.get_eb(self.url_set_name) if string == None: self.um.set_done(self.sourcename, self.step) break eb = nm.execl_bean() eb.paser(string) url_dict = {} if eb.sourcename == "PMC": if eb.waibuaid != "": url_dict[ EXCEL_ITEM. WAIBUAID] = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + eb.waibuaid if eb.abs_url != "": url_dict[EXCEL_ITEM.ABS_URL] = eb.abs_url if eb.full_url != "": url_dict[EXCEL_ITEM.FULL_URL] = eb.full_url if eb.pinjie != "": url_dict[EXCEL_ITEM.PINJIE] = eb.pinjie jcb = nm.json_conf_bean(eb.sourcename, eb.eissn) html_ = htmls.HTML(eb, jcb, self.tm, self.sourcename, test_file=self.create_test_file_path()) try: logger.info("URL_THREAD - " + self.name + " - " + self.sourcename + " get download url form: " + str(url_dict)) url, full_url = parser_url(url_dict, html_, name=self.name + " - " + self.sourcename) except: logger.error(self.sourcename + " download url has err!url列表:" + str(url_dict), exc_info=True) if eb.retry < collect.DOWNLOAD_URL_RETRY: logger.info("retry time:" + str(eb.retry)) eb.retry += 1 self.um.save(eb, self.step - 1) else: logger.info("retry:" + str(eb.retry) + ".retry次数超过5次,不再重试。") eb.err_and_step = str(self.step) + ":请求下载url错误超过五次" self.um.save(eb, self.err_step) continue eb.full_url = full_url eb.abs_url = url # eb.full_url = eb.pinjie # eb.abs_url = eb.pinjie self.um.save(eb, self.step) logger.info("URL_THREAD - " + self.name + " - " + self.sourcename + " download_url finsh.")
def run(self): logger.info(self.sourcename + " download_url start...") while (True): string = self.um.get_eb(self.url_set_name) if string == None: break eb = nm.execl_bean() eb.paser(string) url = "" if eb.sourcename == "PMC": url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" + eb.waibuaid else: url = eb.pinjie jcb = nm.json_conf_bean(eb.sourcename, eb.eissn) file_path = self.creat_filename() try: html_ = htmls.HTML(eb, jcb, self.tm, self.sourcename) if eb.full_url == "": # logger.info("URL_THREAD - "+self.name+" - "+self.sourcename+" get download url form: "+url) print("+++++++++++++++++") print(eb.full_url) full_url = html_.run(url) else: print("=====================", eb.full_url) if html_.test_full_url(eb.full_url): full_url = eb.full_url else: full_url = html_.run(eb.full_url) print("下载pdf...", full_url) htmls.download(full_url, file_path) eb.page = htmls.checkpdf(file_path) print("下载成功") except NoConfError: logger.info(eb.eissn + " 无可用的conf.") eb.err_and_step = str(self.url_step) + ": 无可用的conf" self.um.save(eb, self.err_step) except Exception as e: logger.error(self.sourcename + " download url " + url + " has err", exc_info=True) if eb.retry < collect.DOWNLOAD_URL_RETRY: logger.info("retry time:" + str(eb.retry)) eb.retry += 1 self.um.save(eb, self.url_step - 1) else: logger.info("retry:" + str(eb.retry) + ". retry次数超过5次,不再重试。") self.um.save(eb, self.err_step) continue eb.full_url = full_url eb.abs_url = url dirs = file_path.split("/") eb.full_path = dirs[-2] + "/" + dirs[-1] self.um.save(eb, self.finsh_step)
def run(self): logger.info(self.sourcename + " start...") while (True): string = self.um.get_eb(self.url_set_name) if string == None: break file_path = self.creat_filename() eb = nm.execl_bean() eb.paser(string) url_dict = {} if eb.abs_url != "": url_dict[EXCEL_ITEM.ABS_URL] = eb.abs_url if eb.full_url != "": url_dict[EXCEL_ITEM.FULL_URL] = eb.full_url if eb.pinjie != "": url_dict[EXCEL_ITEM.PINJIE] = eb.pinjie jcb = nm.json_conf_bean(eb.sourcename, eb.eissn) html_ = htmls.HTML(eb, jcb, self.tm, self.sourcename) try: logger.info("URL_THREAD - " + self.name + " - " + self.sourcename + " get download url form: " + str(url_dict)) url, full_url = parser_url(url_dict, html_) htmls.download(full_url, file_path) eb.page = htmls.checkpdf(file_path) except NoConfError: logger.info(self.sourcename + "-" + eb.eissn + " 无可用的conf.") continue except Exception as e: logger.error(self.sourcename + " download url " + str(url_dict) + " has err", exc_info=True) if eb.retry < collect.DOWNLOAD_URL_RETRY: logger.info("retry time:" + str(eb.retry)) eb.retry += 1 self.um.save(eb, self.url_step - 1) else: logger.info("retry:" + str(eb.retry) + ". retry次数超过5次,不再重试。") self.um.save(eb, self.err_step) continue logger.info("URL_THREAD - " + self.name + " - " + self.sourcename + " 下载成功!") eb.full_url = full_url eb.abs_url = url dirs = file_path.split("/") eb.full_path = dirs[-2] + "/" + dirs[-1] self.um.save(eb, self.finsh_step)
def run(self): logger.info("PDF_THREAD - " + self.name + " - " + self.sourcename + " download start...") while (True): string = self.um.get_eb(self.url_set_name) if string == None: if self.um.get_done(self.sourcename, self.step - 1) == self.um.DONE: self.um.set_done(self.sourcename, self.step) break else: logger.info("PDF_THREAD - " + self.name + " - " + self.sourcename + " wait for download...") time.sleep(30) continue eb = nm.execl_bean() eb.paser(string) file_path = self.creat_filename() try: logger.info("PDF_THREAD - " + self.name + " - " + self.sourcename + " : download pdf.download url:" + eb.full_url + " 下载页面链接:" + eb.pinjie) htmls.download(eb.full_url, file_path) logger.info("PDF_THREAD - " + self.name + " - " + self.sourcename + " :check pdf. pdf path:" + file_path) eb.page = htmls.checkpdf(file_path) except Exception: logger.error(self.sourcename + " download " + eb.full_url + " has err", exc_info=True) if eb.retry < collect.DOWNLOAD_RETRY: logger.info("retry time:" + str(eb.retry)) eb.retry += 1 self.um.save(eb, self.step - 1) else: logger.info("retry:" + str(eb.retry) + "retry次数超过5次,不再重试。") eb.err_and_step = str(self.step) + ":下载pdf错误超过五次" self.um.save(eb, self.err_step) continue logger.info("PDF_THREAD - " + self.name + " - " + self.sourcename + " :pdf下载成功。") dirs = file_path.split("/") eb.full_path = dirs[-2] + "/" + dirs[-1] self.um.save(eb, self.step) logger.info("URL_THREAD - " + self.name + " - " + self.sourcename + " download finsh.")
def write(self): back_file = open(self.file_path.replace(".xls", "_back.txt"), "w+") logger.info("写入execl...") for sn in self.um.get_sourcenames(): while (True): url_name = self.um.fix(sn, self.write_step) string = self.um.get_eb(url_name) if string == None: break back_file.write(string + "\n") eb = name_manager.execl_bean() eb.paser(string) self.excel_write(eb) self.wb.save(self.file_path) logger.info("Excel写入完成。")
def read(self, file_path, sourcename="doaj", issn="d"): logger.info("设定sourcename为:" + sourcename + ",开始读取文件...") self.um.save_sourcenames(sourcename) with open(file_path, "r", encoding="utf-8") as f: for line_index, line in enumerate(f.readlines()): print(line_index) line = line.replace("\n", "").strip() if line == "": continue eb = name_manager.execl_bean() eb.sourcename = sourcename eb.eissn = issn + str(line_index) eb.pinjie = line eb.row_num = -1 self.um.save(eb, self.step)
def write(self, file_path): write_file = open(file_path, "w+", encoding="utf-8") back_file = open(file_path.replace(".txt", "_back.txt"), "w+", encoding="utf-8") logger.info("写入文件...") for sn in self.um.get_sourcenames(): while (True): url_name = self.um.fix(sn, self.write_step) string = self.um.get_eb(url_name) if string == None: break back_file.write(string + "\n") eb = name_manager.execl_bean() eb.paser(string) write_file.write(eb.pinjie + "##" + eb.full_url + "##" + eb.full_path + "\n")
def run(self): logger.info(self.sourcename + " download_url start...") while (True): string = self.um.get_eb(self.url_set_name) if string == None: break eb = nm.execl_bean() eb.paser(string) url = eb.pinjie jcb = nm.json_conf_bean(eb.sourcename, eb.eissn) file_path = self.creat_filename() try: d_url = self.get_d_url(url) # print(d_url) logger.info(self.sourcename + " get download url form: " + d_url) htmls.download(d_url, file_path) eb.page = htmls.checkpdf(file_path) except NoConfError: logger.info(eb.eissn + " 无可用的conf.") eb.err_and_step = str(self.url_step) + ": 无可用的conf" self.um.save(eb, self.err_step) except Exception as e: logger.error(self.sourcename + " download url " + url + " has err", exc_info=True) if eb.retry < collect.DOWNLOAD_URL_RETRY: logger.info("retry time:" + str(eb.retry)) eb.retry += 1 self.um.save(eb, self.url_step - 1) else: logger.info("retry:" + str(eb.retry) + ". retry次数超过5次,不再重试。") self.um.save(eb, self.err_step) continue eb.full_url = d_url eb.abs_url = url dirs = file_path.split("/") eb.full_path = dirs[-2] + "/" + dirs[-1] self.um.save(eb, self.finsh_step)
def run(self): logger.info(self.sourcename + " download_url start...") while (True): string = self.um.get_eb(self.url_set_name) if string == None: break eb = nm.execl_bean() eb.paser(string) url = eb.pinjie jcb = nm.json_conf_bean(eb.sourcename, eb.eissn) file_path = self.creat_filename() try: # time.sleep(random.random() * 3 + 1) logger.info(self.sourcename + " 开始下载:" + url) r = requests.get(url) try: c1 = r.cookies['BIGipServerlbapp_tc3'] c2 = r.cookies['BIGipServerwww.osti.gov_pool'] c3 = r.cookies['JSESSIONID'] except: pass soup = BeautifulSoup(r.text, "html.parser") mate = soup.find("meta", {"name": "citation_pdf_url"}) if mate == None: start_break = False for div1 in soup.find_all("div", class_="biblio-secondary-group"): for div2 in div1.find_all( "div", class_="biblio-secondary-item small"): for a in div2.find_all("a"): if "href" in a.attrs.keys(): if "https://doi.org" in a["href"]: pdf_url = a["href"] cp = htmls.config_parser() ht = htmls.HTML(None, None, None, None) for conf in cp.get_all_conf(): print(conf) if ht.test(conf, pdf_url): result = ht.do_run( conf, pdf_url) r2 = requests.get(result) r2.encoding = 'utf-8' # print(r2.text) file = open(file_path, "wb+") file.write(r2.content) file.close() break start_break = True break if start_break: break if start_break: break else: pdf_url = mate["content"] cookies = { 'BIGipServerlbapp_tc3': c1, 'BIGipServerwww.osti.gov_pool': c2, 'JSESSIONID': c3, '__utma': '249692800.1749221367.1564467097.1564467097.1564467097.1', '__utmc': '249692800', '__utmz': '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', '_ga': 'GA1.2.1749221367.1564467097', '_gid': 'GA1.2.298248318.1564467099', '__utmt': '1', '__utmb': '249692800.63.10.1564467097' } r2 = requests.get(pdf_url, cookies=cookies) r2.encoding = 'utf-8' # print(r2.text) file = open(file_path, "wb+") file.write(r2.content) file.close() eb.page = htmls.checkpdf(file_path) full_url = pdf_url # r = requests.get(url) # c1 = r.cookies['BIGipServerlbapp_tc3'] # c2 = r.cookies['BIGipServerwww.osti.gov_pool'] # c3 = r.cookies['JSESSIONID'] # soup = BeautifulSoup(r.text, "html.parser") # # pdf_url = soup.find("meta", {"name": "citation_pdf_url"})["content"] # cookies = { # 'BIGipServerlbapp_tc3': c1, # 'BIGipServerwww.osti.gov_pool': c2, # 'JSESSIONID': c3, # '__utma': '249692800.1749221367.1564467097.1564467097.1564467097.1', # '__utmc': '249692800', # '__utmz': '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', # '_ga': 'GA1.2.1749221367.1564467097', # '_gid': 'GA1.2.298248318.1564467099', # '__utmt': '1', # '__utmb': '249692800.63.10.1564467097' # } # #time.sleep(random.random() * 3 + 1) # logger.info(self.sourcename+" 下载PDF:"+pdf_url) # # r2 = requests.get(pdf_url, cookies=cookies) # r2.encoding = 'utf-8' # file = open(file_path, "wb+") # file.write(r2.content) # file.close() # eb.page = htmls.checkpdf(file_path) # full_url=pdf_url except NoConfError: logger.info(eb.eissn + " 无可用的conf.") eb.err_and_step = str(self.url_step) + ": 无可用的conf" self.um.save(eb, self.err_step) except Exception as e: logger.error(self.sourcename + " download url " + url + " has err", exc_info=True) if eb.retry < collect.DOWNLOAD_URL_RETRY: logger.info("retry time:" + str(eb.retry)) eb.retry += 1 self.um.save(eb, self.url_step - 1) else: logger.info("retry:" + str(eb.retry) + ". retry次数超过5次,不再重试。") self.um.save(eb, self.err_step) continue eb.full_url = full_url eb.abs_url = url dirs = file_path.split("/") eb.full_path = dirs[-2] + "/" + dirs[-1] self.um.save(eb, self.finsh_step)