def test_download(): url_ = "http://dx.doi.org/10.1016/j.athoracsur.2019.05.024" section = "Elsevier_0003-4975" cp = htmls.config_parser() print(cp.get_section(section)) d_url = htmls.HTML(None, None, None, "test").do_run(cp.get_section(section), url_) # d_url="https://onlinelibrary.wiley.com/doi/epdf/10.1016/S1607-551X%2814%2900235-6" # d_url="https://www.microbiologyresearch.org/deliver/fulltext/jgv/99/9/1187_vir001128.pdf?itemId=%2Fcontent%2Fjournal%2Fjgv%2F10.1099%2Fjgv.0.001128&mimeType=pdf&containerItemId=content/journal/jgv" print(d_url) htmls.download(d_url.strip(), test_file) print(htmls.checkpdf(test_file))
import collector.collect as collect import collector.htmls as htmls # redis_ = redis.Redis(host="10.3.1.99", port=6379, db=1,decode_responses=True) # print(redis_.keys("*")) if __name__ == '__main__': # name = "osti_1" name = "osti_aps" # name = "hg0903" file_path = r"C:\temp\osti\r1119\web_xls\aps.xls" # file_path = r"C:\public\目次采全文\0903\化工所待补全文清单_20190903..xls" cp = htmls.config_parser() cp.paser() collect.run_thread(name, file_path) cp.backup() # collect.test_download()
def run(self): logger.info(self.sourcename + " download_url start...") while (True): string = self.um.get_eb(self.url_set_name) if string == None: break eb = nm.execl_bean() eb.paser(string) url = eb.pinjie jcb = nm.json_conf_bean(eb.sourcename, eb.eissn) file_path = self.creat_filename() try: # time.sleep(random.random() * 3 + 1) logger.info(self.sourcename + " 开始下载:" + url) r = requests.get(url) try: c1 = r.cookies['BIGipServerlbapp_tc3'] c2 = r.cookies['BIGipServerwww.osti.gov_pool'] c3 = r.cookies['JSESSIONID'] except: pass soup = BeautifulSoup(r.text, "html.parser") mate = soup.find("meta", {"name": "citation_pdf_url"}) if mate == None: start_break = False for div1 in soup.find_all("div", class_="biblio-secondary-group"): for div2 in div1.find_all( "div", class_="biblio-secondary-item small"): for a in div2.find_all("a"): if "href" in a.attrs.keys(): if "https://doi.org" in a["href"]: pdf_url = a["href"] cp = htmls.config_parser() ht = htmls.HTML(None, None, None, None) for conf in cp.get_all_conf(): print(conf) if ht.test(conf, pdf_url): result = ht.do_run( conf, pdf_url) r2 = requests.get(result) r2.encoding = 'utf-8' # print(r2.text) file = open(file_path, "wb+") file.write(r2.content) file.close() break start_break = True break if start_break: break if start_break: break else: pdf_url = mate["content"] cookies = { 'BIGipServerlbapp_tc3': c1, 'BIGipServerwww.osti.gov_pool': c2, 'JSESSIONID': c3, '__utma': '249692800.1749221367.1564467097.1564467097.1564467097.1', '__utmc': '249692800', '__utmz': '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', '_ga': 'GA1.2.1749221367.1564467097', '_gid': 'GA1.2.298248318.1564467099', '__utmt': '1', '__utmb': '249692800.63.10.1564467097' } r2 = requests.get(pdf_url, cookies=cookies) r2.encoding = 'utf-8' # print(r2.text) file = open(file_path, "wb+") file.write(r2.content) file.close() eb.page = htmls.checkpdf(file_path) full_url = pdf_url # r = requests.get(url) # c1 = r.cookies['BIGipServerlbapp_tc3'] # c2 = r.cookies['BIGipServerwww.osti.gov_pool'] # c3 = r.cookies['JSESSIONID'] # soup = BeautifulSoup(r.text, "html.parser") # # pdf_url = soup.find("meta", {"name": "citation_pdf_url"})["content"] # cookies = { # 'BIGipServerlbapp_tc3': c1, # 'BIGipServerwww.osti.gov_pool': c2, # 'JSESSIONID': c3, # '__utma': '249692800.1749221367.1564467097.1564467097.1564467097.1', # '__utmc': '249692800', # '__utmz': '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', # '_ga': 'GA1.2.1749221367.1564467097', # '_gid': 'GA1.2.298248318.1564467099', # '__utmt': '1', # '__utmb': '249692800.63.10.1564467097' # } # #time.sleep(random.random() * 3 + 1) # logger.info(self.sourcename+" 下载PDF:"+pdf_url) # # r2 = requests.get(pdf_url, cookies=cookies) # r2.encoding = 'utf-8' # file = open(file_path, "wb+") # file.write(r2.content) # file.close() # eb.page = htmls.checkpdf(file_path) # full_url=pdf_url except NoConfError: logger.info(eb.eissn + " 无可用的conf.") eb.err_and_step = str(self.url_step) + ": 无可用的conf" self.um.save(eb, self.err_step) except Exception as e: logger.error(self.sourcename + " download url " + url + " has err", exc_info=True) if eb.retry < collect.DOWNLOAD_URL_RETRY: logger.info("retry time:" + str(eb.retry)) eb.retry += 1 self.um.save(eb, self.url_step - 1) else: logger.info("retry:" + str(eb.retry) + ". retry次数超过5次,不再重试。") self.um.save(eb, self.err_step) continue eb.full_url = full_url eb.abs_url = url dirs = file_path.split("/") eb.full_path = dirs[-2] + "/" + dirs[-1] self.um.save(eb, self.finsh_step)
def test(url, file_path=r"C:\temp\other\test.pdf"): r = requests.get(url) try: c1 = r.cookies['BIGipServerlbapp_tc3'] c2 = r.cookies['BIGipServerwww.osti.gov_pool'] c3 = r.cookies['JSESSIONID'] except: pass soup = BeautifulSoup(r.text, "html.parser") mate = soup.find("meta", {"name": "citation_pdf_url"}) if mate == None: start_break = False for div1 in soup.find_all("div", class_="biblio-secondary-group"): for div2 in div1.find_all("div", class_="biblio-secondary-item small"): for a in div2.find_all("a"): if "href" in a.attrs.keys(): if "https://doi.org" in a["href"]: turl = a["href"] cp = htmls.config_parser() ht = htmls.HTML(None, None, None, None) for conf in cp.get_all_conf(): print(conf) if ht.test(conf, turl): result = ht.do_run(conf, turl) r2 = requests.get(result) r2.encoding = 'utf-8' # print(r2.text) file = open(file_path, "wb+") file.write(r2.content) file.close() break start_break = True break if start_break: break if start_break: break else: pdf_url = mate["content"] cookies = { 'BIGipServerlbapp_tc3': c1, 'BIGipServerwww.osti.gov_pool': c2, 'JSESSIONID': c3, '__utma': '249692800.1749221367.1564467097.1564467097.1564467097.1', '__utmc': '249692800', '__utmz': '249692800.1564467097.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', '_ga': 'GA1.2.1749221367.1564467097', '_gid': 'GA1.2.298248318.1564467099', '__utmt': '1', '__utmb': '249692800.63.10.1564467097' } r2 = requests.get(pdf_url, cookies=cookies) r2.encoding = 'utf-8' # print(r2.text) file = open(file_path, "wb+") file.write(r2.content) file.close() page = checkpdf(file_path) print(page)