def project_delete(project_name): for project in project_set: if project[0] == project_name: del project dh.csv_write(project_set, "universal", "project", "wb") op.output("Project deleted", "notice", ut.time_str("full")) else: op.output('No project found', "notice", ut.time_str("full"))
def project_des_update(project_name, project_description): i = 0 for i in len(project_set): if project_set[i].split(",|,")[0] == project_name: op.output("Project description updated", "notice", ut.time_str("full")) else: op.output('No project found', "notice", ut.time_str("full"))
def crawl_detail(self, pmid): # 爬具体页面 link = "https://www.ncbi.nlm.nih.gov/pubmed/" + pmid key_words_list = [] # 关键词合集 institues_list = [] # 机构名称 full_links_list = [] # 全文链接(不是abstract,是可下载的pdf) tries = 3 # 尝试获取3次,不成功就返回错误 while (tries > 0): try: opener = requests.Session() # 新建了session保存 doc = opener.get( link, timeout=self.request_time_out, headers=agents.get_header()).text # 注意,这里是不断随机换agent的 soup = BeautifulSoup(doc) abstract_raw = soup.findAll(name="abstracttext") abstract = ut.regexp_replace(str(abstract_raw), ut.re_html)[1:-1] # 即时清理abstract key_words_raw = soup.findAll(name="div", attrs={"class": "keywords"}) if key_words_raw: # 如果有keyword的话,很多文章是没有 key_words_raw = str(key_words_raw)[45:-11].replace( "; ", ";") key_words_list = key_words_raw.split(';') institues_raw = soup.findAll(name='dl') if institues_raw: # 如果有institues的话,大部分文章都有 institues_raw = institues_raw[0] institues_raw = re.findall("<dd>.*?</dd>", str(institues_raw)) for institues in institues_raw: institues_list.append(institues[4:-5]) full_content = soup.findAll(name='div', attrs={"class": "icons portlet"}) full_links_raw = re.findall("<a href=.*?ref=", str(full_content)) if full_links_raw: # 如果有全文链接 for full_link in full_links_raw: full_links_list.append(full_link[9:-6].replace( "&", "&")) return abstract, key_words_list, institues_list, full_links_list # 返回的是一个str值和3个集合 break except Exception, e: tries -= 1 msg.display( ut.time_str("time"), "retrying record: " + str(pmid) + "; " + str(tries) + " tries left", "notice") msg.log(self.task_name, ut.time_str("full"), "retry record: " + str(pmid), "notice") msg.log(self.task_name, ut.time_str("full"), str(e), "error") time.sleep(self.request_refresh_wait) # 如果抓不成功,就先休息3秒钟
def generate_record(self): # 从抓取的原始素材产生记录 title_start_with = "linksrc=docsum_title\">" # 标记查找标题开头 title_end_with = "</a>" # 查找标题的结尾 journal_start_with = 'title=' # 查找期刊开头 journal_end_with = '\">' # 查找期刊结尾 m = 0 while (m < len(self.pmid)): # 有多少重复多少 msg.stat("record", "proc") # 处理记录+1 pmid = str(self.pmid[m])[4:-5] # 先找到pmid,再决定要不要下一步 if not (self.pmid_check(pmid)): # 如果之前没有这篇文章 author = str(self.author[m])[16:-4] author_list = author.split(", ") # 作者列表 title_start = str(self.title[m]).find(title_start_with) + 22 title = str(self.title[m])[title_start:-8].replace( '<b>', '').replace('</b>', '') # 论文名 issue = re.search("[1-2][09][0-9]{2}", str(self.issue[m])).group(0) # 刊号,即年份 journal_end = str(self.journal[m]).find( journal_end_with) # 期刊结尾位置 journal = str(self.journal[m])[26:journal_end].replace( '<b>', '').replace('</b>', '') # 期刊名 journal_detail = jn.journal_detail( journal) # 获取期刊的正式名称,影响因子及分区信息 paper_detail = self.crawl_detail( pmid) # 获取文章abstract,keyword列表,机构列表和全文链接列表 if paper_detail: # 如果能够返回正确的abstract,记录;否则留给下一次抓取(不记录,视作新论文) mh.add_new_content(self.project_name, self.key_words, ut.time_str("full"), "pm", pmid, title, author_list, journal, journal_detail[0], journal_detail[1], journal_detail[2], issue, str(paper_detail[0]), paper_detail[1], paper_detail[2], paper_detail[3]) self.pmid_set.append(pmid) # 把刚抓的这篇pmid加入pmid list #这里的 paper_detail[0]是这篇文章的abstract,[1]是keywords,[2]是机构列表 [4]是全文下载的链接合集 msg.stat("record", "succ") # 记录成功+1 msg.display(ut.time_str("time"), "retrieved record: " + str(pmid) + "; total retrieved: " + str(stats.success_record), "info") # 显示:记录成功 msg.log(self.task_name, ut.time_str("full"), "retrieved record: " + str(pmid), "info") # 记录:记录成功 else: msg.stat("record", "skip") # 跳过记录+1 msg.display( ut.time_str("time"), "skipped record: " + str(pmid) + "; total skipped: " + str(stats.skipped_record), "info") msg.log(self.task_name, ut.time_str("full"), "skipped record: " + str(pmid), "info") m += 1
def project_add(project_name, project_description): if not dh.check_folders(project_name, "folder"): # 如果没有对应文件夹 dh.new_project_files(project_name) # 新建文件 project_set = dh.text_read("universal","project").split("\n") time.sleep(0.1) # 确保文件夹读取后关闭 new_project = project_name, project_description, ut.time_str("full") project_set.append(new_project) dh.text_write(project_set,"universal","project","w") else: op.output("Folder already exist", "warning", ut.time_str("full"))
def retrieve_proxy(proxy_number): api_url = "http://vtp.daxiangdaili.com/ip/?tid=559131754091145&num=" + \ str(proxy_number) + "&delay=1&sortby=time" proxies = requests.get(api_url, timeout=10).text proxy_pool = [] for proxy in proxies.split("\n"): proxy_record = ut.time_str("full"), proxy, 0, 0, 0 proxy_pool.append(proxy_record) return proxy_pool
def run_task(project, sstr): # 多少时间后开始运行 record_number, mrmins, endwith = get_task_config(project, sstr) endtime = ut.time_str("full", mrmins) msg.msg("crawl pmid", project + sstr, "started", "succ", "important", msg.display, msg.log, msg.stat) pc.run_pmid_crawler(project, sstr, record_number, endwith, endtime) msg.msg("crawl pmid", project + sstr, "finished", "succ", "important", msg.display, msg.log, msg.stat) msg.msg("crawl detail", project + sstr, "started", "succ", "important", msg.display, msg.log, msg.stat) dc.run_detail_crawler(project, sstr, record_number) msg.msg("crawl detail", project + sstr, "finished", "succ", "important", msg.display, msg.log, msg.stat)
def task_validator(project, sstr, endwith, endtime): # 是否提前结束 if stats.c_skipped_pmid >= config.pmid_max_c_skip and endwith: # 设定提前完成并达到提前完成的skip数量 msg.msg("crawl pmid", project + sstr, "repeat end", "succ", "notice", msg.display, msg.log) return False # false是不通过 if endtime < ut.time_str("full") and endwith: # 设定提前完成并达到提前完成的时间上限 msg.msg("crawl pmid", project + sstr, "time end", "succ", "notice", msg.display, msg.log) return False else: return True # True是通过
def stat(when, who, identifier, action, result, info_type): # 用于统计的信息 if result == "succ": if who == "sum page": stats.success_sum_page += 1 elif who == "record": stats.success_record += 1 elif who == "pmid": stats.success_pmid += 1 stats.c_skipped_pmid = 0 elif who == "crawl pmid": if result == "started": stats.crawl_pmid_start = ut.time_str("full") elif result == "finished": stats.crawl_pmid_finish = ut.time_str("full") elif who == "crawl detail": if result == "started": stats.crawl_detail_start = ut.time_str("full") if result == "finished": stats.crawl_detail_finish = ut.time_str("full") elif result == "fail": if who == "sum page": stats.failed_sum_page += 1 elif who == "record": stats.failed_record += 1 elif who == "pmid": stats.failed_pmid += 1 elif result == "proc": if who == "sum page": stats.processed_sum_page += 1 elif who == "record": stats.processed_record += 1 elif who == "pmid": stats.processed_pmid += 1 elif result == "skip": if who == "sum page": stats.skipped_sum_page += 1 elif who == "record": stats.skipped_record += 1 elif who == "pmid": stats.skipped_pmid += 1 stats.c_skipped_pmid += 1
def crawl_direct(self): # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快 msg.stat("sum_page", "proc") # 列入已处理 tries = self.tries_request # 尝试3次 while (tries > 0): try: opener = requests.Session() raw = opener.get( self.url, timeout=self.request_time_out, headers=agents.get_header()).text # header仍然可以是随机的 soup = BeautifulSoup(raw) number_raw = soup.findAll(name="input", attrs={"id": "pageno"}) # 找到含总数的div number_start = str(number_raw).find("last=") + 6 # 找到总数开始位置 number_end = str(number_raw).find("\" />") # 找到总数结束位置 max_number = int( str(number_raw)[number_start:number_end]) # 实际最大数值,整数 if max_number < self.sum_page_number: # 如果实际最大页面数没有计算值大 self.sum_page_number = max_number # 那用实际值,否则不变 msg.display(ut.time_str("time"), "max sum page changed: " + str(max_number), "notice") msg.log(self.task_name, ut.time_str("full"), "changed sum page number: " + str(max_number), "notice") msg.display(ut.time_str("time"), "loaded: NO.1 sum page (requests)", "info") msg.log(self.task_name, ut.time_str("full"), "load sum page: NO.1 (requests)", "info") self.author = soup.findAll(name='p', attrs={"class": "desc"}) self.journal = soup.findAll(name="span", attrs={'class': 'jrnl'}) self.title = soup.findAll(name='p', attrs={"class": "title"}) self.issue = soup.findAll(name="p", attrs={'class': 'details'}) self.pmid = soup.findAll(name="dd") self.generate_record() # 直接产生结果 msg.stat("sum_page", "succ") break except Exception, e: print e tries -= 1 msg.display( ut.time_str("time"), "load retrying: NO.1 sum page (requests); " + str(tries) + " tries left", "notice") msg.log(self.task_name, ut.time_str("full"), "retry sum page: NO.1 (requests)", "notice") msg.log(self.task_name, ut.time_str("full"), str(e), "error")
def crawl_direct(project, sstr): # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快 url = parse_url(project, sstr) tries = config.request_sp_tries # 尝试3次 while(tries > 0): try: opener = requests.Session() content = opener.get(url, timeout=config.request_time_out, headers=agents.get_header()).text # header仍然可以是随机的 msg.msg("sum page", "1", "loaded", "proc", "info", msg.display) pmid_list = extract_new_pmid(content) # 提取pmid, 然后排除旧的 if pmid_list: mh.add_new_pmid_many( project, sstr, ut.time_str("full"), "pm", pmid_list) msg.msg("sum page", "1", "loaded", "succ", "info", msg.display, msg.log) break except Exception, e: msg.msg("sum page", "1", "loaded", str(e), "error", msg.log) msg.msg("sum page", "1", "loaded", "retried", "notice", msg.display) tries -= 1 time.sleep(config.request_refresh_wait)
return abstract, key_words_list, institues_list, full_links_list # 返回的是一个str值和3个集合 break except Exception, e: tries -= 1 msg.display( ut.time_str("time"), "retrying record: " + str(pmid) + "; " + str(tries) + " tries left", "notice") msg.log(self.task_name, ut.time_str("full"), "retry record: " + str(pmid), "notice") msg.log(self.task_name, ut.time_str("full"), str(e), "error") time.sleep(self.request_refresh_wait) # 如果抓不成功,就先休息3秒钟 else: msg.display(ut.time_str("time"), "retrieve record fail: " + str(pmid), "error") msg.log(self.task_name, ut.time_str("full"), "failed record: " + str(pmid), "error") msg.stat("record", "fail") return 0 #===================================================================================== # 实际爬的部分开始 def crawl_direct(self): # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快 msg.stat("sum_page", "proc") # 列入已处理 tries = self.tries_request # 尝试3次 while (tries > 0): try:
def save_png(browser): browser.save_screenshot( ut.cur_file_dir() + "/browser/" + ut.time_str("time") + ".png") msg.msg("screenshot", "", "saved", "succ", "debug", msg.display, msg.log)
"info", msg.display, msg.stat) WebDriverWait(pm_browser, config.phantom_time_out).until( EC.presence_of_element_located((By.ID, "footer"))) # 等待加载完毕的最好方案 pm_browser.find_elements_by_name("Display")[2].click() # 找到下拉菜单,点击 pm_browser.implicitly_wait(5) # 等0.5秒钟,让菜单下拉完成 time.sleep(2) pm_browser.find_element_by_xpath( "//*[@id=\"ps200\"]").click() # 下拉菜单找到200这个值,点击 WebDriverWait(pm_browser, config.phantom_time_out).until( EC.presence_of_element_located((By.ID, "footer"))) # 自动刷新页面, 等待刷新完毕 msg.msg("sum page", "1", "display number", "clicked", "debug", msg.display, msg.log) pm_browser.implicitly_wait(5) pmid_list = extract_new_pmid(pm_browser.page_source) if pmid_list: mh.add_new_pmid_many(project, sstr, ut.time_str( "full"), "pm", pmid_list) # 把pmid存起来 msg.msg("sum page", "1", "loaded", "succ", "info", msg.log, msg.display, msg.stat) rest_page_number -= 1 # dc.run_detail_crawler(project, sstr, 200) break except Exception as e: tries_1st_sp -= 1 # time.sleep(config.phantom_refresh_wait) pm_browser.refresh() pm_browser.implicitly_wait(config.phantom_refresh_wait) msg.msg("sum page", "1", "loaded", "retried", "notice", msg.display) msg.msg("sum page", "1", "loaded", str(e), "error", msg.log) else: msg.msg("sum page", "1", "loaded", "fail",
def count_task(project, sstr): number = get_db("task").count({"project": project, "sstr": sstr}) return number def count_project_task(project): # 数一下该项目下运行过多少task number = get_db("task").count({"project": project}) return number def finish_task(project, sstr): # 把任务标记为完成 data = {"status": 1} get_db("task").update_one({ "project": project, "sstr": sstr }, {"$set": data}) if __name__ == "__main__": # add_new_project("organ on chip", "organ simulator, organ on chip", ut.time_str("full")) add_new_sstr("cancer", "liver,cancer", ut.time_str("full"), "key_words") # add_new_task("cancer", "breast,cancer", "2017-10-10 10:10:10", 5000, 6, 0, 0) # finish_task("cancer", "breast,cancer") # print count_task("cancer", "breast,cancer") # add_new_pmid("cancer", "lung,cancer", "2017-10-10 10:10:10", "pm", 29027110) # print read_empty_pmid("organ on chip", 10000) # print read_content("cancer", "lung,cancer", 1) # pass
def msg(who, identifier, action, result, info_type, *args): '''*args可以为log, display, stat一个或多个''' for fn in args: fn(ut.time_str("full"), who, identifier, action, result, info_type)
def crawl_phantom(self): # 用于使用phantomjs爬取sum-page,可以爬无限页,但是速度慢 rest_page_number = self.sum_page_number # 剩下多少页 tries_1st_sp = self.tries_1st_sp tries_other_sp = self.tries_other_sp if self.sum_page_number > 1: # 如果页面不超过1个,就不启动浏览器 dcap = dict(DesiredCapabilities.PHANTOMJS) # 设置userAgent dcap["phantomjs.page.settings.userAgent"] = ( self.phantomjs_headers) # header每次打开phantomjs是随机的,但浏览器关闭前不会变 dcap["phantomjs.page.settings.loadImages"] = False # 不载入图片,以加快速度 # browser = webdriver.PhantomJS(executable_path='C:\Python27\Scripts\phantomjs.exe', desired_capabilities=dcap) # 加载浏览器,windows下使用 path = cur_file_dir() + "/browser/phantomjs" # 浏览器地址 browser = webdriver.PhantomJS(executable_path=path, desired_capabilities=dcap) # 加载浏览器 browser.set_page_load_timeout( self.phantomjs_time_out) # 设定网页加载超时,超过了就不加载 while (self.sum_page_number > 1 and tries_1st_sp > 0): try: browser.get(self.url) WebDriverWait(browser, self.phantomjs_time_out).until( EC.presence_of_element_located((By.ID, "footer"))) msg.display(ut.time_str("time"), "loaded: NO.1 sum page (phantomjs)", "info") msg.log(self.task_name, ut.time_str("full"), "load sum page: NO.1 (phantomjs)", "info") msg.stat("sum_page", "succ") break except Exception as e: tries_1st_sp -= 1 msg.display( ut.time_str("time"), "load retrying: NO.1 sum page (phantomjs); " + str(tries_1st_sp) + " tries left", "notice") msg.log(self.task_name, ut.time_str("full"), "retry sum page: NO.1 (phantomjs)", "notice") msg.log(self.task_name, ut.time_str("full"), str(e), "error") browser.refresh() browser.implicitly_wait(self.phantomjs_refresh_wait) else: msg.display(ut.time_str("time"), "load failed: NO.1 sum page (phantomjs)", "error") msg.log(self.task_name, ut.time_str("full"), "fail sum page: NO.1 (phantomjs)", "error") while ( rest_page_number > 1 and tries_1st_sp > 0 ): # 确认需要第二页,如果sum-page只有1页,那就不用再打开; 如果第一页打开失败,也不用打开;从这里开始循环,直到所有的页面都爬完为止 msg.stat("sum_page", "proc") tries_other_sp = self.tries_other_sp while (tries_other_sp > 0): # 尝试多少次,默认尝试5次,不行就打不开 try: browser.find_element_by_link_text( "Next >").click() # 直接就点开“下一页”,从第二页开始 WebDriverWait(browser, self.phantomjs_time_out).until( EC.presence_of_element_located((By.ID, "footer"))) msg.display( ut.time_str("time"), "loaded: NO." + str(stats.success_sum_page + 1) + " sum page (phantomjs)", "info") msg.log( self.task_name, ut.time_str("full"), "load sum page: NO." + str(stats.success_sum_page + 1) + " (phantomjs)", "info") soup = BeautifulSoup(browser.page_source) self.author = soup.findAll(name='p', attrs={"class": "desc"}) self.journal = soup.findAll(name="span", attrs={'class': 'jrnl'}) self.title = soup.findAll(name='p', attrs={"class": "title"}) self.issue = soup.findAll(name="p", attrs={'class': 'details'}) self.pmid = soup.findAll(name="dd") self.generate_record() # 直接产生结果 msg.stat("sum_page", "succ") rest_page_number -= 1 break except Exception as e: tries_other_sp -= 1 msg.display( ut.time_str("time"), "load retrying: NO." + str(stats.success_sum_page + 1) + " sum page (phantomjs); " + str(tries_other_sp) + " tries left", "notice") msg.log( self.task_name, ut.time_str("full"), "retry sum page: NO." + str(stats.success_sum_page + 1) + " (phantomjs)", "notice") msg.log(self.task_name, ut.time_str("full"), str(e), "error") browser.refresh() browser.implicitly_wait(self.phantomjs_refresh_wait) else: msg.stat("sum_page", "fail") msg.display( ut.time_str("time"), "load failed: NO." + str(stats.success_sum_page + 1) + " sum page (phantomjs)", "error") msg.log( self.task_name, ut.time_str("full"), "fail sum page: NO." + str(stats.success_sum_page + 1) + " (phantomjs)", "error") break if self.sum_page_number > 1: browser.quit() # 关闭浏览器。当出现异常时记得在任务浏览器中关闭PhantomJS
def time_box():# 把时间相关的列出来 print " ---------------------------------------------------------------" print " │ Current Time │ Start Time │ Elapsed Time │" print " │ " + ut.time_str("full") + " │ " + ut.time_str("full") + " │ " + " ?? Hr ?? min " + " │" print " ---------------------------------------------------------------"
def generate_tasks(project, sstr): config = get_task_config(project, sstr) mh.add_new_task(project, sstr, ut.time_str( "full"), config[0], config[1], config[2], 0)