def task_validator(project, sstr, endwith, endtime): # 是否提前结束 if stats.c_skipped_pmid >= config.pmid_max_c_skip and endwith: # 设定提前完成并达到提前完成的skip数量 msg.msg("crawl pmid", project + sstr, "repeat end", "succ", "notice", msg.display, msg.log) return False # false是不通过 if endtime < ut.time_str("full") and endwith: # 设定提前完成并达到提前完成的时间上限 msg.msg("crawl pmid", project + sstr, "time end", "succ", "notice", msg.display, msg.log) return False else: return True # True是通过
def journal_name_wash(journal_name_raw): # 原始名称清洗(主要针对各种括号和标点、解释、注释) re_bracket = re.compile("[\\[\\(](.*?)[\\]\\)]") # 去处括号解释 re_explaination = re.compile(" ??[:=].*") # 去处冒号后的解释 journal_name = journal_name_raw.replace('&', "&").replace(',', '').replace( ".", '') # &是部分名称中包含的 journal_name = re_bracket.sub('', journal_name) journal_name = re_explaination.sub('', journal_name) journal_name = journal_name.upper() # 清洗过的名称全大写 msg.msg("journal name", journal_name_raw, "washed", journal_name, "debug", msg.display) return journal_name
def parse_url(project, sstr="key_words"): # 把keyword变成链接形式,临时这样,未来增加内容 sstr_type = mh.read_sstr_type(project, sstr) if sstr_type == "key_words" or sstr_type == "key_word": if "," in sstr: sstr = sstr.replace(", ", ",").replace(" ,", ",") # 防止有空格 elif " " in sstr: sstr = sstr.replace(" ", ",") sstr = sstr.replace(",", "%2C") # 换成链接形式 if sstr_type == "expression": pass url = "https://www.ncbi.nlm.nih.gov/pubmed/?term=" + sstr # 最初的查询网址 msg.msg("url", sstr, "parsed", url, "debug", msg.display) return url
def journal_detail(journal_name): record = mh.read_journal_detail(journal_name) # 直接试一下 if record: msg.msg("journal record", journal_name, "local retrieved", "succ", "debug", msg.display) return record else: wjournal_name = journal_name_wash(journal_name) # 清洗过的在正式名里试一下 record = mh.read_ojournal_detail(wjournal_name) if record: msg.msg("journal record", journal_name, "local retrieved", "succ", "debug", msg.display) return record else: ojournal_name = get_official_name(wjournal_name) # 网络正式名在正式名里试一下 record = mh.read_ojournal_detail(ojournal_name) if record: msg.msg("journal record", journal_name, "web retrieved", "succ", "debug", msg.display) return record else: journal_info = get_journal_info(ojournal_name) # 网络正式名在网络查一下 journal_if = journal_info[0] journal_zone = journal_info[1] mh.add_journal(journal_name, ojournal_name, journal_if, journal_zone) # 新杂志储存 msg.msg("journal record", journal_name, "web retrieved", "succ", "debug", msg.display) data = journal_name, ojournal_name, journal_if, journal_zone return data
def run_task(project, sstr): # 多少时间后开始运行 record_number, mrmins, endwith = get_task_config(project, sstr) endtime = ut.time_str("full", mrmins) msg.msg("crawl pmid", project + sstr, "started", "succ", "important", msg.display, msg.log, msg.stat) pc.run_pmid_crawler(project, sstr, record_number, endwith, endtime) msg.msg("crawl pmid", project + sstr, "finished", "succ", "important", msg.display, msg.log, msg.stat) msg.msg("crawl detail", project + sstr, "started", "succ", "important", msg.display, msg.log, msg.stat) dc.run_detail_crawler(project, sstr, record_number) msg.msg("crawl detail", project + sstr, "finished", "succ", "important", msg.display, msg.log, msg.stat)
def get_data(filename, drop_data=False): '''Get pandas.DataFrame from .csv or .csv.zip''' msg('Reading datafile : {}'.format(filename)) if not os.path.exists(filename): raise IOError('No filename: {}'.format(filename)) if filename.endswith('.zip'): z = zipfile.ZipFile(filename) filename = z.open(filename.replace('.zip', '')) data = pandas.read_csv( filename, parse_dates=['Dates'], infer_datetime_format=True, comment='#', ) #data.Time = data.Dates.map(lambda x: x.time()) if drop_data: data = data[(data.X < -121) & (data.Y < 40)] data = data.dropna() data = data.reset_index(drop=True) msg('Read datafile : {}'.format(filename), 2) return data
def draw(self): if self.perfect: msg(self.scr, self.message, wiggle(self.pos, 5 * self.size), self.color, self.fontsize * wiggle(self.size, 1, -2) * 1.2, centered=True, **self.kwargs) msg(self.scr, self.message, wiggle(self.pos, 5 * self.size), self.color, self.fontsize * wiggle(self.size, 1, -2) * 1.2, centered=True, **self.kwargs) else: msg(self.scr, self.message, self.pos, self.color, self.fontsize * self.size, centered=True, **self.kwargs)
def get_official_name(journal_name_raw, proxy=None): # 查找杂志的全名,支持模糊查询,只输出最符合的那个 url = "http://www.letpub.com.cn/journalappAjax.php?querytype=autojournal&term=" + \ journal_name_raw.replace("&", "%26").replace(" ", "+") tries = config.request_dp_tries while tries > 0: try: opener = requests.Session() doc = opener.get(url, timeout=20, headers=agents.get_header()).text list = doc.split('},{') # 获取列表,但是只有最match的被采纳 journal_name_start = list[0].find("label") + 8 journal_name_end = list[0].find("\",\"", journal_name_start) journal_name = list[0][journal_name_start:journal_name_end] journal_name = journal_name.upper() # 查找到的名字也是全大写 msg.msg("journal name", journal_name_raw, "web retrieved", journal_name, "debug", msg.display) return journal_name break except Exception, e: msg.msg("journal name", journal_name, "web retrieved", "retried", "debug", msg.display) msg.msg("journal name", journal_name, "web retrieved", str(e), "error", msg.log) tries -= 1 time.sleep(config.request_refresh_wait)
def adjust_record_number(project, sstr, record_number): # 确定正确的记录数 url = parse_url(project, sstr) tries = config.request_sp_tries # 尝试3次 while(tries > 0): try: opener = requests.Session() content = opener.get(url, timeout=config.request_time_out, headers=agents.get_header()).text # header仍然可以是随机的 max_record_number_start = content.find( "<h3 class=\"result_count left\">Items:") + 37 # 找描述开始地方 max_record_number_end = content.find( '</h3>', max_record_number_start) record_number_str = content[max_record_number_start:max_record_number_end] max_record_number = int(record_number_str.split(" ")[-1]) if max_record_number >= record_number: pass else: record_number = max_record_number msg.msg("record number", "", "changed", str( record_number), "notice", msg.log, msg.display) return record_number break except Exception, e: msg.msg("record number", "", "read", str(e), "error", msg.log) msg.msg("record number", "", "read", "retried", "notice", msg.display) tries -= 1 time.sleep(config.request_refresh_wait)
def crawl_direct(project, sstr): # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快 url = parse_url(project, sstr) tries = config.request_sp_tries # 尝试3次 while(tries > 0): try: opener = requests.Session() content = opener.get(url, timeout=config.request_time_out, headers=agents.get_header()).text # header仍然可以是随机的 msg.msg("sum page", "1", "loaded", "proc", "info", msg.display) pmid_list = extract_new_pmid(content) # 提取pmid, 然后排除旧的 if pmid_list: mh.add_new_pmid_many( project, sstr, ut.time_str("full"), "pm", pmid_list) msg.msg("sum page", "1", "loaded", "succ", "info", msg.display, msg.log) break except Exception, e: msg.msg("sum page", "1", "loaded", str(e), "error", msg.log) msg.msg("sum page", "1", "loaded", "retried", "notice", msg.display) tries -= 1 time.sleep(config.request_refresh_wait)
def extract_new_pmid(content): # 从文本中提取pmid的通用办法 pmid_set = [] pmid_raw = re.findall("<dd>\d{8}</dd>", content) for pmid in pmid_raw: pmid = str(pmid[4:-5]) # 去处括号 msg.msg("pmid", str(pmid), "retrieved", "proc", "info", msg.log, msg.display, msg.stat) if pmid not in existed_pmid_set: pmid_set.append(pmid) msg.msg("pmid", str(pmid), "retrieved", "succ", "info", msg.log, msg.display, msg.stat) else: msg.msg("pmid", str(pmid), "skipped", "skip", "info", msg.log, msg.display, msg.stat) return pmid_set
def get_journal_info(ojournal_name, proxy=None): # 查找杂志影响因子、分区, 要求输入精准 url = "http://www.letpub.com.cn/index.php?page=journalapp&view=search" search_str = { "searchname": "", "searchissn": "", "searchfield": "", "searchimpactlow": "", "searchimpacthigh": "", "searchscitype": "", "view": "search", "searchcategory1": "", "searchcategory2": "", "searchjcrkind": "", "searchopenaccess": "", "searchsort": "relevance" } search_str["searchname"] = ojournal_name tries = config.request_dp_tries while tries > 0: try: opener = requests.Session() doc = opener.post(url, timeout=20, data=search_str).text selector = etree.HTML(doc.encode("utf-8")) journal_detail_element = selector.xpath( "//td[@style=\"border:1px #DDD solid; border-collapse:collapse; text-align:left; padding:8px 8px 8px 8px;\"]" ) if len(journal_detail_element): impact_factor = journal_detail_element[2].xpath('string(.)') publication_zone = journal_detail_element[3].xpath( 'string(.)')[0] else: impact_factor = "" publication_zone = "" msg.msg("journal info", ojournal_name, "web retrieved", "succ", "debug", msg.display) return impact_factor, publication_zone break except Exception, e: msg.msg("journal info", ojournal_name, "web retrieved", "retried", "debug", msg.display) msg.msg("journal info", ojournal_name, "web retrieved", str(e), "error", msg.log) tries -= 1 time.sleep(config.request_refresh_wait)
def draw(self): p = self.player d = p.pos[0] + 120 * p.scoreboardside ratio = 0 try: ratio = float(p.hits) / p.arrows_gone_by per = str(format(round(ratio * 100, 2), ".2f")) + "%" except: per = "%" message.msg(self.main.scr, str(p.hits) + "/" + str(p.arrows_gone_by), [d, 10], (0, 0, 255), weight=p.scoreboardside) if p.arrows_gone_by: message.msg(self.main.scr, per, [d, 30], (225 * (1 - ratio), 225 * ratio, 0), size=20, weight=p.scoreboardside) message.msg(self.main.scr, p.points, [d, 100], (0, 0, 200), weight=p.scoreboardside) message.msg(self.main.scr, p.combo, [d, 130], (0, 200, 0), weight=p.scoreboardside) message.msg(self.main.scr, p.misclicks, [d, 160], (200, 0, 0), weight=p.scoreboardside) message.msg(self.main.scr, p.misses, [d, 190], (200, 200, 0), weight=p.scoreboardside) message.msg(self.main.scr, p.totalarrows, [p.pos[0], 20], (100, 200, 230), weight=p.scoreboardside)
if max_record_number >= record_number: pass else: record_number = max_record_number msg.msg("record number", "", "changed", str( record_number), "notice", msg.log, msg.display) return record_number break except Exception, e: msg.msg("record number", "", "read", str(e), "error", msg.log) msg.msg("record number", "", "read", "retried", "notice", msg.display) tries -= 1 time.sleep(config.request_refresh_wait) else: msg.msg("record number", "", "read", "fail", "error", msg.display, msg.log) def extract_new_pmid(content): # 从文本中提取pmid的通用办法 pmid_set = [] pmid_raw = re.findall("<dd>\d{8}</dd>", content) for pmid in pmid_raw: pmid = str(pmid[4:-5]) # 去处括号 msg.msg("pmid", str(pmid), "retrieved", "proc", "info", msg.log, msg.display, msg.stat) if pmid not in existed_pmid_set: pmid_set.append(pmid) msg.msg("pmid", str(pmid), "retrieved", "succ", "info", msg.log, msg.display, msg.stat) else: msg.msg("pmid", str(pmid), "skipped", "skip",
journal_name_end = list[0].find("\",\"", journal_name_start) journal_name = list[0][journal_name_start:journal_name_end] journal_name = journal_name.upper() # 查找到的名字也是全大写 msg.msg("journal name", journal_name_raw, "web retrieved", journal_name, "debug", msg.display) return journal_name break except Exception, e: msg.msg("journal name", journal_name, "web retrieved", "retried", "debug", msg.display) msg.msg("journal name", journal_name, "web retrieved", str(e), "error", msg.log) tries -= 1 time.sleep(config.request_refresh_wait) else: msg.msg("journal name", journal_name, "web retrieved", "fail", "error", msg.log, msg.display) return "" def get_journal_info(ojournal_name, proxy=None): # 查找杂志影响因子、分区, 要求输入精准 url = "http://www.letpub.com.cn/index.php?page=journalapp&view=search" search_str = { "searchname": "", "searchissn": "", "searchfield": "", "searchimpactlow": "", "searchimpacthigh": "", "searchscitype": "", "view": "search", "searchcategory1": "", "searchcategory2": "",
#!/usr/bin/python3 from message import sendmessage, sendmessageicon, sendmessagetitle, msg sendmessage("hello you") sendmessageicon("hello you","face-wink") sendmessagetitle("message","title") msg("message","title","face-angry")
def crawl_detail(pmid, proxy=None): # 爬具体页面 link = "https://www.ncbi.nlm.nih.gov/pubmed/" + str(pmid) tries = config.request_dp_tries # 根据设定重复次数 msg.msg("record", pmid, "retrieved", "proc", "info", msg.display, msg.stat) while (tries > 0): try: authors = [] institues = [] countries = [] flinks = [] opener = requests.Session() # 新建了session保存 content = opener.get( link, timeout=config.request_time_out, headers=agents.get_header()).text # 注意,这里是不断随机换agent的 selector = etree.HTML(content.encode("utf-8")) title_element = selector.xpath( "//div[@class = \"rprt abstract\"]//h1") if len(title_element): title = title_element[0].xpath('string(.)') author_element = selector.xpath("//div[@class = \"auths\"]//a") if len(author_element): for author in author_element: authors.append(author.xpath('string(.)')) journal_element = selector.xpath("//a[@alsec=\"jour\"]/@title") if len(journal_element): journal = journal_element[0] if journal: journal_detail = jn.journal_detail(journal) ojournal = journal_detail[0] journal_if = journal_detail[1] journal_zone = journal_detail[2] abstract_element = selector.xpath( "//*[@id=\"maincontent\"]/div/div[5]/div/div[4]") if len(abstract_element): abstract = abstract_element[0].xpath('string(.)')[8:] key_words_element = selector.xpath( "//*[@id=\"maincontent\"]/div/div[5]/div/div[5]/p") if len(key_words_element): key_words = key_words_element[0].xpath('string(.)').split("; ") else: key_words = [] issue_element = selector.xpath("//div[@class = \"cit\"]") # 年份 if len(issue_element): issue_raw = issue_element[0].xpath('string(.)') issue_start = issue_raw.find(".") issue = issue_raw[issue_start + 2:issue_start + 6] institues_element = selector.xpath("//div[@class=\"afflist\"]//dd") if len(institues_element): for institue in institues_element: institue = institue.xpath('string(.)') institue = ut.regexp_replace( institue, ut.re_email_pm) # 去除pm的email语句 institue = ut.regexp_replace( institue, ut.re_email_general) # 去除所有中间的email institue = institue.replace(" ,", ",") institues.append(institue) institue = institue.replace(", ", ",").replace(".", "") institue_strs = institue.split(",") institue_strs.reverse() # 国家名往往放在最后 i = 0 while i < len(institue_strs): if institue_strs[i] in dictionary.country_names.keys( ): # 如果有这个机构 country_name = dictionary.country_names[ institue_strs[i]] # 直接查询 if not country_name in countries: countries.append(country_name) break else: i += 1 flink_element = selector.xpath( "//div[@class=\"icons portlet\"]//a/@href") if len(flink_element): for flink in flink_element: flinks.append(flink) mh.add_new_content(pmid, title, authors, journal, ojournal, journal_if, journal_zone, issue, abstract, key_words, institues, countries, flinks) msg.msg("record", pmid, "retrieved", "succ", "info", msg.display, msg.stat) break except Exception, e: msg.msg("record", pmid, "retrieved", "retried", "notice", msg.display) msg.msg("record", pmid, "retrieved", str(e), "error", msg.log) tries -= 1 time.sleep(config.request_refresh_wait) # 如果抓不成功,就先休息3秒钟
flinks.append(flink) mh.add_new_content(pmid, title, authors, journal, ojournal, journal_if, journal_zone, issue, abstract, key_words, institues, countries, flinks) msg.msg("record", pmid, "retrieved", "succ", "info", msg.display, msg.stat) break except Exception, e: msg.msg("record", pmid, "retrieved", "retried", "notice", msg.display) msg.msg("record", pmid, "retrieved", str(e), "error", msg.log) tries -= 1 time.sleep(config.request_refresh_wait) # 如果抓不成功,就先休息3秒钟 else: msg.msg("record", pmid, "retrieved", "fail", "error", msg.display, msg.log) return 0 def run_detail_crawler(project, sstr, record_number): pmid_list = get_pmid_list(project, sstr, record_number) pool = Pool(config.detail_crawler_number) # 实例化进程池 pool.map(crawl_detail, pmid_list) pool.close() # 关闭进程池 pool.join() # 等待所有进程结束 if __name__ == '__main__': run_detail_crawler("test", "lactobacillus", 1000)
def save_png(browser): browser.save_screenshot( ut.cur_file_dir() + "/browser/" + ut.time_str("time") + ".png") msg.msg("screenshot", "", "saved", "succ", "debug", msg.display, msg.log)