def __init__(self, task_name, project_name, key_words, record_number): self.key_words = key_words # 输入关键词,关键词是一个集合 self.record_number = record_number # 需要爬多少个 self.project_name = project_name # 项目的名字 self.task_name = task_name self.request_time_out = 30 # request超时时间 self.phantomjs_time_out = 60 # phantom超时时间 self.request_refresh_wait = 3 # request刷新等待 self.phantomjs_refresh_wait = 5 # 浏览器刷新等待 self.tries_request = 3 self.tries_1st_sp = 3 # 尝试获取第一个页面的次数 self.tries_other_sp = 5 # 尝试获取其它每个页面的次数 self.sum_page_number = int(math.ceil(self.record_number / 20)) # 每页20个,计算共多少页 self.url = "https://www.ncbi.nlm.nih.gov/pubmed/?term=" + self.key_words.replace( ",", "+") # 最初的查询网址 self.phantomjs_headers = agents.get_header() # 随机选择一个以供浏览器使用 self.pmid_set = mh.read_pmid_all() # 只读一次 # 实例内部容器 self.content = [] # 把sc_content类的抓过来;每次抓取新页面都清空 self.author = [] # author合集;每次抓取新页面都清空 self.journal = [] # 期刊合集;每次抓取新页面都清空 self.title = [] # 名字与连接的合集;每次抓取新页面都清空 self.issue = [] # 年份的合集;每次抓取新页面都清空 self.pmid = [] # Pmid的合集;每次抓取新页面都清空
def get_official_name(journal_name_raw, proxy=None): # 查找杂志的全名,支持模糊查询,只输出最符合的那个 url = "http://www.letpub.com.cn/journalappAjax.php?querytype=autojournal&term=" + \ journal_name_raw.replace("&", "%26").replace(" ", "+") tries = config.request_dp_tries while tries > 0: try: opener = requests.Session() doc = opener.get(url, timeout=20, headers=agents.get_header()).text list = doc.split('},{') # 获取列表,但是只有最match的被采纳 journal_name_start = list[0].find("label") + 8 journal_name_end = list[0].find("\",\"", journal_name_start) journal_name = list[0][journal_name_start:journal_name_end] journal_name = journal_name.upper() # 查找到的名字也是全大写 msg.msg("journal name", journal_name_raw, "web retrieved", journal_name, "debug", msg.display) return journal_name break except Exception, e: msg.msg("journal name", journal_name, "web retrieved", "retried", "debug", msg.display) msg.msg("journal name", journal_name, "web retrieved", str(e), "error", msg.log) tries -= 1 time.sleep(config.request_refresh_wait)
def adjust_record_number(project, sstr, record_number): # 确定正确的记录数 url = parse_url(project, sstr) tries = config.request_sp_tries # 尝试3次 while(tries > 0): try: opener = requests.Session() content = opener.get(url, timeout=config.request_time_out, headers=agents.get_header()).text # header仍然可以是随机的 max_record_number_start = content.find( "<h3 class=\"result_count left\">Items:") + 37 # 找描述开始地方 max_record_number_end = content.find( '</h3>', max_record_number_start) record_number_str = content[max_record_number_start:max_record_number_end] max_record_number = int(record_number_str.split(" ")[-1]) if max_record_number >= record_number: pass else: record_number = max_record_number msg.msg("record number", "", "changed", str( record_number), "notice", msg.log, msg.display) return record_number break except Exception, e: msg.msg("record number", "", "read", str(e), "error", msg.log) msg.msg("record number", "", "read", "retried", "notice", msg.display) tries -= 1 time.sleep(config.request_refresh_wait)
def crawl_detail(self, pmid): # 爬具体页面 link = "https://www.ncbi.nlm.nih.gov/pubmed/" + pmid key_words_list = [] # 关键词合集 institues_list = [] # 机构名称 full_links_list = [] # 全文链接(不是abstract,是可下载的pdf) tries = 3 # 尝试获取3次,不成功就返回错误 while (tries > 0): try: opener = requests.Session() # 新建了session保存 doc = opener.get( link, timeout=self.request_time_out, headers=agents.get_header()).text # 注意,这里是不断随机换agent的 soup = BeautifulSoup(doc) abstract_raw = soup.findAll(name="abstracttext") abstract = ut.regexp_replace(str(abstract_raw), ut.re_html)[1:-1] # 即时清理abstract key_words_raw = soup.findAll(name="div", attrs={"class": "keywords"}) if key_words_raw: # 如果有keyword的话,很多文章是没有 key_words_raw = str(key_words_raw)[45:-11].replace( "; ", ";") key_words_list = key_words_raw.split(';') institues_raw = soup.findAll(name='dl') if institues_raw: # 如果有institues的话,大部分文章都有 institues_raw = institues_raw[0] institues_raw = re.findall("<dd>.*?</dd>", str(institues_raw)) for institues in institues_raw: institues_list.append(institues[4:-5]) full_content = soup.findAll(name='div', attrs={"class": "icons portlet"}) full_links_raw = re.findall("<a href=.*?ref=", str(full_content)) if full_links_raw: # 如果有全文链接 for full_link in full_links_raw: full_links_list.append(full_link[9:-6].replace( "&", "&")) return abstract, key_words_list, institues_list, full_links_list # 返回的是一个str值和3个集合 break except Exception, e: tries -= 1 msg.display( ut.time_str("time"), "retrying record: " + str(pmid) + "; " + str(tries) + " tries left", "notice") msg.log(self.task_name, ut.time_str("full"), "retry record: " + str(pmid), "notice") msg.log(self.task_name, ut.time_str("full"), str(e), "error") time.sleep(self.request_refresh_wait) # 如果抓不成功,就先休息3秒钟
def crawl_name(): # 爬具体页面 link = "https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations" opener = requests.Session() # 新建了session保存 content = opener.get(link, timeout=config.request_time_out, headers=agents.get_header()).text # 注意,这里是不断随机换agent的 selector = etree.HTML(content.encode("utf-8")) name_element = selector.xpath("//*[@id=\"bodyContent\"]//table//tr/td//span") print len(name_element) for item in name_element: print item.xpath('string(.)')
def crawl_direct(self): # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快 msg.stat("sum_page", "proc") # 列入已处理 tries = self.tries_request # 尝试3次 while (tries > 0): try: opener = requests.Session() raw = opener.get( self.url, timeout=self.request_time_out, headers=agents.get_header()).text # header仍然可以是随机的 soup = BeautifulSoup(raw) number_raw = soup.findAll(name="input", attrs={"id": "pageno"}) # 找到含总数的div number_start = str(number_raw).find("last=") + 6 # 找到总数开始位置 number_end = str(number_raw).find("\" />") # 找到总数结束位置 max_number = int( str(number_raw)[number_start:number_end]) # 实际最大数值,整数 if max_number < self.sum_page_number: # 如果实际最大页面数没有计算值大 self.sum_page_number = max_number # 那用实际值,否则不变 msg.display(ut.time_str("time"), "max sum page changed: " + str(max_number), "notice") msg.log(self.task_name, ut.time_str("full"), "changed sum page number: " + str(max_number), "notice") msg.display(ut.time_str("time"), "loaded: NO.1 sum page (requests)", "info") msg.log(self.task_name, ut.time_str("full"), "load sum page: NO.1 (requests)", "info") self.author = soup.findAll(name='p', attrs={"class": "desc"}) self.journal = soup.findAll(name="span", attrs={'class': 'jrnl'}) self.title = soup.findAll(name='p', attrs={"class": "title"}) self.issue = soup.findAll(name="p", attrs={'class': 'details'}) self.pmid = soup.findAll(name="dd") self.generate_record() # 直接产生结果 msg.stat("sum_page", "succ") break except Exception, e: print e tries -= 1 msg.display( ut.time_str("time"), "load retrying: NO.1 sum page (requests); " + str(tries) + " tries left", "notice") msg.log(self.task_name, ut.time_str("full"), "retry sum page: NO.1 (requests)", "notice") msg.log(self.task_name, ut.time_str("full"), str(e), "error")
def crawl_phantom(project, sstr, record_number, endwith, endtime): url = parse_url(project, sstr) sum_page_number = int(math.ceil(record_number / 200)) # 计算要多少页面可以爬完 rest_page_number = sum_page_number # 剩下多少页, 刚开始一样的 tries_1st_sp = config.phantom_1st_sp_tries phantomjs_headers = agents.get_header() # 随机选择一个以供浏览器使用 dcap = dict(DesiredCapabilities.PHANTOMJS) # 设置userAgent dcap["phantomjs.page.settings.userAgent"] = ( phantomjs_headers) # header每次打开phantomjs是随机的,但浏览器关闭前不会变 dcap["phantomjs.page.settings.loadImages"] = False # 不载入图片,以加快速度 sargs = ["--disk-cache=yes", '--ignore-ssl-errors=true'] # 打开磁盘缓存,忽略ssl错误 # browser = webdriver.PhantomJS(executable_path='C:\Python27\Scripts\phantomjs.exe', desired_capabilities=dcap) # 加载浏览器,windows下使用 path = ut.cur_file_dir() + "/browser/phantomjs" # 浏览器地址 pm_browser = webdriver.PhantomJS( # executable_path=path, desired_capabilities=dcap, service_args=sargs) # 加载浏览器 executable_path=path, desired_capabilities=dcap) # 加载浏览器
def crawl_direct(project, sstr): # 用于直接爬sum-page,只能爬第一页,但是不采用phantomjs,速度快 url = parse_url(project, sstr) tries = config.request_sp_tries # 尝试3次 while(tries > 0): try: opener = requests.Session() content = opener.get(url, timeout=config.request_time_out, headers=agents.get_header()).text # header仍然可以是随机的 msg.msg("sum page", "1", "loaded", "proc", "info", msg.display) pmid_list = extract_new_pmid(content) # 提取pmid, 然后排除旧的 if pmid_list: mh.add_new_pmid_many( project, sstr, ut.time_str("full"), "pm", pmid_list) msg.msg("sum page", "1", "loaded", "succ", "info", msg.display, msg.log) break except Exception, e: msg.msg("sum page", "1", "loaded", str(e), "error", msg.log) msg.msg("sum page", "1", "loaded", "retried", "notice", msg.display) tries -= 1 time.sleep(config.request_refresh_wait)
def crawl_detail(pmid, proxy=None): # 爬具体页面 link = "https://www.ncbi.nlm.nih.gov/pubmed/" + str(pmid) tries = config.request_dp_tries # 根据设定重复次数 msg.msg("record", pmid, "retrieved", "proc", "info", msg.display, msg.stat) while (tries > 0): try: authors = [] institues = [] countries = [] flinks = [] opener = requests.Session() # 新建了session保存 content = opener.get( link, timeout=config.request_time_out, headers=agents.get_header()).text # 注意,这里是不断随机换agent的 selector = etree.HTML(content.encode("utf-8")) title_element = selector.xpath( "//div[@class = \"rprt abstract\"]//h1") if len(title_element): title = title_element[0].xpath('string(.)') author_element = selector.xpath("//div[@class = \"auths\"]//a") if len(author_element): for author in author_element: authors.append(author.xpath('string(.)')) journal_element = selector.xpath("//a[@alsec=\"jour\"]/@title") if len(journal_element): journal = journal_element[0] if journal: journal_detail = jn.journal_detail(journal) ojournal = journal_detail[0] journal_if = journal_detail[1] journal_zone = journal_detail[2] abstract_element = selector.xpath( "//*[@id=\"maincontent\"]/div/div[5]/div/div[4]") if len(abstract_element): abstract = abstract_element[0].xpath('string(.)')[8:] key_words_element = selector.xpath( "//*[@id=\"maincontent\"]/div/div[5]/div/div[5]/p") if len(key_words_element): key_words = key_words_element[0].xpath('string(.)').split("; ") else: key_words = [] issue_element = selector.xpath("//div[@class = \"cit\"]") # 年份 if len(issue_element): issue_raw = issue_element[0].xpath('string(.)') issue_start = issue_raw.find(".") issue = issue_raw[issue_start + 2:issue_start + 6] institues_element = selector.xpath("//div[@class=\"afflist\"]//dd") if len(institues_element): for institue in institues_element: institue = institue.xpath('string(.)') institue = ut.regexp_replace( institue, ut.re_email_pm) # 去除pm的email语句 institue = ut.regexp_replace( institue, ut.re_email_general) # 去除所有中间的email institue = institue.replace(" ,", ",") institues.append(institue) institue = institue.replace(", ", ",").replace(".", "") institue_strs = institue.split(",") institue_strs.reverse() # 国家名往往放在最后 i = 0 while i < len(institue_strs): if institue_strs[i] in dictionary.country_names.keys( ): # 如果有这个机构 country_name = dictionary.country_names[ institue_strs[i]] # 直接查询 if not country_name in countries: countries.append(country_name) break else: i += 1 flink_element = selector.xpath( "//div[@class=\"icons portlet\"]//a/@href") if len(flink_element): for flink in flink_element: flinks.append(flink) mh.add_new_content(pmid, title, authors, journal, ojournal, journal_if, journal_zone, issue, abstract, key_words, institues, countries, flinks) msg.msg("record", pmid, "retrieved", "succ", "info", msg.display, msg.stat) break except Exception, e: msg.msg("record", pmid, "retrieved", "retried", "notice", msg.display) msg.msg("record", pmid, "retrieved", str(e), "error", msg.log) tries -= 1 time.sleep(config.request_refresh_wait) # 如果抓不成功,就先休息3秒钟