def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) self.waiting_for_app_ready() # 只有等到django准备好之后 才能导入相应的包 time.sleep(3) while True: try: self.update_once() # logger.log(user='******', tag='INFO', info='Spider configs have been updated automatically!', screen=True) except Exception as e: time.sleep(10) logger.log(user='******', tag='ERROR', info=e, screen=True) time.sleep(5)
def get_kw_query_str(self, kw_id): query_str = "" try: logic_str = {'none': '', '1': 'AND ', '2': 'OR ', '3': 'NOT '} kw_ = SpiderKeyWord.objects.filter(id=kw_id).values()[0] for ele in json.loads(kw_['value']): if (len(query_str) > 0): query_str = "(%s) " % query_str query_str += "%s%s" % (logic_str.get( ele['symbol']), ele['keyword']) if ele['field'] != 'All Fields': query_str += "[%s]" % ele['field'] query_str = unquote(query_str) logger.log(user=self.name, tag='INFO', info="query_str:%s !" % query_str, screen=True) return query_str except Exception as e: raise Exception('Error: unable to parse the kw_id! %s' % e)
def get_kw_query_str(self, kw_id): try: if kw_id in special_kw: query_str = "qs=hash" else: kw_ = SpiderKeyWord.objects.filter(id=kw_id).values()[0] query_str = "" for key, value in json.loads(kw_['value']).items(): if value == '': continue if len(query_str) > 0: query_str += '&' if key == 'articleTypes': value = " ".join(value.keys()) query_str += "%s=%s" % (key, value) logger.log(user=self.name, tag='INFO', info="query_str:%s !" % query_str, screen=True) return query_str except Exception as e: raise Exception('Error: unable to parse the kw_id! %s' % e)
def _updateSpiderInfo(self): retry_times = 1 while retry_times <= spiders.ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to get lastQueryKey...:' + str(retry_times), screen=True) # 更新会话 self.ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.pubmed_ids_headers) lastQueryKey, self.ids_sessionHelper = self._getLastQueryKey( sessionHelper=self.ids_sessionHelper) self.ids_sessionHelper.lastQueryKey = lastQueryKey return except Exception as e: logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1
def auto_update_session(self, force=False): while True: try: # 检查cookie是否被设置 if self.journal_cookies['main'] is None: self.idsP_status.value = 6 # cookie无效, 等待重新输入 self.contentP_status.value = 6 # cookie无效, 等待重新输入 time.sleep(1) # 歇息一秒,继续检查 raise Exception( 'Cookies are None or invalided. Please set it.') # 如果不是强制更新,则结束函数 if self.ajax_sessionHelper and not force: return # 表示未初始化或者要求强制更新 # 检查helper是否被设置 if self.page_sessionHelper is None: self.page_sessionHelper = SessionHelper( header_fun=HeadersHelper.jounal_headers_page, try_proxy=False, cookies=self.journal_cookies['main']) # 检查cookie是否有效 user_name = heplers.check_cookies_valid( self.page_sessionHelper) if not user_name: # 无效 self.journal_cookies['main'] = None # 无效cookie置为空 self.page_sessionHelper = None # 无效cookie置为空 self.ajax_sessionHelper = None # 无效cookie置为空 self.kw_name = '<span style="color:red">无效cookies,请重新输入</span>' # 无效cookie置为空 raise Exception('This cookies cant be used to login.') self.page_sessionHelper = SessionHelper( header_fun=HeadersHelper.jounal_headers_page, try_proxy=False, cookies=self.journal_cookies['main']) self.ajax_sessionHelper = SessionHelper( header_fun=HeadersHelper.jounal_headers_ajax, try_proxy=False, cookies=self.journal_cookies['main']) self.kw_name = '<span style="color:green">' + user_name + '</span>' logger.log(user=self.TYPE, tag='INFO', info='auto_update_session success!', screen=True) break except Exception as e: logger.log(tag="ERROR", user=self.TYPE, info='Check the cookies failed! ' + str(e), screen=True) time.sleep(1) logger.log(user=self.TYPE, tag='INFO', info='auto_update_session ended', screen=True)
def get_proxy(self): if not self.use_proxy: return None while True: try: if self.proxy_ips and len(self.proxy_ips) > 0: random.shuffle(self.proxy_ips) ip = self.proxy_ips[0] elif self.proxy_pool_url and self.proxy_pool_url != '': rsp = requests.get(self.proxy_pool_url) if len(rsp.text) > 21: raise Exception('Get Proxy Failed') else: ip = re.sub(r"\s", "", rsp.text) else: return None proxies = {'http': 'http://' + ip, 'https': 'http://' + ip} return proxies except Exception as e: logger.log(user='******', tag='ERROR', info=e, screen=True) time.sleep(2)
def _updateSession(self, ids_max_retry_times=3): retry_times = 1 while retry_times <= ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to Update the session!...:' + str(retry_times), screen=True) query_worker = _scienceIDWorker( kw_id=self.kw_id, name='SciContentUS-Process')._worker( kw_id=self.kw_id, name='SciContentUS-Thread') ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) query_str = query_worker.get_kw_query_str(self.kw_id) offset = 0 query_str = "%s&offset=%d&show=%d" % ( query_str, offset, spiders.default_science_pagesize) response = ids_sessionHelper.get( 'https://www.sciencedirect.com/search?' + query_str) if response.status_code != 200: raise Exception('Connection Failed') rsp_text = response.text.encode().decode('unicode_escape') if self._isBlocked(rsp_text): continue # 设置header: refer headers = { 'Referer': query_str, 'Upgrade-Insecure-Requests': '1' } ids_sessionHelper.session.headers.update(headers) self.sessionHelper = ids_sessionHelper logger.log(user=self.name, tag='INFO', info='Update the session successfully.', screen=True) return self.sessionHelper except Exception as e: logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1 time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下 raise Exception('Update the session failed!')
def _get_page_Num(self, ids_sessionHelper=None): retry_times = 1 while retry_times <= spiders.ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to get pageNum ...:' + str(retry_times), screen=True) if not ids_sessionHelper: ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.pubmed_ids_headers, timeout=10) query_str = self.get_kw_query_str(self.kw_id) data = { 'term': query_str, 'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_DisplayBar.PageSize': str(self.page_size) } response = ids_sessionHelper.get( url='https://www.ncbi.nlm.nih.gov/pubmed/', params=data) if response.status_code != 200: raise Exception('Connection Failed') rsp_text = response.text if self._isBlocked(rsp_text): raise Exception( 'This request has been recognized as Spider and blocked!' ) lastQueryKey = self._find_lastQueryKey(response.text) ids_list = self._findIdsList(rsp_text) page_num = self._findPageNum(rsp_text, ids_list) self.manager.page_Num.value = page_num ids_sessionHelper.lastQueryKey = lastQueryKey # 记得给lastQueryKey 赋值 logger.log(user=self.name, tag='INFO', info='Get pageNum:%d successfully.' % page_num, screen=True) return page_num, ids_sessionHelper except Exception as e: ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers, timeout=10) logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1 return -1, None
def _get_page_Num(self, ids_sessionHelper=None): retry_times = 1 while retry_times <= spiders.ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to get pageNum ...:' + str(retry_times), screen=True) if not ids_sessionHelper: ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) query_str = self.get_kw_query_str(self.kw_id) offset = 0 query_str = "%s&show=%d&sortBy=relevance&offset=%d" % ( query_str, self.page_size, offset) response = ids_sessionHelper.get( 'https://www.sciencedirect.com/search?' + query_str) if response.status_code != 200: raise Exception('Connection Failed') content = response.text.encode().decode('unicode_escape') page_num_p = re.compile('Page\s[\d]+\sof\s(\d+)</li>', re.I | re.M) r = re.search(page_num_p, content) page_num = int(r.group(1)) if r else 0 self.manager.page_Num.value = page_num logger.log(user=self.name, tag='INFO', info='Get pageNum:%d successfully.' % page_num, screen=True) return page_num except Exception as e: logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1 ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下 return -1
def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) self.manager.auto_update_session() self._init_data() while True: # 检查是否被暂停 if self.manager.idsP_status.value == 2: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.idsP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否需要输入cookies if self.manager.idsP_status.value == 6: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.idsP_status.value == 4: break task_info = None try: self.manager.idsP_status.value = 1 task_info = self.cls_queen.get(timeout=1) cls_name = task_info['cls_name'] retry_times = task_info['retry_times'] if (retry_times >= spiders.ids_max_retry_times): raise Exception( "%s: retry_times=%d! This id is labeled as FAILED!" % (cls_name, spiders.ids_max_retry_times)) sub_list_file = os.path.join( BASE_DIR, 'cofcoAPP/heplers/journal', spiders.journal_year + "-" + cls_name + '.txt') if os.path.exists(sub_list_file) and spiders.read_cached: with open(sub_list_file, 'r', encoding='utf-8') as f: list_content = f.read() sub_list = re.split('\n', list_content)[:-1] for id_n, target_link in enumerate(sub_list): self.ids_queen.put({ 'id_n': id_n, 'target_link': target_link, 'retry_times': 0 }) self.manager.update_ids_qsize(1) self.manager.update_finished_page_Num() continue currPage = 1 page_retried = 0 total_Num = 9999 #暂时的总期刊数目 curr_num = 0 # 目前通过翻页查询到的 is_get_last_journal = False # 是否获取到了最后一本期刊 while True and curr_num <= total_Num and not is_get_last_journal: # 检查是否被暂停 if self.manager.idsP_status.value == 2: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.idsP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否需要输入cookies if self.manager.idsP_status.value == 6: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.idsP_status.value == 4: break if (page_retried >= spiders.ids_max_retry_times): currPage += 1 continue self.manager.idsP_status.value = 1 try: real_pre = str(currPage - 1) # real_pre = '4' real_curr = str(currPage) if currPage == 1: real_curr = '' real_pre = '1' self.data[ 'ctl00$ContentPlaceHolder1$AspNetPager1_input'] = real_pre self.data['__EVENTARGUMENT'] = real_curr self.data[ 'ctl00$ContentPlaceHolder1$dplCategory'] = cls_name rsp = self.manager.ajax_sessionHelper.post( 'https://www.fenqubiao.com/Core/CategoryList.aspx', data=self.data) if rsp.status_code != 200: raise Exception('Connection Failed!') rsp_text = rsp.text # 寻找总期刊数目,同时也用于判断是否获取了正确的网页 re_r = re.search( r"期刊数量共计[\s\S]*?>([\d]+)[\s\S]*?本", rsp_text) if not re_r: raise Exception('Cant find the totalNum') total_Num = int(re_r.group(1)) # 获取本页所有链接 row_eles = re.findall( r'<tr>\s+<td>([\d]+)</td>[\s\S]*?href="([\s\S]+?)"', rsp_text) id_n = -1 fp = open(sub_list_file, 'a+', encoding='utf-8') for ele in row_eles: id_n = int(ele[0]) # 编号 target_link = ele[1] # 详情链接 self.ids_queen.put({ 'id_n': id_n, 'target_link': target_link, 'retry_times': 0 }) fp.write(target_link + '\n') self.manager.update_ids_qsize(1) if int(id_n) == total_Num: is_get_last_journal = True fp.flush() fp.close() logger.log( user=self.name, tag='INFO', info='%s-%d success! currPage:%d %d/%d' % (cls_name, currPage, currPage, id_n, total_Num), screen=True) currPage += 1 page_retried = 0 except Exception as e: page_retried += 1 logger.log(user=self.name, tag='ERROF', info='%s-%d failed! %s' % (cls_name, currPage, e), screen=True) self.manager.auto_update_session(force=True) self._init_data() self.manager.update_finished_page_Num() except Exception as e: # 判断是否完成 finished_page_Num = self.manager.finished_page_Num.value failed_page_Num = self.manager.failed_page_Num.value page_Num = self.manager.page_Num.value if finished_page_Num + failed_page_Num == page_Num: self.manager.idsP_status.value = 3 # 将状态置为已完成 continue # 失败后的任务重新放入任务队列,并重新尝试 if task_info: retry_times = task_info['retry_times'] if (retry_times < spiders.ids_max_retry_times): if not isinstance(e, ProxyError): task_info['retry_times'] += 1 self.cls_queen.put(task_info) else: # 该任务确认已经失败,进行一些后续操作 self.manager.update_failed_page_Num() logger.log(user=self.name, tag='ERROR', info=e, screen=True) self.manager.auto_update_session(force=True) self._init_data() else: pass # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True) time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下
def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) self.manager.auto_update_session() while True: # 检查是否被暂停 if self.manager.contentP_status.value == 2: # 任务被暂停 time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.contentP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否需要输入cookies if self.manager.contentP_status.value == 6: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.contentP_status.value == 4: break task_info = None try: self.manager.contentP_status.value = 1 task_info = self.ids_queen.get(timeout=1) id_n = str(task_info['id_n']) target_link = str(task_info['target_link']) retry_times = int(task_info['retry_times']) if (retry_times >= spiders.content_max_retry_times): raise Exception( '%s: retry_times>=%d! This id is labeled as FAILED!' % (id_n, spiders.content_max_retry_times)) rsp = self.manager.page_sessionHelper.get( 'https://www.fenqubiao.com/Core/' + target_link) if rsp.status_code != 200: raise Exception('Connection Failed!') rsp_text = rsp.text r = re.search(r'点击按钮开始智能验证', rsp_text) if r: raise Exception('Logged in Failed! Please re-loggin!') try: journal_model = Journal() journal_model.issn = re.search( r'ISSN[\s\S]*?valueCss">([\s\S]*?)</td>', rsp_text).group(1) journal_model.full_name = re.search( r'期刊全称[\s\S]*?="3">([\s\S]*?)</td>', rsp_text).group(1) journal_model.short_name = re.search( r'期刊简称[\s\S]*?valueCss">([\s\S]*?)</td>', rsp_text).group(1) journal_model.subject = re.search( r'大类[\s\S]*?<td>([\s\S]*?)</td>', rsp_text).group(1) journal_model.journal_zone = re.search( r'大类[\s\S]*?<td>[\s\S]*?center">\s+([\d]+)', rsp_text).group(1) journal_model.impact_factor = re.findall( r'<td>([\d.]+)</td>', rsp_text)[3] journal_model.is_survey = re.search( r'综述:[\s\S]*?valueCss">([\s\S]*?)</td>', rsp_text).group(1) journal_model.is_top = re.search( r'大类[\s\S]*?top width-10[\s\S]*?ter">(\S+?)</td>', rsp_text).group(1) journal_model.total_cited = re.findall( r'<td>([\d.]+)</td>', rsp_text)[6] journal_model.save() except Exception as e: txt_path = os.path.join(BASE_DIR, 'test/faild_journal_details', target_link + '.txt') with open(txt_path, 'w+', encoding='utf-8') as f: f.write(rsp_text) raise e # ============================================================================================= self.manager.update_finish() info = "%s/%s" % (self.manager.finished_num.value, self.manager.ids_queen_size.value) logger.log(user=self.name, tag='INFO', info=info, screen=True) time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下 except Exception as e: # 判断是否完成 finished_num = self.manager.finished_num.value failed_num = self.manager.failed_num.value ids_queen_size = self.manager.ids_queen_size.value idsP_status = self.manager.idsP_status.value # 任务失败或者正常完成 if (idsP_status == -2) or (finished_num + failed_num == ids_queen_size and idsP_status == 3): self.manager.contentP_status.value = 3 # 将状态置为已完成 continue # 失败后的任务重新放入任务队列,并重新尝试 if task_info: retry_times = task_info['retry_times'] if (retry_times < spiders.content_max_retry_times): if not isinstance(e, ProxyError): task_info['retry_times'] += 1 self.ids_queen.put(task_info) else: # 该任务确认已经失败,进行一些后续操作 self.manager.update_failed() self.manager.auto_update_session(force=True) logger.log(user=self.name, tag='ERROR', info="%s" % (e), screen=True) else: pass # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True) time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下
def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) while True: # 检查是否被暂停 if self.manager.contentP_status.value == 2: # 任务被暂停 time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.contentP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.contentP_status.value == 4: break task_info = None try: task_info = self.ids_queen.get(timeout=1) article_id = str(task_info['id']) retry_times = int(task_info['retry_times']) if (retry_times >= spiders.content_max_retry_times): raise Exception( '%s: retry_times>=%d! This id is labeled as FAILED!' % (article_id, spiders.content_max_retry_times)) if ContentHelper.is_in_black_list( article_id): # 判断是否在黑名单当中 continue if not self.sessionHelper: self._updateSession() # 更换Helper rsp_text = self.get_raw_content( article_id=article_id, content_sessionHelper=self.sessionHelper, max_retry_times=1) if self._isBlocked(rsp_text): # 如果被forbidden,就放弃当前的session self.sessionHelper = None raise Exception('This session has been blocked!') details_str = self._find_details_str(rsp_text) # ============================================================================================= try: content_model = ContentHelper.format_scicent_details( details_str) content_model.status = 0 content_model.art_id = article_id content_model.kw_id = int(self.kw_id) content_model.creater = self.manager.create_user_id content_model.project = self.project if self.project else self.manager.TYPE ContentHelper.content_save(content_model) except Exception as e: txt_path = os.path.join(BASE_DIR, 'test/failed_science', article_id + '.txt') with open(txt_path, 'w+', encoding='utf-8') as f: f.write(details_str) raise e # ============================================================================================= self.manager.update_finish() info = "%s/%s" % (self.manager.finished_num.value, self.manager.ids_queen_size.value) logger.log(user=self.name, tag='INFO', info=info, screen=True) except Exception as e: # 判断是否完成 finished_num = self.manager.finished_num.value failed_num = self.manager.failed_num.value ids_queen_size = self.manager.ids_queen_size.value idsP_status = self.manager.idsP_status.value # 任务失败或者正常完成 if (idsP_status == -2) or (finished_num + failed_num == ids_queen_size and idsP_status == 3): self.manager.contentP_status.value = 3 # 将状态置为已完成 continue # 失败后的任务重新放入任务队列,并重新尝试 if task_info: retry_times = task_info['retry_times'] if (retry_times < spiders.content_max_retry_times): if not isinstance(e, ProxyError): task_info['retry_times'] += 1 self.ids_queen.put(task_info) else: # 该任务确认已经失败,进行一些后续操作 self.manager.update_failed() self.manager.failed_ids_queen.put(task_info) # content_model = Content() # content_model.status = -3 # content_model.art_id = str(task_info['id']) # content_model.title = '该文章爬取失败' # content_model.kw_id = int(self.kw_id) # content_model.creater = self.manager.create_user_id # content_model.project = self.manager.TYPE # ContentHelper.content_save(content_model) logger.log(user=self.name, tag='ERROR', info=e, screen=True) else: pass # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True) time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下
def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) if not self.ids_sessionHelper: self.ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) while True: # 检查是否被暂停 if self.manager.idsP_status.value == 2: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.idsP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.idsP_status.value == 4: break task_info = None try: task_info = self.pages_queen.get(timeout=1) currPage = task_info['currPage'] retry_times = task_info['retry_times'] if (retry_times >= spiders.ids_max_retry_times): raise Exception( "%s: retry_times=%d! This id is labeled as FAILED!" % (currPage, spiders.ids_max_retry_times)) query_str = self.get_kw_query_str(self.kw_id) offset = (currPage - 1) * self.page_size query_str = "%s&offset=%d&show=%d" % (query_str, offset, self.page_size) response = self.ids_sessionHelper.get( 'https://www.sciencedirect.com/search?' + query_str) if response.status_code != 200: raise Exception('Connection Failed') content = response.text.encode().decode('unicode_escape') pii_ids_p = re.compile('"pii":"([\w\d]+)"', re.I | re.M) results = re.findall(pii_ids_p, content) for art_id in results: self.ids_queen.put({'id': art_id, 'retry_times': 0}) self.manager.update_ids_qsize(1) self.manager.update_finished_page_Num() logger.log(user=self.name, tag='INFO', info=self.manager.ids_queen_size.value, screen=True) except Exception as e: # 判断是否完成 finished_page_Num = self.manager.finished_page_Num.value failed_page_Num = self.manager.failed_page_Num.value page_Num = self.manager.page_Num.value if finished_page_Num + failed_page_Num == page_Num: self.manager.idsP_status.value = 3 # 将状态置为已完成 continue # 失败后的任务重新放入任务队列,并重新尝试 if task_info: retry_times = task_info['retry_times'] if (retry_times < spiders.ids_max_retry_times): if not isinstance(e, ProxyError): task_info['retry_times'] += 1 self.pages_queen.put(task_info) else: # 该任务确认已经失败,进行一些后续操作 self.manager.update_failed_page_Num() logger.log(user=self.name, tag='ERROR', info=e, screen=True) else: pass # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True) self.ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers)
def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) if not self.ids_sessionHelper: self._updateSpiderInfo() # 更换Helper while True: # 检查是否被暂停 if self.manager.idsP_status.value == 2: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.idsP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.idsP_status.value == 4: break task_info = None try: task_info = self.pages_queen.get(timeout=1) currPage = task_info['currPage'] retry_times = task_info['retry_times'] if (retry_times >= spiders.ids_max_retry_times): raise Exception( "%s: retry_times=%d! This id is labeled as FAILED!" % (currPage, spiders.ids_max_retry_times)) rsp_text = self._getPageConent( sessionHelper=self.ids_sessionHelper, lastQueryKey=self.ids_sessionHelper.lastQueryKey, currPage=currPage, page_size=self.page_size) ids_list = self._findIdsList(rsp_text=rsp_text) for pubmed_id in ids_list: self.ids_queen.put({'id': pubmed_id, 'retry_times': 0}) self.manager.update_ids_qsize(1) self.manager.update_finished_page_Num() logger.log(user=self.name, tag='INFO', info=self.manager.ids_queen_size.value, screen=True) except Exception as e: # traceback.print_exc(e) # 判断是否完成 finished_page_Num = self.manager.finished_page_Num.value failed_page_Num = self.manager.failed_page_Num.value page_Num = self.manager.page_Num.value if finished_page_Num + failed_page_Num == page_Num: self.manager.idsP_status.value = 3 # 将状态置为已完成 continue # 失败后的任务重新放入任务队列,并重新尝试 if task_info: retry_times = task_info['retry_times'] if (retry_times < spiders.ids_max_retry_times): if not isinstance(e, ProxyError): task_info['retry_times'] += 1 self.pages_queen.put(task_info) else: # 该任务确认已经失败,进行一些后续操作 self.manager.update_failed_page_Num() logger.log(user=self.name, tag='ERROR', info=e, screen=True) else: pass # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=False) self._updateSpiderInfo() # 更换Helper