def auto_update_session(self, force=False): while True: try: # 检查cookie是否被设置 if self.journal_cookies['main'] is None: self.idsP_status.value = 6 # cookie无效, 等待重新输入 self.contentP_status.value = 6 # cookie无效, 等待重新输入 time.sleep(1) # 歇息一秒,继续检查 raise Exception( 'Cookies are None or invalided. Please set it.') # 如果不是强制更新,则结束函数 if self.ajax_sessionHelper and not force: return # 表示未初始化或者要求强制更新 # 检查helper是否被设置 if self.page_sessionHelper is None: self.page_sessionHelper = SessionHelper( header_fun=HeadersHelper.jounal_headers_page, try_proxy=False, cookies=self.journal_cookies['main']) # 检查cookie是否有效 user_name = heplers.check_cookies_valid( self.page_sessionHelper) if not user_name: # 无效 self.journal_cookies['main'] = None # 无效cookie置为空 self.page_sessionHelper = None # 无效cookie置为空 self.ajax_sessionHelper = None # 无效cookie置为空 self.kw_name = '<span style="color:red">无效cookies,请重新输入</span>' # 无效cookie置为空 raise Exception('This cookies cant be used to login.') self.page_sessionHelper = SessionHelper( header_fun=HeadersHelper.jounal_headers_page, try_proxy=False, cookies=self.journal_cookies['main']) self.ajax_sessionHelper = SessionHelper( header_fun=HeadersHelper.jounal_headers_ajax, try_proxy=False, cookies=self.journal_cookies['main']) self.kw_name = '<span style="color:green">' + user_name + '</span>' logger.log(user=self.TYPE, tag='INFO', info='auto_update_session success!', screen=True) break except Exception as e: logger.log(tag="ERROR", user=self.TYPE, info='Check the cookies failed! ' + str(e), screen=True) time.sleep(1) logger.log(user=self.TYPE, tag='INFO', info='auto_update_session ended', screen=True)
def _updateSession(self, ids_max_retry_times=3): retry_times = 1 while retry_times <= ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to Update the session!...:' + str(retry_times), screen=True) query_worker = _scienceIDWorker( kw_id=self.kw_id, name='SciContentUS-Process')._worker( kw_id=self.kw_id, name='SciContentUS-Thread') ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) query_str = query_worker.get_kw_query_str(self.kw_id) offset = 0 query_str = "%s&offset=%d&show=%d" % ( query_str, offset, spiders.default_science_pagesize) response = ids_sessionHelper.get( 'https://www.sciencedirect.com/search?' + query_str) if response.status_code != 200: raise Exception('Connection Failed') rsp_text = response.text.encode().decode('unicode_escape') if self._isBlocked(rsp_text): continue # 设置header: refer headers = { 'Referer': query_str, 'Upgrade-Insecure-Requests': '1' } ids_sessionHelper.session.headers.update(headers) self.sessionHelper = ids_sessionHelper logger.log(user=self.name, tag='INFO', info='Update the session successfully.', screen=True) return self.sessionHelper except Exception as e: logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1 time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下 raise Exception('Update the session failed!')
def _get_page_Num(self, ids_sessionHelper=None): retry_times = 1 while retry_times <= spiders.ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to get pageNum ...:' + str(retry_times), screen=True) if not ids_sessionHelper: ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) query_str = self.get_kw_query_str(self.kw_id) offset = 0 query_str = "%s&show=%d&sortBy=relevance&offset=%d" % ( query_str, self.page_size, offset) response = ids_sessionHelper.get( 'https://www.sciencedirect.com/search?' + query_str) if response.status_code != 200: raise Exception('Connection Failed') content = response.text.encode().decode('unicode_escape') page_num_p = re.compile('Page\s[\d]+\sof\s(\d+)</li>', re.I | re.M) r = re.search(page_num_p, content) page_num = int(r.group(1)) if r else 0 self.manager.page_Num.value = page_num logger.log(user=self.name, tag='INFO', info='Get pageNum:%d successfully.' % page_num, screen=True) return page_num except Exception as e: logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1 ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下 return -1
def get_raw_content(self, article_id, content_sessionHelper=None, max_retry_times=3): sessionHelper = content_sessionHelper retry_times = 1 while retry_times <= max_retry_times: # 最多重试次数 try: if not content_sessionHelper: sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) rsp = sessionHelper.get( 'https://www.sciencedirect.com/science/article/pii/' + article_id) if rsp.status_code != 200: raise Exception('Connection Failed') return rsp.text except Exception as e: if not isinstance(e, ProxyError): retry_times += 1 time.sleep(1.0 * random.randrange(1, 200) / 1000) # 休息一下 raise Exception('Get %s raw content failed!' % (article_id))
def get_raw_content(self, article_id, content_sessionHelper=None, max_retry_times=3): sessionHelper = content_sessionHelper retry_times = 1 while retry_times <= max_retry_times: # 最多重试次数 try: if not content_sessionHelper: sessionHelper = SessionHelper( header_fun=HeadersHelper.pubmed_content_headers) xml_rsp = sessionHelper.get( 'https://www.ncbi.nlm.nih.gov/pubmed/' + str(article_id) + '?report=xml&format=text') if xml_rsp.status_code != 200: raise Exception('Connection Failed') xml_str = xml_rsp.text return xml_str except Exception as e: if not isinstance(e, ProxyError): retry_times += 1 time.sleep(1.0 * random.randrange(1, 2000) / 1000) # 休息一下 raise Exception('Get raw content failed!')
def _get_page_Num(self, ids_sessionHelper=None): retry_times = 1 while retry_times <= spiders.ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to get pageNum ...:' + str(retry_times), screen=True) if not ids_sessionHelper: ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.pubmed_ids_headers, timeout=10) query_str = self.get_kw_query_str(self.kw_id) data = { 'term': query_str, 'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_DisplayBar.PageSize': str(self.page_size) } response = ids_sessionHelper.get( url='https://www.ncbi.nlm.nih.gov/pubmed/', params=data) if response.status_code != 200: raise Exception('Connection Failed') rsp_text = response.text if self._isBlocked(rsp_text): raise Exception( 'This request has been recognized as Spider and blocked!' ) lastQueryKey = self._find_lastQueryKey(response.text) ids_list = self._findIdsList(rsp_text) page_num = self._findPageNum(rsp_text, ids_list) self.manager.page_Num.value = page_num ids_sessionHelper.lastQueryKey = lastQueryKey # 记得给lastQueryKey 赋值 logger.log(user=self.name, tag='INFO', info='Get pageNum:%d successfully.' % page_num, screen=True) return page_num, ids_sessionHelper except Exception as e: ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers, timeout=10) logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1 return -1, None
def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) ids_sessionHelper = None for i in range(10): try: # 获取页码 ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.pubmed_ids_headers) break except Exception as e: time.sleep(2) page_worker = self._worker(self.kw_id, name="%s %s" % (self.name, 'PAGE_THREAD'), ids_sessionHelper=ids_sessionHelper) page_Num, ids_sessionHelper = page_worker._get_page_Num() if page_Num == -1: page_worker.manager.idsP_status.value = -2 # 任务失败 self.terminate() # 结束进程 return # 结束进程 if page_Num == 0: page_worker.manager.idsP_status.value = 3 # 任务完成 return # 结束进程 del page_worker for cur_p in range(page_Num): self.pages_queen.put({'currPage': (cur_p + 1), 'retry_times': 0}) for i in range(self.thread_num): name = "%s %s-%02d" % (self.name, 'THREAD', i + 1) dt = self._worker(kw_id=self.kw_id, name=name, pages_queen=self.pages_queen, ids_sessionHelper=ids_sessionHelper) dt.start() self.threads.append(dt) # 合并到父进程 for t in self.threads: t.join()
def _updateSpiderInfo(self): retry_times = 1 while retry_times <= spiders.ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to get lastQueryKey...:' + str(retry_times), screen=True) # 更新会话 self.ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.pubmed_ids_headers) lastQueryKey, self.ids_sessionHelper = self._getLastQueryKey( sessionHelper=self.ids_sessionHelper) self.ids_sessionHelper.lastQueryKey = lastQueryKey return except Exception as e: logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1
class _worker(threading.Thread): def __init__(self, kw_id, name=None, pages_queen=None, ids_sessionHelper=None, page_size=spiders.default_science_pagesize): threading.Thread.__init__(self) self.kw_id = kw_id self.manager = SPIDERS_STATUS[kw_id] self.ids_queen = self.manager.ids_queen self.name = name self.page_size = page_size self.pages_queen = pages_queen self.ids_queen = self.manager.ids_queen self.ids_sessionHelper = ids_sessionHelper # 获得查询字符串 def get_kw_query_str(self, kw_id): try: if kw_id in special_kw: query_str = "qs=hash" else: kw_ = SpiderKeyWord.objects.filter(id=kw_id).values()[0] query_str = "" for key, value in json.loads(kw_['value']).items(): if value == '': continue if len(query_str) > 0: query_str += '&' if key == 'articleTypes': value = " ".join(value.keys()) query_str += "%s=%s" % (key, value) logger.log(user=self.name, tag='INFO', info="query_str:%s !" % query_str, screen=True) return query_str except Exception as e: raise Exception('Error: unable to parse the kw_id! %s' % e) def _get_page_Num(self, ids_sessionHelper=None): retry_times = 1 while retry_times <= spiders.ids_max_retry_times: # 最多重试次数 try: logger.log(user=self.name, tag='INFO', info='Trying to get pageNum ...:' + str(retry_times), screen=True) if not ids_sessionHelper: ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) query_str = self.get_kw_query_str(self.kw_id) offset = 0 query_str = "%s&show=%d&sortBy=relevance&offset=%d" % ( query_str, self.page_size, offset) response = ids_sessionHelper.get( 'https://www.sciencedirect.com/search?' + query_str) if response.status_code != 200: raise Exception('Connection Failed') content = response.text.encode().decode('unicode_escape') page_num_p = re.compile('Page\s[\d]+\sof\s(\d+)</li>', re.I | re.M) r = re.search(page_num_p, content) page_num = int(r.group(1)) if r else 0 self.manager.page_Num.value = page_num logger.log(user=self.name, tag='INFO', info='Get pageNum:%d successfully.' % page_num, screen=True) return page_num except Exception as e: logger.log(user=self.name, tag='ERROR', info=e, screen=True) if not isinstance(e, ProxyError): retry_times += 1 ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下 return -1 def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) if not self.ids_sessionHelper: self.ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) while True: # 检查是否被暂停 if self.manager.idsP_status.value == 2: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.idsP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.idsP_status.value == 4: break task_info = None try: task_info = self.pages_queen.get(timeout=1) currPage = task_info['currPage'] retry_times = task_info['retry_times'] if (retry_times >= spiders.ids_max_retry_times): raise Exception( "%s: retry_times=%d! This id is labeled as FAILED!" % (currPage, spiders.ids_max_retry_times)) query_str = self.get_kw_query_str(self.kw_id) offset = (currPage - 1) * self.page_size query_str = "%s&offset=%d&show=%d" % (query_str, offset, self.page_size) response = self.ids_sessionHelper.get( 'https://www.sciencedirect.com/search?' + query_str) if response.status_code != 200: raise Exception('Connection Failed') content = response.text.encode().decode('unicode_escape') pii_ids_p = re.compile('"pii":"([\w\d]+)"', re.I | re.M) results = re.findall(pii_ids_p, content) for art_id in results: self.ids_queen.put({'id': art_id, 'retry_times': 0}) self.manager.update_ids_qsize(1) self.manager.update_finished_page_Num() logger.log(user=self.name, tag='INFO', info=self.manager.ids_queen_size.value, screen=True) except Exception as e: # 判断是否完成 finished_page_Num = self.manager.finished_page_Num.value failed_page_Num = self.manager.failed_page_Num.value page_Num = self.manager.page_Num.value if finished_page_Num + failed_page_Num == page_Num: self.manager.idsP_status.value = 3 # 将状态置为已完成 continue # 失败后的任务重新放入任务队列,并重新尝试 if task_info: retry_times = task_info['retry_times'] if (retry_times < spiders.ids_max_retry_times): if not isinstance(e, ProxyError): task_info['retry_times'] += 1 self.pages_queen.put(task_info) else: # 该任务确认已经失败,进行一些后续操作 self.manager.update_failed_page_Num() logger.log(user=self.name, tag='ERROR', info=e, screen=True) else: pass # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True) self.ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers)
def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) if not self.ids_sessionHelper: self.ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers) while True: # 检查是否被暂停 if self.manager.idsP_status.value == 2: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.idsP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.idsP_status.value == 4: break task_info = None try: task_info = self.pages_queen.get(timeout=1) currPage = task_info['currPage'] retry_times = task_info['retry_times'] if (retry_times >= spiders.ids_max_retry_times): raise Exception( "%s: retry_times=%d! This id is labeled as FAILED!" % (currPage, spiders.ids_max_retry_times)) query_str = self.get_kw_query_str(self.kw_id) offset = (currPage - 1) * self.page_size query_str = "%s&offset=%d&show=%d" % (query_str, offset, self.page_size) response = self.ids_sessionHelper.get( 'https://www.sciencedirect.com/search?' + query_str) if response.status_code != 200: raise Exception('Connection Failed') content = response.text.encode().decode('unicode_escape') pii_ids_p = re.compile('"pii":"([\w\d]+)"', re.I | re.M) results = re.findall(pii_ids_p, content) for art_id in results: self.ids_queen.put({'id': art_id, 'retry_times': 0}) self.manager.update_ids_qsize(1) self.manager.update_finished_page_Num() logger.log(user=self.name, tag='INFO', info=self.manager.ids_queen_size.value, screen=True) except Exception as e: # 判断是否完成 finished_page_Num = self.manager.finished_page_Num.value failed_page_Num = self.manager.failed_page_Num.value page_Num = self.manager.page_Num.value if finished_page_Num + failed_page_Num == page_Num: self.manager.idsP_status.value = 3 # 将状态置为已完成 continue # 失败后的任务重新放入任务队列,并重新尝试 if task_info: retry_times = task_info['retry_times'] if (retry_times < spiders.ids_max_retry_times): if not isinstance(e, ProxyError): task_info['retry_times'] += 1 self.pages_queen.put(task_info) else: # 该任务确认已经失败,进行一些后续操作 self.manager.update_failed_page_Num() logger.log(user=self.name, tag='ERROR', info=e, screen=True) else: pass # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True) self.ids_sessionHelper = SessionHelper( header_fun=HeadersHelper.science_headers)
def run(self): asyncio.set_event_loop(asyncio.new_event_loop()) while True: # 检查是否被暂停 if self.manager.contentP_status.value == 2: # 任务被暂停 time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否任务是否完成 if self.manager.contentP_status.value == 3: time.sleep(1) # 歇息一秒,继续检查 continue # 检查是否时被终止 if self.manager.contentP_status.value == 4: break task_info = None try: task_info = self.ids_queen.get(timeout=1) article_id = str(task_info['id']) retry_times = int(task_info['retry_times']) if (retry_times >= spiders.content_max_retry_times): raise Exception( '%s: retry_times>=%d! This id is labeled as FAILED!' % (article_id, spiders.content_max_retry_times)) if ContentHelper.is_in_black_list( article_id): # 判断是否在黑名单当中 continue self.content_sessionHelper = SessionHelper( header_fun=HeadersHelper.pubmed_content_headers) details_str = self.get_raw_content( article_id=article_id, content_sessionHelper=self.content_sessionHelper, max_retry_times=1) # ============================================================================================= try: content_model = ContentHelper.format_pubmed_xml( details_str) content_model.status = 0 content_model.art_id = article_id content_model.kw_id = int(self.kw_id) content_model.creater = self.manager.create_user_id content_model.project = self.project if self.project else self.manager.TYPE ContentHelper.content_save(content_model) except Exception as e: txt_path = os.path.join(BASE_DIR, 'test/failed_pub', article_id + '.xml') with open(txt_path, 'w+', encoding='utf-8') as f: f.write(details_str) raise e # ============================================================================================= self.manager.update_finish() info = "%s/%s" % (self.manager.finished_num.value, self.manager.ids_queen_size.value) logger.log(user=self.name, tag='INFO', info=info, screen=True) except Exception as e: # 判断是否完成 finished_num = self.manager.finished_num.value failed_num = self.manager.failed_num.value ids_queen_size = self.manager.ids_queen_size.value idsP_status = self.manager.idsP_status.value if (idsP_status == -2) or (finished_num + failed_num == ids_queen_size and idsP_status == 3): self.manager.contentP_status.value = 3 # 将状态置为已完成 continue # 失败后的任务重新放入任务队列,并重新尝试 if task_info: retry_times = task_info['retry_times'] if (retry_times < spiders.content_max_retry_times): if not isinstance(e, ProxyError): task_info['retry_times'] += 1 self.ids_queen.put(task_info) else: # 该任务确认已经失败,进行一些后续操作 self.manager.update_failed() self.manager.failed_ids_queen.put(task_info) # content_model = Content() # content_model.status = -3 # content_model.art_id = str(task_info['id']) # content_model.title = '该文章爬取失败' # content_model.kw_id = int(self.kw_id) # content_model.creater = self.manager.create_user_id # content_model.project = self.manager.TYPE # ContentHelper.content_save(content_model) logger.log(user=self.name, tag='ERROR', info=e, screen=True) else: pass # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True) time.sleep(1.0 * random.randrange(1, 1000) / 1000) # 休息一下