예제 #1
0
 def run(self):
     asyncio.set_event_loop(asyncio.new_event_loop())
     self.waiting_for_app_ready()  # 只有等到django准备好之后 才能导入相应的包
     time.sleep(3)
     while True:
         try:
             self.update_once()
             # logger.log(user='******', tag='INFO', info='Spider configs have been updated automatically!', screen=True)
         except Exception as e:
             time.sleep(10)
             logger.log(user='******',
                        tag='ERROR',
                        info=e,
                        screen=True)
         time.sleep(5)
예제 #2
0
 def get_kw_query_str(self, kw_id):
     query_str = ""
     try:
         logic_str = {'none': '', '1': 'AND ', '2': 'OR ', '3': 'NOT '}
         kw_ = SpiderKeyWord.objects.filter(id=kw_id).values()[0]
         for ele in json.loads(kw_['value']):
             if (len(query_str) > 0):
                 query_str = "(%s) " % query_str
             query_str += "%s%s" % (logic_str.get(
                 ele['symbol']), ele['keyword'])
             if ele['field'] != 'All Fields':
                 query_str += "[%s]" % ele['field']
         query_str = unquote(query_str)
         logger.log(user=self.name,
                    tag='INFO',
                    info="query_str:%s !" % query_str,
                    screen=True)
         return query_str
     except Exception as e:
         raise Exception('Error: unable to parse the kw_id! %s' % e)
예제 #3
0
 def get_kw_query_str(self, kw_id):
     try:
         if kw_id in special_kw:
             query_str = "qs=hash"
         else:
             kw_ = SpiderKeyWord.objects.filter(id=kw_id).values()[0]
             query_str = ""
             for key, value in json.loads(kw_['value']).items():
                 if value == '':
                     continue
                 if len(query_str) > 0:
                     query_str += '&'
                 if key == 'articleTypes':
                     value = " ".join(value.keys())
                 query_str += "%s=%s" % (key, value)
         logger.log(user=self.name,
                    tag='INFO',
                    info="query_str:%s !" % query_str,
                    screen=True)
         return query_str
     except Exception as e:
         raise Exception('Error: unable to parse the kw_id! %s' % e)
예제 #4
0
 def _updateSpiderInfo(self):
     retry_times = 1
     while retry_times <= spiders.ids_max_retry_times:  # 最多重试次数
         try:
             logger.log(user=self.name,
                        tag='INFO',
                        info='Trying to get lastQueryKey...:' +
                        str(retry_times),
                        screen=True)
             # 更新会话
             self.ids_sessionHelper = SessionHelper(
                 header_fun=HeadersHelper.pubmed_ids_headers)
             lastQueryKey, self.ids_sessionHelper = self._getLastQueryKey(
                 sessionHelper=self.ids_sessionHelper)
             self.ids_sessionHelper.lastQueryKey = lastQueryKey
             return
         except Exception as e:
             logger.log(user=self.name,
                        tag='ERROR',
                        info=e,
                        screen=True)
             if not isinstance(e, ProxyError):
                 retry_times += 1
예제 #5
0
    def auto_update_session(self, force=False):
        while True:
            try:
                # 检查cookie是否被设置
                if self.journal_cookies['main'] is None:
                    self.idsP_status.value = 6  # cookie无效, 等待重新输入
                    self.contentP_status.value = 6  # cookie无效, 等待重新输入
                    time.sleep(1)  # 歇息一秒,继续检查
                    raise Exception(
                        'Cookies are None or invalided. Please set it.')

                # 如果不是强制更新,则结束函数
                if self.ajax_sessionHelper and not force:
                    return

                # 表示未初始化或者要求强制更新
                # 检查helper是否被设置
                if self.page_sessionHelper is None:
                    self.page_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.jounal_headers_page,
                        try_proxy=False,
                        cookies=self.journal_cookies['main'])
                # 检查cookie是否有效
                user_name = heplers.check_cookies_valid(
                    self.page_sessionHelper)
                if not user_name:  # 无效
                    self.journal_cookies['main'] = None  # 无效cookie置为空
                    self.page_sessionHelper = None  # 无效cookie置为空
                    self.ajax_sessionHelper = None  # 无效cookie置为空
                    self.kw_name = '<span style="color:red">无效cookies,请重新输入</span>'  # 无效cookie置为空
                    raise Exception('This cookies cant be used to login.')

                self.page_sessionHelper = SessionHelper(
                    header_fun=HeadersHelper.jounal_headers_page,
                    try_proxy=False,
                    cookies=self.journal_cookies['main'])
                self.ajax_sessionHelper = SessionHelper(
                    header_fun=HeadersHelper.jounal_headers_ajax,
                    try_proxy=False,
                    cookies=self.journal_cookies['main'])

                self.kw_name = '<span style="color:green">' + user_name + '</span>'
                logger.log(user=self.TYPE,
                           tag='INFO',
                           info='auto_update_session success!',
                           screen=True)
                break
            except Exception as e:
                logger.log(tag="ERROR",
                           user=self.TYPE,
                           info='Check the cookies failed! ' + str(e),
                           screen=True)
                time.sleep(1)
        logger.log(user=self.TYPE,
                   tag='INFO',
                   info='auto_update_session ended',
                   screen=True)
예제 #6
0
 def get_proxy(self):
     if not self.use_proxy:
         return None
     while True:
         try:
             if self.proxy_ips and len(self.proxy_ips) > 0:
                 random.shuffle(self.proxy_ips)
                 ip = self.proxy_ips[0]
             elif self.proxy_pool_url and self.proxy_pool_url != '':
                 rsp = requests.get(self.proxy_pool_url)
                 if len(rsp.text) > 21:
                     raise Exception('Get Proxy Failed')
                 else:
                     ip = re.sub(r"\s", "", rsp.text)
             else:
                 return None
             proxies = {'http': 'http://' + ip, 'https': 'http://' + ip}
             return proxies
         except Exception as e:
             logger.log(user='******',
                        tag='ERROR',
                        info=e,
                        screen=True)
             time.sleep(2)
예제 #7
0
        def _updateSession(self, ids_max_retry_times=3):
            retry_times = 1
            while retry_times <= ids_max_retry_times:  # 最多重试次数
                try:
                    logger.log(user=self.name,
                               tag='INFO',
                               info='Trying to Update the session!...:' +
                               str(retry_times),
                               screen=True)
                    query_worker = _scienceIDWorker(
                        kw_id=self.kw_id, name='SciContentUS-Process')._worker(
                            kw_id=self.kw_id, name='SciContentUS-Thread')

                    ids_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.science_headers)
                    query_str = query_worker.get_kw_query_str(self.kw_id)
                    offset = 0
                    query_str = "%s&offset=%d&show=%d" % (
                        query_str, offset, spiders.default_science_pagesize)

                    response = ids_sessionHelper.get(
                        'https://www.sciencedirect.com/search?' + query_str)
                    if response.status_code != 200:
                        raise Exception('Connection Failed')

                    rsp_text = response.text.encode().decode('unicode_escape')
                    if self._isBlocked(rsp_text):
                        continue
                    # 设置header: refer
                    headers = {
                        'Referer': query_str,
                        'Upgrade-Insecure-Requests': '1'
                    }
                    ids_sessionHelper.session.headers.update(headers)
                    self.sessionHelper = ids_sessionHelper

                    logger.log(user=self.name,
                               tag='INFO',
                               info='Update the session successfully.',
                               screen=True)
                    return self.sessionHelper
                except Exception as e:
                    logger.log(user=self.name,
                               tag='ERROR',
                               info=e,
                               screen=True)
                    if not isinstance(e, ProxyError):
                        retry_times += 1
                    time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
            raise Exception('Update the session failed!')
예제 #8
0
 def _get_page_Num(self, ids_sessionHelper=None):
     retry_times = 1
     while retry_times <= spiders.ids_max_retry_times:  # 最多重试次数
         try:
             logger.log(user=self.name,
                        tag='INFO',
                        info='Trying to get pageNum ...:' +
                        str(retry_times),
                        screen=True)
             if not ids_sessionHelper:
                 ids_sessionHelper = SessionHelper(
                     header_fun=HeadersHelper.pubmed_ids_headers,
                     timeout=10)
             query_str = self.get_kw_query_str(self.kw_id)
             data = {
                 'term':
                 query_str,
                 'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_DisplayBar.PageSize':
                 str(self.page_size)
             }
             response = ids_sessionHelper.get(
                 url='https://www.ncbi.nlm.nih.gov/pubmed/',
                 params=data)
             if response.status_code != 200:
                 raise Exception('Connection Failed')
             rsp_text = response.text
             if self._isBlocked(rsp_text):
                 raise Exception(
                     'This request has been recognized as Spider and blocked!'
                 )
             lastQueryKey = self._find_lastQueryKey(response.text)
             ids_list = self._findIdsList(rsp_text)
             page_num = self._findPageNum(rsp_text, ids_list)
             self.manager.page_Num.value = page_num
             ids_sessionHelper.lastQueryKey = lastQueryKey  # 记得给lastQueryKey 赋值
             logger.log(user=self.name,
                        tag='INFO',
                        info='Get pageNum:%d successfully.' % page_num,
                        screen=True)
             return page_num, ids_sessionHelper
         except Exception as e:
             ids_sessionHelper = SessionHelper(
                 header_fun=HeadersHelper.science_headers, timeout=10)
             logger.log(user=self.name,
                        tag='ERROR',
                        info=e,
                        screen=True)
             if not isinstance(e, ProxyError):
                 retry_times += 1
     return -1, None
예제 #9
0
 def _get_page_Num(self, ids_sessionHelper=None):
     retry_times = 1
     while retry_times <= spiders.ids_max_retry_times:  # 最多重试次数
         try:
             logger.log(user=self.name,
                        tag='INFO',
                        info='Trying to get pageNum ...:' +
                        str(retry_times),
                        screen=True)
             if not ids_sessionHelper:
                 ids_sessionHelper = SessionHelper(
                     header_fun=HeadersHelper.science_headers)
             query_str = self.get_kw_query_str(self.kw_id)
             offset = 0
             query_str = "%s&show=%d&sortBy=relevance&offset=%d" % (
                 query_str, self.page_size, offset)
             response = ids_sessionHelper.get(
                 'https://www.sciencedirect.com/search?' + query_str)
             if response.status_code != 200:
                 raise Exception('Connection Failed')
             content = response.text.encode().decode('unicode_escape')
             page_num_p = re.compile('Page\s[\d]+\sof\s(\d+)</li>',
                                     re.I | re.M)
             r = re.search(page_num_p, content)
             page_num = int(r.group(1)) if r else 0
             self.manager.page_Num.value = page_num
             logger.log(user=self.name,
                        tag='INFO',
                        info='Get pageNum:%d successfully.' % page_num,
                        screen=True)
             return page_num
         except Exception as e:
             logger.log(user=self.name,
                        tag='ERROR',
                        info=e,
                        screen=True)
             if not isinstance(e, ProxyError):
                 retry_times += 1
             ids_sessionHelper = SessionHelper(
                 header_fun=HeadersHelper.science_headers)
             time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
     return -1
예제 #10
0
        def run(self):
            asyncio.set_event_loop(asyncio.new_event_loop())
            self.manager.auto_update_session()
            self._init_data()

            while True:
                # 检查是否被暂停
                if self.manager.idsP_status.value == 2:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否完成
                if self.manager.idsP_status.value == 3:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否需要输入cookies
                if self.manager.idsP_status.value == 6:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否时被终止
                if self.manager.idsP_status.value == 4:
                    break

                task_info = None
                try:
                    self.manager.idsP_status.value = 1
                    task_info = self.cls_queen.get(timeout=1)
                    cls_name = task_info['cls_name']
                    retry_times = task_info['retry_times']
                    if (retry_times >= spiders.ids_max_retry_times):
                        raise Exception(
                            "%s: retry_times=%d! This id is labeled as FAILED!"
                            % (cls_name, spiders.ids_max_retry_times))

                    sub_list_file = os.path.join(
                        BASE_DIR, 'cofcoAPP/heplers/journal',
                        spiders.journal_year + "-" + cls_name + '.txt')
                    if os.path.exists(sub_list_file) and spiders.read_cached:
                        with open(sub_list_file, 'r', encoding='utf-8') as f:
                            list_content = f.read()
                        sub_list = re.split('\n', list_content)[:-1]

                        for id_n, target_link in enumerate(sub_list):
                            self.ids_queen.put({
                                'id_n': id_n,
                                'target_link': target_link,
                                'retry_times': 0
                            })
                            self.manager.update_ids_qsize(1)
                        self.manager.update_finished_page_Num()
                        continue

                    currPage = 1
                    page_retried = 0
                    total_Num = 9999  #暂时的总期刊数目
                    curr_num = 0  # 目前通过翻页查询到的
                    is_get_last_journal = False  # 是否获取到了最后一本期刊
                    while True and curr_num <= total_Num and not is_get_last_journal:
                        # 检查是否被暂停
                        if self.manager.idsP_status.value == 2:
                            time.sleep(1)  # 歇息一秒,继续检查
                            continue
                        # 检查是否任务是否完成
                        if self.manager.idsP_status.value == 3:
                            time.sleep(1)  # 歇息一秒,继续检查
                            continue
                        # 检查是否任务是否需要输入cookies
                        if self.manager.idsP_status.value == 6:
                            time.sleep(1)  # 歇息一秒,继续检查
                            continue

                        # 检查是否时被终止
                        if self.manager.idsP_status.value == 4:
                            break

                        if (page_retried >= spiders.ids_max_retry_times):
                            currPage += 1
                            continue
                        self.manager.idsP_status.value = 1
                        try:
                            real_pre = str(currPage - 1)
                            # real_pre = '4'
                            real_curr = str(currPage)
                            if currPage == 1:
                                real_curr = ''
                                real_pre = '1'
                            self.data[
                                'ctl00$ContentPlaceHolder1$AspNetPager1_input'] = real_pre
                            self.data['__EVENTARGUMENT'] = real_curr
                            self.data[
                                'ctl00$ContentPlaceHolder1$dplCategory'] = cls_name

                            rsp = self.manager.ajax_sessionHelper.post(
                                'https://www.fenqubiao.com/Core/CategoryList.aspx',
                                data=self.data)
                            if rsp.status_code != 200:
                                raise Exception('Connection Failed!')
                            rsp_text = rsp.text

                            # 寻找总期刊数目,同时也用于判断是否获取了正确的网页
                            re_r = re.search(
                                r"期刊数量共计[\s\S]*?>([\d]+)[\s\S]*?本", rsp_text)
                            if not re_r:
                                raise Exception('Cant find the totalNum')
                            total_Num = int(re_r.group(1))

                            # 获取本页所有链接
                            row_eles = re.findall(
                                r'<tr>\s+<td>([\d]+)</td>[\s\S]*?href="([\s\S]+?)"',
                                rsp_text)
                            id_n = -1
                            fp = open(sub_list_file, 'a+', encoding='utf-8')
                            for ele in row_eles:
                                id_n = int(ele[0])  # 编号
                                target_link = ele[1]  # 详情链接
                                self.ids_queen.put({
                                    'id_n': id_n,
                                    'target_link': target_link,
                                    'retry_times': 0
                                })
                                fp.write(target_link + '\n')
                                self.manager.update_ids_qsize(1)
                                if int(id_n) == total_Num:
                                    is_get_last_journal = True
                            fp.flush()
                            fp.close()
                            logger.log(
                                user=self.name,
                                tag='INFO',
                                info='%s-%d success! currPage:%d %d/%d' %
                                (cls_name, currPage, currPage, id_n,
                                 total_Num),
                                screen=True)
                            currPage += 1
                            page_retried = 0
                        except Exception as e:
                            page_retried += 1
                            logger.log(user=self.name,
                                       tag='ERROF',
                                       info='%s-%d failed! %s' %
                                       (cls_name, currPage, e),
                                       screen=True)
                            self.manager.auto_update_session(force=True)
                            self._init_data()
                    self.manager.update_finished_page_Num()

                except Exception as e:
                    # 判断是否完成
                    finished_page_Num = self.manager.finished_page_Num.value
                    failed_page_Num = self.manager.failed_page_Num.value
                    page_Num = self.manager.page_Num.value

                    if finished_page_Num + failed_page_Num == page_Num:
                        self.manager.idsP_status.value = 3  # 将状态置为已完成
                        continue
                    # 失败后的任务重新放入任务队列,并重新尝试
                    if task_info:
                        retry_times = task_info['retry_times']
                        if (retry_times < spiders.ids_max_retry_times):
                            if not isinstance(e, ProxyError):
                                task_info['retry_times'] += 1
                            self.cls_queen.put(task_info)
                        else:  # 该任务确认已经失败,进行一些后续操作
                            self.manager.update_failed_page_Num()
                        logger.log(user=self.name,
                                   tag='ERROR',
                                   info=e,
                                   screen=True)
                        self.manager.auto_update_session(force=True)
                        self._init_data()
                    else:
                        pass
                        # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True)
                time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
예제 #11
0
        def run(self):
            asyncio.set_event_loop(asyncio.new_event_loop())
            self.manager.auto_update_session()
            while True:
                # 检查是否被暂停
                if self.manager.contentP_status.value == 2:  # 任务被暂停
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否完成
                if self.manager.contentP_status.value == 3:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否需要输入cookies
                if self.manager.contentP_status.value == 6:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否时被终止
                if self.manager.contentP_status.value == 4:
                    break

                task_info = None
                try:
                    self.manager.contentP_status.value = 1
                    task_info = self.ids_queen.get(timeout=1)
                    id_n = str(task_info['id_n'])
                    target_link = str(task_info['target_link'])
                    retry_times = int(task_info['retry_times'])

                    if (retry_times >= spiders.content_max_retry_times):
                        raise Exception(
                            '%s: retry_times>=%d! This id is labeled as FAILED!'
                            % (id_n, spiders.content_max_retry_times))

                    rsp = self.manager.page_sessionHelper.get(
                        'https://www.fenqubiao.com/Core/' + target_link)

                    if rsp.status_code != 200:
                        raise Exception('Connection Failed!')
                    rsp_text = rsp.text

                    r = re.search(r'点击按钮开始智能验证', rsp_text)
                    if r:
                        raise Exception('Logged in Failed! Please re-loggin!')
                    try:
                        journal_model = Journal()
                        journal_model.issn = re.search(
                            r'ISSN[\s\S]*?valueCss">([\s\S]*?)</td>',
                            rsp_text).group(1)
                        journal_model.full_name = re.search(
                            r'期刊全称[\s\S]*?="3">([\s\S]*?)</td>',
                            rsp_text).group(1)
                        journal_model.short_name = re.search(
                            r'期刊简称[\s\S]*?valueCss">([\s\S]*?)</td>',
                            rsp_text).group(1)
                        journal_model.subject = re.search(
                            r'大类[\s\S]*?<td>([\s\S]*?)</td>',
                            rsp_text).group(1)
                        journal_model.journal_zone = re.search(
                            r'大类[\s\S]*?<td>[\s\S]*?center">\s+([\d]+)',
                            rsp_text).group(1)
                        journal_model.impact_factor = re.findall(
                            r'<td>([\d.]+)</td>', rsp_text)[3]
                        journal_model.is_survey = re.search(
                            r'综述:[\s\S]*?valueCss">([\s\S]*?)</td>',
                            rsp_text).group(1)
                        journal_model.is_top = re.search(
                            r'大类[\s\S]*?top width-10[\s\S]*?ter">(\S+?)</td>',
                            rsp_text).group(1)
                        journal_model.total_cited = re.findall(
                            r'<td>([\d.]+)</td>', rsp_text)[6]
                        journal_model.save()
                    except Exception as e:
                        txt_path = os.path.join(BASE_DIR,
                                                'test/faild_journal_details',
                                                target_link + '.txt')
                        with open(txt_path, 'w+', encoding='utf-8') as f:
                            f.write(rsp_text)
                        raise e

                    # =============================================================================================
                    self.manager.update_finish()
                    info = "%s/%s" % (self.manager.finished_num.value,
                                      self.manager.ids_queen_size.value)
                    logger.log(user=self.name,
                               tag='INFO',
                               info=info,
                               screen=True)
                    time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
                except Exception as e:
                    # 判断是否完成
                    finished_num = self.manager.finished_num.value
                    failed_num = self.manager.failed_num.value
                    ids_queen_size = self.manager.ids_queen_size.value
                    idsP_status = self.manager.idsP_status.value
                    # 任务失败或者正常完成
                    if (idsP_status
                            == -2) or (finished_num + failed_num
                                       == ids_queen_size and idsP_status == 3):
                        self.manager.contentP_status.value = 3  # 将状态置为已完成
                        continue
                    # 失败后的任务重新放入任务队列,并重新尝试
                    if task_info:
                        retry_times = task_info['retry_times']
                        if (retry_times < spiders.content_max_retry_times):
                            if not isinstance(e, ProxyError):
                                task_info['retry_times'] += 1
                            self.ids_queen.put(task_info)
                        else:  # 该任务确认已经失败,进行一些后续操作
                            self.manager.update_failed()
                        self.manager.auto_update_session(force=True)
                        logger.log(user=self.name,
                                   tag='ERROR',
                                   info="%s" % (e),
                                   screen=True)
                    else:
                        pass
                        # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True)
                    time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
예제 #12
0
        def run(self):
            asyncio.set_event_loop(asyncio.new_event_loop())
            while True:
                # 检查是否被暂停
                if self.manager.contentP_status.value == 2:  # 任务被暂停
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否完成
                if self.manager.contentP_status.value == 3:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否时被终止
                if self.manager.contentP_status.value == 4:
                    break

                task_info = None
                try:
                    task_info = self.ids_queen.get(timeout=1)
                    article_id = str(task_info['id'])
                    retry_times = int(task_info['retry_times'])
                    if (retry_times >= spiders.content_max_retry_times):
                        raise Exception(
                            '%s: retry_times>=%d! This id is labeled as FAILED!'
                            % (article_id, spiders.content_max_retry_times))

                    if ContentHelper.is_in_black_list(
                            article_id):  # 判断是否在黑名单当中
                        continue

                    if not self.sessionHelper:
                        self._updateSession()  # 更换Helper

                    rsp_text = self.get_raw_content(
                        article_id=article_id,
                        content_sessionHelper=self.sessionHelper,
                        max_retry_times=1)
                    if self._isBlocked(rsp_text):  # 如果被forbidden,就放弃当前的session
                        self.sessionHelper = None
                        raise Exception('This session has been blocked!')

                    details_str = self._find_details_str(rsp_text)
                    # =============================================================================================
                    try:
                        content_model = ContentHelper.format_scicent_details(
                            details_str)
                        content_model.status = 0
                        content_model.art_id = article_id
                        content_model.kw_id = int(self.kw_id)
                        content_model.creater = self.manager.create_user_id
                        content_model.project = self.project if self.project else self.manager.TYPE
                        ContentHelper.content_save(content_model)
                    except Exception as e:
                        txt_path = os.path.join(BASE_DIR,
                                                'test/failed_science',
                                                article_id + '.txt')
                        with open(txt_path, 'w+', encoding='utf-8') as f:
                            f.write(details_str)
                        raise e
                    # =============================================================================================
                    self.manager.update_finish()
                    info = "%s/%s" % (self.manager.finished_num.value,
                                      self.manager.ids_queen_size.value)
                    logger.log(user=self.name,
                               tag='INFO',
                               info=info,
                               screen=True)
                except Exception as e:
                    # 判断是否完成
                    finished_num = self.manager.finished_num.value
                    failed_num = self.manager.failed_num.value
                    ids_queen_size = self.manager.ids_queen_size.value
                    idsP_status = self.manager.idsP_status.value
                    # 任务失败或者正常完成
                    if (idsP_status
                            == -2) or (finished_num + failed_num
                                       == ids_queen_size and idsP_status == 3):
                        self.manager.contentP_status.value = 3  # 将状态置为已完成
                        continue
                    # 失败后的任务重新放入任务队列,并重新尝试
                    if task_info:
                        retry_times = task_info['retry_times']
                        if (retry_times < spiders.content_max_retry_times):
                            if not isinstance(e, ProxyError):
                                task_info['retry_times'] += 1
                            self.ids_queen.put(task_info)
                        else:  # 该任务确认已经失败,进行一些后续操作
                            self.manager.update_failed()
                            self.manager.failed_ids_queen.put(task_info)
                            # content_model = Content()
                            # content_model.status = -3
                            # content_model.art_id = str(task_info['id'])
                            # content_model.title = '该文章爬取失败'
                            # content_model.kw_id = int(self.kw_id)
                            # content_model.creater = self.manager.create_user_id
                            # content_model.project = self.manager.TYPE
                            # ContentHelper.content_save(content_model)
                        logger.log(user=self.name,
                                   tag='ERROR',
                                   info=e,
                                   screen=True)
                    else:
                        pass
                        # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True)
                    time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
예제 #13
0
        def run(self):
            asyncio.set_event_loop(asyncio.new_event_loop())
            if not self.ids_sessionHelper:
                self.ids_sessionHelper = SessionHelper(
                    header_fun=HeadersHelper.science_headers)
            while True:
                # 检查是否被暂停
                if self.manager.idsP_status.value == 2:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否完成
                if self.manager.idsP_status.value == 3:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否时被终止
                if self.manager.idsP_status.value == 4:
                    break

                task_info = None
                try:
                    task_info = self.pages_queen.get(timeout=1)
                    currPage = task_info['currPage']
                    retry_times = task_info['retry_times']
                    if (retry_times >= spiders.ids_max_retry_times):
                        raise Exception(
                            "%s: retry_times=%d! This id is labeled as FAILED!"
                            % (currPage, spiders.ids_max_retry_times))

                    query_str = self.get_kw_query_str(self.kw_id)
                    offset = (currPage - 1) * self.page_size
                    query_str = "%s&offset=%d&show=%d" % (query_str, offset,
                                                          self.page_size)
                    response = self.ids_sessionHelper.get(
                        'https://www.sciencedirect.com/search?' + query_str)
                    if response.status_code != 200:
                        raise Exception('Connection Failed')
                    content = response.text.encode().decode('unicode_escape')
                    pii_ids_p = re.compile('"pii":"([\w\d]+)"', re.I | re.M)
                    results = re.findall(pii_ids_p, content)
                    for art_id in results:
                        self.ids_queen.put({'id': art_id, 'retry_times': 0})
                        self.manager.update_ids_qsize(1)

                    self.manager.update_finished_page_Num()
                    logger.log(user=self.name,
                               tag='INFO',
                               info=self.manager.ids_queen_size.value,
                               screen=True)
                except Exception as e:
                    # 判断是否完成
                    finished_page_Num = self.manager.finished_page_Num.value
                    failed_page_Num = self.manager.failed_page_Num.value
                    page_Num = self.manager.page_Num.value
                    if finished_page_Num + failed_page_Num == page_Num:
                        self.manager.idsP_status.value = 3  # 将状态置为已完成
                        continue
                    # 失败后的任务重新放入任务队列,并重新尝试
                    if task_info:
                        retry_times = task_info['retry_times']
                        if (retry_times < spiders.ids_max_retry_times):
                            if not isinstance(e, ProxyError):
                                task_info['retry_times'] += 1
                            self.pages_queen.put(task_info)
                        else:  # 该任务确认已经失败,进行一些后续操作
                            self.manager.update_failed_page_Num()
                        logger.log(user=self.name,
                                   tag='ERROR',
                                   info=e,
                                   screen=True)
                    else:
                        pass
                        # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True)
                    self.ids_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.science_headers)
예제 #14
0
        def run(self):
            asyncio.set_event_loop(asyncio.new_event_loop())
            if not self.ids_sessionHelper:
                self._updateSpiderInfo()  # 更换Helper

            while True:
                # 检查是否被暂停
                if self.manager.idsP_status.value == 2:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否完成
                if self.manager.idsP_status.value == 3:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否时被终止
                if self.manager.idsP_status.value == 4:
                    break

                task_info = None
                try:
                    task_info = self.pages_queen.get(timeout=1)
                    currPage = task_info['currPage']
                    retry_times = task_info['retry_times']
                    if (retry_times >= spiders.ids_max_retry_times):
                        raise Exception(
                            "%s: retry_times=%d! This id is labeled as FAILED!"
                            % (currPage, spiders.ids_max_retry_times))
                    rsp_text = self._getPageConent(
                        sessionHelper=self.ids_sessionHelper,
                        lastQueryKey=self.ids_sessionHelper.lastQueryKey,
                        currPage=currPage,
                        page_size=self.page_size)
                    ids_list = self._findIdsList(rsp_text=rsp_text)
                    for pubmed_id in ids_list:
                        self.ids_queen.put({'id': pubmed_id, 'retry_times': 0})
                        self.manager.update_ids_qsize(1)
                    self.manager.update_finished_page_Num()
                    logger.log(user=self.name,
                               tag='INFO',
                               info=self.manager.ids_queen_size.value,
                               screen=True)
                except Exception as e:
                    # traceback.print_exc(e)
                    # 判断是否完成
                    finished_page_Num = self.manager.finished_page_Num.value
                    failed_page_Num = self.manager.failed_page_Num.value
                    page_Num = self.manager.page_Num.value

                    if finished_page_Num + failed_page_Num == page_Num:
                        self.manager.idsP_status.value = 3  # 将状态置为已完成
                        continue
                    # 失败后的任务重新放入任务队列,并重新尝试
                    if task_info:
                        retry_times = task_info['retry_times']
                        if (retry_times < spiders.ids_max_retry_times):
                            if not isinstance(e, ProxyError):
                                task_info['retry_times'] += 1
                            self.pages_queen.put(task_info)
                        else:  # 该任务确认已经失败,进行一些后续操作
                            self.manager.update_failed_page_Num()
                        logger.log(user=self.name,
                                   tag='ERROR',
                                   info=e,
                                   screen=True)
                    else:
                        pass
                        # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=False)
                    self._updateSpiderInfo()  # 更换Helper