Exemple #1
0
    def auto_update_session(self, force=False):
        while True:
            try:
                # 检查cookie是否被设置
                if self.journal_cookies['main'] is None:
                    self.idsP_status.value = 6  # cookie无效, 等待重新输入
                    self.contentP_status.value = 6  # cookie无效, 等待重新输入
                    time.sleep(1)  # 歇息一秒,继续检查
                    raise Exception(
                        'Cookies are None or invalided. Please set it.')

                # 如果不是强制更新,则结束函数
                if self.ajax_sessionHelper and not force:
                    return

                # 表示未初始化或者要求强制更新
                # 检查helper是否被设置
                if self.page_sessionHelper is None:
                    self.page_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.jounal_headers_page,
                        try_proxy=False,
                        cookies=self.journal_cookies['main'])
                # 检查cookie是否有效
                user_name = heplers.check_cookies_valid(
                    self.page_sessionHelper)
                if not user_name:  # 无效
                    self.journal_cookies['main'] = None  # 无效cookie置为空
                    self.page_sessionHelper = None  # 无效cookie置为空
                    self.ajax_sessionHelper = None  # 无效cookie置为空
                    self.kw_name = '<span style="color:red">无效cookies,请重新输入</span>'  # 无效cookie置为空
                    raise Exception('This cookies cant be used to login.')

                self.page_sessionHelper = SessionHelper(
                    header_fun=HeadersHelper.jounal_headers_page,
                    try_proxy=False,
                    cookies=self.journal_cookies['main'])
                self.ajax_sessionHelper = SessionHelper(
                    header_fun=HeadersHelper.jounal_headers_ajax,
                    try_proxy=False,
                    cookies=self.journal_cookies['main'])

                self.kw_name = '<span style="color:green">' + user_name + '</span>'
                logger.log(user=self.TYPE,
                           tag='INFO',
                           info='auto_update_session success!',
                           screen=True)
                break
            except Exception as e:
                logger.log(tag="ERROR",
                           user=self.TYPE,
                           info='Check the cookies failed! ' + str(e),
                           screen=True)
                time.sleep(1)
        logger.log(user=self.TYPE,
                   tag='INFO',
                   info='auto_update_session ended',
                   screen=True)
Exemple #2
0
        def _updateSession(self, ids_max_retry_times=3):
            retry_times = 1
            while retry_times <= ids_max_retry_times:  # 最多重试次数
                try:
                    logger.log(user=self.name,
                               tag='INFO',
                               info='Trying to Update the session!...:' +
                               str(retry_times),
                               screen=True)
                    query_worker = _scienceIDWorker(
                        kw_id=self.kw_id, name='SciContentUS-Process')._worker(
                            kw_id=self.kw_id, name='SciContentUS-Thread')

                    ids_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.science_headers)
                    query_str = query_worker.get_kw_query_str(self.kw_id)
                    offset = 0
                    query_str = "%s&offset=%d&show=%d" % (
                        query_str, offset, spiders.default_science_pagesize)

                    response = ids_sessionHelper.get(
                        'https://www.sciencedirect.com/search?' + query_str)
                    if response.status_code != 200:
                        raise Exception('Connection Failed')

                    rsp_text = response.text.encode().decode('unicode_escape')
                    if self._isBlocked(rsp_text):
                        continue
                    # 设置header: refer
                    headers = {
                        'Referer': query_str,
                        'Upgrade-Insecure-Requests': '1'
                    }
                    ids_sessionHelper.session.headers.update(headers)
                    self.sessionHelper = ids_sessionHelper

                    logger.log(user=self.name,
                               tag='INFO',
                               info='Update the session successfully.',
                               screen=True)
                    return self.sessionHelper
                except Exception as e:
                    logger.log(user=self.name,
                               tag='ERROR',
                               info=e,
                               screen=True)
                    if not isinstance(e, ProxyError):
                        retry_times += 1
                    time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
            raise Exception('Update the session failed!')
Exemple #3
0
 def _get_page_Num(self, ids_sessionHelper=None):
     retry_times = 1
     while retry_times <= spiders.ids_max_retry_times:  # 最多重试次数
         try:
             logger.log(user=self.name,
                        tag='INFO',
                        info='Trying to get pageNum ...:' +
                        str(retry_times),
                        screen=True)
             if not ids_sessionHelper:
                 ids_sessionHelper = SessionHelper(
                     header_fun=HeadersHelper.science_headers)
             query_str = self.get_kw_query_str(self.kw_id)
             offset = 0
             query_str = "%s&show=%d&sortBy=relevance&offset=%d" % (
                 query_str, self.page_size, offset)
             response = ids_sessionHelper.get(
                 'https://www.sciencedirect.com/search?' + query_str)
             if response.status_code != 200:
                 raise Exception('Connection Failed')
             content = response.text.encode().decode('unicode_escape')
             page_num_p = re.compile('Page\s[\d]+\sof\s(\d+)</li>',
                                     re.I | re.M)
             r = re.search(page_num_p, content)
             page_num = int(r.group(1)) if r else 0
             self.manager.page_Num.value = page_num
             logger.log(user=self.name,
                        tag='INFO',
                        info='Get pageNum:%d successfully.' % page_num,
                        screen=True)
             return page_num
         except Exception as e:
             logger.log(user=self.name,
                        tag='ERROR',
                        info=e,
                        screen=True)
             if not isinstance(e, ProxyError):
                 retry_times += 1
             ids_sessionHelper = SessionHelper(
                 header_fun=HeadersHelper.science_headers)
             time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
     return -1
Exemple #4
0
 def get_raw_content(self,
                     article_id,
                     content_sessionHelper=None,
                     max_retry_times=3):
     sessionHelper = content_sessionHelper
     retry_times = 1
     while retry_times <= max_retry_times:  # 最多重试次数
         try:
             if not content_sessionHelper:
                 sessionHelper = SessionHelper(
                     header_fun=HeadersHelper.science_headers)
             rsp = sessionHelper.get(
                 'https://www.sciencedirect.com/science/article/pii/' +
                 article_id)
             if rsp.status_code != 200:
                 raise Exception('Connection Failed')
             return rsp.text
         except Exception as e:
             if not isinstance(e, ProxyError):
                 retry_times += 1
             time.sleep(1.0 * random.randrange(1, 200) / 1000)  # 休息一下
     raise Exception('Get %s raw content failed!' % (article_id))
Exemple #5
0
 def get_raw_content(self,
                     article_id,
                     content_sessionHelper=None,
                     max_retry_times=3):
     sessionHelper = content_sessionHelper
     retry_times = 1
     while retry_times <= max_retry_times:  # 最多重试次数
         try:
             if not content_sessionHelper:
                 sessionHelper = SessionHelper(
                     header_fun=HeadersHelper.pubmed_content_headers)
             xml_rsp = sessionHelper.get(
                 'https://www.ncbi.nlm.nih.gov/pubmed/' +
                 str(article_id) + '?report=xml&format=text')
             if xml_rsp.status_code != 200:
                 raise Exception('Connection Failed')
             xml_str = xml_rsp.text
             return xml_str
         except Exception as e:
             if not isinstance(e, ProxyError):
                 retry_times += 1
             time.sleep(1.0 * random.randrange(1, 2000) / 1000)  # 休息一下
     raise Exception('Get raw content failed!')
Exemple #6
0
 def _get_page_Num(self, ids_sessionHelper=None):
     retry_times = 1
     while retry_times <= spiders.ids_max_retry_times:  # 最多重试次数
         try:
             logger.log(user=self.name,
                        tag='INFO',
                        info='Trying to get pageNum ...:' +
                        str(retry_times),
                        screen=True)
             if not ids_sessionHelper:
                 ids_sessionHelper = SessionHelper(
                     header_fun=HeadersHelper.pubmed_ids_headers,
                     timeout=10)
             query_str = self.get_kw_query_str(self.kw_id)
             data = {
                 'term':
                 query_str,
                 'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_DisplayBar.PageSize':
                 str(self.page_size)
             }
             response = ids_sessionHelper.get(
                 url='https://www.ncbi.nlm.nih.gov/pubmed/',
                 params=data)
             if response.status_code != 200:
                 raise Exception('Connection Failed')
             rsp_text = response.text
             if self._isBlocked(rsp_text):
                 raise Exception(
                     'This request has been recognized as Spider and blocked!'
                 )
             lastQueryKey = self._find_lastQueryKey(response.text)
             ids_list = self._findIdsList(rsp_text)
             page_num = self._findPageNum(rsp_text, ids_list)
             self.manager.page_Num.value = page_num
             ids_sessionHelper.lastQueryKey = lastQueryKey  # 记得给lastQueryKey 赋值
             logger.log(user=self.name,
                        tag='INFO',
                        info='Get pageNum:%d successfully.' % page_num,
                        screen=True)
             return page_num, ids_sessionHelper
         except Exception as e:
             ids_sessionHelper = SessionHelper(
                 header_fun=HeadersHelper.science_headers, timeout=10)
             logger.log(user=self.name,
                        tag='ERROR',
                        info=e,
                        screen=True)
             if not isinstance(e, ProxyError):
                 retry_times += 1
     return -1, None
Exemple #7
0
    def run(self):
        asyncio.set_event_loop(asyncio.new_event_loop())
        ids_sessionHelper = None
        for i in range(10):
            try:
                # 获取页码
                ids_sessionHelper = SessionHelper(
                    header_fun=HeadersHelper.pubmed_ids_headers)
                break
            except Exception as e:
                time.sleep(2)

        page_worker = self._worker(self.kw_id,
                                   name="%s %s" % (self.name, 'PAGE_THREAD'),
                                   ids_sessionHelper=ids_sessionHelper)
        page_Num, ids_sessionHelper = page_worker._get_page_Num()
        if page_Num == -1:
            page_worker.manager.idsP_status.value = -2  # 任务失败
            self.terminate()  # 结束进程
            return  # 结束进程

        if page_Num == 0:
            page_worker.manager.idsP_status.value = 3  # 任务完成
            return  # 结束进程

        del page_worker

        for cur_p in range(page_Num):
            self.pages_queen.put({'currPage': (cur_p + 1), 'retry_times': 0})

        for i in range(self.thread_num):
            name = "%s %s-%02d" % (self.name, 'THREAD', i + 1)
            dt = self._worker(kw_id=self.kw_id,
                              name=name,
                              pages_queen=self.pages_queen,
                              ids_sessionHelper=ids_sessionHelper)
            dt.start()
            self.threads.append(dt)
        # 合并到父进程
        for t in self.threads:
            t.join()
Exemple #8
0
 def _updateSpiderInfo(self):
     retry_times = 1
     while retry_times <= spiders.ids_max_retry_times:  # 最多重试次数
         try:
             logger.log(user=self.name,
                        tag='INFO',
                        info='Trying to get lastQueryKey...:' +
                        str(retry_times),
                        screen=True)
             # 更新会话
             self.ids_sessionHelper = SessionHelper(
                 header_fun=HeadersHelper.pubmed_ids_headers)
             lastQueryKey, self.ids_sessionHelper = self._getLastQueryKey(
                 sessionHelper=self.ids_sessionHelper)
             self.ids_sessionHelper.lastQueryKey = lastQueryKey
             return
         except Exception as e:
             logger.log(user=self.name,
                        tag='ERROR',
                        info=e,
                        screen=True)
             if not isinstance(e, ProxyError):
                 retry_times += 1
Exemple #9
0
    class _worker(threading.Thread):
        def __init__(self,
                     kw_id,
                     name=None,
                     pages_queen=None,
                     ids_sessionHelper=None,
                     page_size=spiders.default_science_pagesize):
            threading.Thread.__init__(self)
            self.kw_id = kw_id
            self.manager = SPIDERS_STATUS[kw_id]
            self.ids_queen = self.manager.ids_queen
            self.name = name
            self.page_size = page_size
            self.pages_queen = pages_queen
            self.ids_queen = self.manager.ids_queen
            self.ids_sessionHelper = ids_sessionHelper

        # 获得查询字符串
        def get_kw_query_str(self, kw_id):
            try:
                if kw_id in special_kw:
                    query_str = "qs=hash"
                else:
                    kw_ = SpiderKeyWord.objects.filter(id=kw_id).values()[0]
                    query_str = ""
                    for key, value in json.loads(kw_['value']).items():
                        if value == '':
                            continue
                        if len(query_str) > 0:
                            query_str += '&'
                        if key == 'articleTypes':
                            value = " ".join(value.keys())
                        query_str += "%s=%s" % (key, value)
                logger.log(user=self.name,
                           tag='INFO',
                           info="query_str:%s !" % query_str,
                           screen=True)
                return query_str
            except Exception as e:
                raise Exception('Error: unable to parse the kw_id! %s' % e)

        def _get_page_Num(self, ids_sessionHelper=None):
            retry_times = 1
            while retry_times <= spiders.ids_max_retry_times:  # 最多重试次数
                try:
                    logger.log(user=self.name,
                               tag='INFO',
                               info='Trying to get pageNum ...:' +
                               str(retry_times),
                               screen=True)
                    if not ids_sessionHelper:
                        ids_sessionHelper = SessionHelper(
                            header_fun=HeadersHelper.science_headers)
                    query_str = self.get_kw_query_str(self.kw_id)
                    offset = 0
                    query_str = "%s&show=%d&sortBy=relevance&offset=%d" % (
                        query_str, self.page_size, offset)
                    response = ids_sessionHelper.get(
                        'https://www.sciencedirect.com/search?' + query_str)
                    if response.status_code != 200:
                        raise Exception('Connection Failed')
                    content = response.text.encode().decode('unicode_escape')
                    page_num_p = re.compile('Page\s[\d]+\sof\s(\d+)</li>',
                                            re.I | re.M)
                    r = re.search(page_num_p, content)
                    page_num = int(r.group(1)) if r else 0
                    self.manager.page_Num.value = page_num
                    logger.log(user=self.name,
                               tag='INFO',
                               info='Get pageNum:%d successfully.' % page_num,
                               screen=True)
                    return page_num
                except Exception as e:
                    logger.log(user=self.name,
                               tag='ERROR',
                               info=e,
                               screen=True)
                    if not isinstance(e, ProxyError):
                        retry_times += 1
                    ids_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.science_headers)
                    time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下
            return -1

        def run(self):
            asyncio.set_event_loop(asyncio.new_event_loop())
            if not self.ids_sessionHelper:
                self.ids_sessionHelper = SessionHelper(
                    header_fun=HeadersHelper.science_headers)
            while True:
                # 检查是否被暂停
                if self.manager.idsP_status.value == 2:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否完成
                if self.manager.idsP_status.value == 3:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否时被终止
                if self.manager.idsP_status.value == 4:
                    break

                task_info = None
                try:
                    task_info = self.pages_queen.get(timeout=1)
                    currPage = task_info['currPage']
                    retry_times = task_info['retry_times']
                    if (retry_times >= spiders.ids_max_retry_times):
                        raise Exception(
                            "%s: retry_times=%d! This id is labeled as FAILED!"
                            % (currPage, spiders.ids_max_retry_times))

                    query_str = self.get_kw_query_str(self.kw_id)
                    offset = (currPage - 1) * self.page_size
                    query_str = "%s&offset=%d&show=%d" % (query_str, offset,
                                                          self.page_size)
                    response = self.ids_sessionHelper.get(
                        'https://www.sciencedirect.com/search?' + query_str)
                    if response.status_code != 200:
                        raise Exception('Connection Failed')
                    content = response.text.encode().decode('unicode_escape')
                    pii_ids_p = re.compile('"pii":"([\w\d]+)"', re.I | re.M)
                    results = re.findall(pii_ids_p, content)
                    for art_id in results:
                        self.ids_queen.put({'id': art_id, 'retry_times': 0})
                        self.manager.update_ids_qsize(1)

                    self.manager.update_finished_page_Num()
                    logger.log(user=self.name,
                               tag='INFO',
                               info=self.manager.ids_queen_size.value,
                               screen=True)
                except Exception as e:
                    # 判断是否完成
                    finished_page_Num = self.manager.finished_page_Num.value
                    failed_page_Num = self.manager.failed_page_Num.value
                    page_Num = self.manager.page_Num.value
                    if finished_page_Num + failed_page_Num == page_Num:
                        self.manager.idsP_status.value = 3  # 将状态置为已完成
                        continue
                    # 失败后的任务重新放入任务队列,并重新尝试
                    if task_info:
                        retry_times = task_info['retry_times']
                        if (retry_times < spiders.ids_max_retry_times):
                            if not isinstance(e, ProxyError):
                                task_info['retry_times'] += 1
                            self.pages_queen.put(task_info)
                        else:  # 该任务确认已经失败,进行一些后续操作
                            self.manager.update_failed_page_Num()
                        logger.log(user=self.name,
                                   tag='ERROR',
                                   info=e,
                                   screen=True)
                    else:
                        pass
                        # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True)
                    self.ids_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.science_headers)
Exemple #10
0
        def run(self):
            asyncio.set_event_loop(asyncio.new_event_loop())
            if not self.ids_sessionHelper:
                self.ids_sessionHelper = SessionHelper(
                    header_fun=HeadersHelper.science_headers)
            while True:
                # 检查是否被暂停
                if self.manager.idsP_status.value == 2:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否完成
                if self.manager.idsP_status.value == 3:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否时被终止
                if self.manager.idsP_status.value == 4:
                    break

                task_info = None
                try:
                    task_info = self.pages_queen.get(timeout=1)
                    currPage = task_info['currPage']
                    retry_times = task_info['retry_times']
                    if (retry_times >= spiders.ids_max_retry_times):
                        raise Exception(
                            "%s: retry_times=%d! This id is labeled as FAILED!"
                            % (currPage, spiders.ids_max_retry_times))

                    query_str = self.get_kw_query_str(self.kw_id)
                    offset = (currPage - 1) * self.page_size
                    query_str = "%s&offset=%d&show=%d" % (query_str, offset,
                                                          self.page_size)
                    response = self.ids_sessionHelper.get(
                        'https://www.sciencedirect.com/search?' + query_str)
                    if response.status_code != 200:
                        raise Exception('Connection Failed')
                    content = response.text.encode().decode('unicode_escape')
                    pii_ids_p = re.compile('"pii":"([\w\d]+)"', re.I | re.M)
                    results = re.findall(pii_ids_p, content)
                    for art_id in results:
                        self.ids_queen.put({'id': art_id, 'retry_times': 0})
                        self.manager.update_ids_qsize(1)

                    self.manager.update_finished_page_Num()
                    logger.log(user=self.name,
                               tag='INFO',
                               info=self.manager.ids_queen_size.value,
                               screen=True)
                except Exception as e:
                    # 判断是否完成
                    finished_page_Num = self.manager.finished_page_Num.value
                    failed_page_Num = self.manager.failed_page_Num.value
                    page_Num = self.manager.page_Num.value
                    if finished_page_Num + failed_page_Num == page_Num:
                        self.manager.idsP_status.value = 3  # 将状态置为已完成
                        continue
                    # 失败后的任务重新放入任务队列,并重新尝试
                    if task_info:
                        retry_times = task_info['retry_times']
                        if (retry_times < spiders.ids_max_retry_times):
                            if not isinstance(e, ProxyError):
                                task_info['retry_times'] += 1
                            self.pages_queen.put(task_info)
                        else:  # 该任务确认已经失败,进行一些后续操作
                            self.manager.update_failed_page_Num()
                        logger.log(user=self.name,
                                   tag='ERROR',
                                   info=e,
                                   screen=True)
                    else:
                        pass
                        # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True)
                    self.ids_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.science_headers)
Exemple #11
0
        def run(self):
            asyncio.set_event_loop(asyncio.new_event_loop())
            while True:
                # 检查是否被暂停
                if self.manager.contentP_status.value == 2:  # 任务被暂停
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否任务是否完成
                if self.manager.contentP_status.value == 3:
                    time.sleep(1)  # 歇息一秒,继续检查
                    continue

                # 检查是否时被终止
                if self.manager.contentP_status.value == 4:
                    break
                task_info = None
                try:
                    task_info = self.ids_queen.get(timeout=1)
                    article_id = str(task_info['id'])
                    retry_times = int(task_info['retry_times'])
                    if (retry_times >= spiders.content_max_retry_times):
                        raise Exception(
                            '%s: retry_times>=%d! This id is labeled as FAILED!'
                            % (article_id, spiders.content_max_retry_times))

                    if ContentHelper.is_in_black_list(
                            article_id):  # 判断是否在黑名单当中
                        continue
                    self.content_sessionHelper = SessionHelper(
                        header_fun=HeadersHelper.pubmed_content_headers)
                    details_str = self.get_raw_content(
                        article_id=article_id,
                        content_sessionHelper=self.content_sessionHelper,
                        max_retry_times=1)
                    # =============================================================================================
                    try:
                        content_model = ContentHelper.format_pubmed_xml(
                            details_str)
                        content_model.status = 0
                        content_model.art_id = article_id
                        content_model.kw_id = int(self.kw_id)
                        content_model.creater = self.manager.create_user_id
                        content_model.project = self.project if self.project else self.manager.TYPE
                        ContentHelper.content_save(content_model)
                    except Exception as e:
                        txt_path = os.path.join(BASE_DIR, 'test/failed_pub',
                                                article_id + '.xml')
                        with open(txt_path, 'w+', encoding='utf-8') as f:
                            f.write(details_str)
                        raise e

                    # =============================================================================================
                    self.manager.update_finish()
                    info = "%s/%s" % (self.manager.finished_num.value,
                                      self.manager.ids_queen_size.value)
                    logger.log(user=self.name,
                               tag='INFO',
                               info=info,
                               screen=True)
                except Exception as e:
                    # 判断是否完成
                    finished_num = self.manager.finished_num.value
                    failed_num = self.manager.failed_num.value
                    ids_queen_size = self.manager.ids_queen_size.value
                    idsP_status = self.manager.idsP_status.value
                    if (idsP_status
                            == -2) or (finished_num + failed_num
                                       == ids_queen_size and idsP_status == 3):
                        self.manager.contentP_status.value = 3  # 将状态置为已完成
                        continue
                    # 失败后的任务重新放入任务队列,并重新尝试
                    if task_info:
                        retry_times = task_info['retry_times']
                        if (retry_times < spiders.content_max_retry_times):
                            if not isinstance(e, ProxyError):
                                task_info['retry_times'] += 1
                            self.ids_queen.put(task_info)
                        else:  # 该任务确认已经失败,进行一些后续操作
                            self.manager.update_failed()
                            self.manager.failed_ids_queen.put(task_info)
                            # content_model = Content()
                            # content_model.status = -3
                            # content_model.art_id = str(task_info['id'])
                            # content_model.title = '该文章爬取失败'
                            # content_model.kw_id = int(self.kw_id)
                            # content_model.creater = self.manager.create_user_id
                            # content_model.project = self.manager.TYPE
                            # ContentHelper.content_save(content_model)
                        logger.log(user=self.name,
                                   tag='ERROR',
                                   info=e,
                                   screen=True)
                    else:
                        pass
                        # logger.log(user=self.name, tag='INFO', info='Waiting...', screen=True)
                    time.sleep(1.0 * random.randrange(1, 1000) / 1000)  # 休息一下