Example #1
0
 def process_exception(self, request, exception, spider):
     ex_class = global_object_name(exception.__class__)
     ticker = request.meta["ticker"]
     self.stats.inc_value('downloader/exception_count', spider=spider)
     self.stats.inc_value('downloader/exception_type_count/%s' % ex_class,
                          spider=spider)
     self.stats.set_value(f'downloader/my_errors/{ticker}', ex_class)
Example #2
0
    def _retry(self, request, reason, spider):

        # 为代理列表中代理tag+1,表示已经一次失效
        self.add_proxy_tag(request.meta.get('proxy', ''))
        # 判断失效次数,如果这是第二次则删除,该代理
        now_proxy = request.meta.get('proxy', '')
        now_ip_port = now_proxy.split(':')[1][2:] + ':' + now_proxy.split(
            ':')[2]
        for each in IP_PORT_LIST:
            if each != '':
                if now_ip_port == each[0] and each[1] == 8:
                    self.delete_proxy(request.meta.get('proxy', False))
                    self.delete_list_proxy(request.meta.get('proxy', False))

        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            # 删除该代理,并从列表中剔除
            # self.delete_proxy(request.meta.get('proxy', False))
            # self.delete_list_proxy(request.meta.get('proxy', False))

            logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            # 删除该代理,并从列表中剔除
            # self.delete_proxy(request.meta.get('proxy', False))
            # self.delete_list_proxy(request.meta.get('proxy', False))

            stats.inc_value('retry/max_reached')
            logger.debug(
                "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
Example #3
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            url = request.url
            self.server.lpush('programList:starturls', url)
            proxy = request.meta['proxy']
            self.server.srem("proxy_set", proxy)
            logger.debug(
                "rewrite %(request)s to redis because retrying failed %(retries)d times: %(reason)s, "
                "and remove the bad proxy %(proxy)s",
                {'request': request, 'retries': retries, 'reason': reason},
                extra={'spider': spider})
Example #4
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1
        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            request.dont_filter = True
            request.priority = request.priority + self.priority_adjust
            return request
Example #5
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
Example #6
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_proxy_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("RetryWithProxy: Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_proxy_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            _proxy = next(ips)[0]
            proxy = _proxy['http'] if 'http' in _proxy.keys() else _proxy['https']
            retryreq.meta['proxy'] = proxy
            logger.info(f"Retrying with proxy: {proxy}")
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
Example #7
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.info("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value(f'retry/reason_count/{reason}')
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.error("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})

            #放弃超时的url并记录下来
            with open(self.errorpath,'a+') as fp:
                fp.write(str(request.url)+"\n")
Example #8
0
    def init_database(self, pool, mysql_config, db, table, item, taskid, spider_name):
        response, spider = self._hook(taskid, spider_name) # 这里有看不见的钩子

        # 需要注意的是,在一些老版本的mysql 里面并不支持 utf8mb4。
        # 所以:这都什么时代了,赶紧使用大于 5.5 版本的 mysql !
        charset = mysql_config.get('charset')
        
        '''
        CREATE TABLE `student` (
          `s_id` MEDIUMTEXT NULL,
          `s_name` MEDIUMTEXT NULL,
          `s_age` MEDIUMTEXT NULL,
          `s_msg` MEDIUMTEXT NULL,
        );
        '''
        try:
            conn = pool.dbapi.connect(**mysql_config)
            cursor = conn.cursor()
            table_sql = ""
            for k,v in item.items():
                # 创建db,创建表名,所有字段都以 MEDIUMTEXT 存储
                # MEDIUMTEXT 最大能使用16M 的长度,所以对于一般的html文本已经足够。
                table_sql += '`{}` MEDIUMTEXT NULL,'.format(str(k))
            cursor.execute('Create Database If Not Exists {} Character Set {}'.format(db, charset))
            cursor.execute('Create Table If Not Exists `{}`.`{}` ({})'.format(db, table, table_sql.strip(',')))
            conn.commit()
            cursor.close()
            conn.close()
        except Exception as e:
            traceback.print_exc()

            ex_class = global_object_name(e.__class__)
            self.stats.inc_value('create_db/exception_count', spider=spider)
            self.stats.inc_value('create_db/exception_type_count/%s' % ex_class, spider=spider)
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            # 全部重试错误,要保存错误的url和参数 - start
            error_request = spider.name + ":error_urls"
            self.redis_client.sadd(error_request, request.url)
            # 全部重试错误,要保存错误的url和参数 - en
            stats.inc_value('retry/max_reached')
            logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
Example #10
0
 def process_exception(self, request, exception, spider):
     ex_class = global_object_name(exception.__class__)
     ## 异常数量的统计
     self.stats.inc_value('downloader/exception_count', spider=spider)
     ## 异常类型数量的统计
     self.stats.inc_value('downloader/exception_type_count/%s' % ex_class,
                          spider=spider)
Example #11
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust
            # 重新获取代理,添加代理
            proxy_ip = "http://" + getProxy()
            retryreq.meta['proxy'] = proxy_ip

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            # print(retryreq)
            # print("*"*100)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
Example #12
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            with open('error.txt', 'a') as f:
                f.write(f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]-{request.url}--{reason}--MyRetry\n")
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
Example #13
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            # 主要魔改的这个函数,因为在重访问的过程中使用到了 Request 的copy函数重新创建对象
            # 这个函数不会将 _plusmeta 重新拷贝一份,所以这里需要重新拷贝,否则会有问题。
            # 重点就仅仅是魔改新增了下面这一行而已
            retryreq._plusmeta = request._plusmeta
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
    def _retry(self, request, reason, spider):
        retries = request.meta.get("retry_times", 0) + 1
        stats = spider.crawler.stats

        spider.logger.debug(
            "Retrying %(request)s (failed %(retries)d times): %(reason)s",
            {
                "request": request,
                "retries": retries,
                "reason": reason
            },
            extra={"spider": spider},
        )
        retryreq = request.copy()
        retryreq.meta["retry_times"] = retries
        retryreq.dont_filter = True

        if isinstance(reason, Exception):
            reason = global_object_name(reason.__class__)

        stats.inc_value("retry/count")
        stats.inc_value("retry/reason_count/%s" % reason)

        current_folio = request.meta["state"].current_folio
        spider.logger.info(
            "error: %s on folio %s, backing off %s seconds",
            reason,
            current_folio,
            retries,
        )
        time.sleep(1 * retries)
        return retryreq
Example #15
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            # ua = random.choice(user_agent_list)
            # request.headers.setdefault('User-Agent', ua)
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust
            # 重新获取代理,添加代理
            proxy_ip = "http://" + getProxy()
            retryreq.meta['proxy'] = proxy_ip
            # retryreq.meta['headers'] = proxy_ip
            print(proxy_ip)
            print("-" * 100)

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
Example #16
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            time.sleep(3)
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            spider.logger.warn('{0}超过重试次数,停止重试'.format(request.url))
Example #17
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            spider.logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            if isinstance(reason, TimeoutError) or isinstance(
                    reason, TCPTimedOutError) or isinstance(
                        reason, ConnectionRefusedError
                    ) and not spider.direct_connection:
                # retryreq = self.change_request_proxy(request, spider)
                retryreq = spider.change_request_proxy(request)
                return retryreq
            # elif isinstance(reason, HttpError) and reason.value.response.status == 429:
            #     if not spider.direct_connection:
            #         retryreq = self.change_request_proxy(request, spider)
            #         return retryreq
            #     else:
            #         sleep_time = F.rand_int((180, 600))
            #         spider.logger.info('Meet 429 code! Sleep for {} seconds...'.format(sleep_time))
            #         self.crawler.engine.pause()
            #         time.sleep(sleep_time)
            #         spider.logger.info('Wake up!')
            #         self.crawler.engine.unpause()
            #         return request
            else:
                stats.inc_value('retry/max_reached')
                spider.logger.debug(
                    "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                    {
                        'request': request,
                        'retries': retries,
                        'reason': reason
                    },
                    extra={'spider': spider})
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries < retry_times:
            logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            # 删除该代理
            self.delete_proxy(request.meta.get('proxy', False))
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            # 设置新的ip代理
            global IP_PORT_QUEUE
            if IP_PORT_QUEUE.empty():
                IP_PORT_QUEUE = get_ip_port_queue()
            ip_port = IP_PORT_QUEUE.get()
            # 删除当前代理
            # request.meta.pop('proxy')
            retryreq.meta['proxy'] = 'http://' + ip_port

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.debug(
                "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
Example #19
0
    def _retry(self, request, reason, spider, response):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        elif self.server is not None:
            stats.inc_value('retry/max_reached')
            logger.debug(
                "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            reason = response_status_message(response.status)
            dict_response = {
                'url': request.url,
                'reason': reason,
                'retries': retries
            }
            data = self.default_serialize(dict_response)
            print '*' * 10 + 'record invaild request and url is %s' % request.url + '*' * 10
            RETRYT_KEY = '%(spider)s:invailrequest'
            self.server.rpush(RETRYT_KEY, data)
        return response
Example #20
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            # 修改2
            # http_proxy = request.meta.get('http_proxy')
            # self.delete_proxy(http_proxy)

            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            # 修改1
            formdata = unquote(request.body.decode('utf-8'))
            http_proxy = request.meta.get('http_proxy')
            self.cant_retry_formdata_set.add(formdata + ' ' + http_proxy)
            # print(formdata + ' ' + http_proxy)

            stats.inc_value('retry/max_reached')
            logger.debug(
                "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
Example #21
0
    def insert_item(self, conn, db, table, item, taskid, spider_name):
        response, spider = self._hook(taskid, spider_name) # 这里有看不见的钩子

        # 使用 json 通用处理,存储时保证了数据类型,取数据时候使用 json.loads 来解析类型。
        table_sql = ""
        for k,v in item.items():
            table_sql += "'{}',".format(json.dumps(v))

        try:
            conn.execute('INSERT INTO `{}`.`{}` VALUES({})'.format(db, table, table_sql.strip(',')))
            self.stats.inc_value('item_mysql/db:{}/table:{}/count'.format(db, table), spider=spider)
        except Exception as e:
            traceback.print_exc()

            ex_class = global_object_name(e.__class__)
            self.stats.inc_value('item/exception_count', spider=spider)
            self.stats.inc_value('item/exception_type_count/%s' % ex_class, spider=spider)
Example #22
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1
        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust
            _proxy = Proxy()
            number = random.randint(20, 50)
            proxy_id = _proxy.get_ip(server_id=number)
            proxy_id = proxy_id.decode()
            proxy = "http://" + proxy_id + ":9990"
            retryreq.meta["proxy"] = proxy

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            _proxy = Proxy()
            proxy = _proxy.get()
            proxy2 = _proxy.get()
            proxy3 = _proxy.get()
            proxy4 = _proxy.get()
            request.meta["proxy"] = "http://" + proxy4
            request.dont_filter = True
            request.priority = request.priority + self.priority_adjust
            # return self.process_request(request, spider)
            return request
Example #23
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1
        print("获取request.meta.getretry_times", request.meta.get('retry_times'))
        retry_times = self.max_retry_times
        print("retry_timesretry_times", type(retry_times), retry_times)
        if 'max_retry_times' in request.meta:
            retry_times = request.meta[
                'max_retry_times']  #settings未设置时从这里读取默认的重试次数
            print("默认重试次数,settings未设置", type(retry_times),
                  request.meta['max_retry_times'])

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug(
                "重试elseRetrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            print("执行else,放弃重试将有异常的url写入redis中")
            r.sadd('jiayuan_except', request.url)  #避免重复,使用set
            r.save()
            stats.inc_value('retry/max_reached')
            logger.debug(
                "放弃了重试,已重试10000次了, retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
 def process_exception(self, request, exception, spider):
     if (isinstance(exception, self.EXCEPTIONS_TO_RETRY)
             and not request.meta.get('dont_retry', False)):
         if isinstance(exception,
                       (TunnelError, defer.TimeoutError, TimeoutError)):
             if self.need_switch_proxy:
                 request.meta["need_switch_proxy"] = True
         ret = self._retry(request, exception, spider)
         if ret:
             return ret
         else:
             # 重试次数达到状态
             response = HtmlResponse(url='', request=request)
             response._status = 1
             if isinstance(exception, Exception):
                 reason = global_object_name(exception.__class__)
             self.logger.debug(
                 "max retries had reached because of {}!".format(reason))
             return response
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        proxy = request.meta['proxy']

        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})

            # logger.debug('Retrying proxy <%s> #%d: %s' % (proxy, retries, reason))

            retryreq = self._get_retry_request(request, retries)

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
        else:
            stats.inc_value('retry/max_reached')
            logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                          extra={'spider': spider})

            # try:

            del self.proxies[proxy]
            retryreq = self._get_retry_request(request, 0)
            self.change_proxy(retryreq)
            logger.error('Removing proxy <%s>, %d proxies left' % (proxy, len(self.proxies)))

            # except (ValueError, KeyError):
            #     pass

        return retryreq
Example #26
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get("retry_times", 0) + 1

        retry_times = self.max_retry_times

        if "max_retry_times" in request.meta:
            retry_times = request.meta["max_retry_times"]

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    "request": request,
                    "retries": retries,
                    "reason": reason
                },
                extra={"spider": spider},
            )
            retryreq = request.copy()
            retryreq.meta["retry_times"] = retries
            retryreq.meta["refresh_cache"] = True
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value("retry/count")
            stats.inc_value("retry/reason_count/%s" % reason)
            return retryreq
        else:
            stats.inc_value("retry/max_reached")
            logger.debug(
                "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    "request": request,
                    "retries": retries,
                    "reason": reason
                },
                extra={"spider": spider},
            )
Example #27
0
    def _retry(self, request, reason, spider):
        """modified logger level"""
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.info(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.meta['change_proxy'] = True  # if use proxy, change proxy
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.warning(
                "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
Example #28
0
    def retry(self,
              request: ScrapflyScrapyRequest,
              reason: Union[str, Exception],
              delay: Optional[int] = None):
        logger.info('==> Retrying request for reason %s' % reason)
        stats = self.crawler.stats
        retries = request.meta.get('retry_times', 0) + 1

        if retries >= self.custom_settings.get('SCRAPFLY_MAX_API_RETRIES', 5):
            return None

        retryreq = request.replace(dont_filter=True)
        retryreq.priority += 100

        if retryreq.scrape_config.cache is True:
            retryreq.scrape_config.cache_clear = True

        retryreq.meta['retry_times'] = retries

        if isinstance(reason, ScrapflyError):
            stats.inc_value(f'scrapfly/api_retry/{reason.code}')

        if isinstance(reason, Exception):
            reason = global_object_name(reason.__class__)

        logger.warning(f"Retrying {request} for x{retries - 1}: {reason}",
                       extra={'spider': self})
        stats.inc_value('scrapfly/api_retry/count')

        if delay is None:
            deferred = Deferred()
            deferred.addCallback(self.crawler.engine.schedule,
                                 request=retryreq,
                                 spider=self)
        else:
            deferred = task.deferLater(reactor, delay,
                                       self.crawler.engine.crawl, retryreq,
                                       self)

        return deferred
    def process_exception(self, request, exception, spider):
        if isinstance(exception, (IgnoreRequest, DropItem)):
            return
        if not self._is_enabled_for_request(request):
            return

        autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
        stop_time = time.time()
        latency = time.time() - autoextract['timing']['start_ts']
        autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})

        # Make sure to log all unknown failures
        logger.warning('AutoExtract failure after %.3fs for %s: %s',
                       latency,
                       autoextract['original_url'],
                       repr(exception),
                       extra={'spider': spider})

        request.meta['autoextract'] = autoextract
        ex_class = global_object_name(exception.__class__)
        self.inc_metric('autoextract/errors/total_count', spider=spider)
        self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
Example #30
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1
        stats = spider.crawler.stats
        logger.debug(
            "Retrying %(request)s (failed %(retries)d times): %(reason)s", {
                'request': request,
                'retries': retries,
                'reason': reason
            },
            extra={'spider': spider})
        retryreq = request.copy()
        retryreq.meta['retry_times'] = retries
        proxy = self.get_random_proxy()
        retryreq.meta['proxy'] = proxy
        retryreq.dont_filter = True
        retryreq.priority = request.priority + 1

        if isinstance(reason, Exception):
            reason = global_object_name(reason.__class__)

        stats.inc_value('retry/count')
        stats.inc_value('retry/reason_count/%s' % reason)
        return retryreq
Example #31
0
	def _retry(self, request, reason, spider):
		response_retries = request.meta.get('response_retry', 0)
		exception_retries = request.meta.get('exception_retry', 0)
		print("response_retries is %s" % response_retries)
		print("exception_retries is %s" % exception_retries)
		retries = response_retries + exception_retries
		retry_times = self.max_retry_times
		
		if 'max_retry_times' in request.meta:
			retry_times = request.meta['max_retry_times']
		
		stats = spider.crawler.stats
		if retries <= retry_times:
			logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
			             {'request': request, 'retries': retries, 'reason': reason},
			             extra={'spider': spider})
			retryreq = request.copy()
			retryreq.meta['retry_times'] = retries
			retryreq.dont_filter = True
			retryreq.priority = request.priority + self.priority_adjust
			
			if isinstance(reason, Exception):
				reason = global_object_name(reason.__class__)
			
			stats.inc_value('retry/count')
			stats.inc_value('retry/reason_count/%s' % reason)
			return retryreq
		else:
			stats.inc_value('retry/max_reached')
			logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
			             {'request': request, 'retries': retries, 'reason': reason},
			             extra={'spider': spider})
			# 如果主要是由于出现exception,则说明该ip地址很可能失效
			if exception_retries > response_retries:
				# 随意封装一个response,返回给靠近engin的middleware,也就是上面定义的MiddlewareIpagentDownloaderMiddleware
				response = HtmlResponse(url='retry_over_exception')
				return response
Example #32
0
 def process_exception(self, request, exception, spider):
     ex_class = global_object_name(exception.__class__)
     self.stats.inc_value('downloader/exception_count', spider=spider)
     self.stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)