Beispiel #1
0
 def start_requests(self):
     if not self.max_page:
         self.max_page = 1
     page = 0
     while page < self.max_page:
         page += 1
         for i in range(len(self.start_urls)):
             url = self.start_urls[i]
             url = url.format(page=page)
             log.info(f'爬取第 {i+1}/{len(self.start_urls)} 个地址,第 {page}/{self.max_page} 页,{url}')
             yield scrapy.Request(url=url)
Beispiel #2
0
 def process_request(self, request, spider):
     """处理请求"""
     if self.use_unique_proxy:
         if not self.unique_proxy:
             self.unique_proxy = proxy_manager.get()
             log.info(f'use unique proxy {self.unique_proxy}')
         proxy = self.unique_proxy
     else:
         proxy = proxy_manager.get()
         log.info(f'use random proxy {proxy}')
     request.meta["proxy"] = str(proxy)
Beispiel #3
0
 def fail(self, proxy: ProxyItem):
     """失败时调用"""
     # 添加失败次数
     self._execute(self._sql_add_times, proxy, key='fail_times')
     # 获取失败次数
     sql = self._sql_get_fail_times_by_ip.format(**proxy)
     record = asyncio.get_event_loop().run_until_complete(self.manager.conn.fetchrow(sql))
     if record:
         fail_times = record['fail_times']
         if fail_times and fail_times >= 5:
             # 大于 10 次,置为不可用
             log.info(f'{proxy} fail set available=0')
             self._execute(self._sql_update_fail, proxy)
         else:
             log.info(f'{proxy} fail set fail_times={fail_times}')
Beispiel #4
0
    def filter_proxy_list(self, proxy_list):
        """过滤并保存代理"""
        log.info(f'抓取 {self.name} 共 {len(proxy_list)} 个代理,校验有效性')
        # 设置名字
        for proxy in proxy_list:
            proxy['source_domain'] = self.name

        # 每次都新建,各自过滤各自保存
        proxy_filter = self.get_proxy_filter(proxy_list)
        if proxy_filter is not None:
            available_proxy_list = proxy_filter.filter()
        else:
            available_proxy_list = proxy_list

        log.info(f'{self.name} 共 {len(available_proxy_list)}/{len(proxy_list)} 个代理有效')
        return available_proxy_list
Beispiel #5
0
 def validate_response(self, proxy, result) -> bool:
     """从爬取代理的过程中,因为直接爬了抖音,所以解析数据"""
     status_code = result['status_code']
     if status_code == 0:
         aweme_list = result['aweme_list']
         proxy['available'] = 1
         log.info(f'{proxy}-代理有效,爬到 {len(aweme_list)} items')
         for aweme in aweme_list:
             item = DouyinItem(aweme)
             self.items.append(item)
         return True
     elif status_code == 2154:
         proxy['available'] = 2
         proxy['banned_time'] = time.time()
         log.info(f"{proxy}-代理有效,但已被禁{result['status_code']}")
         return True
     else:
         print(f"{proxy}-代理无效,返回状态码{result['status_code']}")
         return False
Beispiel #6
0
    def start_requests(self):
        if not self.max_page:
            self.max_page = 1

        keyword_length = len(self.keyword_list)
        for i in range(keyword_length):
            self.keyword = self.keyword_list[i]
            page = 0
            while page < self.max_page:
                page += 1
                url_length = len(self.start_urls)
                for j in range(url_length):
                    url = self.start_urls[j]
                    url = url.format(keyword=urllib.parse.quote(self.keyword),
                                     page=page)
                    log.info(
                        f'爬取关键字{i + 1}/{keyword_length},地址{j + 1}/{url_length},页数{page}/{self.max_page},'
                        f'{url}')
                    yield scrapy.Request(url=url)
Beispiel #7
0
 def start_requests(self):
     self.statistics.start_time = time.time()
     i = 0
     self.statistics.start_craw_count = self.manager.count()
     log.info(f'爬取前 item 数量 {self.statistics.start_craw_count}')
     while i < 1:
         # i += 1
         if not ANONYMOUS:
             log.info(f'sleep {self.sleep_time}')
             time.sleep(self.sleep_time)
             self.sleep_time = 1
         # 并发的时候,time 是相同的,被 scrapy 认为是相同地址而忽略
         # 后来发现要设置 dont_filter
         anonymous = ANONYMOUS
         url = douyin.generate_feed_url('http', anonymous)
         headers = douyin.generate_headers(anonymous)
         cookies = douyin.generate_cookies(anonymous)
         self.statistics.crawled_pages += 1
         log.info(f'crawl {self.statistics.crawled_pages} page:' + url)
         yield scrapy.Request(url=url,
                              headers=headers,
                              cookies=cookies,
                              dont_filter=True)
         if self.has_more == 0 or self.exit_code == 0:
             break
Beispiel #8
0
 def banned(self, proxy: ProxyItem):
     """被禁时调用"""
     log.info(f'{proxy} banned')
     self._execute(self._sql_update_banned, proxy)
Beispiel #9
0
 def success(self, proxy: ProxyItem):
     """成功时调用"""
     log.info(f'{proxy} success')
     self._execute(self._sql_update_success, proxy)
Beispiel #10
0
 def save_items(self):
     log.info(f'共爬取到 {len(self.items)} items,保存中')
     for item in self.items:
         self.pipeline.process_item(item, None)
     self.items.clear()
     self.pipeline.close_spider(None)
Beispiel #11
0
 def parse(self, response):
     try:
         body = response.body.decode()
         if body == 'error':
             print('body 为 error,异常已拦截')
             return
         result = json.loads(body)
         status_code = result['status_code']
         if result['status_code'] == 0:
             self.has_more = result['has_more']
             aweme_list = result['aweme_list']
             # 之前的数量
             before_item_count = self.manager.count()
             for aweme in aweme_list:
                 item = DouyinItem(aweme)
                 yield item
             # 保存后再统计
             # 之后的数量
             current_item_count = self.manager.count()
             self.statistics.crawled_success__pages += 1
             self.statistics.crawled_items += len(aweme_list)
             minute = (time.time() - self.statistics.start_time) / 60
             available_count = current_item_count - before_item_count
             if available_count <= 2:
                 # 获取数量较小,需要等待
                 if self.statistics.few_available_result_times >= 5:
                     self.statistics.few_available_result_times = 0
                     log.info('有 5 次抓取到的有效结果都较少,等待 600 s')
                     self.sleep_time = 600
                 else:
                     self.statistics.few_available_result_times += 1
                     log.info('抓取到的有效结果较少,等待 60s')
                     self.sleep_time = 60
             else:
                 self.statistics.few_available_result_times = 0
             log.info(
                 f'scraped {len(aweme_list)} items,available {available_count} items.'
             )
             speed = self.statistics.crawled_items / minute
             log.info(
                 f'scraped {self.statistics.crawled_success__pages}/{self.statistics.crawled_pages} pages,'
                 f'{current_item_count - self.statistics.start_craw_count}/{self.statistics.crawled_items} items,'
                 f'spend {self.parse_time(minute)},speed {speed:#.2f} items/min.'
             )
         elif status_code == 2145:
             log.warning('请求已过期')
             self.exit_code = 0
         elif status_code == 2151:
             log.warning('签名错误')
             self.exit_code = 0
         elif status_code == 2154:
             # 大约会被禁 1 个小时
             log.warning('请求太频繁,设备被禁')
             if ANONYMOUS:
                 # 已经在下载器中间件拦截,应该不会走到这里的
                 pass
             else:
                 # 不匿名需要处理
                 log.warning('休息 10 分钟')
                 self.sleep_time = 10 * 60
                 # 仅休眠,不退出
                 # self.exit_code = 0
         else:
             log.warning('错误码 %d' % status_code)
             log.warning(response.body.decode())
             self.exit_code = 0
     except Exception as e:
         # TODO 这里要解析代理出错,或者在中间件里处理
         log.error('出错了')
         log.error(repr(e))
Beispiel #12
0
    def crawl_in_loop(self, runner):
        """在循环中爬取"""
        # 遍历取出 spider
        spider_list = []
        for spider_class in iter_spider_classes(regex_proxy_spider):
            ip_count = getattr(spider_class, 'ip_count', 0)
            if ip_count > 0:
                spider_list.append(spider_class)

        all_loop = ProxyCounter()
        single_loop = ProxyCounter()
        # 开始时起动,每轮结束后计数
        all_loop.start()
        # 无限循环
        loop_times = 0
        while loop_times >= 0:
            loop_times += 1

            # 每轮开始时启动,每个爬虫结束时计数
            single_loop.start()
            while single_loop.available.start_num > 100:
                print(f'有效 ip {single_loop.available.start_num} 个,休息 10 分钟')
                time.sleep(60 * 10)
                single_loop.start()

            # 开始时的数量
            log.info(f'第 {loop_times} 轮爬取开始')

            # 爬取
            for i in range(len(spider_list)):
                spider = spider_list[i]
                log.info(
                    f'第 {loop_times} 轮,第 {i + 1}/{len(spider_list)} 个爬虫 {spider.name} 开始爬取,'
                    f'{single_loop.print_count()}')

                spider = spider_list[i]
                try:
                    yield runner.crawl(spider)
                except SystemExit:
                    pass
                sleep_time = 10
                divider = '-' * 10
                single_loop.count()
                log.info(
                    f'{divider}第 {loop_times} 轮,第 {i + 1}/{len(spider_list)} 个爬虫 {spider.name} 爬取结束,'
                    f'{single_loop.print_count()} {divider}')
                log.info(f'等待执行下一爬虫,sleep {sleep_time}')
                time.sleep(sleep_time)

            # 延时下一轮
            sleep_time = 60
            log.info(f'本轮爬取结束,等待下一轮,sleep {sleep_time}')
            all_loop.count()
            log.info(all_loop.print_count())
            time.sleep(sleep_time)
        # noinspection PyUnresolvedReferences
        reactor.stop()
Beispiel #13
0
 def process_request(self, request, spider):
     proxy = proxy_manager.get()
     log.info(f'use random proxy {proxy}')
     request.meta["proxy"] = str(proxy)
Beispiel #14
0
    def crawl_in_loop(self, runner):
        """在循环中爬取"""
        # 遍历取出 spider
        spider_list = []
        for spider_class in iter_spider_classes(regex_proxy_spider):
            ip_count = getattr(spider_class, 'ip_count', 0)
            if ip_count > 0:
                spider_list.append(spider_class)
        loop_times = 0
        loop_end_count = 0
        all_loop_proxy_count = 0
        """整个循环中爬取的代理总数"""
        # 无限循环
        while loop_times >= 0:
            loop_times += 1

            # 开始时的数量
            if loop_end_count == 0:
                # 首次获取
                loop_start_count = proxy_manager.count()
            else:
                # 取循环结束时的获取
                loop_start_count = loop_end_count
            log.info(f'第 {loop_times} 轮爬取开始,当前 ip 共 {loop_start_count} 个')

            # 爬取

            spider_end_count = 0
            for i in range(len(spider_list)):
                spider = spider_list[i]
                if spider_end_count == 0:
                    spider_start_count = loop_start_count
                else:
                    spider_start_count = spider_end_count
                log.info(
                    f'第 {loop_times} 轮,第 {i+1}/{len(spider_list)} 个爬虫 {spider.name} 开始爬取,'
                    f'当前 ip 共 {spider_start_count} 个')

                spider = spider_list[i]
                try:
                    yield runner.crawl(spider)
                except SystemExit:
                    pass
                sleep_time = 10
                spider_end_count = proxy_manager.count()
                spider_crawled_count = spider_end_count - spider_start_count
                loop_crawled_count = spider_end_count - loop_start_count
                # 单次循环爬取到的数量
                all_loop_proxy_count += loop_crawled_count
                divider = '-' * 10
                log.info(
                    f'{divider}第 {loop_times} 轮,第 {i+1}/{len(spider_list)} 个爬虫 {spider.name} 爬取结束,'
                    f'共爬取到 {spider_crawled_count}/{loop_crawled_count}/{all_loop_proxy_count} 个代理{divider}'
                )
                log.info(f'等待执行下一爬虫,sleep {sleep_time}')
                log.info(f'当前有效代理共 {proxy_manager.available_count()} 个')
                time.sleep(sleep_time)

            # 结束时的数量
            loop_end_count = proxy_manager.count()
            # 延时下一轮
            sleep_time = 60
            log.info(
                f'本轮共爬到 {loop_end_count-loop_start_count}/{loop_end_count} 个代理,等待下一轮,sleep {sleep_time}'
            )
            log.info(f'当前有效代理共 {proxy_manager.available_count()} 个')
            time.sleep(sleep_time)
        reactor.stop()