import os import time import requests from headers import headers from logger.log import crawler from db.redis_db import Urls from db.redis_db import Cookies from db.login_info import freeze_account from page_parse.basic import is_403, is_404, is_complete from decorator.decorators import timeout_decorator, timeout from config.conf import get_timeout, get_crawl_interal, get_excp_interal, get_max_retries time_out = get_timeout() interal = get_crawl_interal() max_retries = get_max_retries() excp_interal = get_excp_interal() # 每次抓取都从redis中随机取一个cookie以降低被封号的危险,但是还没验证不同ip对账号的影响 # todo 验证代理ip使用cookie访问用户信息会不会出现验证码 @timeout(200) @timeout_decorator def get_page(url, user_verify=True): """ :param url: 待出现 :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接 :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串 """ crawler.info('本次抓取的url为{url}'.format(url=url)) count = 0
def test_get_timeout(self): from config.conf import get_timeout self.assertEqual(get_timeout(), 200)