Esempio n. 1
0
def request_token(proxy_api,url=API_GET_TOKEN):
    while 1:
        proxy = get_proxy(proxy_api)
        res = requests.get(url,proxies=proxy)
        try:
            data = res.json()
        except Exception as e:
            print('Error:',res.content)
            continue
        if data:
            return data.get('token')
Esempio n. 2
0
def send_http(session,
              method,
              url,
              *,
              retries=1,
              interval=INTERVAL,
              proxy=None,
              timeout=None,
              kind=None,
              success_callback=None,
              fail_callback=None,
              headers=None,
              **kwargs):
    forbiddens = 0
    fails = 0
    _404 = 0
    _ua = copy.deepcopy(USER_AGENTS)
    if method.lower() not in HTTP_METHODS:
        return
    if retries == -1:
        attempt = -1
    elif retries == 0:
        attempt = 1
    else:
        attempt = retries + 1
    logger.debug(f'当前请求[{kind}]:{url}')
    while attempt != 0:
        try:
            try:
                response = getattr(session, method.lower())(url,
                                                            timeout=timeout,
                                                            proxies=proxy,
                                                            headers=headers,
                                                            **kwargs)
            except Exception as e:
                logger.debug(f'[请求异常-代理:{proxy}] {e.__class__.__name__}:{e}')
                logger.info(f'[请求异常-代理:{proxy}] {e.__class__.__name__}:{e}')
                fails += 1
                try:
                    delete_proxy(urlparse(proxy.get('http')).netloc)
                except:
                    proxy_url = 0
                if PROXY_ENABLE:
                    proxy = get_proxy()
                raise e
            code = response.status_code
            if code == 404:
                if _404 > NOT_FOUND_MAX_TO_DROP:
                    break
                _404 += 1
            if code in FORBIDDEN_CODE or \
                    should_verify(response) or \
                    fake_detail_response(response,kind)  or \
                    fake_pages_response(response,kind) or \
                    fake_css_response(response,kind) or \
                    fake_city_response(response,kind) or \
                    fake_json_response(response,kind) or \
                    fake_map_response(response,kind) or \
                    fake_city_list_response(response, kind):
                fails += 1
                logger.debug(
                    f'[无效代理-{code}] {proxy} 请求无效.{headers["User-Agent"]}')
                logger.info(
                    f'[无效代理-{code}] {proxy} 请求无效.{headers["User-Agent"]}')
                try:
                    delete_proxy(urlparse(proxy.get('http')).netloc)
                except:
                    proxy_url = 0
                if code in FORBIDDEN_CODE:
                    forbiddens += 1
                if not _ua:
                    _ua = copy.deepcopy(USER_AGENTS)
                if forbiddens > FORBIDDEN_MAX_TO_CHANGE and _ua:
                    headers['User-Agent'] = _ua.pop(
                        random.choice([i for i in range(len(_ua))]))
                    # forbiddens = 0
                    logger.debug(f'切换UA:{headers["User-Agent"]}')
                if fails > FAIL_MAX_TO_DROP:
                    break
                if PROXY_ENABLE:
                    proxy = get_proxy()
                raise ForbiddenProxy
            if success_callback:
                success_callback(response)
            logger.debug(f'请求成功:[代理:{proxy},UA:{headers["User-Agent"]}]')
            return response, proxy, headers
        except:
            if RANDOM_SLEEP:
                time.sleep(random.uniform(*RANDOM_INTERVAL))
            else:
                time.sleep(interval)
        attempt -= 1
    if fail_callback:
        fail_callback()
    tries = fails if retries < 0 else retries
    logger.warn(f'[失败-{code}] 重试抓取{url} {tries} 次后失败.')
Esempio n. 3
0
def send_request(method,
                 url,
                 session=None,
                 JSON=False,
                 DATA=False,
                 retries=MAX_RETRY,
                 **kwargs):
    proxy_on = False
    if method.lower() not in HTTP_METHODS:
        raise Exception(f'非法请求操作:{method}.')
    if session is None:
        session = requests.session()
    if retries == -1:
        attempt = -1
    elif retries == 0:
        attempt = 1
    else:
        attempt = retries + 1
    while attempt != 0:
        try:
            response = session.request(method, url, **kwargs)
            code = response.status_code
        except Exception as e:
            logger.error(f'[请求异常]{e.__class__.__name__}:{e}')
            kwargs['proxies'] = get_proxy()
            proxy_on = True
            attempt -= 1
            continue
        if code not in OK_CODE:
            print(response.text)
            logger.error(f'[{code}]非正常请求页面.使用代理重试中.')
            kwargs['proxies'] = get_proxy()
            proxy_on = True
            attempt -= 1
            continue
        if JSON:
            try:
                response = response.json()
            except Exception as e:
                logger.error(f'[无效json格式]{e.__class__.__name__}:{e}')
                if proxy_on:
                    kwargs['proxies'] = get_proxy()
                attempt -= 1
                continue
            else:
                if DATA:
                    data = response.get('data')
                    next = response.get('next')
                    cursor = response.get('cursor')
                    has_more = response.get('has_more')
                    if cursor == 0 or cursor:
                        return response
                    if not bool(data) and not next:
                        if has_more is 0:
                            return response
                        logger.debug(f'[无数据返回]:{response}')
                        if response == {}:
                            # r = requests.get(URL_HOST, headers=HEADERS)
                            # cookie = r.cookies.get('tt_webid')
                            # kwargs['cookies'] = {'tt_webid':cookie}
                            # continue
                            return response
                        if proxy_on:
                            kwargs['proxies'] = get_proxy()
                        attempt -= 1
                        continue
        time.sleep(DELAYS)
        return response
Esempio n. 4
0
def send_http(session,
              method,
              url,
              *,
              retries=1,
              interval=INTERVAL,
              proxy=None,
              timeout=None,
              kind=None,
              success_callback=None,
              fail_callback=None,
              headers=None,
              _token=None,
              **kwargs):
    tname = 'dianping'

    # 读取缓存数据
    if method.lower() == "post":
        data = json.dumps(kwargs['data'])
        _id = md5("{}{}".format(url, data))
        logger.info("url: {} method: post headers: {} data: {}".format(
            url, json.dumps(headers), data))
    else:
        _id = md5(url)
        logger.info("url: {} method: {} headers: {}".format(
            url, method, json.dumps(headers)))
    cacheDB = init_cache_db()
    cache = cacheDB.select(condition={'_id': {'=': _id}}, tname=tname)
    if cache:
        logger.info(f'当前请求[_id]: {url} 缓存命中')
        return pickle.loads(cache[0]['response']), proxy, headers

    # 前面不拼接token是为了能缓存
    if _token != None:
        url = "{}&_token={}".format(url, _token)

    forbiddens = 0
    fails = 0
    _404 = 0
    _ua = copy.deepcopy(USER_AGENTS)
    if method.lower() not in HTTP_METHODS:
        return
    if retries == -1:
        attempt = -1
    elif retries == 0:
        attempt = 1
    else:
        attempt = retries + 1

    # 保证User-Agent存在
    if not headers.get('User-Agent'):
        headers['User-Agent'] = _ua.pop(
            random.choice([i for i in range(len(_ua))]))

    while attempt != 0:
        try:
            try:
                response = getattr(session, method.lower())(url,
                                                            timeout=timeout,
                                                            proxies=proxy,
                                                            headers=headers,
                                                            **kwargs)
            except Exception as e:
                logger.debug(f'[请求异常-代理:{proxy}] {e.__class__.__name__}:{e}')
                fails += 1
                if PROXY_ENABLE:
                    proxy = get_proxy()
                raise e
            print(response.text)
            exit()

            code = response.status_code
            if code == 404:
                if _404 > NOT_FOUND_MAX_TO_DROP:
                    break
                _404 += 1
            if code in FORBIDDEN_CODE or \
                    should_verify(response) or \
                    fake_detail_response(response, kind) or \
                    fake_pages_response(response, kind) or \
                    fake_css_response(response, kind) or \
                    fake_city_response(response, kind) or \
                    fake_json_response(response, kind) or \
                    fake_map_response(response, kind) or \
                    fake_city_list_response(response, kind):
                fails += 1
                logger.debug(
                    f'[无效代理-{code}] {proxy} 请求无效.{headers["User-Agent"]}')
                if code in FORBIDDEN_CODE:
                    forbiddens += 1
                if not _ua:
                    _ua = copy.deepcopy(USER_AGENTS)
                if forbiddens > FORBIDDEN_MAX_TO_CHANGE and _ua:
                    headers['User-Agent'] = _ua.pop(
                        random.choice([i for i in range(len(_ua))]))
                    # forbiddens = 0
                    logger.debug(f'切换UA:{headers["User-Agent"]}')
                if fails > FAIL_MAX_TO_DROP:
                    break
                if PROXY_ENABLE:
                    proxy = get_proxy()
                raise ForbiddenProxy
            if success_callback:
                success_callback(response)
            # TODO:: 缓存页面结果 到 mongodb
            logger.debug(f'请求成功:[代理:{proxy},UA:{headers["User-Agent"]}]')
            cacheDB.save(data={
                "_id": _id,
                "response": pickle.dumps(response)
            },
                         tname=tname)
            return response, proxy, headers
        except Exception as e:
            if RANDOM_SLEEP:
                time.sleep(random.uniform(*RANDOM_INTERVAL))
            else:
                time.sleep(interval)
        attempt -= 1
    if fail_callback:
        fail_callback()
    tries = fails if retries < 0 else retries
    logger.warn(f'[失败-{code}] 重试抓取{url} {tries} 次后失败.')