def request_token(proxy_api,url=API_GET_TOKEN): while 1: proxy = get_proxy(proxy_api) res = requests.get(url,proxies=proxy) try: data = res.json() except Exception as e: print('Error:',res.content) continue if data: return data.get('token')
def send_http(session, method, url, *, retries=1, interval=INTERVAL, proxy=None, timeout=None, kind=None, success_callback=None, fail_callback=None, headers=None, **kwargs): forbiddens = 0 fails = 0 _404 = 0 _ua = copy.deepcopy(USER_AGENTS) if method.lower() not in HTTP_METHODS: return if retries == -1: attempt = -1 elif retries == 0: attempt = 1 else: attempt = retries + 1 logger.debug(f'当前请求[{kind}]:{url}') while attempt != 0: try: try: response = getattr(session, method.lower())(url, timeout=timeout, proxies=proxy, headers=headers, **kwargs) except Exception as e: logger.debug(f'[请求异常-代理:{proxy}] {e.__class__.__name__}:{e}') logger.info(f'[请求异常-代理:{proxy}] {e.__class__.__name__}:{e}') fails += 1 try: delete_proxy(urlparse(proxy.get('http')).netloc) except: proxy_url = 0 if PROXY_ENABLE: proxy = get_proxy() raise e code = response.status_code if code == 404: if _404 > NOT_FOUND_MAX_TO_DROP: break _404 += 1 if code in FORBIDDEN_CODE or \ should_verify(response) or \ fake_detail_response(response,kind) or \ fake_pages_response(response,kind) or \ fake_css_response(response,kind) or \ fake_city_response(response,kind) or \ fake_json_response(response,kind) or \ fake_map_response(response,kind) or \ fake_city_list_response(response, kind): fails += 1 logger.debug( f'[无效代理-{code}] {proxy} 请求无效.{headers["User-Agent"]}') logger.info( f'[无效代理-{code}] {proxy} 请求无效.{headers["User-Agent"]}') try: delete_proxy(urlparse(proxy.get('http')).netloc) except: proxy_url = 0 if code in FORBIDDEN_CODE: forbiddens += 1 if not _ua: _ua = copy.deepcopy(USER_AGENTS) if forbiddens > FORBIDDEN_MAX_TO_CHANGE and _ua: headers['User-Agent'] = _ua.pop( random.choice([i for i in range(len(_ua))])) # forbiddens = 0 logger.debug(f'切换UA:{headers["User-Agent"]}') if fails > FAIL_MAX_TO_DROP: break if PROXY_ENABLE: proxy = get_proxy() raise ForbiddenProxy if success_callback: success_callback(response) logger.debug(f'请求成功:[代理:{proxy},UA:{headers["User-Agent"]}]') return response, proxy, headers except: if RANDOM_SLEEP: time.sleep(random.uniform(*RANDOM_INTERVAL)) else: time.sleep(interval) attempt -= 1 if fail_callback: fail_callback() tries = fails if retries < 0 else retries logger.warn(f'[失败-{code}] 重试抓取{url} {tries} 次后失败.')
def send_request(method, url, session=None, JSON=False, DATA=False, retries=MAX_RETRY, **kwargs): proxy_on = False if method.lower() not in HTTP_METHODS: raise Exception(f'非法请求操作:{method}.') if session is None: session = requests.session() if retries == -1: attempt = -1 elif retries == 0: attempt = 1 else: attempt = retries + 1 while attempt != 0: try: response = session.request(method, url, **kwargs) code = response.status_code except Exception as e: logger.error(f'[请求异常]{e.__class__.__name__}:{e}') kwargs['proxies'] = get_proxy() proxy_on = True attempt -= 1 continue if code not in OK_CODE: print(response.text) logger.error(f'[{code}]非正常请求页面.使用代理重试中.') kwargs['proxies'] = get_proxy() proxy_on = True attempt -= 1 continue if JSON: try: response = response.json() except Exception as e: logger.error(f'[无效json格式]{e.__class__.__name__}:{e}') if proxy_on: kwargs['proxies'] = get_proxy() attempt -= 1 continue else: if DATA: data = response.get('data') next = response.get('next') cursor = response.get('cursor') has_more = response.get('has_more') if cursor == 0 or cursor: return response if not bool(data) and not next: if has_more is 0: return response logger.debug(f'[无数据返回]:{response}') if response == {}: # r = requests.get(URL_HOST, headers=HEADERS) # cookie = r.cookies.get('tt_webid') # kwargs['cookies'] = {'tt_webid':cookie} # continue return response if proxy_on: kwargs['proxies'] = get_proxy() attempt -= 1 continue time.sleep(DELAYS) return response
def send_http(session, method, url, *, retries=1, interval=INTERVAL, proxy=None, timeout=None, kind=None, success_callback=None, fail_callback=None, headers=None, _token=None, **kwargs): tname = 'dianping' # 读取缓存数据 if method.lower() == "post": data = json.dumps(kwargs['data']) _id = md5("{}{}".format(url, data)) logger.info("url: {} method: post headers: {} data: {}".format( url, json.dumps(headers), data)) else: _id = md5(url) logger.info("url: {} method: {} headers: {}".format( url, method, json.dumps(headers))) cacheDB = init_cache_db() cache = cacheDB.select(condition={'_id': {'=': _id}}, tname=tname) if cache: logger.info(f'当前请求[_id]: {url} 缓存命中') return pickle.loads(cache[0]['response']), proxy, headers # 前面不拼接token是为了能缓存 if _token != None: url = "{}&_token={}".format(url, _token) forbiddens = 0 fails = 0 _404 = 0 _ua = copy.deepcopy(USER_AGENTS) if method.lower() not in HTTP_METHODS: return if retries == -1: attempt = -1 elif retries == 0: attempt = 1 else: attempt = retries + 1 # 保证User-Agent存在 if not headers.get('User-Agent'): headers['User-Agent'] = _ua.pop( random.choice([i for i in range(len(_ua))])) while attempt != 0: try: try: response = getattr(session, method.lower())(url, timeout=timeout, proxies=proxy, headers=headers, **kwargs) except Exception as e: logger.debug(f'[请求异常-代理:{proxy}] {e.__class__.__name__}:{e}') fails += 1 if PROXY_ENABLE: proxy = get_proxy() raise e print(response.text) exit() code = response.status_code if code == 404: if _404 > NOT_FOUND_MAX_TO_DROP: break _404 += 1 if code in FORBIDDEN_CODE or \ should_verify(response) or \ fake_detail_response(response, kind) or \ fake_pages_response(response, kind) or \ fake_css_response(response, kind) or \ fake_city_response(response, kind) or \ fake_json_response(response, kind) or \ fake_map_response(response, kind) or \ fake_city_list_response(response, kind): fails += 1 logger.debug( f'[无效代理-{code}] {proxy} 请求无效.{headers["User-Agent"]}') if code in FORBIDDEN_CODE: forbiddens += 1 if not _ua: _ua = copy.deepcopy(USER_AGENTS) if forbiddens > FORBIDDEN_MAX_TO_CHANGE and _ua: headers['User-Agent'] = _ua.pop( random.choice([i for i in range(len(_ua))])) # forbiddens = 0 logger.debug(f'切换UA:{headers["User-Agent"]}') if fails > FAIL_MAX_TO_DROP: break if PROXY_ENABLE: proxy = get_proxy() raise ForbiddenProxy if success_callback: success_callback(response) # TODO:: 缓存页面结果 到 mongodb logger.debug(f'请求成功:[代理:{proxy},UA:{headers["User-Agent"]}]') cacheDB.save(data={ "_id": _id, "response": pickle.dumps(response) }, tname=tname) return response, proxy, headers except Exception as e: if RANDOM_SLEEP: time.sleep(random.uniform(*RANDOM_INTERVAL)) else: time.sleep(interval) attempt -= 1 if fail_callback: fail_callback() tries = fails if retries < 0 else retries logger.warn(f'[失败-{code}] 重试抓取{url} {tries} 次后失败.')