def tunnel_request_data(host, port, proxy_auth_header=None): host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port)) tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n' tunnel_req += b'Host: ' + host_value + b'\r\n' if proxy_auth_header: tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n' tunnel_req += b'\r\n' return tunnel_req
def download_request(self, request): # 设定多长时间内下载不报错 timeout = request.meta.get('download_timeout') or self._connectTimeout redirect = request.meta.get('download_redirect') or self._redirect self.request = request try: logger.debug(*self.lfm.crawled('Request', request, '执行download_request,超时时间:', {'time': timeout})) if redirect: agent = self._getRedirectAgent(timeout) else: agent = self._getAgent(timeout) # url格式如下:protocol :// hostname[:port] / path / [;parameters][?query]#fragment # urldefrag去掉fragment url = urldefrag(request.url)[0] method = to_bytes(request.method) headers = request.headers if request.body: bodyproducer = _RequestBodyProducer(request.body) elif method == b'POST': bodyproducer = _RequestBodyProducer(b'') else: bodyproducer = None start_time = time.clock() d = agent.request(method, to_bytes(url), headers, bodyproducer) d.addCallback(self._cb_latency, request, start_time) # 下载request.body d.addCallback(self._cb_body_get, request) d.addCallback(self._cb_body_done, request, url) # 检查是否超时,如果在设定时间还没返回结果,就将defer取消 # 当d.callback执行的方法,一直处于占用状态的时候,callLater是不会执行的, # 只有执行的方法是能够回到reactor主循环的时候,callLater才能执行 # _RequestBodyProducer中dataReceived方法不会一直占用,数据还没接收到是,是会回到reactor循环的, # 当总的接收数据的时间超过了timeout的时候,才会执行d.cancel self._timeout_cl = reactor.callLater(timeout, d.cancel) d.addBoth(self._cb_timeout, url, timeout) except Exception as e: # logger.error(e) logger.error(*self.lfm.error("Request", request, DownloadAgent, '下载过程中出现错误:'), extra={ 'exception': e, }, exc_info=True) return d
def body(self, body): if body is None: self._body = b'' else: self._body = to_bytes(body, self.encoding)