def _connect_to_remote_server(self): ''' Connect to destination. Note that connection_from_host has hard-coded `scheme='http'` to avoid internal urllib3 logic when scheme is https. We handle ssl and socks inside the current method. self._conn_pool._get_conn() will either return an existing connection or a new one. If its new, it needs initialization. ''' self._conn_pool = self.server.remote_connection_pool.connection_from_host( host=self.hostname, port=int(self.port), scheme='http', pool_kwargs={ 'maxsize': 12, 'timeout': self._socket_timeout }) self._remote_server_conn = self._conn_pool._get_conn() if is_connection_dropped(self._remote_server_conn): if self.onion_tor_socks_proxy_host and self.hostname.endswith( '.onion'): self.logger.info( "using tor socks proxy at %s:%s to connect to %s", self.onion_tor_socks_proxy_host, self.onion_tor_socks_proxy_port or 1080, self.hostname) self._remote_server_conn.sock = socks.socksocket() self._remote_server_conn.sock.set_proxy( socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, port=self.onion_tor_socks_proxy_port, rdns=True) self._remote_server_conn.sock.settimeout(self._socket_timeout) self._remote_server_conn.sock.connect( (self.hostname, int(self.port))) else: self._remote_server_conn.connect() # Wrap socket if SSL is required if self.is_connect: try: context = ssl.create_default_context() context.check_hostname = False context.verify_mode = ssl.CERT_NONE self._remote_server_conn.sock = context.wrap_socket( self._remote_server_conn.sock, server_hostname=self.hostname) except AttributeError: try: self._remote_server_conn.sock = ssl.wrap_socket( self._remote_server_conn.sock) except ssl.SSLError: self.logger.warning( "failed to establish ssl connection to %s; " "python ssl library does not support SNI, " "consider upgrading to python 2.7.9+ or 3.4+", self.hostname) raise return self._remote_server_conn.sock
def _connect_to_remote_server(self): ''' Connect to destination. Note that connection_from_host has hard-coded `scheme='http'` to avoid internal urllib3 logic when scheme is https. We handle ssl and socks inside the current method. self._conn_pool._get_conn() will either return an existing connection or a new one. If its new, it needs initialization. ''' self._conn_pool = self.server.remote_connection_pool.connection_from_host( host=self.hostname, port=int(self.port), scheme='http', pool_kwargs={'maxsize': 12, 'timeout': self._socket_timeout}) self._remote_server_conn = self._conn_pool._get_conn() if is_connection_dropped(self._remote_server_conn): if self.onion_tor_socks_proxy_host and self.hostname.endswith('.onion'): self.logger.info( "using tor socks proxy at %s:%s to connect to %s", self.onion_tor_socks_proxy_host, self.onion_tor_socks_proxy_port or 1080, self.hostname) self._remote_server_conn.sock = socks.socksocket() self._remote_server_conn.sock.set_proxy( socks.SOCKS5, addr=self.onion_tor_socks_proxy_host, port=self.onion_tor_socks_proxy_port, rdns=True) self._remote_server_conn.sock.settimeout(self._socket_timeout) self._remote_server_conn.sock.connect((self.hostname, int(self.port))) else: self._remote_server_conn.connect() # Wrap socket if SSL is required if self.is_connect: try: context = ssl.create_default_context() context.check_hostname = False context.verify_mode = ssl.CERT_NONE self._remote_server_conn.sock = context.wrap_socket( self._remote_server_conn.sock, server_hostname=self.hostname) except AttributeError: try: self._remote_server_conn.sock = ssl.wrap_socket( self._remote_server_conn.sock) except ssl.SSLError: self.logger.warning( "failed to establish ssl connection to %s; " "python ssl library does not support SNI, " "consider upgrading to python 2.7.9+ or 3.4+", self.hostname) raise return self._remote_server_conn.sock
def _proxy_request(self, extra_response_headers={}): ''' Sends the request to the remote server, then uses a ProxyingRecorder to read the response and send it to the proxy client, while recording the bytes in transit. Returns a tuple (request, response) where request is the raw request bytes, and response is a ProxyingRecorder. :param extra_response_headers: generated on warcprox._proxy_request. It may contain extra HTTP headers such as ``Warcprox-Meta`` which are written in the WARC record for this request. ''' # Build request req_str = '{} {} {}\r\n'.format( self.command, self.path, self.request_version) # Swallow headers that don't make sense to forward on, i.e. most # hop-by-hop headers, see # http://tools.ietf.org/html/rfc2616#section-13.5. # self.headers is an email.message.Message, which is case-insensitive # and doesn't throw KeyError in __delitem__ for key in ( 'Connection', 'Proxy-Connection', 'Keep-Alive', 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): del self.headers[key] self.headers['Via'] = via_header_value( self.headers.get('Via'), self.request_version.replace('HTTP/', '')) # Add headers to the request # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( req_str += '\r\n'.join( '{}: {}'.format(k,v) for (k,v) in self.headers.items()) req = req_str.encode('latin1') + b'\r\n\r\n' # Append message body if present to the request if 'Content-Length' in self.headers: req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None try: self.logger.debug('sending to remote server req=%r', req) # Send it down the pipe! self._remote_server_conn.sock.sendall(req) prox_rec_res = ProxyingRecordingHTTPResponse( self._remote_server_conn.sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, url=self.url, method=self.command, tmp_file_max_memory_size=self._tmp_file_max_memory_size) prox_rec_res.begin(extra_response_headers=extra_response_headers) buf = prox_rec_res.read(65536) while buf != b'': buf = prox_rec_res.read(65536) if self._max_resource_size: if prox_rec_res.recorder.len > self._max_resource_size: prox_rec_res.truncated = b'length' self.logger.error( 'Max resource size %d bytes exceeded for URL %s', self._max_resource_size, self.url) break self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) # Let's close off the remote end. If remote connection is fine, # put it back in the pool to reuse it later. if not is_connection_dropped(self._remote_server_conn): self._conn_pool._put_conn(self._remote_server_conn) except: self._remote_server_conn.sock.close() finally: if prox_rec_res: prox_rec_res.close() return req, prox_rec_res
def _inner_proxy_request(self, extra_response_headers={}): ''' Sends the request to the remote server, then uses a ProxyingRecorder to read the response and send it to the proxy client, while recording the bytes in transit. Returns a tuple (request, response) where request is the raw request bytes, and response is a ProxyingRecorder. :param extra_response_headers: generated on warcprox._proxy_request. It may contain extra HTTP headers such as ``Warcprox-Meta`` which are written in the WARC record for this request. ''' self._swallow_hop_by_hop_headers() self.headers['Via'] = via_header_value( self.headers.get('Via'), self.request_version.replace('HTTP/', '')) req = self._build_request() # Append message body if present to the request if 'Content-Length' in self.headers: req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None start = time.time() try: self.logger.debug('sending to remote server req=%r', req) # Send it down the pipe! self._remote_server_conn.sock.sendall(req) prox_rec_res = ProxyingRecordingHTTPResponse( self._remote_server_conn.sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, url=self.url, method=self.command, tmp_file_max_memory_size=self._tmp_file_max_memory_size) prox_rec_res.begin(extra_response_headers=extra_response_headers) buf = None while buf != b'': try: buf = prox_rec_res.read(65536) except http_client.IncompleteRead as e: self.logger.warning('%s from %s', e, self.url) buf = e.partial if (self._max_resource_size and prox_rec_res.recorder.len > self._max_resource_size): prox_rec_res.truncated = b'length' self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) self._remote_server_conn.sock.close() self.logger.info( 'truncating response because max resource size %d ' 'bytes exceeded for URL %s', self._max_resource_size, self.url) break elif (not 'content-length' in self.headers and time.time() - start > 3 * 60 * 60): prox_rec_res.truncated = b'time' self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) self._remote_server_conn.sock.close() self.logger.info( 'reached hard timeout of 3 hours fetching url ' 'without content-length: %s', self.url) break self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) # Let's close off the remote end. If remote connection is fine, # put it back in the pool to reuse it later. if not is_connection_dropped(self._remote_server_conn): self._conn_pool._put_conn(self._remote_server_conn) except Exception as e: # A common error is to connect to the remote server successfully # but raise a `RemoteDisconnected` exception when trying to begin # downloading. Its caused by prox_rec_res.begin(...) which calls # http_client._read_status(). The connection fails there. # https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L275 # Another case is when the connection is fine but the response # status is problematic, raising `BadStatusLine`. # https://github.com/python/cpython/blob/3.7/Lib/http/client.py#L296 # In both cases, the host is bad and we must add it to # `bad_hostnames_ports` cache. if isinstance( e, (http_client.RemoteDisconnected, http_client.BadStatusLine)): host_port = self._hostname_port_cache_key() with self.server.bad_hostnames_ports_lock: self.server.bad_hostnames_ports[host_port] = 502 self.logger.info('bad_hostnames_ports cache size: %d', len(self.server.bad_hostnames_ports)) # Close the connection only if its still open. If its already # closed, an `OSError` "([Errno 107] Transport endpoint is not # connected)" would be raised. if not is_connection_dropped(self._remote_server_conn): self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) self._remote_server_conn.sock.close() raise finally: if prox_rec_res: prox_rec_res.close() return req, prox_rec_res
def _inner_proxy_request(self, extra_response_headers={}): ''' Sends the request to the remote server, then uses a ProxyingRecorder to read the response and send it to the proxy client, while recording the bytes in transit. Returns a tuple (request, response) where request is the raw request bytes, and response is a ProxyingRecorder. :param extra_response_headers: generated on warcprox._proxy_request. It may contain extra HTTP headers such as ``Warcprox-Meta`` which are written in the WARC record for this request. ''' # Build request req_str = '{} {} {}\r\n'.format( self.command, self.path, self.request_version) # Swallow headers that don't make sense to forward on, i.e. most # hop-by-hop headers. http://tools.ietf.org/html/rfc2616#section-13.5. # self.headers is an email.message.Message, which is case-insensitive # and doesn't throw KeyError in __delitem__ for key in ( 'Connection', 'Proxy-Connection', 'Keep-Alive', 'Proxy-Authenticate', 'Proxy-Authorization', 'Upgrade'): del self.headers[key] self.headers['Via'] = via_header_value( self.headers.get('Via'), self.request_version.replace('HTTP/', '')) # Add headers to the request # XXX in at least python3.3 str(self.headers) uses \n not \r\n :( req_str += '\r\n'.join( '{}: {}'.format(k,v) for (k,v) in self.headers.items()) req = req_str.encode('latin1') + b'\r\n\r\n' # Append message body if present to the request if 'Content-Length' in self.headers: req += self.rfile.read(int(self.headers['Content-Length'])) prox_rec_res = None start = time.time() try: self.logger.debug('sending to remote server req=%r', req) # Send it down the pipe! self._remote_server_conn.sock.sendall(req) prox_rec_res = ProxyingRecordingHTTPResponse( self._remote_server_conn.sock, proxy_client=self.connection, digest_algorithm=self.server.digest_algorithm, url=self.url, method=self.command, tmp_file_max_memory_size=self._tmp_file_max_memory_size) prox_rec_res.begin(extra_response_headers=extra_response_headers) buf = None while buf != b'': try: buf = prox_rec_res.read(65536) except http_client.IncompleteRead as e: self.logger.warn('%s from %s', e, self.url) buf = e.partial if (self._max_resource_size and prox_rec_res.recorder.len > self._max_resource_size): prox_rec_res.truncated = b'length' self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) self._remote_server_conn.sock.close() self.logger.info( 'truncating response because max resource size %d ' 'bytes exceeded for URL %s', self._max_resource_size, self.url) break elif (not 'content-length' in self.headers and time.time() - start > 3 * 60 * 60): prox_rec_res.truncated = b'time' self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) self._remote_server_conn.sock.close() self.logger.info( 'reached hard timeout of 3 hours fetching url ' 'without content-length: %s', self.url) break self.log_request(prox_rec_res.status, prox_rec_res.recorder.len) # Let's close off the remote end. If remote connection is fine, # put it back in the pool to reuse it later. if not is_connection_dropped(self._remote_server_conn): self._conn_pool._put_conn(self._remote_server_conn) except Exception as e: # A common error is to connect to the remote server successfully # but raise a `RemoteDisconnected` exception when trying to begin # downloading. Its caused by prox_rec_res.begin(...) which calls # http_client._read_status(). In that case, the host is also bad # and we must add it to `bad_hostnames_ports` cache. if isinstance(e, http_client.RemoteDisconnected): host_port = self._hostname_port_cache_key() with self.server.bad_hostnames_ports_lock: self.server.bad_hostnames_ports[host_port] = 502 self.logger.info('bad_hostnames_ports cache size: %d', len(self.server.bad_hostnames_ports)) self._remote_server_conn.sock.shutdown(socket.SHUT_RDWR) self._remote_server_conn.sock.close() raise finally: if prox_rec_res: prox_rec_res.close() return req, prox_rec_res