def test_unmodified(writer): """ Single request/response pair, no revisits """ records = [] httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {} record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) for r in records: writer.write_record (r) output = NamedTemporaryFile() mergeWarc ([writer.out.name], output) output.seek(0) recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def test_non_ascii_2(): st = StatusAndHeaders( '200 OK', [('Custom-Header', u'value; filename="Éxamplè"; param; other=испытание; another')]) res = st.to_ascii_bytes().decode('ascii') assert res == "\
def test_different_payload(writer): """ Duplicate URL, but different payload """ records = [] for i in range (2): httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {} record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(f'data{i}'.encode ('utf8')), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) for r in records: writer.write_record (r) output = NamedTemporaryFile() mergeWarc ([writer.out.name], output) output.seek(0) recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') != self.CONTENT_TYPE: return None if not self.ydl: return None info = self.ydl.extract_info(load_url) info_buff = json.dumps(info) info_buff = info_buff.encode('utf-8') warc_headers = {} schema, rest = load_url.split('://', 1) target_url = 'metadata://' + rest dt = timestamp_to_datetime(cdx['timestamp']) warc_headers['WARC-Type'] = 'metadata' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = target_url warc_headers['WARC-Date'] = datetime_to_iso_date(dt) warc_headers['Content-Type'] = self.CONTENT_TYPE warc_headers['Content-Length'] = str(len(info_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return warc_headers, None, BytesIO(info_buff)
def test_resp_revisit_same_url(writer): """ Duplicate record for the same URL, creates a revisit """ records = [] for i in range (2): httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {} record = writer.create_warc_record ('http://example.com/', 'request', payload=BytesIO(b'foobar'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record ('http://example.com/', 'response', payload=BytesIO(b'data'), warc_headers_dict=warcHeaders, http_headers=httpHeaders) records.append (record) for r in records: writer.write_record (r) dup = records.pop () ref = records[1] records.append (makeRevisit (writer, ref, dup)) output = NamedTemporaryFile() mergeWarc ([writer.out.name], output) output.seek(0) recordsEqual (makeGolden (writer, records), ArchiveIterator (output))
def write_request_response_pair(self, url, ip, req_headers, resp_headers, is_truncated, payload, digest=None, decompressed=False): if self.writer is None: self.open() req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers) warc_headers_dict = OrderedDict() warc_headers_dict['WARC-Warcinfo-ID'] = self.warcinfo_id request = self.writer.create_warc_record( 'http://example.com/', 'request', warc_headers_dict=warc_headers_dict, http_headers=req_http_headers) fake_resp_headers = self._fake_resp_headers(resp_headers, len(payload), decompressed=decompressed) resp_http_headers = StatusAndHeaders('200 OK', fake_resp_headers, protocol='HTTP/1.1') warc_headers_dict = OrderedDict() warc_headers_dict['WARC-Warcinfo-ID'] = self.warcinfo_id if ip is not None: if not isinstance(ip, str): ip = ip[0] warc_headers_dict['WARC-IP-Address'] = ip if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
def process_document(self, doc): if doc.status == 200: self.concurrency_lock.acquire() try: # print base64.b64encode(doc.text)+"\t"+doc.url+"\t"+str(time.time()) headers_list = doc.response.getheaders() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(doc.url, 'response', payload=io.BytesIO( doc.text), http_headers=http_headers) writer.write_record(record) self.crawlsize += sys.getsizeof(doc.text) / 1000000.0 if self.sizelimit is not None and self.crawlsize > self.sizelimit: self.interrupt = True self.save_status() if self.timelimit is not None and time.time( ) - self.crawlstarts > self.timelimit: self.interrupt = True self.save_status() finally: self.concurrency_lock.release() else: pass
def create_redirect_record(self, url, redirect_url, timestamp, status='301'): warc_headers = {} warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp) #content = 'Redirect to ' + redirect_url content = '' payload = content.encode('utf-8') headers_list = [('Content-Length', str(len(payload))), ('Location', redirect_url)] http_headers = StatusAndHeaders(status + ' Redirect', headers_list, protocol='HTTP/1.0') rec = self.writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), http_headers=http_headers, warc_headers_dict=warc_headers) self.writer.write_record(rec) return rec
def run(self): with open(self.warcfile, 'ab') as output: while True: self.lock.acquire() data = self.out_queue.get() writer = WARCWriter(output, gzip=False) headers_list = data[0] http_headers = StatusAndHeaders('{} {}'.format( data[3], data[4]), headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers) h = hashlib.sha1() h.update(record.raw_stream.read(BLOCK_SIZE)) if self.dedup.lookup(h.hexdigest()): record = writer.create_warc_record( data[2], 'revisit', http_headers=http_headers) writer.write_record(record) self.out_queue.task_done() self.lock.release() else: self.dedup.save(h.hexdigest(), data[2]) record.raw_stream.seek(0) writer.write_record(record) self.out_queue.task_done() self.lock.release()
def fetch_urls_to_warc(urls, warcfile_path): """Fetch urls and write to warc file :urls: list of urls to binary files :warcfile_path: path to a WARC file. """ with open(warcfile_path, 'wb') as output: writer = WARCWriter(output, gzip=True) for url in urls: print(url) resp = requests.get(url, headers={'Accept-Encoding': 'identity'}, stream=True) headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') record = writer.create_warc_record(url, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record)
def _writeRequest(self, item): logger = self.logger.bind(reqId=item.id) req = item.request url = item.url path = url.relative().with_fragment(None) httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { 'X-Chrome-Initiator': json.dumps(req.initiator), 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date(req.timestamp), } body = item.request.body if item.request.hasPostData and body is None: # oops, don’t know what went wrong here logger.error('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') warcHeaders['WARC-Truncated'] = 'unspecified' else: warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body) body = BytesIO(body) record = self.writeRecord(url, 'request', payload=body, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID']
def _writeResponse(self, item, concurrentTo): # fetch the body reqId = item.id # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date(resp.timestamp), } # conditional WARC headers if item.remoteIpAddress: warcHeaders['WARC-IP-Address'] = item.remoteIpAddress if item.protocol: warcHeaders['X-Chrome-Protocol'] = item.protocol # HTTP headers statusText = resp.statusText or \ BaseHTTPRequestHandler.responses.get ( resp.status, ('No status text available', ))[0] httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} for h in blacklistedHeaders: httpHeaders.remove_header(h) # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. contentType = resp.mimeType if contentType: if isinstance(resp.body, UnicodeBody): contentType += '; charset=utf-8' httpHeaders.replace_header('Content-Type', contentType) # response body body = resp.body if body is None: warcHeaders['WARC-Truncated'] = 'unspecified' else: httpHeaders.replace_header('Content-Length', str(len(body))) warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body) body = BytesIO(body) record = self.writeRecord(item.url, 'response', warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) if item.resourceType == 'Document': self.documentRecords[item.url] = record.rec_headers.get_header( 'WARC-Record-ID')
def write_request_response_pair(self, url, req_headers, resp_headers, is_truncated, payload, digest=None): if self.writer is None: self.open() # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?) req_http_headers = StatusAndHeaders( 'GET / HTTP/1.1', headers_to_str_headers(req_headers)) request = self.writer.create_warc_record('http://example.com/', 'request', http_headers=req_http_headers) resp_http_headers = StatusAndHeaders( '200 OK', headers_to_str_headers(resp_headers), protocol='HTTP/1.1') warc_headers_dict = {} if digest is not None: warc_headers_dict['WARC-Payload-Digest'] = digest if is_truncated: if is_truncated in valid_truncations: warc_headers_dict['WARC-Truncated'] = is_truncated else: LOGGER.error('Invalid is_truncation of ' + is_truncated) warc_headers_dict['WARC-Truncated'] = 'unspecified' response = self.writer.create_warc_record( url, 'response', payload=BytesIO(payload), length=len(payload), warc_headers_dict=warc_headers_dict, http_headers=resp_http_headers) self.writer.write_request_response_pair(request, response) self.maybe_close() LOGGER.debug('wrote warc request-response pair%s for url %s', p(self.prefix), url) stats.stats_sum('warc r/r' + p(self.prefix), 1)
def run(url, out_path, time_limit, agent, filetypes, warcfilename, wait): cmd = "" if time_limit: cmd += "timeout {} ".format(time_limit) waitoption = "" if wait is not None: waitoption = "--wait " + wait agentoption = "" if agent is not None: agentoption = "--user-agent \"" + agent + "\"" filetypesoption = "" if filetypes is not None: filetypesoption = "-A \"" + filetypes + "\"" warcoption = "" warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")] if warcfilename is not None: warcoption = "--warc-file \"" + warcfilebasename + "\"" if check_wget_compression("wget --help | grep 'no-warc-compression'"): warcoption += " --no-warc-compression" cmd += "wget --mirror {WAIT} {FILETYPES} -q -o /dev/null {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format( WAIT=waitoption, FILETYPES=filetypesoption, URL=url, DOWNLOAD_PATH=out_path, AGENT=agentoption, WARC=warcoption) # print("cmd", cmd) try: system_check(cmd) except subprocess.CalledProcessError as grepexc: sys.stderr.write( "Warning: Some files could not be downloaded with wget\n") with open(warcfilebasename + ".warc", 'rb') as f_in: with open(warcfilebasename + ".warc.gz", 'wb') as f_out: writer = WARCWriter(f_out, gzip=True) try: for record in ArchiveIterator(f_in): if record.http_headers: if record.http_headers.get_header( 'Transfer-Encoding') == "chunked": continue try: record.http_headers.to_ascii_bytes() except UnicodeEncodeError: # if header is non ascii, create a new header, with status code only # content length and content type will be filled before writing record.http_headers = StatusAndHeaders( record.http_headers.get_statuscode(), []) record.length = None writer.write_record(record) except: pass system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
def bin_stream(stream, content_type, status='200 OK', headers=None): def_headers = [('Content-Type', content_type)] if headers: def_headers += headers status_headers = StatusAndHeaders(status, def_headers) return WbResponse(status_headers, value=stream)
def warc_from_response(response, resolved_url): f_output = BytesIO() writer = WARCWriter(f_output, gzip=True) # Response response_header_items = list(response.headers.to_unicode_dict().items()) response_headers = StatusAndHeaders("200 OK", response_header_items, protocol="HTTP/1.0") response_record = writer.create_warc_record(resolved_url, "response", payload=BytesIO(response.body), http_headers=response_headers) writer.write_record(response_record) # Request request_header_items = list(response.request.headers.to_unicode_dict().items()) request_headers = StatusAndHeaders("200 OK", request_header_items, protocol="HTTP/1.0") request_record = writer.create_warc_record(resolved_url, "request", payload=BytesIO(response.request.body), http_headers=request_headers) request_record.rec_headers.add_header("WARC-Concurrent-To", response_record.rec_headers.get_header("WARC-Record-ID")) writer.write_record(request_record) contents = f_output.getvalue() f_output.close() return contents
def parse_response(self, url, response, ip=None): headers = [] payload = BytesIO() content = response['content'].get('text', '') if not content and not response.get('headers'): self.logger.info('No headers or payload for: {0}'.format(url)) headers.append(('Content-Length', '0')) if response['content'].get('encoding') == 'base64': payload.write(base64.b64decode(content)) else: payload.write(content.encode('utf-8')) length = payload.tell() payload.seek(0) SKIP_HEADERS = ('content-encoding', 'transfer-encoding') http2 = False for header in response['headers']: if header['name'].lower() not in SKIP_HEADERS: headers.append((header['name'], header['value'])) #TODO: http2 detection -- write as same warc header? if (not http2 and header['name'] in (':method', ':scheme', ':path')): http2 = True status = response.get('status') or 204 reason = response.get('statusText') if not reason: reason = http_status_names.get(status, 'No Reason') status_line = str(status) + ' ' + reason proto = self._get_http_version(response) http_headers = StatusAndHeaders(status_line, headers, protocol=proto) if not content: content_length = http_headers.get_header('Content-Length', '0') if content_length != '0': self.logger.info('No Content for length {0} {1}'.format(content_length, url)) http_headers.replace_header('Content-Length', '0') else: http_headers.replace_header('Content-Length', str(length)) warc_headers_dict = {} if ip: warc_headers_dict['WARC-IP-Address'] = ip record = self.writer.create_warc_record(url, 'response', http_headers=http_headers, payload=payload, length=length, warc_headers_dict=warc_headers_dict) return record
def do_rewrite(cls, statusline, headers): writer = BufferWARCWriter() http_headers = StatusAndHeaders(statusline, headers, protocol='HTTP/1.0') record = writer.create_warc_record('http://example.com/', 'response', http_headers=http_headers) return cls.get_rwinfo(record)
def _sample_request(self, writer): headers_list = [('User-Agent', 'foo'), ('Host', 'example.com')] http_headers = StatusAndHeaders('GET / HTTP/1.0', headers_list) record = writer.create_warc_record('http://example.com/', 'request', http_headers=http_headers) return record
def text_response(text, status='200 OK', content_type='text/plain; charset=utf-8'): encoded_text = text.encode('utf-8') status_headers = StatusAndHeaders( status, [('Content-Type', content_type), ('Content-Length', str(len(encoded_text)))]) return WbResponse(status_headers, value=[encoded_text])
def default_http_headers(self, length, content_type=None): headers = [] if content_type: headers.append(('Content-Type', content_type)) if length is not None and length >= 0: headers.append(('Content-Length', str(length))) return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0')
def _test_proxy_headers(http_cache=None): headers = _make_cache_headers() status = '200 OK' rewriter = UrlRewriter('20131010/http://example.com/', '/pywb/', rewrite_opts={'http_cache': http_cache}) rewritten = headerrewriter.rewrite(StatusAndHeaders(status, headers), rewriter, rewriter.get_cookie_rewriter()) return rewritten.status_headers
def sample_request(builder): headers_list = [('User-Agent', 'foo'), ('Host', 'example.com')] http_headers = StatusAndHeaders('GET / HTTP/1.0', headers_list, is_http_request=True) return builder.create_warc_record('http://example.com/', 'request', http_headers=http_headers)
def write_memento(self, murl=None): """ This is function to write memento in WARC format. Parameters: murl (str): URI-M Returns: (bool): True on Success and False on Failure """ try: if self.lookup_memento(murl): return True else: response = Utils.get_murl_info(murl, self.__thandle) mpath = self.__memento_dir if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["handle"].lower()) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["domain"]) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["archive"]) if not os.path.exists(mpath): os.mkdir(mpath) mpath = os.path.join(mpath, response["wrep"] + response["lang"]) if not os.path.exists(mpath): os.mkdir(mpath) try: mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT) with open(mpath, "wb") as output: writer = WARCWriter(output, gzip=True) resp = requests.get(murl, headers={'Accept-Encoding': 'identity'}, stream=True, timeout=120) # get raw headers from urllib3 headers_list = resp.raw.headers.items() http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1') record = writer.create_warc_record(mpath, 'response', payload=resp.raw, http_headers=http_headers) writer.write_record(record) return True except requests.exceptions.TooManyRedirects as err: sys.stderr.write(murl + "Too Many redirects" + "\n") except requests.exceptions.ConnectTimeout as err: sys.stderr.write(murl + "Connection Timeout" + "\n") except Exception as e: sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n") except Exception as e: sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n") return False
def create_warcinfo_record(self, filename, info): warc_headers = StatusAndHeaders(self.warc_version, []) warc_headers.add_header('WARC-Type', 'warcinfo') warc_headers.add_header('WARC-Record-ID', self._make_warc_id()) if filename: warc_headers.add_header('WARC-Filename', filename) warc_headers.add_header('WARC-Date', self._make_warc_date()) warcinfo = BytesIO() for name, value in six.iteritems(info): if not value: continue line = name + ': ' + str(value) + '\r\n' warcinfo.write(line.encode('utf-8')) length = warcinfo.tell() warcinfo.seek(0) return self.create_warc_record('', 'warcinfo', warc_headers=warc_headers, payload=warcinfo, length=length)
def _create_response_record(self, url, headers, payload, warc_headers): writer = BufferWARCWriter() warc_headers = warc_headers or {} payload = payload.encode('utf-8') http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0') return writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), http_headers=http_headers, warc_headers_dict=warc_headers)
def __call__(self): new_headers_list = [] for name, value in self.http_headers.headers: rule = self.header_rules.get(name.lower()) new_header = self.rewrite_header(name, value, rule) if new_header: if isinstance(new_header, list): new_headers_list.extend(new_header) else: new_headers_list.append(new_header) return StatusAndHeaders(self.http_headers.statusline, headers=new_headers_list, protocol=self.http_headers.protocol)
def options_response(env): """Construct WbResponse for OPTIONS based on the WSGI env dictionary :param dict env: The WSGI environment dictionary :return: The WBResponse for the options request :rtype: WbResponse """ status_headers = StatusAndHeaders('200 Ok', [ ('Content-Type', 'text/plain'), ('Content-Length', '0'), ]) response = WbResponse(status_headers) response.add_access_control_headers(env=env) return response
def redir_response(location, status='302 Redirect', headers=None): """Utility method for constructing redirection response. :param str location: The location of the resource redirecting to :param str status: The HTTP status line :param list[tuple[str, str]] headers: Additional headers for this response :return: WbResponse redirection response :rtype: WbResponse """ redir_headers = [('Location', location), ('Content-Length', '0')] if headers: redir_headers += headers return WbResponse(StatusAndHeaders(status, redir_headers))
def test_resp_3(): resp = vars(WbResponse.redir_response('http://example.com/otherfile')) expected = { 'body': [], 'status_headers': StatusAndHeaders(protocol='', statusline='302 Redirect', headers=[('Location', 'http://example.com/otherfile'), ('Content-Length', '0')]) } assert (resp == expected)
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check(method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote(upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)