def _writeResponse(self, item, concurrentTo): # fetch the body reqId = item.id # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date(resp.timestamp), } # conditional WARC headers if item.remoteIpAddress: warcHeaders['WARC-IP-Address'] = item.remoteIpAddress if item.protocol: warcHeaders['X-Chrome-Protocol'] = item.protocol # HTTP headers statusText = resp.statusText or \ BaseHTTPRequestHandler.responses.get ( resp.status, ('No status text available', ))[0] httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} for h in blacklistedHeaders: httpHeaders.remove_header(h) # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. contentType = resp.mimeType if contentType: if isinstance(resp.body, UnicodeBody): contentType += '; charset=utf-8' httpHeaders.replace_header('Content-Type', contentType) # response body body = resp.body if body is None: warcHeaders['WARC-Truncated'] = 'unspecified' else: httpHeaders.replace_header('Content-Length', str(len(body))) warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body) body = BytesIO(body) record = self.writeRecord(item.url, 'response', warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) if item.resourceType == 'Document': self.documentRecords[item.url] = record.rec_headers.get_header( 'WARC-Record-ID')
def _writeResponse(self, item, concurrentTo): # fetch the body reqId = item.id rawBody = None base64Encoded = False bodyTruncated = None if item.isRedirect: # redirects reuse the same request, thus we cannot safely retrieve # the body (i.e getResponseBody may return the new location’s # body). bodyTruncated = 'unspecified' elif item.encodedDataLength > self.maxBodySize: bodyTruncated = 'length' # check body size first, since we’re loading everything into memory self.logger.error('body for {} too large {} vs {}'.format( reqId, item.encodedDataLength, self.maxBodySize)) else: try: rawBody, base64Encoded = item.body except ValueError: # oops, don’t know what went wrong here bodyTruncated = 'unspecified' # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'WARC-IP-Address': resp.get('remoteIPAddress', ''), 'X-Chrome-Protocol': resp.get('protocol', ''), 'X-Chrome-FromDiskCache': str(resp.get('fromDiskCache')), 'X-Chrome-ConnectionReused': str(resp.get('connectionReused')), 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date( datetime.utcfromtimestamp(item.chromeRequest['wallTime'] + (item.chromeResponse['timestamp'] - item.chromeRequest['timestamp']))), } if bodyTruncated: warcHeaders['WARC-Truncated'] = bodyTruncated else: warcHeaders['X-Chrome-Base64Body'] = str(base64Encoded) httpHeaders = StatusAndHeaders('{} {}'.format(resp['status'], item.statusText), item.responseHeaders, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} for h in blacklistedHeaders: httpHeaders.remove_header(h) # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. contentType = resp.get('mimeType') if contentType: if not base64Encoded: contentType += '; charset=utf-8' httpHeaders.replace_header('content-type', contentType) if rawBody is not None: httpHeaders.replace_header('content-length', '{:d}'.format(len(rawBody))) bodyIo = BytesIO(rawBody) else: bodyIo = BytesIO() record = self.writeRecord(resp['url'], 'response', warc_headers_dict=warcHeaders, payload=bodyIo, http_headers=httpHeaders) if item.resourceType == 'Document': self.documentRecords[item.url] = record.rec_headers.get_header( 'WARC-Record-ID')