Esempio n. 1
0
    def _writeRequest(self, item):
        logger = self.logger.bind(reqId=item.id)

        req = item.request
        url = item.url

        path = url.relative().with_fragment(None)
        httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
                                       req.headers,
                                       protocol='HTTP/1.1',
                                       is_http_request=True)
        warcHeaders = {
            'X-Chrome-Initiator': json.dumps(req.initiator),
            'X-Chrome-Request-ID': item.id,
            'WARC-Date': datetime_to_iso_date(req.timestamp),
        }

        body = item.request.body
        if item.request.hasPostData and body is None:
            # oops, don’t know what went wrong here
            logger.error('requestBody missing',
                         uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
            warcHeaders['WARC-Truncated'] = 'unspecified'
        else:
            warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body)
            body = BytesIO(body)
        record = self.writeRecord(url,
                                  'request',
                                  payload=body,
                                  http_headers=httpHeaders,
                                  warc_headers_dict=warcHeaders)
        return record.rec_headers['WARC-Record-ID']
Esempio n. 2
0
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') != self.CONTENT_TYPE:
            return None

        if not self.ydl:
            return None

        info = self.ydl.extract_info(load_url)
        info_buff = json.dumps(info)
        info_buff = info_buff.encode('utf-8')

        warc_headers = {}

        schema, rest = load_url.split('://', 1)
        target_url = 'metadata://' + rest

        dt = timestamp_to_datetime(cdx['timestamp'])

        warc_headers['WARC-Type'] = 'metadata'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = target_url
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
        warc_headers['Content-Type'] = self.CONTENT_TYPE
        warc_headers['Content-Length'] = str(len(info_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())

        return warc_headers, None, BytesIO(info_buff)
Esempio n. 3
0
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') != self.CONTENT_TYPE:
            return None

        if not self.ydl:
            return None

        info = self.ydl.extract_info(load_url)
        info_buff = json.dumps(info)
        info_buff = info_buff.encode('utf-8')

        warc_headers = {}

        schema, rest = load_url.split('://', 1)
        target_url = 'metadata://' + rest

        dt = timestamp_to_datetime(cdx['timestamp'])

        warc_headers['WARC-Type'] = 'metadata'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = target_url
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)
        warc_headers['Content-Type'] = self.CONTENT_TYPE
        warc_headers['Content-Length'] = str(len(info_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())

        return warc_headers, None, BytesIO(info_buff)
Esempio n. 4
0
    def _writeResponse(self, item, concurrentTo):
        # fetch the body
        reqId = item.id

        # now the response
        resp = item.response
        warcHeaders = {
            'WARC-Concurrent-To': concurrentTo,
            'X-Chrome-Request-ID': item.id,
            'WARC-Date': datetime_to_iso_date(resp.timestamp),
        }
        # conditional WARC headers
        if item.remoteIpAddress:
            warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
        if item.protocol:
            warcHeaders['X-Chrome-Protocol'] = item.protocol

        # HTTP headers
        statusText = resp.statusText or \
                BaseHTTPRequestHandler.responses.get (
                resp.status, ('No status text available', ))[0]
        httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
                                       resp.headers,
                                       protocol='HTTP/1.1')

        # Content is saved decompressed and decoded, remove these headers
        blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
        for h in blacklistedHeaders:
            httpHeaders.remove_header(h)

        # chrome sends nothing but utf8 encoded text. Fortunately HTTP
        # headers take precedence over the document’s <meta>, thus we can
        # easily override those.
        contentType = resp.mimeType
        if contentType:
            if isinstance(resp.body, UnicodeBody):
                contentType += '; charset=utf-8'
            httpHeaders.replace_header('Content-Type', contentType)

        # response body
        body = resp.body
        if body is None:
            warcHeaders['WARC-Truncated'] = 'unspecified'
        else:
            httpHeaders.replace_header('Content-Length', str(len(body)))
            warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body)
            body = BytesIO(body)

        record = self.writeRecord(item.url,
                                  'response',
                                  warc_headers_dict=warcHeaders,
                                  payload=body,
                                  http_headers=httpHeaders)

        if item.resourceType == 'Document':
            self.documentRecords[item.url] = record.rec_headers.get_header(
                'WARC-Record-ID')
Esempio n. 5
0
    def _writeRequest(self, item):
        logger = self.logger.bind(reqId=item.id)

        req = item.request
        resp = item.response
        url = urlsplit(resp['url'])

        path = url.path
        if url.query:
            path += '?' + url.query
        httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format(
            req['method'], path),
                                       item.requestHeaders,
                                       protocol='HTTP/1.1',
                                       is_http_request=True)
        initiator = item.initiator
        warcHeaders = {
            'X-Chrome-Initiator':
            json.dumps(initiator),
            'X-Chrome-Request-ID':
            item.id,
            'WARC-Date':
            datetime_to_iso_date(
                datetime.utcfromtimestamp(item.chromeRequest['wallTime'])),
        }
        try:
            bodyTruncated = None
            payload, payloadBase64Encoded = item.requestBody
        except ValueError:
            # oops, don’t know what went wrong here
            bodyTruncated = 'unspecified'
            logger.error('requestBody missing',
                         uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')

        if bodyTruncated:
            warcHeaders['WARC-Truncated'] = bodyTruncated
            payload = None

        if payload:
            payload = BytesIO(payload)
            warcHeaders['X-Chrome-Base64Body'] = str(payloadBase64Encoded)
        record = self.writeRecord(req['url'],
                                  'request',
                                  payload=payload,
                                  http_headers=httpHeaders,
                                  warc_headers_dict=warcHeaders)
        return record.rec_headers['WARC-Record-ID']
Esempio n. 6
0
    def write_warc_info(self, message):
        creator = message.get('From', '')

        url = message.get('Snapshot-Content-Location', '')

        title = message.get('Subject', url)


        try:
            actual_date = http_date_to_datetime(message['Date'])
            timestamp = datetime_to_timestamp(actual_date)
        except Exception:
            actual_date = ''
            timestamp = ''

        source = 'MHTML Snapshot for: ' + url

        software = 'mhtml2warc ' + str(__version__)

        metadata = {'title':  source,
                    'type': 'recording',
                    'pages': [{'title': title,
                               'url': url,
                               'timestamp': timestamp}]
                   }

        params = OrderedDict([('software', software),
                              ('creator', creator),
                              ('source', source),
                              ('format', 'WARC File Format 1.0'),
                              ('subject', title),
                              ('json-metadata', json.dumps(metadata))])

        record = self.writer.create_warcinfo_record(self.filename, params)

        if actual_date:
            actual_date = datetime_to_iso_date(actual_date)

            creation_date = record.rec_headers.get('WARC-Date')
            record.rec_headers.replace_header('WARC-Date', actual_date)
            record.rec_headers.replace_header('WARC-Creation-Date', creation_date)

        self.writer.write_record(record)

        return actual_date
Esempio n. 7
0
    def make_record(self, writer, file_info):
        if self.fixed_dt:
            warc_date = self.fixed_dt
        else:
            warc_date = datetime_to_iso_date(file_info.modified_dt)

        url = file_info.url

        source_uri = 'file://' + file_info.full_filename

        warc_headers = {
            'WARC-Date': warc_date,
            'WARC-Source-URI': source_uri,
            'WARC-Created-Date': writer._make_warc_date()
        }

        warc_content_type = self._guess_type(file_info)

        warc_content_type += self._guess_charset(warc_content_type, file_info)

        with file_info.open() as fh:
            record = writer.create_warc_record(
                url,
                'resource',
                payload=fh,
                length=file_info.size,
                warc_content_type=warc_content_type,
                warc_headers_dict=warc_headers)

            self.count += 1
            writer.write_record(record)

            self.logger.debug('Writing "{0}" ({1}) @ "{2}" from "{3}"'.format(
                url, warc_content_type, warc_date, file_info.full_filename))

        if url.lower().endswith(self.index_files):
            self.add_index_revisit(writer, record, url, warc_date, source_uri)
Esempio n. 8
0
    def make_record(self,
                    writer,
                    file_info,
                    record_type='resource',
                    extra_headers=None):
        # process inclue/exclude rules
        if self.include and self.exclude:
            if self.fnmatch_list(file_info.full_filename, self.include):
                pass
            elif self.fnmatch_list(file_info.full_filename, self.exclude):
                return False
        elif self.include and not self.exclude:
            if not self.fnmatch_list(file_info.full_filename, self.include):
                return False
        elif self.exclude and not self.include:
            if self.fnmatch_list(file_info.full_filename, self.exclude):
                return False

        # type and encoding
        if self.use_tika:
            file_info.tika_results = self.tika_parser.from_file(
                file_info.full_filename)

        if self.use_mapfile:
            file_info.mapfile_results = self._match_mapfile(
                file_info.full_filename)

        mime_type = self._guess_type(file_info)
        encoding = self._guess_charset(mime_type, file_info)
        warc_content_type = mime_type + encoding

        # target URL
        if self.use_mapfile and file_info.mapfile_results and 'URL' in file_info.mapfile_results:
            url = file_info.mapfile_results['URL']
        else:
            url = file_info.url

        # timestamp
        if self.use_mapfile and file_info.mapfile_results and 'timestamp' in file_info.mapfile_results:
            warc_date = self._set_fixed_dt(
                file_info.mapfile_results['timestamp'])
        elif self.fixed_dt:
            warc_date = self.fixed_dt
        else:
            warc_date = datetime_to_iso_date(file_info.modified_dt)

        # source from local disk
        source_uri = 'file://' + file_info.full_filename

        # write WARC entry

        warc_headers = {
            'WARC-Date': warc_date,
            'WARC-Source-URI': source_uri,
            'WARC-Creation-Date': writer._make_warc_date()
        }

        if extra_headers:
            warc_headers.update(extra_headers)

        with file_info.open() as fh:
            record = writer.create_warc_record(
                url,
                record_type,
                payload=fh,
                length=file_info.size,
                warc_content_type=warc_content_type,
                warc_headers_dict=warc_headers)

            self.count += 1
            writer.write_record(record)

            self.logger.debug('Writing "{0}" ({1}) @ "{2}" from "{3}"'.format(
                url, warc_content_type, warc_date, file_info.full_filename))

        self.write_logfile({
            'file': file_info.full_filename,
            'Record-Type': record_type,
            'URL': url,
            'timestamp': warc_date,
            'Content-Type': warc_content_type,
            'mime': mime_type,
            'charset': encoding[10:]  # minus '; charset='
        })

        return url, record
Esempio n. 9
0
 def _make_warc_date(cls):
     return datetime_to_iso_date(datetime.datetime.utcnow())
Esempio n. 10
0
 def _make_warc_date(cls, use_micros=False):
     return datetime_to_iso_date(datetime.datetime.utcnow(),
                                 use_micros=use_micros)
Esempio n. 11
0
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') == VideoLoader.CONTENT_TYPE:
            return None

        if self.forward_proxy_prefix and not cdx.get('is_live'):
            load_url = self.forward_proxy_prefix + load_url

        input_req = params['_input_req']

        req_headers = input_req.get_req_headers()

        dt = timestamp_to_datetime(cdx['timestamp'])

        if cdx.get('memento_url'):
            req_headers['Accept-Datetime'] = datetime_to_http_date(dt)

        method = input_req.get_req_method()
        data = input_req.get_req_body()

        p = PreparedRequest()
        try:
            p.prepare_url(load_url, None)
        except:
            raise LiveResourceException(load_url)
        p.prepare_headers(None)
        p.prepare_auth(None, load_url)

        auth = p.headers.get('Authorization')
        if auth:
            req_headers['Authorization'] = auth

        load_url = p.url

        # host is set to the actual host for live loading
        # ensure it is set to the load_url host
        if not cdx.get('is_live'):
            #req_headers.pop('Host', '')
            req_headers['Host'] = urlsplit(p.url).netloc

            referrer = cdx.get('set_referrer')
            if referrer:
                req_headers['Referer'] = referrer

        upstream_res = self._do_request_with_redir_check(
            method, load_url, data, req_headers, params, cdx)

        memento_dt = upstream_res.headers.get('Memento-Datetime')
        if memento_dt:
            dt = http_date_to_datetime(memento_dt)
            cdx['timestamp'] = datetime_to_timestamp(dt)
        elif cdx.get('memento_url'):
            # if 'memento_url' set and no Memento-Datetime header present
            # then its an error
            return None

        agg_type = upstream_res.headers.get('Warcserver-Type')
        if agg_type == 'warc':
            cdx['source'] = unquote(
                upstream_res.headers.get('Warcserver-Source-Coll'))
            return None, upstream_res.headers, upstream_res

        if upstream_res.version == 11:
            version = '1.1'
        else:
            version = '1.0'

        status = 'HTTP/{version} {status} {reason}\r\n'
        status = status.format(version=version,
                               status=upstream_res.status,
                               reason=upstream_res.reason)

        http_headers_buff = status

        orig_resp = upstream_res._original_response

        try:  #pragma: no cover
            #PY 3
            resp_headers = orig_resp.headers._headers
            for n, v in resp_headers:
                nl = n.lower()
                if nl in self.SKIP_HEADERS:
                    continue

                if nl in self.UNREWRITE_HEADERS:
                    v = self.unrewrite_header(cdx, v)

                http_headers_buff += n + ': ' + v + '\r\n'

            http_headers_buff += '\r\n'

            try:
                # http headers could be encoded as utf-8 (though non-standard)
                # first try utf-8 encoding
                http_headers_buff = http_headers_buff.encode('utf-8')
            except:
                # then, fall back to latin-1
                http_headers_buff = http_headers_buff.encode('latin-1')

        except:  #pragma: no cover
            #PY 2
            resp_headers = orig_resp.msg.headers

            for line in resp_headers:
                n, v = line.split(':', 1)
                n = n.lower()
                v = v.strip()

                if n in self.SKIP_HEADERS:
                    continue

                new_v = v
                if n in self.UNREWRITE_HEADERS:
                    new_v = self.unrewrite_header(cdx, v)

                if new_v != v:
                    http_headers_buff += n + ': ' + new_v + '\r\n'
                else:
                    http_headers_buff += line

            # if python2, already byte headers, so leave as is
            http_headers_buff += '\r\n'

        try:
            fp = upstream_res._fp.fp
            if hasattr(fp, 'raw'):  #pragma: no cover
                fp = fp.raw
            remote_ip = fp._sock.getpeername()[0]
        except:  #pragma: no cover
            remote_ip = None

        warc_headers = {}

        warc_headers['WARC-Type'] = 'response'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = cdx['url']
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)

        if not cdx.get('is_live'):
            now = datetime.datetime.utcnow()
            warc_headers['WARC-Source-URI'] = cdx.get('load_url')
            warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now)

        if remote_ip:
            warc_headers['WARC-IP-Address'] = remote_ip

        ct = upstream_res.headers.get('Content-Type')
        if ct:
            metadata = self.get_custom_metadata(ct, dt)
            if metadata:
                warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata)

        warc_headers['Content-Type'] = 'application/http; msgtype=response'

        if method == 'HEAD':
            content_len = 0
        else:
            content_len = upstream_res.headers.get('Content-Length', -1)

        self._set_content_len(content_len, warc_headers,
                              len(http_headers_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
        return (warc_headers, http_headers_buff, upstream_res)
Esempio n. 12
0
 def __call__(self, cdx, params):
     dt = timestamp_to_datetime(cdx['timestamp'])
     return ('revisit', cdx['url'], datetime_to_iso_date(dt))
Esempio n. 13
0
 def __call__(self, cdx, params):
     dt = timestamp_to_datetime(cdx['timestamp'])
     return ('revisit', cdx['url'], datetime_to_iso_date(dt))
Esempio n. 14
0
    def load_resource(self, cdx, params):
        load_url = cdx.get('load_url')
        if not load_url:
            return None

        if params.get('content_type') == VideoLoader.CONTENT_TYPE:
            return None

        if self.forward_proxy_prefix and not cdx.get('is_live'):
            load_url = self.forward_proxy_prefix + load_url

        input_req = params['_input_req']

        req_headers = input_req.get_req_headers()

        dt = timestamp_to_datetime(cdx['timestamp'])

        if cdx.get('memento_url'):
            req_headers['Accept-Datetime'] = datetime_to_http_date(dt)

        method = input_req.get_req_method()
        data = input_req.get_req_body()

        p = PreparedRequest()
        try:
            p.prepare_url(load_url, None)
        except:
            raise LiveResourceException(load_url)
        p.prepare_headers(None)
        p.prepare_auth(None, load_url)

        auth = p.headers.get('Authorization')
        if auth:
            req_headers['Authorization'] = auth

        load_url = p.url

        # host is set to the actual host for live loading
        # ensure it is set to the load_url host
        if not cdx.get('is_live'):
            #req_headers.pop('Host', '')
            req_headers['Host'] = urlsplit(p.url).netloc

            referrer = cdx.get('set_referrer')
            if referrer:
                req_headers['Referer'] = referrer

        upstream_res = self._do_request_with_redir_check(method, load_url,
                                                         data, req_headers,
                                                         params, cdx)

        memento_dt = upstream_res.headers.get('Memento-Datetime')
        if memento_dt:
            dt = http_date_to_datetime(memento_dt)
            cdx['timestamp'] = datetime_to_timestamp(dt)
        elif cdx.get('memento_url'):
        # if 'memento_url' set and no Memento-Datetime header present
        # then its an error
            return None

        agg_type = upstream_res.headers.get('Warcserver-Type')
        if agg_type == 'warc':
            cdx['source'] = unquote(upstream_res.headers.get('Warcserver-Source-Coll'))
            return None, upstream_res.headers, upstream_res

        if upstream_res.version == 11:
            version = '1.1'
        else:
            version = '1.0'

        status = 'HTTP/{version} {status} {reason}\r\n'
        status = status.format(version=version,
                               status=upstream_res.status,
                               reason=upstream_res.reason)

        http_headers_buff = status

        orig_resp = upstream_res._original_response

        try:  #pragma: no cover
        #PY 3
            resp_headers = orig_resp.headers._headers
            for n, v in resp_headers:
                nl = n.lower()
                if nl in self.SKIP_HEADERS:
                    continue

                if nl in self.UNREWRITE_HEADERS:
                    v = self.unrewrite_header(cdx, v)

                http_headers_buff += n + ': ' + v + '\r\n'

            http_headers_buff += '\r\n'

            try:
                # http headers could be encoded as utf-8 (though non-standard)
                # first try utf-8 encoding
                http_headers_buff = http_headers_buff.encode('utf-8')
            except:
                # then, fall back to latin-1
                http_headers_buff = http_headers_buff.encode('latin-1')

        except:  #pragma: no cover
        #PY 2
            resp_headers = orig_resp.msg.headers

            for line in resp_headers:
                n, v = line.split(':', 1)
                n = n.lower()
                v = v.strip()

                if n in self.SKIP_HEADERS:
                    continue

                new_v = v
                if n in self.UNREWRITE_HEADERS:
                    new_v = self.unrewrite_header(cdx, v)

                if new_v != v:
                    http_headers_buff += n + ': ' + new_v + '\r\n'
                else:
                    http_headers_buff += line

            # if python2, already byte headers, so leave as is
            http_headers_buff += '\r\n'

        try:
            fp = upstream_res._fp.fp
            if hasattr(fp, 'raw'):  #pragma: no cover
                fp = fp.raw
            remote_ip = fp._sock.getpeername()[0]
        except:  #pragma: no cover
            remote_ip = None

        warc_headers = {}

        warc_headers['WARC-Type'] = 'response'
        warc_headers['WARC-Record-ID'] = self._make_warc_id()
        warc_headers['WARC-Target-URI'] = cdx['url']
        warc_headers['WARC-Date'] = datetime_to_iso_date(dt)

        if not cdx.get('is_live'):
            now = datetime.datetime.utcnow()
            warc_headers['WARC-Source-URI'] = cdx.get('load_url')
            warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now)

        if remote_ip:
            warc_headers['WARC-IP-Address'] = remote_ip

        ct = upstream_res.headers.get('Content-Type')
        if ct:
            metadata = self.get_custom_metadata(ct, dt)
            if metadata:
                warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata)

        warc_headers['Content-Type'] = 'application/http; msgtype=response'

        if method == 'HEAD':
            content_len = 0
        else:
            content_len = upstream_res.headers.get('Content-Length', -1)

        self._set_content_len(content_len,
                              warc_headers,
                              len(http_headers_buff))

        warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items())
        return (warc_headers, http_headers_buff, upstream_res)
Esempio n. 15
0
    def _writeResponse(self, item, concurrentTo):
        # fetch the body
        reqId = item.id
        rawBody = None
        base64Encoded = False
        bodyTruncated = None
        if item.isRedirect:
            # redirects reuse the same request, thus we cannot safely retrieve
            # the body (i.e getResponseBody may return the new location’s
            # body).
            bodyTruncated = 'unspecified'
        elif item.encodedDataLength > self.maxBodySize:
            bodyTruncated = 'length'
            # check body size first, since we’re loading everything into memory
            self.logger.error('body for {} too large {} vs {}'.format(
                reqId, item.encodedDataLength, self.maxBodySize))
        else:
            try:
                rawBody, base64Encoded = item.body
            except ValueError:
                # oops, don’t know what went wrong here
                bodyTruncated = 'unspecified'

        # now the response
        resp = item.response
        warcHeaders = {
            'WARC-Concurrent-To':
            concurrentTo,
            'WARC-IP-Address':
            resp.get('remoteIPAddress', ''),
            'X-Chrome-Protocol':
            resp.get('protocol', ''),
            'X-Chrome-FromDiskCache':
            str(resp.get('fromDiskCache')),
            'X-Chrome-ConnectionReused':
            str(resp.get('connectionReused')),
            'X-Chrome-Request-ID':
            item.id,
            'WARC-Date':
            datetime_to_iso_date(
                datetime.utcfromtimestamp(item.chromeRequest['wallTime'] +
                                          (item.chromeResponse['timestamp'] -
                                           item.chromeRequest['timestamp']))),
        }
        if bodyTruncated:
            warcHeaders['WARC-Truncated'] = bodyTruncated
        else:
            warcHeaders['X-Chrome-Base64Body'] = str(base64Encoded)

        httpHeaders = StatusAndHeaders('{} {}'.format(resp['status'],
                                                      item.statusText),
                                       item.responseHeaders,
                                       protocol='HTTP/1.1')

        # Content is saved decompressed and decoded, remove these headers
        blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
        for h in blacklistedHeaders:
            httpHeaders.remove_header(h)

        # chrome sends nothing but utf8 encoded text. Fortunately HTTP
        # headers take precedence over the document’s <meta>, thus we can
        # easily override those.
        contentType = resp.get('mimeType')
        if contentType:
            if not base64Encoded:
                contentType += '; charset=utf-8'
            httpHeaders.replace_header('content-type', contentType)

        if rawBody is not None:
            httpHeaders.replace_header('content-length',
                                       '{:d}'.format(len(rawBody)))
            bodyIo = BytesIO(rawBody)
        else:
            bodyIo = BytesIO()

        record = self.writeRecord(resp['url'],
                                  'response',
                                  warc_headers_dict=warcHeaders,
                                  payload=bodyIo,
                                  http_headers=httpHeaders)

        if item.resourceType == 'Document':
            self.documentRecords[item.url] = record.rec_headers.get_header(
                'WARC-Record-ID')