def test_post_json(self): warc_writer = BufferWARCWriter(gzip=False) with capture_http(warc_writer): res = requests.post('http://localhost:{0}/post'.format(self.port), headers={'Host': 'httpbin.org'}, json={'some': { 'data': 'posted' }}) assert res.json()['json'] == {'some': {'data': 'posted'}} # response ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert res.json() == json.loads( response.content_stream().read().decode('utf-8')) # request request = next(ai) assert request.rec_type == 'request' assert request.http_headers['Content-Type'] == 'application/json' data = request.content_stream().read().decode('utf-8') assert data == '{"some": {"data": "posted"}}'
def test_post_stream(self): warc_writer = BufferWARCWriter(gzip=False) def nop_filter(request, response, warc_writer): assert request assert response return request, response postbuff = BytesIO(b'somedatatopost') url = 'http://localhost:{0}/post'.format(self.port) with capture_http(warc_writer, nop_filter): res = requests.post(url, data=postbuff) # response ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1' assert res.json() == json.loads(response.content_stream().read().decode('utf-8')) # request request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1' data = request.content_stream().read().decode('utf-8') assert data == 'somedatatopost'
def do_rewrite(cls, statusline, headers): writer = BufferWARCWriter() http_headers = StatusAndHeaders(statusline, headers, protocol='HTTP/1.0') record = writer.create_warc_record('http://example.com/', 'response', http_headers=http_headers) return cls.get_rwinfo(record)
def _get_digest(self, record, name): value = record.rec_headers.get(name) if not value: if not self.writer: self.writer = BufferWARCWriter() self.writer.ensure_digest(record, block=False, payload=True) value = record.rec_headers.get(name) return value
def make_detailed_warcinfo(filename, guid, coll_title, coll_desc, rec_title, pages): # # # Thank you! Rhizome/Webrecorder.io/Ilya Kreymer # # coll_metadata = {'type': 'collection', 'title': coll_title, 'desc': coll_desc} rec_metadata = {'type': 'recording', 'title': rec_title, 'pages': pages} # Coll info writer = BufferWARCWriter(gzip=True) params = OrderedDict([('operator', 'Perma.cc download'), ('Perma-GUID', guid), ('format', 'WARC File Format 1.0'), ('json-metadata', json.dumps(coll_metadata))]) record = writer.create_warcinfo_record(filename, params) writer.write_record(record) # Rec Info params['json-metadata'] = json.dumps(rec_metadata) record = writer.create_warcinfo_record(filename, params) writer.write_record(record) return writer.get_contents()
def test_warc_enabled(): # Note that this does not use the responses mocking framework, as it conflicts with the warc captures. # This makes a real request to Yahoo, so might fail. url = 'https://groups.yahoo.com/api/v1/groups/test/' yga = YahooGroupsAPI('test') writer = BufferWARCWriter(gzip=False) yga.set_warc_writer(writer) yga.get_json('HackGroupInfo') expected = [(url, 'response'), (url, 'request')] actual = [(record.rec_headers['WARC-Target-URI'], record.rec_type) for record in ArchiveIterator(writer.get_stream())] assert expected == actual
def _create_response_record(self, url, headers, payload, warc_headers): writer = BufferWARCWriter() warc_headers = warc_headers or {} payload = payload.encode('utf-8') http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0') return writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), http_headers=http_headers, warc_headers_dict=warc_headers)
def _create_response_record(self, url, headers, payload, warc_headers): writer = BufferWARCWriter() warc_headers = warc_headers or {} if isinstance(payload, six.text_type): payload = payload.encode('utf-8') http_headers = StatusAndHeaders('200 OK', headers, protocol='HTTP/1.0') return writer.create_warc_record(url, 'response', payload=BytesIO(payload), length=len(payload), http_headers=http_headers, warc_headers_dict=warc_headers)
def test_skip_filter(self): warc_writer = BufferWARCWriter(gzip=False) def skip_filter(request, response, warc_writer): assert request assert response return None, None with capture_http(warc_writer, skip_filter): res = requests.get('http://localhost:{0}/get?foo=bar'.format(self.port), headers={'Host': 'httpbin.org'}) assert res.json()['args'] == {'foo': 'bar'} # skipped, nothing written assert warc_writer.get_contents() == b''
def test_response_warc_1_1(self, is_gzip, builder_factory): writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1') builder = builder_factory(writer, warc_version='WARC/1.1') resp = sample_response(builder) writer.write_record(resp) stream = writer.get_stream() reader = ArchiveIterator(stream) recs = list(reader) assert len(recs) == 1 assert recs[0].rec_headers.protocol == 'WARC/1.1' # ISO 8601 date with fractional seconds (microseconds) assert '.' in recs[0].rec_headers['WARC-Date'] assert len(recs[0].rec_headers['WARC-Date']) == 27
def test_request_response_concur(self): writer = BufferWARCWriter(gzip=False) resp = self._sample_response(writer) req = self._sample_request(writer) writer.write_request_response_pair(req, resp) stream = writer.get_stream() reader = ArchiveIterator(stream) resp, req = list(reader) resp_id = resp.rec_headers.get_header('WARC-Record-ID') req_id = req.rec_headers.get_header('WARC-Record-ID') assert resp_id != req_id assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
def test_request_response_concur(self, is_gzip, builder_factory): writer = BufferWARCWriter(gzip=is_gzip) builder = builder_factory(writer, builder_cls=RecordBuilder) resp = sample_response(builder) req = sample_request(builder) writer.write_request_response_pair(req, resp) stream = writer.get_stream() reader = ArchiveIterator(stream) resp, req = list(reader) resp_id = resp.rec_headers.get_header('WARC-Record-ID') req_id = req.rec_headers.get_header('WARC-Record-ID') assert resp_id != req_id assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
def create_warcinfo(self, creator, name, metadata, source, serialized, filename): for key, value in iteritems(serialized): if key in self.COPY_FIELDS: metadata[key] = value if not metadata.get('title'): metadata['title'] = self.DEFAULT_REC_TITLE.format(source.to_iso_date(metadata['created_at'], no_T=True)) metadata['auto_title'] = True info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator.name), ('isPartOf', name), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record(filename, info)) return wi_writer.get_contents()
def test_get_cache_to_file(self): warc_writer = BufferWARCWriter(gzip=False) url = 'http://localhost:{0}/bytes/{1}'.format(self.port, BUFF_SIZE * 2) with capture_http(warc_writer): res = requests.get(url, headers={'Host': 'httpbin.org'}) assert len(res.content) == BUFF_SIZE * 2 ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1' assert res.content == response.content_stream().read() request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'
def test_post_chunked(self): warc_writer = BufferWARCWriter(gzip=False) def nop_filter(request, response, recorder): assert request assert response return request, response def gen(): return iter([b'some', b'data', b'to', b'post']) #url = 'http://localhost:{0}/post'.format(self.port) url = 'https://httpbin.org/post' with capture_http(warc_writer, nop_filter, record_ip=False): res = requests.post(url, data=gen(), headers={'Content-Type': 'application/json'}) # response ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url assert 'WARC-IP-Address' not in response.rec_headers assert res.json() == json.loads( response.content_stream().read().decode('utf-8')) # request request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url assert 'WARC-IP-Address' not in response.rec_headers data = request.content_stream().read().decode('utf-8') assert data == 'somedatatopost'
def capture_http(warc_writer=None, filter_func=None, append=True, **kwargs): out = None if warc_writer == None: if 'gzip' not in kwargs: kwargs['gzip'] = False warc_writer = BufferWARCWriter(**kwargs) if isinstance(warc_writer, str): out = open(warc_writer, 'ab' if append else 'xb') warc_writer = WARCWriter(out, **kwargs) try: recorder = RequestRecorder(warc_writer, filter_func) RecordingHTTPConnection.local.recorder = recorder yield warc_writer finally: RecordingHTTPConnection.local.recorder = None if out: out.close()
def create_warcinfo(self, creator, title, metadata, source, filename): for name, value in iteritems(source): if name in self.COPY_FIELDS: metadata[name] = value info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator), ('isPartOf', title), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record(filename, info)) return wi_writer.get_contents()
def test_request_response_concur(self, is_gzip, builder_factory): writer = BufferWARCWriter(gzip=is_gzip) builder = builder_factory(writer, builder_cls=RecordBuilder) resp = sample_response(builder) req = sample_request(builder) # test explicitly calling ensure_digest with block digest enabled on a record writer.ensure_digest(resp, block=True, payload=True) writer.write_request_response_pair(req, resp) stream = writer.get_stream() reader = ArchiveIterator(stream) resp, req = list(reader) resp_id = resp.rec_headers.get_header('WARC-Record-ID') req_id = req.rec_headers.get_header('WARC-Record-ID') assert resp_id != req_id assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
def test_utf8_rewrite_content_adjust(self): UTF8_PAYLOAD = u'\ HTTP/1.0 200 OK\r\n\ Content-Type: text/plain; charset="UTF-8"\r\n\ Content-Disposition: attachment; filename="испытание.txt"\r\n\ Custom-Header: somevalue\r\n\ Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\ \r\n\ some\n\ text' content_length = len(UTF8_PAYLOAD.encode('utf-8')) UTF8_RECORD = u'\ WARC/1.0\r\n\ WARC-Type: response\r\n\ WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\ WARC-Target-URI: http://example.com/\r\n\ WARC-Date: 2000-01-01T00:00:00Z\r\n\ WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\ Content-Type: application/http; msgtype=response\r\n\ Content-Length: {0}\r\n\ \r\n\ {1}\r\n\ \r\n\ '.format(content_length, UTF8_PAYLOAD) assert (content_length == 226) record = ArcWarcRecordLoader().parse_record_stream( BytesIO(UTF8_RECORD.encode('utf-8'))) writer = BufferWARCWriter(gzip=False) writer.write_record(record) raw_buff = writer.get_contents() assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS for record in ArchiveIterator(writer.get_stream()): assert record.length == 268
def test_identity(self): """ read(write(record)) should yield record """ payload = b'foobar' writer = BufferWARCWriter(gzip=True) httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {'Foo': 'Bar'} record = writer.create_warc_record('http://example.com/', 'request', payload=BytesIO(payload), warc_headers_dict=warcHeaders, http_headers=httpHeaders) writer.write_record(record) for new_rec in ArchiveIterator(writer.get_stream()): assert new_rec.rec_type == record.rec_type assert new_rec.rec_headers == record.rec_headers assert new_rec.content_type == record.content_type assert new_rec.length == record.length assert new_rec.http_headers == record.http_headers assert new_rec.raw_stream.read() == payload
def create_warcinfo(self, creator, name, metadata, source, serialized, filename): for key, value in iteritems(serialized): if key in self.COPY_FIELDS: metadata[key] = value if not metadata.get('title'): metadata['title'] = self.DEFAULT_REC_TITLE.format( source.to_iso_date(metadata['created_at'], no_T=True)) metadata['auto_title'] = True info = OrderedDict([ ('software', 'Webrecorder Platform v' + __version__), ('format', 'WARC File Format 1.0'), ('creator', creator.name), ('isPartOf', name), ('json-metadata', json.dumps(metadata)), ]) wi_writer = BufferWARCWriter() wi_writer.write_record(wi_writer.create_warcinfo_record( filename, info)) return wi_writer.get_contents()
class CDXJIndexer(Indexer): field_names = { 'warc-target-uri': 'url', 'http:status': 'status', 'warc-payload-digest': 'digest', 'req.http:referer': 'referrer', 'req.http:method': 'method', } inv_field_names = {k: v for v, k in field_names.items()} DEFAULT_FIELDS = [ 'warc-target-uri', 'mime', 'http:status', 'warc-payload-digest', 'length', 'offset', 'filename' ] DEFAULT_RECORDS = ['response', 'revisit', 'resource', 'metadata'] def __init__(self, output, inputs, opts=None): opts = opts or {} fields = self._parse_fields(opts) super(CDXJIndexer, self).__init__(fields, inputs, output) self.writer = None self.curr_filename = None self.force_filename = opts.get('filename') self.post_append = opts.get('post_append') self.write_records = opts.get('records') if self.write_records == 'all': self.write_records = None elif self.write_records: self.write_records = self.write_records.split(',') else: self.write_records = self.DEFAULT_RECORDS self.collect_records = self.post_append or any( field.startswith('req.http:') for field in self.fields) self.record_parse = True def _parse_fields(self, opts): add_fields = opts.get('replace_fields') if add_fields: fields = [] else: add_fields = opts.get('fields') fields = copy(self.DEFAULT_FIELDS) if add_fields: add_fields = add_fields.split(',') for field in add_fields: fields.append(self.inv_field_names.get(field, field)) return fields def get_field(self, record, name, it, filename): if name == 'mime': if record.rec_type == 'revisit': return 'warc/revisit' elif record.rec_type in ('response', 'request'): name = 'http:content-type' else: name = 'content-type' value = super(CDXJIndexer, self).get_field(record, name, it, filename) if value: value = value.split(';')[0].strip() return value if name == 'filename': return self.curr_filename if self.collect_records: if name == 'offset': return str(record.file_offset) elif name == 'length': return str(record.file_length) elif name.startswith('req.http:'): value = self._get_req_field(name, record) if value: return value value = super(CDXJIndexer, self).get_field(record, name, it, filename) if name == 'warc-payload-digest': value = self._get_digest(record, name) return value def _get_req_field(self, name, record): if hasattr(record, 'req'): req = record.req elif record.rec_type == 'request': req = record else: return None if name == 'req.http:method': return req.http_headers.protocol else: return req.http_headers.get_header(name[9:]) def process_one(self, input_, output, filename): self.curr_filename = self.force_filename or os.path.basename(filename) it = self._create_record_iter(input_) self._write_header(output, filename) if self.collect_records: wrap_it = self.req_resolving_iter(it) else: wrap_it = it for record in wrap_it: if not self.write_records or record.rec_type in self.write_records: self.process_index_entry(it, record, filename, output) def _get_digest(self, record, name): value = record.rec_headers.get(name) if not value: if not self.writer: self.writer = BufferWARCWriter() self.writer.ensure_digest(record, block=False, payload=True) value = record.rec_headers.get(name) if value: value = value.split(':')[-1] return value def _write_line(self, out, index, record, filename): url = index.get('url') if not url: url = record.rec_headers.get('WARC-Target-URI') dt = record.rec_headers.get('WARC-Date') ts = iso_date_to_timestamp(dt) if hasattr(record, 'urlkey'): urlkey = record.urlkey else: urlkey = self.get_url_key(url) self._do_write(urlkey, ts, index, out) def _do_write(self, urlkey, ts, index, out): out.write(urlkey + ' ' + ts + ' ') out.write(json.dumps(index) + '\n') def get_url_key(self, url): try: return surt.surt(url) except: #pragma: no coverage return url def _concur_req_resp(self, rec_1, rec_2): if not rec_1 or not rec_2: return None, None if (rec_1.rec_headers.get_header('WARC-Target-URI') != rec_2.rec_headers.get_header('WARC-Target-URI')): return None, None if (rec_2.rec_headers.get_header('WARC-Concurrent-To') != rec_1.rec_headers.get_header('WARC-Record-ID')): return None, None if rec_1.rec_type == 'response' and rec_2.rec_type == 'request': req = rec_2 resp = rec_1 elif rec_1.rec_type == 'request' and rec_2.rec_type == 'response': req = rec_1 resp = rec_2 else: return None, None return req, resp def req_resolving_iter(self, record_iter): prev_record = None for record in record_iter: if record.rec_type == 'request': record.buffered_stream = BytesIO( record.content_stream().read()) record.file_offset = record_iter.get_record_offset() record.file_length = record_iter.get_record_length() req, resp = self._concur_req_resp(prev_record, record) if not req or not resp: if prev_record: yield prev_record prev_record = record continue self._join_req_resp(req, resp) yield prev_record yield record prev_record = None if prev_record: yield prev_record def _join_req_resp(self, req, resp): resp.req = req method = req.http_headers.protocol if self.post_append and method.upper() in ('POST', 'PUT'): post_url = append_post_query(req, resp) if post_url: resp.urlkey = self.get_url_key(post_url) req.urlkey = resp.urlkey
class CDXJIndexer(Indexer): field_names = { "warc-target-uri": "url", "http:status": "status", "warc-payload-digest": "digest", "req.http:referer": "referrer", "req.http:method": "method", } inv_field_names = {k: v for v, k in field_names.items()} DEFAULT_FIELDS = [ "warc-target-uri", "mime", "http:status", "warc-payload-digest", "length", "offset", "filename", ] DEFAULT_RECORDS = ["response", "revisit", "resource", "metadata"] ALLOWED_EXT = (".arc", ".arc.gz", ".warc", ".warc.gz") RE_SPACE = re.compile(r'[;\s]') def __init__(self, output, inputs, post_append=False, sort=False, compress=None, lines=300, data_out_name=None, filename=None, fields=None, replace_fields=None, records=None, verify_http=False, dir_root=None, **kwargs): if isinstance(inputs, str) or hasattr(inputs, "read"): inputs = [inputs] inputs = iter_file_or_dir(inputs) fields = self._parse_fields(fields, replace_fields) super(CDXJIndexer, self).__init__(fields, inputs, output, verify_http=verify_http) self.writer = None self.curr_filename = None self.force_filename = filename self.post_append = post_append self.dir_root = dir_root self.num_lines = lines self.sort = sort self.compress = compress self.data_out_name = data_out_name self.include_records = records if self.include_records == "all": self.include_records = None elif self.include_records: self.include_records = self.include_records.split(",") else: self.include_records = self.DEFAULT_RECORDS self.collect_records = self.post_append or any( field.startswith("req.http:") for field in self.fields) self.record_parse = True def _parse_fields(self, fields=None, replace_fields=None): add_fields = replace_fields if add_fields: fields = [] else: add_fields = fields fields = copy(self.DEFAULT_FIELDS) if add_fields: add_fields = add_fields.split(",") for field in add_fields: fields.append(self.inv_field_names.get(field, field)) return fields def get_field(self, record, name, it, filename): if name == "mime": if record.rec_type == "revisit": return "warc/revisit" elif record.rec_type in ("response", "request"): name = "http:content-type" else: name = "content-type" value = super(CDXJIndexer, self).get_field(record, name, it, filename) if value: value = self.RE_SPACE.split(value, 1)[0].strip() return value if name == "filename": return self.curr_filename if self.collect_records: if name == "offset": return str(record.file_offset) elif name == "length": return str(record.file_length) elif name.startswith("req.http:"): value = self._get_req_field(name, record) if value: return value value = super(CDXJIndexer, self).get_field(record, name, it, filename) if name == "warc-payload-digest": value = self._get_digest(record, name) return value def _get_req_field(self, name, record): if hasattr(record, "req"): req = record.req elif record.rec_type == "request": req = record else: return None if name == "req.http:method": return req.http_headers.protocol else: return req.http_headers.get_header(name[9:]) def process_all(self): data_out = None with open_or_default(self.output, "wt", sys.stdout) as fh: if self.compress: if isinstance(self.compress, str): data_out = open(self.compress, "wb") if os.path.splitext(self.compress)[1] == "": self.compress += ".cdxj.gz" fh = CompressedWriter( fh, data_out=data_out, data_out_name=self.compress, num_lines=self.num_lines, ) else: fh = CompressedWriter( fh, data_out=self.compress, data_out_name=self.data_out_name, num_lines=self.num_lines, ) if self.sort: fh = SortingWriter(fh) self.output = fh super().process_all() if self.sort or self.compress: fh.flush() if data_out: data_out.close() def _resolve_rel_path(self, filename): if not self.dir_root: return os.path.basename(filename) path = os.path.relpath(filename, self.dir_root) if os.path.sep != "/": # pragma: no cover path = path.replace(os.path.sep, "/") return path def process_one(self, input_, output, filename): self.curr_filename = self.force_filename or self._resolve_rel_path( filename) it = self._create_record_iter(input_) self._write_header(output, filename) if self.collect_records: wrap_it = self.req_resolving_iter(it) else: wrap_it = it for record in wrap_it: if not self.include_records or record.rec_type in self.include_records: self.process_index_entry(it, record, filename, output) def _get_digest(self, record, name): value = record.rec_headers.get(name) if not value: if not self.writer: self.writer = BufferWARCWriter() self.writer.ensure_digest(record, block=False, payload=True) value = record.rec_headers.get(name) if value: value = value.split(":")[-1] return value def _write_line(self, out, index, record, filename): url = index.get("url") if not url: url = record.rec_headers.get("WARC-Target-URI") dt = record.rec_headers.get("WARC-Date") ts = iso_date_to_timestamp(dt) if hasattr(record, "urlkey"): urlkey = record.urlkey else: urlkey = self.get_url_key(url) self._do_write(urlkey, ts, index, out) def _do_write(self, urlkey, ts, index, out): out.write(urlkey + " " + ts + " " + json.dumps(index) + "\n") def get_url_key(self, url): try: return surt.surt(url) except: # pragma: no coverage return url def _concur_req_resp(self, rec_1, rec_2): if not rec_1 or not rec_2: return None, None if rec_1.rec_headers.get_header( "WARC-Target-URI") != rec_2.rec_headers.get_header( "WARC-Target-URI"): return None, None if rec_2.rec_headers.get_header( "WARC-Concurrent-To") != rec_1.rec_headers.get_header( "WARC-Record-ID"): return None, None if rec_1.rec_type == "response" and rec_2.rec_type == "request": req = rec_2 resp = rec_1 elif rec_1.rec_type == "request" and rec_2.rec_type == "response": req = rec_1 resp = rec_2 else: return None, None return req, resp def req_resolving_iter(self, record_iter): prev_record = None for record in record_iter: if record.rec_type == "request": record.buffered_stream = BytesIO( record.content_stream().read()) record.file_offset = record_iter.get_record_offset() record.file_length = record_iter.get_record_length() req, resp = self._concur_req_resp(prev_record, record) if not req or not resp: if prev_record: yield prev_record prev_record = record continue self._join_req_resp(req, resp) yield prev_record yield record prev_record = None if prev_record: yield prev_record def _join_req_resp(self, req, resp): resp.req = req method = req.http_headers.protocol if self.post_append and method.upper() in ("POST", "PUT"): post_url = append_post_query(req, resp) resp.urlkey = self.get_url_key(post_url) req.urlkey = resp.urlkey
class CDXJIndexer(Indexer): field_names = { "warc-target-uri": "url", "http:status": "status", "warc-payload-digest": "digest", "req.http:referer": "referrer", "req.http:method": "method", "record-digest": "recordDigest", } inv_field_names = {k: v for v, k in field_names.items()} DEFAULT_FIELDS = [ "warc-target-uri", "mime", "http:status", "warc-payload-digest", "length", "offset", "filename", ] DEFAULT_RECORDS = ["response", "revisit", "resource", "metadata"] ALLOWED_EXT = (".arc", ".arc.gz", ".warc", ".warc.gz") RE_SPACE = re.compile(r"[;\s]") BUFF_SIZE = 1024 * 64 DEFAULT_NUM_LINES = 300 def __init__(self, output, inputs, post_append=False, sort=False, compress=None, lines=DEFAULT_NUM_LINES, data_out_name=None, filename=None, fields=None, replace_fields=None, records=None, verify_http=False, dir_root=None, digest_records=False, **kwargs): if isinstance(inputs, str) or hasattr(inputs, "read"): inputs = [inputs] inputs = iter_file_or_dir(inputs) self.digest_records = digest_records fields = self._parse_fields(fields, replace_fields) super(CDXJIndexer, self).__init__(fields, inputs, output, verify_http=verify_http) self.writer = None self.curr_filename = None self.force_filename = filename self.post_append = post_append self.dir_root = dir_root self.num_lines = lines self.sort = sort self.compress = compress self.data_out_name = data_out_name self.include_records = records if self.include_records == "all": self.include_records = None elif self.include_records: self.include_records = self.include_records.split(",") else: self.include_records = self.DEFAULT_RECORDS self.collect_records = self.post_append or any( field.startswith("req.http:") for field in self.fields) self.record_parse = True def _parse_fields(self, fields=None, replace_fields=None): add_fields = replace_fields if add_fields: fields = [] else: add_fields = fields fields = copy(self.DEFAULT_FIELDS) if self.digest_records and "record-digest" not in fields: fields.append("record-digest") if add_fields: add_fields = add_fields.split(",") for field in add_fields: fields.append(self.inv_field_names.get(field, field)) return fields def get_field(self, record, name, it, filename): if name == "mime": if record.rec_type == "revisit": return "warc/revisit" elif record.rec_type in ("response", "request"): name = "http:content-type" else: name = "content-type" value = super(CDXJIndexer, self).get_field(record, name, it, filename) if value: value = self.RE_SPACE.split(value, 1)[0].strip() return value if name == "filename": return self.curr_filename if self.collect_records: if name == "offset": return str(record.file_offset) elif name == "length": return str(record.file_length) elif name == "record-digest": return str(record.record_digest) elif name.startswith("req.http:"): value = self._get_req_field(name, record) if value: return value value = super(CDXJIndexer, self).get_field(record, name, it, filename) if name == "warc-payload-digest": value = self._get_digest(record, name) return value def _get_req_field(self, name, record): if hasattr(record, "req"): req = record.req elif record.rec_type == "request": req = record else: return None if name == "req.http:method": return req.http_headers.protocol else: return req.http_headers.get_header(name[9:]) def process_all(self): data_out = None with open_or_default(self.output, "wt", sys.stdout) as fh: if self.compress: if isinstance(self.compress, str): data_out = open(self.compress, "wb") if os.path.splitext(self.compress)[1] == "": self.compress += ".cdxj.gz" fh = CompressedWriter( fh, data_out=data_out, data_out_name=self.compress, num_lines=self.num_lines, digest_records=self.digest_records, ) else: fh = CompressedWriter( fh, data_out=self.compress, data_out_name=self.data_out_name, num_lines=self.num_lines, digest_records=self.digest_records, ) if self.sort: fh = SortingWriter(fh) self.output = fh super().process_all() if self.sort or self.compress: fh.flush() if data_out: data_out.close() def _resolve_rel_path(self, filename): if not self.dir_root: return os.path.basename(filename) path = os.path.relpath(filename, self.dir_root) if os.path.sep != "/": # pragma: no cover path = path.replace(os.path.sep, "/") return path def process_one(self, input_, output, filename): self.curr_filename = self.force_filename or self._resolve_rel_path( filename) it = self._create_record_iter(input_) self._write_header(output, filename) if self.collect_records: wrap_it = self.req_resolving_iter(it, input_) else: wrap_it = it for record in wrap_it: if not self.include_records or self.filter_record(record): self.process_index_entry(it, record, filename, output) def filter_record(self, record): if not record.rec_type in self.include_records: return False if (self.include_records == self.DEFAULT_RECORDS and record.rec_type in ("resource", "metadata") and record.rec_headers.get_header("Content-Type") == "application/warc-fields"): return False return True def _get_digest(self, record, name): value = record.rec_headers.get(name) if not value: if not self.writer: self.writer = BufferWARCWriter() self.writer.ensure_digest(record, block=False, payload=True) value = record.rec_headers.get(name) return value def _write_line(self, out, index, record, filename): url = index.get("url") if not url: url = record.rec_headers.get("WARC-Target-URI") dt = record.rec_headers.get("WARC-Date") ts = iso_date_to_timestamp(dt) if hasattr(record, "urlkey"): urlkey = record.urlkey else: urlkey = self.get_url_key(url) if hasattr(record, "requestBody"): index["requestBody"] = record.requestBody if hasattr(record, "method"): index["method"] = record.method self._do_write(urlkey, ts, index, out) def _do_write(self, urlkey, ts, index, out): out.write(urlkey + " " + ts + " " + json.dumps(index) + "\n") def get_url_key(self, url): try: return surt.surt(url) except: # pragma: no coverage return url def _concur_req_resp(self, rec_1, rec_2): if not rec_1 or not rec_2: return None, None if rec_1.rec_headers.get_header( "WARC-Target-URI") != rec_2.rec_headers.get_header( "WARC-Target-URI"): return None, None if rec_2.rec_headers.get_header( "WARC-Concurrent-To") != rec_1.rec_headers.get_header( "WARC-Record-ID"): return None, None if rec_1.rec_type == "response" and rec_2.rec_type == "request": req = rec_2 resp = rec_1 elif rec_1.rec_type == "request" and rec_2.rec_type == "response": req = rec_1 resp = rec_2 else: return None, None return req, resp def read_content(self, record): spool = tempfile.SpooledTemporaryFile() shutil.copyfileobj(record.content_stream(), spool) spool.seek(0) record.buffered_stream = spool # record.buffered_stream = BytesIO(record.content_stream().read()) def req_resolving_iter(self, record_iter, digest_reader): prev_record = None for record in record_iter: # if record.rec_type == "request": self.read_content(record) record.file_offset = record_iter.get_record_offset() record.file_length = record_iter.get_record_length() if digest_reader and self.digest_records: curr = digest_reader.tell() digest_reader.seek(record.file_offset) record_digest, digest_length = self.digest_block( digest_reader, record.file_length) digest_reader.seek(curr) if digest_length != record.file_length: raise Exception( "Digest block mismatch, expected {0}, got {1}", record.file_length, len(buff), ) record.record_digest = record_digest req, resp = self._concur_req_resp(prev_record, record) if not req or not resp: if prev_record: yield prev_record prev_record.buffered_stream.close() prev_record = record continue self._join_req_resp(req, resp) yield prev_record prev_record.buffered_stream.close() yield record record.buffered_stream.close() prev_record = None if prev_record: yield prev_record prev_record.buffered_stream.close() def _join_req_resp(self, req, resp): resp.req = req method = req.http_headers.protocol if self.post_append and method.upper() in ("POST", "PUT"): url = req.rec_headers.get_header("WARC-Target-URI") query, append_str = append_method_query_from_req_resp(req, resp) resp.method = method.upper() resp.requestBody = query resp.urlkey = self.get_url_key(url + append_str) req.urlkey = resp.urlkey def digest_block(self, reader, length): count = 0 hasher = hashlib.sha256() while length > 0: buff = reader.read(min(self.BUFF_SIZE, length)) if not buff: break hasher.update(buff) length -= len(buff) count += len(buff) return "sha256:" + hasher.hexdigest(), count
class CDXJIndexer(Indexer): field_names = { "warc-target-uri": "url", "http:status": "status", "warc-payload-digest": "digest", "req.http:referer": "referrer", "req.http:method": "method", "record-digest": "recordDigest", } inv_field_names = {k: v for v, k in field_names.items()} DEFAULT_FIELDS = [ "warc-target-uri", "mime", "http:status", "warc-payload-digest", "length", "offset", "filename", ] DEFAULT_RECORDS = ["response", "revisit", "resource", "metadata"] ALLOWED_EXT = (".arc", ".arc.gz", ".warc", ".warc.gz") RE_SPACE = re.compile(r"[;\s]") DEFAULT_NUM_LINES = 300 def __init__(self, output, inputs, post_append=False, sort=False, compress=None, lines=DEFAULT_NUM_LINES, max_sort_buff_size=None, data_out_name=None, filename=None, fields=None, replace_fields=None, records=None, verify_http=False, dir_root=None, digest_records=False, **kwargs): if isinstance(inputs, str) or hasattr(inputs, "read"): inputs = [inputs] inputs = iter_file_or_dir(inputs) self.digest_records = digest_records fields = self._parse_fields(fields, replace_fields) super(CDXJIndexer, self).__init__(fields, inputs, output, verify_http=verify_http) self.writer = None self.curr_filename = None self.force_filename = filename self.post_append = post_append self.dir_root = dir_root self.num_lines = lines self.max_sort_buff_size = max_sort_buff_size self.sort = sort self.compress = compress self.data_out_name = data_out_name self.include_records = records if self.include_records == "all": self.include_records = None elif self.include_records: self.include_records = self.include_records.split(",") else: self.include_records = self.DEFAULT_RECORDS self.collect_records = self.post_append or any( field.startswith("req.http:") for field in self.fields) self.record_parse = True def _parse_fields(self, fields=None, replace_fields=None): add_fields = replace_fields if add_fields: fields = [] else: add_fields = fields fields = copy(self.DEFAULT_FIELDS) if self.digest_records and "record-digest" not in fields: fields.append("record-digest") if add_fields: add_fields = add_fields.split(",") for field in add_fields: fields.append(self.inv_field_names.get(field, field)) return fields def get_field(self, record, name, it, filename): if name == "mime": if record.rec_type == "revisit": return "warc/revisit" elif record.rec_type in ("response", "request"): name = "http:content-type" else: name = "content-type" value = super(CDXJIndexer, self).get_field(record, name, it, filename) if value: value = self.RE_SPACE.split(value, 1)[0].strip() return value if name == "filename": return self.curr_filename if self.collect_records: if name == "offset": return str(record.file_offset) elif name == "length": return str(record.file_length) elif name == "record-digest": return str(record.record_digest) elif name.startswith("req.http:"): value = self._get_req_field(name, record) if value: return value value = super(CDXJIndexer, self).get_field(record, name, it, filename) if name == "warc-payload-digest": value = self._get_digest(record, name) return value def _get_req_field(self, name, record): if hasattr(record, "req"): req = record.req elif record.rec_type == "request": req = record else: return None if name == "req.http:method": return req.http_headers.protocol else: return req.http_headers.get_header(name[9:]) def process_all(self): data_out = None with open_or_default(self.output, "wt", sys.stdout) as fh: if self.compress: if isinstance(self.compress, str): data_out = open(self.compress, "wb") if os.path.splitext(self.compress)[1] == "": self.compress += ".cdxj.gz" fh = CompressedWriter( fh, data_out=data_out, data_out_name=self.compress, num_lines=self.num_lines, digest_records=self.digest_records, ) else: fh = CompressedWriter( fh, data_out=self.compress, data_out_name=self.data_out_name, num_lines=self.num_lines, digest_records=self.digest_records, ) if self.sort: fh = SortingWriter(fh, self.max_sort_buff_size) self.output = fh super().process_all() if self.sort or self.compress: fh.flush() if data_out: data_out.close() def _resolve_rel_path(self, filename): if not self.dir_root: return os.path.basename(filename) path = os.path.relpath(filename, self.dir_root) if os.path.sep != "/": # pragma: no cover path = path.replace(os.path.sep, "/") return path def process_one(self, input_, output, filename): self.curr_filename = self.force_filename or self._resolve_rel_path( filename) it = self._create_record_iter(input_) self._write_header(output, filename) if self.collect_records: digest_reader = input_ if self.digest_records else None wrap_it = buffering_record_iter( it, post_append=self.post_append, digest_reader=digest_reader, url_key_func=self.get_url_key, ) else: wrap_it = it for record in wrap_it: if not self.include_records or self.filter_record(record): self.process_index_entry(it, record, filename, output) def filter_record(self, record): if not record.rec_type in self.include_records: return False if (self.include_records == self.DEFAULT_RECORDS and record.rec_type in ("resource", "metadata") and record.rec_headers.get_header("Content-Type") == "application/warc-fields"): return False return True def _get_digest(self, record, name): value = record.rec_headers.get(name) if not value: if not self.writer: self.writer = BufferWARCWriter() self.writer.ensure_digest(record, block=False, payload=True) value = record.rec_headers.get(name) return value def _write_line(self, out, index, record, filename): url = index.get("url") if not url: url = record.rec_headers.get("WARC-Target-URI") dt = record.rec_headers.get("WARC-Date") ts = iso_date_to_timestamp(dt) if hasattr(record, "urlkey"): urlkey = record.urlkey else: urlkey = self.get_url_key(url) if hasattr(record, "requestBody"): index["requestBody"] = record.requestBody if hasattr(record, "method"): index["method"] = record.method self._do_write(urlkey, ts, index, out) def _do_write(self, urlkey, ts, index, out): out.write(urlkey + " " + ts + " " + json.dumps(index) + "\n") def get_url_key(self, url): try: return surt.surt(url) except: # pragma: no coverage return url