def test_post_stream(self): warc_writer = BufferWARCWriter(gzip=False) def nop_filter(request, response, warc_writer): assert request assert response return request, response postbuff = BytesIO(b'somedatatopost') url = 'http://localhost:{0}/post'.format(self.port) with capture_http(warc_writer, nop_filter): res = requests.post(url, data=postbuff) # response ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1' assert res.json() == json.loads(response.content_stream().read().decode('utf-8')) # request request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1' data = request.content_stream().read().decode('utf-8') assert data == 'somedatatopost'
def test_post_json(self): warc_writer = BufferWARCWriter(gzip=False) with capture_http(warc_writer): res = requests.post('http://localhost:{0}/post'.format(self.port), headers={'Host': 'httpbin.org'}, json={'some': { 'data': 'posted' }}) assert res.json()['json'] == {'some': {'data': 'posted'}} # response ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert res.json() == json.loads( response.content_stream().read().decode('utf-8')) # request request = next(ai) assert request.rec_type == 'request' assert request.http_headers['Content-Type'] == 'application/json' data = request.content_stream().read().decode('utf-8') assert data == '{"some": {"data": "posted"}}'
def test_warc_enabled(): # Note that this does not use the responses mocking framework, as it conflicts with the warc captures. # This makes a real request to Yahoo, so might fail. url = 'https://groups.yahoo.com/api/v1/groups/test/' yga = YahooGroupsAPI('test') writer = BufferWARCWriter(gzip=False) yga.set_warc_writer(writer) yga.get_json('HackGroupInfo') expected = [(url, 'response'), (url, 'request')] actual = [(record.rec_headers['WARC-Target-URI'], record.rec_type) for record in ArchiveIterator(writer.get_stream())] assert expected == actual
def test_response_warc_1_1(self, is_gzip, builder_factory): writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1') builder = builder_factory(writer, warc_version='WARC/1.1') resp = sample_response(builder) writer.write_record(resp) stream = writer.get_stream() reader = ArchiveIterator(stream) recs = list(reader) assert len(recs) == 1 assert recs[0].rec_headers.protocol == 'WARC/1.1' # ISO 8601 date with fractional seconds (microseconds) assert '.' in recs[0].rec_headers['WARC-Date'] assert len(recs[0].rec_headers['WARC-Date']) == 27
def test_request_response_concur(self): writer = BufferWARCWriter(gzip=False) resp = self._sample_response(writer) req = self._sample_request(writer) writer.write_request_response_pair(req, resp) stream = writer.get_stream() reader = ArchiveIterator(stream) resp, req = list(reader) resp_id = resp.rec_headers.get_header('WARC-Record-ID') req_id = req.rec_headers.get_header('WARC-Record-ID') assert resp_id != req_id assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
def test_get_cache_to_file(self): warc_writer = BufferWARCWriter(gzip=False) url = 'http://localhost:{0}/bytes/{1}'.format(self.port, BUFF_SIZE * 2) with capture_http(warc_writer): res = requests.get(url, headers={'Host': 'httpbin.org'}) assert len(res.content) == BUFF_SIZE * 2 ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url assert response.rec_headers['WARC-IP-Address'] == '127.0.0.1' assert res.content == response.content_stream().read() request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url assert request.rec_headers['WARC-IP-Address'] == '127.0.0.1'
def test_request_response_concur(self, is_gzip, builder_factory): writer = BufferWARCWriter(gzip=is_gzip) builder = builder_factory(writer, builder_cls=RecordBuilder) resp = sample_response(builder) req = sample_request(builder) writer.write_request_response_pair(req, resp) stream = writer.get_stream() reader = ArchiveIterator(stream) resp, req = list(reader) resp_id = resp.rec_headers.get_header('WARC-Record-ID') req_id = req.rec_headers.get_header('WARC-Record-ID') assert resp_id != req_id assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
def test_utf8_rewrite_content_adjust(self): UTF8_PAYLOAD = u'\ HTTP/1.0 200 OK\r\n\ Content-Type: text/plain; charset="UTF-8"\r\n\ Content-Disposition: attachment; filename="испытание.txt"\r\n\ Custom-Header: somevalue\r\n\ Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\ \r\n\ some\n\ text' content_length = len(UTF8_PAYLOAD.encode('utf-8')) UTF8_RECORD = u'\ WARC/1.0\r\n\ WARC-Type: response\r\n\ WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\ WARC-Target-URI: http://example.com/\r\n\ WARC-Date: 2000-01-01T00:00:00Z\r\n\ WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\ Content-Type: application/http; msgtype=response\r\n\ Content-Length: {0}\r\n\ \r\n\ {1}\r\n\ \r\n\ '.format(content_length, UTF8_PAYLOAD) assert (content_length == 226) record = ArcWarcRecordLoader().parse_record_stream( BytesIO(UTF8_RECORD.encode('utf-8'))) writer = BufferWARCWriter(gzip=False) writer.write_record(record) raw_buff = writer.get_contents() assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS for record in ArchiveIterator(writer.get_stream()): assert record.length == 268
def test_identity(self): """ read(write(record)) should yield record """ payload = b'foobar' writer = BufferWARCWriter(gzip=True) httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True) warcHeaders = {'Foo': 'Bar'} record = writer.create_warc_record('http://example.com/', 'request', payload=BytesIO(payload), warc_headers_dict=warcHeaders, http_headers=httpHeaders) writer.write_record(record) for new_rec in ArchiveIterator(writer.get_stream()): assert new_rec.rec_type == record.rec_type assert new_rec.rec_headers == record.rec_headers assert new_rec.content_type == record.content_type assert new_rec.length == record.length assert new_rec.http_headers == record.http_headers assert new_rec.raw_stream.read() == payload
def test_request_response_concur(self, is_gzip, builder_factory): writer = BufferWARCWriter(gzip=is_gzip) builder = builder_factory(writer, builder_cls=RecordBuilder) resp = sample_response(builder) req = sample_request(builder) # test explicitly calling ensure_digest with block digest enabled on a record writer.ensure_digest(resp, block=True, payload=True) writer.write_request_response_pair(req, resp) stream = writer.get_stream() reader = ArchiveIterator(stream) resp, req = list(reader) resp_id = resp.rec_headers.get_header('WARC-Record-ID') req_id = req.rec_headers.get_header('WARC-Record-ID') assert resp_id != req_id assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
def test_post_chunked(self): warc_writer = BufferWARCWriter(gzip=False) def nop_filter(request, response, recorder): assert request assert response return request, response def gen(): return iter([b'some', b'data', b'to', b'post']) #url = 'http://localhost:{0}/post'.format(self.port) url = 'https://httpbin.org/post' with capture_http(warc_writer, nop_filter, record_ip=False): res = requests.post(url, data=gen(), headers={'Content-Type': 'application/json'}) # response ai = ArchiveIterator(warc_writer.get_stream()) response = next(ai) assert response.rec_type == 'response' assert response.rec_headers['WARC-Target-URI'] == url assert 'WARC-IP-Address' not in response.rec_headers assert res.json() == json.loads( response.content_stream().read().decode('utf-8')) # request request = next(ai) assert request.rec_type == 'request' assert request.rec_headers['WARC-Target-URI'] == url assert 'WARC-IP-Address' not in response.rec_headers data = request.content_stream().read().decode('utf-8') assert data == 'somedatatopost'