def test_to_str_2(): res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers))) assert(res == req_headers) res = str(StatusAndHeadersParser(['GET']).parse(StringIO(req_headers + '\r\n'))) assert(res == req_headers)
def __init__(self, verify_http=True, arc2warc=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: self.arc_parser = ARCHeadersParser() self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES) self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def test_agg_local_revisit(self): resp = self.testapp.get( '/many/resource?url=http://www.example.com/&closest=20140127171251&sources=local' ) assert resp.headers['Warcserver-Source-Coll'] == 'local:dupes.cdxj' buff = BytesIO(resp.body) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header( 'WARC-Target-URI') == 'http://example.com' assert status_headers.get_header('WARC-Date') == '2014-01-27T17:12:51Z' assert status_headers.get_header( 'WARC-Refers-To-Target-URI') == 'http://example.com' assert status_headers.get_header( 'WARC-Refers-To-Date') == '2014-01-27T17:12:00Z' assert resp.headers['Link'] == MementoUtils.make_link( 'http://example.com', 'original') assert resp.headers[ 'Memento-Datetime'] == 'Mon, 27 Jan 2014 17:12:51 GMT' assert b'HTTP/1.1 200 OK' in resp.body assert b'<!doctype html>' in resp.body assert 'ResErrors' not in resp.headers
def __init__(self, gzip=True, *args, **kwargs): self.gzip = gzip self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False) self.warc_version = kwargs.get('warc_version', self.WARC_VERSION) self.header_filter = kwargs.get('header_filter')
def __init__(self, gzip=True, *args, **kwargs): super(BaseWARCWriter, self).__init__(warc_version=kwargs.get('warc_version'), header_filter=kwargs.get('header_filter')) self.gzip = gzip self.hostname = gethostname() self.parser = StatusAndHeadersParser([], verify=False)
def test_record_param_user_coll_revisit(self): warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() recorder_app = RecorderApp( self.upstream_url, PerRecordWARCWriter(warc_path, dedup_index=dedup_index)) self._test_all_warcs('/warcs/USER/COLL/', 1) resp = self._test_warc_write( recorder_app, 'httpbin.org', '/user-agent', '¶m.recorder.user=USER¶m.recorder.coll=COLL') assert '"user-agent": "{0}"'.format(UA) in resp.text #assert b'HTTP/1.1 200 OK' in resp.body #assert b'"foo": "bar"' in resp.body self._test_all_warcs('/warcs/USER/COLL/', 2) # Test Redis CDX r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('USER:COLL:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 if b'warc/revisit' in res[0]: cdx = CDXObject(res[0]) else: cdx = CDXObject(res[1]) assert cdx['urlkey'] == 'org,httpbin)/user-agent' assert cdx['mime'] == 'warc/revisit' assert cdx['offset'] == '0' assert cdx['filename'].startswith(to_path('USER/COLL/')) assert cdx['filename'].endswith('.warc.gz') fullwarc = os.path.join(self.root_dir, 'warcs', cdx['filename']) warcs = r.hgetall('USER:COLL:warc') assert len(warcs) == 2 assert warcs[cdx['filename'].encode('utf-8')] == fullwarc.encode( 'utf-8') with open(fullwarc, 'rb') as fh: decomp = DecompressingBufferedReader(fh) # Test refers-to headers status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(decomp) assert status_headers.get_header('WARC-Type') == 'revisit' assert status_headers.get_header( 'WARC-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header( 'WARC-Refers-To-Target-URI') == 'http://httpbin.org/user-agent' assert status_headers.get_header('WARC-Refers-To-Date') != ''
def test_to_str_with_remove(): res = StatusAndHeadersParser(['GET']).parse(StringIO(req_headers)) res.remove_header('Foo') exp = "\ GET / HTTP/1.0\r\n\ Content-Length: 0\r\n" assert (str(res) == exp)
def _check_uri_date(self, resp, uri, dt): buff = BytesIO(resp.body) buff = ChunkedDataReader(buff) status_headers = StatusAndHeadersParser(['WARC/1.0']).parse(buff) assert status_headers.get_header('WARC-Target-URI') == uri if dt == True: assert status_headers.get_header('WARC-Date') != '' else: assert status_headers.get_header('WARC-Date') == dt
def test_to_str_1(): res = str(StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1))) exp = "\ HTTP/1.0 200 OK\r\n\ Content-Type: ABC\r\n\ Some: Value\r\n\ Multi-Line: Value1 Also This\r\n\ " assert(res == exp)
def __init__(self, paths, cdx_source): self.paths = paths self.resolvers = self.make_resolvers(self.paths) self.resolve_loader = ResolvingLoader(self.resolvers, no_record_parse=True) self.headers_parser = StatusAndHeadersParser([], verify=False) self.cdx_source = cdx_source
def test_to_str_exclude(): def exclude(h): if h[0].lower() == 'multi-line': return None return h sah = StatusAndHeadersParser(['HTTP/1.0']).parse(StringIO(status_headers_1)) res = sah.to_str(exclude) exp = "\ HTTP/1.0 200 OK\r\n\ Content-Type: ABC\r\n\ Some: Value\r\n\ " assert(res == exp) assert(sah.to_bytes(exclude) == (exp.encode('latin-1') + b'\r\n'))
def __init__(self, env): self.env = env parser = StatusAndHeadersParser([], verify=False) self.status_headers = parser.parse(self.env['wsgi.input'])
def test_status_one_word(): res = StatusAndHeadersParser(['GET'], verify=False).parse(StringIO('A')) assert (str(res) == 'A\r\n')
def test_status_empty(): with pytest.raises(EOFError): StatusAndHeadersParser([], verify=False).parse(StringIO(''))