def test_resolveRevisits_orig_fields(self): """ when resolveRevisits=true, extra three fields are named ``orig.length``, ``orig.offset`` and ``orig.filename``, respectively. it is possible to filter fields by these names. """ resp = self.query('http://www.iana.org/_css/2013.1/print.css', resolveRevisits='1', fields='urlkey,orig.length,orig.offset,orig.filename' ) assert resp.status_code == 200 assert resp.content_type == 'text/x-cdxj' cdxes = resp.text.splitlines() cdx = cdxes[0] cdx = CDXObject(cdx.encode('utf-8')) assert cdx['orig.offset'] == '-' assert cdx['orig.length'] == '-' assert cdx['orig.filename'] == '-' for cdx in cdxes[1:]: cdx = CDXObject(cdx.encode('utf-8')) assert cdx['orig.offset'] != '-' assert cdx['orig.length'] != '-' assert cdx['orig.filename'] == 'iana.warc.gz'
def test_resolveRevisits(self): """ with ``resolveRevisits=true``, server adds three fields pointing to the *original* capture. """ resp = self.query('http://www.iana.org/_css/2013.1/print.css', resolveRevisits='true' ) assert resp.status_code == 200 assert resp.content_type == 'text/x-cdxj' cdxes = resp.text.splitlines() originals = {} for cdx in cdxes: cdx = CDXObject(cdx.encode('utf-8')) assert len(cdx) == 16 # orig.* fields are either all '-' or (int, int, filename) # check if orig.* fields are equals to corresponding fields # for the original capture. sha = cdx['digest'] if cdx['orig.length'] == '-': assert cdx['orig.offset'] == '-' and cdx['orig.filename'] == '-' originals[sha] = (int(cdx['length']), int(cdx['offset']), cdx['filename']) else: orig = originals.get(sha) assert orig == (int(cdx['orig.length']), int(cdx['orig.offset']), cdx['orig.filename'])
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader(DefaultResolverMixin.make_resolvers(test_warc_dir)) cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print(repr_format(headers)) sys.stdout.write(stream.readline().decode('utf-8')) sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: print('Exception: ' + e.__class__.__name__)
def test_fields(self): """ retrieve subset of fields with ``fields`` parameter. """ resp = self.query('http://www.iana.org/_css/2013.1/print.css', fields='urlkey,timestamp,status') assert resp.status_code == 200 cdxes = resp.text.splitlines() for cdx in cdxes: cdx = CDXObject(cdx.encode('utf-8')) assert cdx['urlkey'] == 'org,iana)/_css/2013.1/print.css' assert re.match(r'\d{14}$', cdx['timestamp']) assert re.match(r'\d{3}|-', cdx['status'])
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader( DefaultResolverMixin.make_resolvers(test_warc_dir)) cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print(repr_format(headers)) sys.stdout.write(stream.readline().decode('utf-8')) sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: print('Exception: ' + e.__class__.__name__)