def test_unicode_url(): x = CDXObject(u'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}'.encode('utf-8')) assert x['urlkey'] == 'com,example,cafe)/' assert x['timestamp'] == '123' assert x['url'] == 'http://example.com/caf%C3%A9/path' assert x.to_cdxj() == 'com,example,cafe)/ 123 {"url": "http://example.com/caf%C3%A9/path"}\n'
def lookup(self, digest, url, timestamp): start, end = calc_search_range(url, 'exact') results = self.redis.zrangebylex(self.key, '[' + start, '(' + end) for res in results: cdx = CDXObject(res) if digest == cdx.get('digest'): return ('revisit', cdx['url'], timestamp_to_datetime(cdx['timestamp'])) return None
def test_lt_le(): A = CDXObject(b'ca,example)/ 2016 {"url": "http://example.com/"}') B = CDXObject(b'com,example)/ 2015 {"url": "http://example.com/"}') C = CDXObject(b'com,example)/ 2016 {"url": "http://example.com/"}') assert A < B assert B < C assert B >= A assert C >= A assert A < C
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader(PathResolverMapper()(test_warc_dir)) cdx = CDXObject(cdx.encode('utf-8')) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print(repr(headers)) sys.stdout.write(stream.readline().decode('utf-8')) sys.stdout.write(stream.readline().decode('utf-8')) except ArchiveLoadFailed as e: if reraise: raise else: print('Exception: ' + e.__class__.__name__)
def cdx_index(self, z_key, stream, filename): cdxout = BytesIO() write_cdx_index(cdxout, stream, filename, cdxj=True, append_post=True) cdx_list = cdxout.getvalue().rstrip().split(b'\n') count = 0 min_ = max_ = None for cdx in cdx_list: if cdx and not self.dry: self.dst_redis.zadd(z_key, 0, cdx) cdxobj = CDXObject(cdx) ts = cdxobj['timestamp'] min_ = min(min_, ts) if min_ else ts max_ = max(max_, ts) if max_ else ts count += 1 if count: min_ = timestamp_to_sec(min_) max_ = timestamp_to_sec(max_) logging.info(' CDXJ: {0} {1} {2}'.format(count, min_, max_)) return min_, max_
def test_anon_download_coll(self): res = self._get_anon('/temp/$download') assert res.headers['Content-Disposition'].startswith("attachment; filename*=UTF-8''temp-") warcin = self._get_dechunked(res.body) cdxout = BytesIO() write_cdx_index(cdxout, warcin, 'temp.warc.gz', include_all=True, cdxj=True) #print(cdxout.getvalue().decode('utf-8')) cdx = [CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n')] assert len(cdx) == 6 # response cdx[0]['url'] = 'http://httpbin.org/get?food=bar' cdx[0]['mime'] = 'application/json' # request cdx[1]['url'] = 'http://httpbin.org/get?food=bar' cdx[1]['mime'] = '-' # response cdx[2]['url'] = 'http://httpbin.org/get?bood=far' cdx[2]['mime'] = 'application/json' # request cdx[3]['url'] = 'http://httpbin.org/get?bood=far' cdx[3]['mime'] = '-'
def add_page(self, user, coll, pagedata): if not self.can_write_coll(user, coll): print('Cannot Write') return False url = pagedata['url'] try: key, end_key = calc_search_range(url, 'exact') except: print('Cannot Canon') return False if 'ts' not in pagedata: cdx_key = self.make_key(user, coll, self.CDX_KEY) result = self.redis.zrangebylex(cdx_key, '[' + key, '(' + end_key) if not result: print('NO CDX') return False last_cdx = CDXObject(result[-1]) pagedata['ts'] = last_cdx['timestamp'] pagedata_json = json.dumps(pagedata) key = self.make_key(user, coll, self.PAGE_KEY) self.redis.sadd(key, pagedata_json)
def index(self): for filename in self.sources: with open_file(filename) as fd: for line in fd: idx = CDXObject(line) idx['url_parsed'] = urlparse(idx['url']) yield idx fd.close()
def convert_line(self, line, url): timestamp, mime, filename = line.split('\t') cdx = CDXObject() cdx['urlkey'] = canonicalize(url) cdx['timestamp'] = timestamp cdx['original'] = url cdx['mimetype'] = mime cdx['statuscode'] = '200' cdx['digest'] = '-' cdx['length'] = '-1' cdx['offset'] = '0' cdx['filename'] = filename return cdx
def convert_to_cdx(self, item, urlkey, url): cdx = CDXObject() cdx['urlkey'] = canonicalize(url) cdx['timestamp'] = gettext(item, 'tstamp')[:14] cdx['url'] = url cdx['mime'] = gettext(item, 'primaryType') + '/' + gettext( item, 'subType') cdx['status'] = '-' cdx['digest'] = gettext(item, 'digest') #cdx['length'] = gettext(item, 'contentLength') cdx['length'] = '-' cdx['offset'] = gettext(item, 'arcoffset') cdx['filename'] = gettext(item, 'arcname') + '.arc.gz' return cdx
def test_collapseTime_resolveRevisits_reverse(client): resp = query(client, 'http://www.iana.org/_css/2013.1/print.css', collapseTime='11', resolveRevisits='true', reverse='true') cdxes = [CDXObject(l) for l in resp.body.splitlines()] assert len(cdxes) == 3 # timestamp is in descending order for i in range(len(cdxes) - 1): assert cdxes[i]['timestamp'] >= cdxes[i + 1]['timestamp']
def detect_pages(self, user, coll, rec): key = self.cdxj_key.format(user=user, coll=coll, rec=rec) pages = [] #for member, score in self.manager.redis.zscan_iter(key): for member in self.manager.redis.zrange(key, 0, -1): cdxj = CDXObject(member.encode('utf-8')) if len(pages) < 500 and self.is_page(cdxj): pages.append(dict(url=cdxj['url'], timestamp=cdxj['timestamp'])) return pages
def load_from_cdx_test(cdx, revisit_func=load_orig_cdx, reraise=False, failed_files=None): resolve_loader = ResolvingLoader(test_warc_dir) cdx = CDXObject(cdx) try: (headers, stream) = resolve_loader(cdx, failed_files, revisit_func) print headers sys.stdout.write(stream.readline()) sys.stdout.write(stream.readline()) except ArchiveLoadFailed as e: if reraise: raise else: print 'Exception: ' + e.__class__.__name__
def _get_url_ts(self, user, coll, rec, url): try: key, end_key = calc_search_range(url, 'exact') except: return None cdx_key = self.cdx_key.format(user=user, coll=coll, rec=rec) result = self.redis.zrangebylex(cdx_key, '[' + key, '(' + end_key) if not result: return None last_cdx = CDXObject(result[-1]) return last_cdx['timestamp']
def convert_to_cdxj(self): cdxj_writer = CDXJ() for filename in self.iter_cdx_files(): outfile = filename + 'j' print('Converting {0} -> {1}'.format(filename, outfile)) with open(outfile + '.tmp', 'w+b') as out: with open(filename) as fh: for line in fh: if line.startswith(' CDX'): continue cdx = CDXObject(line) cdx[URLKEY] = canonicalize(cdx[ORIGINAL]) cdxj_writer.write_cdx_line(out, cdx, cdx['filename']) shutil.move(outfile + '.tmp', outfile) os.remove(filename)
def test_convert_cdx(self): """ Create non-surt cdx, then convert to cdxj """ migrate_dir = os.path.join(self.root_dir, '_migrate') os.mkdir(migrate_dir) cdxindexer_main(['-u', migrate_dir, self._get_sample_warc('')]) # try one file with -9 cdxindexer_main(['-u', '-9', migrate_dir, self._get_sample_warc('example.warc.gz')]) cdxs = os.listdir(migrate_dir) assert all(x.endswith('.cdx') for x in cdxs) @patch('pywb.manager.manager.get_input', lambda x: 'blah') def do_migrate_no(): main(['cdx-convert', migrate_dir]) do_migrate_no() assert os.listdir(migrate_dir) == cdxs @patch('pywb.manager.manager.get_input', lambda x: 'y') def do_migrate_yes(): main(['cdx-convert', migrate_dir]) do_migrate_yes() cdxjs = os.listdir(migrate_dir) assert len(cdxs) == len(cdxjs) assert all(x.endswith('.cdxj') for x in cdxjs) with open(os.path.join(migrate_dir, 'iana.cdxj'), 'rb') as fh: cdx = CDXObject(fh.readline()) assert cdx['urlkey'] == 'org,iana)/' assert cdx['timestamp'] == '20140126200624' assert cdx['url'] == 'http://www.iana.org/' #assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",') # Nothing else to migrate main(['cdx-convert', migrate_dir])
def __parsed(self): return CDXObject(self.raw)
def load_orig_cdx(self): return [CDXObject(BAD_ORIG_CDX), CDXObject(URL_AGNOSTIC_ORIG_CDX)]
def load_orig_bad_cdx(self): return [CDXObject(BAD_ORIG_CDX), CDXObject(BAD_ORIG_CDX)]
def test_invalid_cdx_format(): with raises(CDXException): x = CDXObject('a b c')
def test_empty_cdxobject(): x = CDXObject('') assert len(x) == 0
def load_orig_cdx(_): return [ CDXObject(BAD_ORIG_CDX), CDXObject(URL_AGNOSTIC_ORIG_CDX.encode('utf-8')) ]
def test_unicode_url(): x = CDXObject( 'com,example,cafe)/ 123 {"url": "http://example.com/café/path"}') assert x['urlkey'] == 'com,example,cafe)/' assert x['timestamp'] == '123' assert x['url'] == 'http://example.com/caf%C3%A9/path'
def process_record(self, record, flow): headers = flow.response.headers url = flow.request.req_url scheme = flow.request.req_scheme if not self.content_rewriter: return record.http_headers, StreamIO(record.raw_stream) cookie_rewriter = None template_params = flow.extra_data environ = { 'pywb_proxy_magic': self.proxy_magic, 'webrec.template_params': template_params } wb_url = WbUrl(url) wb_prefix = '' host_prefix = flow.request.req_scheme + '://' + self.proxy_magic urlrewriter = SchemeOnlyUrlRewriter(wb_url, '') if flow.request.headers.get('X-Requested-With', '').lower() == 'xmlhttprequest': urlrewriter.rewrite_opts['is_ajax'] = True head_insert_func = (self.head_insert_view.create_insert_func( wb_url, wb_prefix, host_prefix, url, environ, False)) urlkey = canonicalize(wb_url.url) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp( headers.get('Memento-Datetime')) cdx['url'] = wb_url.url if headers.get('Webagg-Source-Coll') == 'live': cdx['is_live'] = 'true' result = self.content_rewriter.rewrite_content( urlrewriter, record.http_headers, record.raw_stream, head_insert_func, urlkey, cdx, cookie_rewriter, environ) status_headers, gen, is_rw = result status_headers.remove_header('Content-Security-Policy') # check for content-length res = status_headers.get_header('content-length') try: if int(res) > 0: return status_headers, IterIdent(gen) except: pass # need to either chunk or buffer to get content-length if flow.request.http_version == 'HTTP/1.1': status_headers.remove_header('content-length') status_headers.headers.append(('Transfer-Encoding', 'chunked')) #gen = chunk_encode_iter(gen) else: gen = buffer_iter(status_headers, gen) return status_headers, IterIdent(gen)
def _make_line(fields): line = ' '.join(['-'] * fields) x = CDXObject(line) assert len(x) == fields assert str(x) == line
def _make_line(fields): line = ' '.join(['-'] * fields) x = CDXObject(line.encode('utf-8')) assert len(x) == fields assert str(x) == line