def cdx_index(self, z_key, stream, filename): cdxout = BytesIO() write_cdx_index(cdxout, stream, filename, cdxj=True, append_post=True) cdx_list = cdxout.getvalue().rstrip().split(b'\n') count = 0 min_ = max_ = None for cdx in cdx_list: if cdx and not self.dry: self.dst_redis.zadd(z_key, 0, cdx) cdxobj = CDXObject(cdx) ts = cdxobj['timestamp'] min_ = min(min_, ts) if min_ else ts max_ = max(max_, ts) if max_ else ts count += 1 if count: min_ = timestamp_to_sec(min_) max_ = timestamp_to_sec(max_) logging.info(' CDXJ: {0} {1} {2}'.format(count, min_, max_)) return min_, max_
def cdx_index(warc, **options): buff = BytesIO() with open(TEST_WARC_DIR + warc, 'rb') as fh: write_cdx_index(buff, fh, warc, **options) return buff.getvalue()
def _load_and_index(self, warc_path): warckey = self.warc_bucket.get_key(warc_path) cdx_path = self._conv_warc_to_cdx_path(warc_path) if self.options.skip_existing: cdxkey = self.cdx_bucket.get_key(cdx_path) if cdxkey: sys.stderr.write('Already Exists\n') return with TemporaryFile(mode='w+b') as warctemp: shutil.copyfileobj(warckey, warctemp) warctemp.seek(0) with TemporaryFile(mode='w+b') as cdxtemp: with GzipFile(fileobj=cdxtemp, mode='w+b') as cdxfile: # Index to temp write_cdx_index(cdxfile, warctemp, warc_path, **self.index_options) # Upload temp cdxkey = self.cdx_bucket.new_key(cdx_path) cdxtemp.flush() cdxkey.set_contents_from_file(cdxtemp, rewind=True)
def test_anon_download_coll(self): res = self._get_anon('/temp/$download') assert res.headers['Content-Disposition'].startswith("attachment; filename*=UTF-8''temp-") warcin = self._get_dechunked(res.body) cdxout = BytesIO() write_cdx_index(cdxout, warcin, 'temp.warc.gz', include_all=True, cdxj=True) #print(cdxout.getvalue().decode('utf-8')) cdx = [CDXObject(cdx) for cdx in cdxout.getvalue().rstrip().split(b'\n')] assert len(cdx) == 6 # response cdx[0]['url'] = 'http://httpbin.org/get?food=bar' cdx[0]['mime'] = 'application/json' # request cdx[1]['url'] = 'http://httpbin.org/get?food=bar' cdx[1]['mime'] = '-' # response cdx[2]['url'] = 'http://httpbin.org/get?bood=far' cdx[2]['mime'] = 'application/json' # request cdx[3]['url'] = 'http://httpbin.org/get?bood=far' cdx[3]['mime'] = '-'
def create_all_from_link(self, link): warc_path = link.warc_storage_file() with default_storage.open(warc_path, 'rb') as warc_file, io.BytesIO() as cdx_io: write_cdx_index(cdx_io, warc_file, warc_path) cdx_io.seek(0) next(cdx_io) # first line is a header so skip it results = [CDXLine.objects.get_or_create(link=link, raw=line)[0] for line in cdx_io] return results
def load_cdx(self, query): """ This function accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response. """ guid = query.params['guid'] url = query.url # We'll first check the key-value store to see if we cached the lookup for this guid on a previous request. # This will be common, since each playback triggers lots of requests for the same .warc file. cache_key = guid + '-surts' url_key = guid+'-url' surt_lookup = django_cache.get(cache_key) url = url or django_cache.get(url_key) if surt_lookup and url: surt_lookup = json.loads(surt_lookup) else: # nothing in cache; find requested link in database try: link = Link.objects.select_related().get(pk=guid) except Link.DoesNotExist: return [] # cache url, which may be blank if this is the first request if not url: url = link.submitted_url django_cache.set(url_key, url, timeout=60*60) # get warc file for asset in link.assets.all(): if '.warc' in asset.warc_capture: warc_path = os.path.join(asset.base_storage_path, asset.warc_capture) break else: return [] # no .warc file -- do something to handle this? # now we have to get an index of all the URLs in this .warc file # first try fetching it from a .cdx file on disk cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx') if not default_storage.exists(cdx_path): # there isn't a .cdx file on disk either -- let's create it with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file: write_cdx_index(cdx_file, warc_file, warc_path, sort=True) # now load the URL index from disk and stick it in the cache cdx_lines = (line.strip() for line in default_storage.open(cdx_path, 'rb')) surt_lookup = dict((key, list(val)) for key, val in groupby(cdx_lines, key=lambda line: line.split(' ', 1)[0])) django_cache.set(cache_key, json.dumps(surt_lookup), timeout=60*60) # find cdx lines for url sorted_url = surt(url) if sorted_url in surt_lookup: return (str(i) for i in surt_lookup[sorted_url]) # didn't find requested url in this archive return []
def test_cdxj_empty(): options = dict(cdxj=True) buff = BytesIO() empty = BytesIO() write_cdx_index(buff, empty, 'empty.warc.gz', **options) assert buff.getvalue() == b''
def create_all_from_asset(self, asset): results = [] warc_path = os.path.join(asset.base_storage_path, asset.warc_capture) with default_storage.open(warc_path, 'rb') as warc_file, io.BytesIO() as cdx_io: write_cdx_index(cdx_io, warc_file, warc_path) cdx_io.seek(0) next(cdx_io) # first line is a header so skip it for line in cdx_io: results.append( CDXLine.objects.get_or_create(asset=asset, raw=line)[0]) return results
def create_all_from_link(self, link): warc_path = link.warc_storage_file() with default_storage.open(warc_path, 'rb') as warc_file, io.BytesIO() as cdx_io: write_cdx_index(cdx_io, warc_file, warc_path) cdx_io.seek(0) next(cdx_io) # first line is a header so skip it results = [ CDXLine.objects.get_or_create(link=link, raw=line)[0] for line in cdx_io ] return results
def create_all_from_link(self, link): warc_path = link.warc_storage_file() with default_storage.open(warc_path, 'rb') as warc_file, io.BytesIO() as cdx_io: write_cdx_index(cdx_io, warc_file, warc_path) cdx_io.seek(0) next(cdx_io) # first line is a header so skip it results = [] for line in cdx_io: cdxline = CDXLine.objects.get_or_create(raw=line, link_id=link.guid)[0] cdxline.is_unlisted = link.is_unlisted cdxline.is_private = link.is_private cdxline.save() results.append(cdxline) return results
def add_record(self, stream, name=None): stream.seek(0) if not name: name = stream.name cdxout = BytesIO() write_cdx_index(cdxout, stream, name, cdxj=True, append_post=True) cdxes = cdxout.getvalue() for cdx in cdxes.split('\n'): if cdx: self.redis.zadd(self.key, 0, cdx) return cdx
def cdx(request): """ This function handles WARC lookups by our warc server (running in warc_server). It accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response. If there's no warc for the requested GUID, or the requested URL isn't stored in that WARC, it returns a 404. """ # find requested link and url try: link = Link.objects.select_related().get(pk=request.GET.get('guid')) except Link.DoesNotExist: print "COULDN'T FIND LINK" raise Http404 url = request.GET.get('url', link.submitted_url) # get warc file for asset in link.assets.all(): if '.warc' in asset.warc_capture: warc_path = os.path.join(asset.base_storage_path, asset.warc_capture) break else: print "COULDN'T FIND WARC" raise Http404 # no .warc file -- do something to handle this? # get cdx file cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx') if not default_storage.exists(cdx_path): # if we can't find the CDX file associated with this WARC, create it with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file: write_cdx_index(cdx_file, warc_file, warc_path, sort=True) cdx_lines = default_storage.open(cdx_path, 'rb') # find cdx lines for url sorted_url = surt.surt(url) out = "" for line in cdx_lines: if line.startswith(sorted_url+" "): out += line elif out: # file may contain multiple matching lines in a row; we want to return all of them # if we've already found one or more matching lines, and now they're no longer matching, we're done break if out: return HttpResponse(out, content_type="text/plain") print "COULDN'T FIND URL" raise Http404 # didn't find requested url in .cdx file
def add_urls_to_index(self, stream, params, filename, length): rel_path = res_template(self.rel_path_template, params) filename = os.path.relpath(filename, rel_path) cdxout = BytesIO() write_cdx_index(cdxout, stream, filename, cdxj=True, append_post=True) z_key = res_template(self.redis_key_template, params) cdx_list = cdxout.getvalue().rstrip().split(b'\n') for cdx in cdx_list: if cdx: self.redis.zadd(z_key, 0, cdx) return cdx_list
def test_record_multiple_writes_keep_open(self): warc_path = to_path(self.root_dir + '/warcs/FOO/ABC-{hostname}-{timestamp}.warc.gz') rel_path = self.root_dir + '/warcs/' dedup_index = self._get_dedup_index(user=False) writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) # First Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?foo=bar', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"foo": "bar"' in resp.body # Second Record resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') assert b'HTTP/1.1 200 OK' in resp.body assert b'"boo": "far"' in resp.body self._test_all_warcs('/warcs/FOO/', 1) # Check two records in WARC r = FakeStrictRedis.from_url('redis://localhost/2') res = r.zrangebylex('FOO:cdxj', '[org,httpbin)/', '(org,httpbin,') assert len(res) == 2 files, coll_dir = self._test_all_warcs('/warcs/FOO/', 1) fullname = coll_dir + files[0] cdxout = BytesIO() with open(fullname, 'rb') as fh: filename = os.path.relpath(fullname, rel_path) write_cdx_index(cdxout, fh, filename, cdxj=True, append_post=True, sort=True) res = [CDXObject(x) for x in res] cdxres = cdxout.getvalue().strip() cdxres = cdxres.split(b'\n') cdxres = [CDXObject(x) for x in cdxres] assert cdxres == res assert len(writer.fh_cache) == 1 writer.close_key(self.root_dir + '/warcs/FOO/') assert len(writer.fh_cache) == 0 writer.close() resp = self._test_warc_write(recorder_app, 'httpbin.org', '/get?boo=far', '¶m.recorder.coll=FOO') self._test_all_warcs('/warcs/FOO/', 2) warcs = r.hgetall('FOO:warc') assert len(warcs) == 2