def list_warcs(self, user, coll): if not self.can_admin_coll(user, coll): return [] archive_dir = self.path_router.get_archive_dir(user, coll) warcs = {} total_size = 0 if os.path.isdir(archive_dir): for fullpath, filename in iter_file_or_dir([archive_dir]): stats = os.stat(fullpath) size = long(stats.st_size) res = {'size': size, 'mtime': long(stats.st_mtime), 'name': filename} warcs[filename] = res total_size += size donewarcs = self.redis.smembers(self.make_key(user, coll, self.DONE_WARC_KEY)) for stats in donewarcs: res = json.loads(stats) filename = res['name'] warcs[filename] = res total_size += long(res['size']) return total_size, warcs.values()
def list_warcs(self, user, coll): if not self.can_admin_coll(user, coll): return [] archive_dir = self.path_router.get_archive_dir(user, coll) warcs = {} total_size = 0 if os.path.isdir(archive_dir): for fullpath, filename in iter_file_or_dir([archive_dir]): stats = os.stat(fullpath) size = long(stats.st_size) res = { 'size': size, 'mtime': long(stats.st_mtime), 'name': filename } warcs[filename] = res total_size += size donewarcs = self.redis.smembers( self.make_key(user, coll, self.DONE_WARC_KEY)) for stats in donewarcs: res = json.loads(stats) filename = res['name'] warcs[filename] = res total_size += long(res['size']) return total_size, warcs.values()
def update_cdx(self, output_cdx, inputs): """ Output sorted, post-query resolving cdx from 'input_' warc(s) to 'output_cdx'. Write cdx to temp and rename to output_cdx when completed to ensure atomic updates of the cdx. """ writer_cls = PageDetectSortedWriter options = dict(sort=True, surt_ordered=True, append_post=True, include_all=True) try: with open(output_cdx.name, 'wb') as outfile: with writer_cls(outfile) as writer: for fullpath, filename in iter_file_or_dir(inputs): filename = self.format_filename(filename) with open(fullpath, 'rb') as infile: entry_iter = create_index_iter(infile, **options) for entry in entry_iter: writer.write(entry, filename) output_cdx.flush() except Exception as exc: import traceback err_details = traceback.format_exc(exc) print err_details return None return writer.pages
def index_warcs(warcs, cdx, base_cdx=None): """Creates a sorted CDX for a series of WARC files, supplementing revisit records with original records.""" with open(cdx, "wb") as o: with cdxindexer.CDXWriter(o, False) as writer: for fullpath, filename in cdxindexer.iter_file_or_dir(warcs): with open(fullpath, "rb") as infile: entry_iter = cdxindexer.create_index_iter( infile, include_all=False, surt_ordered=False) for entry in entry_iter: if entry.mime == "warc/revisit": if base_cdx is not None: with open(base_cdx, "rb") as c: matches = [ m for m in binsearch.iter_exact( c, entry.key) if m.split()[3] != "warc/revisit" and m.split()[5] == entry.digest ] if len(matches) > 0: o.write("%s\n" % matches[-1]) writer.write(entry, fullpath) with open(cdx, "rb") as i: lines = [l.strip() for l in i] lines.sort(cmp=locale.strcoll) with open(cdx, "wb") as o: o.write("\n".join(lines))
def generate_path_index(warcs, index): lines = [] for fullpath, filename in cdxindexer.iter_file_or_dir(warcs): lines.append("%s\t%s" % (filename, fullpath)) lines.sort() set(lines) with open(index, "wb") as path_index: path_index.write("\n".join(lines))
def get_filenames(warcs): """Builds a URL->filename lookup from WARC files.""" filenames = {} options = {"include_all": False, "surt_ordered": False} for fullpath, filename in cdxindexer.iter_file_or_dir(warcs): with open(fullpath, "rb") as warc: iter = DefaultRecordIter(**options) for entry in iter(warc): if not entry.record.status_headers.statusline.startswith("200"): continue if entry.record.rec_type == "revisit": continue pdf = get_filename(entry) if pdf is not None: key = entry.pop("url") filenames[key] = pdf return filenames
def get_filenames(warcs): """Builds a URL->filename lookup from WARC files.""" filenames = {} options = {"include_all": False, "surt_ordered": False} for fullpath, filename in cdxindexer.iter_file_or_dir(warcs): with open(fullpath, "rb") as warc: iter = DefaultRecordIter(**options) for entry in iter(warc): if not entry.record.status_headers.statusline.startswith( "200"): continue if entry.record.rec_type == "revisit": continue pdf = get_filename(entry) if pdf is not None: key = entry.pop("url") filenames[key] = pdf return filenames
def index_warcs( warcs, cdx, base_cdx=None ): """Creates a sorted CDX for a series of WARC files, supplementing revisit records with original records.""" with open(cdx, "wb") as o: with cdxindexer.CDXWriter(o, False) as writer: for fullpath, filename in cdxindexer.iter_file_or_dir(warcs): with open(fullpath, "rb") as infile: entry_iter = cdxindexer.create_index_iter(infile, include_all=False, surt_ordered=False) for entry in entry_iter: if entry.mime == "warc/revisit": if base_cdx is not None: with open(base_cdx, "rb") as c: matches = [m for m in binsearch.iter_exact(c, entry.key) if m.split()[3] != "warc/revisit" and m.split()[5] == entry.digest] if len(matches) > 0: o.write("%s\n" % matches[-1]) writer.write(entry, fullpath) with open(cdx, "rb") as i: lines = [l.strip() for l in i] lines.sort(cmp=locale.strcoll) with open(cdx, "wb") as o: o.write("\n".join(lines))