def index_warcs(warcs, cdx, base_cdx=None): """Creates a sorted CDX for a series of WARC files, supplementing revisit records with original records.""" with open(cdx, "wb") as o: with cdxindexer.CDXWriter(o, False) as writer: for fullpath, filename in cdxindexer.iter_file_or_dir(warcs): with open(fullpath, "rb") as infile: entry_iter = cdxindexer.create_index_iter( infile, include_all=False, surt_ordered=False) for entry in entry_iter: if entry.mime == "warc/revisit": if base_cdx is not None: with open(base_cdx, "rb") as c: matches = [ m for m in binsearch.iter_exact( c, entry.key) if m.split()[3] != "warc/revisit" and m.split()[5] == entry.digest ] if len(matches) > 0: o.write("%s\n" % matches[-1]) writer.write(entry, fullpath) with open(cdx, "rb") as i: lines = [l.strip() for l in i] lines.sort(cmp=locale.strcoll) with open(cdx, "wb") as o: o.write("\n".join(lines))
def __call__(self, filename, cdx): with open(self.pathindex_file, 'rb') as reader: result = iter_exact(reader, filename.encode('utf-8'), b'\t') for pathline in result: paths = pathline.split(b'\t')[1:] for path in paths: yield to_native_str(path, 'utf-8')
def __call__(self, filename, cdx=None): with open(self.pathindex_file, 'rb') as reader: result = iter_exact(reader, filename.encode('utf-8'), b'\t') for pathline in result: paths = pathline.split(b'\t')[1:] for path in paths: yield to_native_str(path, 'utf-8')
def __call__(self, filename): with open(self.pathindex_file, 'rb') as reader: result = iter_exact(reader, filename, '\t') for pathline in result: paths = pathline.split('\t')[1:] for path in paths: yield path
def __call__(self, filename): result = iter_exact(self.reader, filename, '\t') def gen_list(result): for pathline in result: paths = pathline.split('\t')[1:] for path in paths: yield path return gen_list(result)
def index_warcs( warcs, cdx, base_cdx=None ): """Creates a sorted CDX for a series of WARC files, supplementing revisit records with original records.""" with open(cdx, "wb") as o: with cdxindexer.CDXWriter(o, False) as writer: for fullpath, filename in cdxindexer.iter_file_or_dir(warcs): with open(fullpath, "rb") as infile: entry_iter = cdxindexer.create_index_iter(infile, include_all=False, surt_ordered=False) for entry in entry_iter: if entry.mime == "warc/revisit": if base_cdx is not None: with open(base_cdx, "rb") as c: matches = [m for m in binsearch.iter_exact(c, entry.key) if m.split()[3] != "warc/revisit" and m.split()[5] == entry.digest] if len(matches) > 0: o.write("%s\n" % matches[-1]) writer.write(entry, fullpath) with open(cdx, "rb") as i: lines = [l.strip() for l in i] lines.sort(cmp=locale.strcoll) with open(cdx, "wb") as o: o.write("\n".join(lines))
def getCDXLine(surtURI): with open(INDEX_FILE, 'r') as cdxFile: bsResp = iter_exact(cdxFile, surtURI) cdxLine = bsResp.next() return cdxLine