Beispiel #1
0
def index_warcs(warcs, cdx, base_cdx=None):
    """Creates a sorted CDX for a series of WARC files, 
       supplementing revisit records with original records."""
    with open(cdx, "wb") as o:
        with cdxindexer.CDXWriter(o, False) as writer:
            for fullpath, filename in cdxindexer.iter_file_or_dir(warcs):
                with open(fullpath, "rb") as infile:
                    entry_iter = cdxindexer.create_index_iter(
                        infile, include_all=False, surt_ordered=False)
                    for entry in entry_iter:
                        if entry.mime == "warc/revisit":
                            if base_cdx is not None:
                                with open(base_cdx, "rb") as c:
                                    matches = [
                                        m for m in binsearch.iter_exact(
                                            c, entry.key)
                                        if m.split()[3] != "warc/revisit"
                                        and m.split()[5] == entry.digest
                                    ]
                                if len(matches) > 0:
                                    o.write("%s\n" % matches[-1])
                        writer.write(entry, fullpath)

    with open(cdx, "rb") as i:
        lines = [l.strip() for l in i]
    lines.sort(cmp=locale.strcoll)
    with open(cdx, "wb") as o:
        o.write("\n".join(lines))
Beispiel #2
0
    def __call__(self, filename, cdx):
        with open(self.pathindex_file, 'rb') as reader:
            result = iter_exact(reader, filename.encode('utf-8'), b'\t')

            for pathline in result:
                paths = pathline.split(b'\t')[1:]
                for path in paths:
                    yield to_native_str(path, 'utf-8')
Beispiel #3
0
    def __call__(self, filename, cdx=None):
        with open(self.pathindex_file, 'rb') as reader:
            result = iter_exact(reader, filename.encode('utf-8'), b'\t')

            for pathline in result:
                paths = pathline.split(b'\t')[1:]
                for path in paths:
                    yield to_native_str(path, 'utf-8')
Beispiel #4
0
    def __call__(self, filename):
        with open(self.pathindex_file, 'rb') as reader:
            result = iter_exact(reader, filename, '\t')

            for pathline in result:
                paths = pathline.split('\t')[1:]
                for path in paths:
                    yield path
Beispiel #5
0
    def __call__(self, filename):
        result = iter_exact(self.reader, filename, '\t')

        def gen_list(result):
            for pathline in result:
                paths = pathline.split('\t')[1:]
                for path in paths:
                    yield path

        return gen_list(result)
Beispiel #6
0
    def __call__(self, filename):
        result = iter_exact(self.reader, filename, '\t')

        def gen_list(result):
            for pathline in result:
                paths = pathline.split('\t')[1:]
                for path in paths:
                    yield path

        return gen_list(result)
Beispiel #7
0
def index_warcs( warcs, cdx, base_cdx=None ):
    """Creates a sorted CDX for a series of WARC files, 
       supplementing revisit records with original records."""
    with open(cdx, "wb") as o:
        with cdxindexer.CDXWriter(o, False) as writer:
            for fullpath, filename in cdxindexer.iter_file_or_dir(warcs):
                with open(fullpath, "rb") as infile:
                    entry_iter = cdxindexer.create_index_iter(infile, include_all=False, surt_ordered=False)
                    for entry in entry_iter:
                        if entry.mime == "warc/revisit":
                            if base_cdx is not None:
                                with open(base_cdx, "rb") as c:
                                    matches = [m for m in binsearch.iter_exact(c, entry.key) if m.split()[3] != "warc/revisit" and m.split()[5] == entry.digest]
                                if len(matches) > 0:
                                    o.write("%s\n" % matches[-1])
                        writer.write(entry, fullpath)

    with open(cdx, "rb") as i:
        lines = [l.strip() for l in i]
    lines.sort(cmp=locale.strcoll)
    with open(cdx, "wb") as o:
        o.write("\n".join(lines))
Beispiel #8
0
def getCDXLine(surtURI):
    with open(INDEX_FILE, 'r') as cdxFile:
        bsResp = iter_exact(cdxFile, surtURI)
        cdxLine = bsResp.next()
        return cdxLine