Ejemplo n.º 1
0
    def list_warcs(self, user, coll):
        if not self.can_admin_coll(user, coll):
            return []

        archive_dir = self.path_router.get_archive_dir(user, coll)

        warcs = {}
        total_size = 0

	if os.path.isdir(archive_dir):
            for fullpath, filename in iter_file_or_dir([archive_dir]):
                stats = os.stat(fullpath)
                size = long(stats.st_size)
                res = {'size': size,
                       'mtime': long(stats.st_mtime),
                       'name': filename}
                warcs[filename] = res
                total_size += size

        donewarcs = self.redis.smembers(self.make_key(user, coll, self.DONE_WARC_KEY))
        for stats in donewarcs:
            res = json.loads(stats)
            filename = res['name']
            warcs[filename] = res

            total_size += long(res['size'])

        return total_size, warcs.values()
Ejemplo n.º 2
0
    def list_warcs(self, user, coll):
        if not self.can_admin_coll(user, coll):
            return []

        archive_dir = self.path_router.get_archive_dir(user, coll)

        warcs = {}
        total_size = 0

        if os.path.isdir(archive_dir):
            for fullpath, filename in iter_file_or_dir([archive_dir]):
                stats = os.stat(fullpath)
                size = long(stats.st_size)
                res = {
                    'size': size,
                    'mtime': long(stats.st_mtime),
                    'name': filename
                }
                warcs[filename] = res
                total_size += size

        donewarcs = self.redis.smembers(
            self.make_key(user, coll, self.DONE_WARC_KEY))
        for stats in donewarcs:
            res = json.loads(stats)
            filename = res['name']
            warcs[filename] = res

            total_size += long(res['size'])

        return total_size, warcs.values()
Ejemplo n.º 3
0
    def update_cdx(self, output_cdx, inputs):
        """
        Output sorted, post-query resolving cdx from 'input_' warc(s)
        to 'output_cdx'. Write cdx to temp and rename to output_cdx
        when completed to ensure atomic updates of the cdx.
        """

        writer_cls = PageDetectSortedWriter
        options = dict(sort=True,
                       surt_ordered=True,
                       append_post=True,
                       include_all=True)

        try:
            with open(output_cdx.name, 'wb') as outfile:
                with writer_cls(outfile) as writer:
                    for fullpath, filename in iter_file_or_dir(inputs):
                        filename = self.format_filename(filename)
                        with open(fullpath, 'rb') as infile:
                            entry_iter = create_index_iter(infile, **options)

                            for entry in entry_iter:
                                writer.write(entry, filename)

            output_cdx.flush()
        except Exception as exc:
            import traceback
            err_details = traceback.format_exc(exc)
            print err_details
            return None

        return writer.pages
Ejemplo n.º 4
0
def index_warcs(warcs, cdx, base_cdx=None):
    """Creates a sorted CDX for a series of WARC files, 
       supplementing revisit records with original records."""
    with open(cdx, "wb") as o:
        with cdxindexer.CDXWriter(o, False) as writer:
            for fullpath, filename in cdxindexer.iter_file_or_dir(warcs):
                with open(fullpath, "rb") as infile:
                    entry_iter = cdxindexer.create_index_iter(
                        infile, include_all=False, surt_ordered=False)
                    for entry in entry_iter:
                        if entry.mime == "warc/revisit":
                            if base_cdx is not None:
                                with open(base_cdx, "rb") as c:
                                    matches = [
                                        m for m in binsearch.iter_exact(
                                            c, entry.key)
                                        if m.split()[3] != "warc/revisit"
                                        and m.split()[5] == entry.digest
                                    ]
                                if len(matches) > 0:
                                    o.write("%s\n" % matches[-1])
                        writer.write(entry, fullpath)

    with open(cdx, "rb") as i:
        lines = [l.strip() for l in i]
    lines.sort(cmp=locale.strcoll)
    with open(cdx, "wb") as o:
        o.write("\n".join(lines))
Ejemplo n.º 5
0
def generate_path_index(warcs, index):
    lines = []
    for fullpath, filename in cdxindexer.iter_file_or_dir(warcs):
        lines.append("%s\t%s" % (filename, fullpath))
    lines.sort()
    set(lines)
    with open(index, "wb") as path_index:
        path_index.write("\n".join(lines))
Ejemplo n.º 6
0
def generate_path_index(warcs, index):
    lines = []
    for fullpath, filename in cdxindexer.iter_file_or_dir(warcs):
        lines.append("%s\t%s" % (filename, fullpath))
    lines.sort()
    set(lines)
    with open(index, "wb") as path_index:
        path_index.write("\n".join(lines))
Ejemplo n.º 7
0
def get_filenames(warcs):
    """Builds a URL->filename lookup from WARC files."""
    filenames = {}
    options = {"include_all": False, "surt_ordered": False}
    for fullpath, filename in cdxindexer.iter_file_or_dir(warcs):
        with open(fullpath, "rb") as warc:
            iter = DefaultRecordIter(**options)
            for entry in iter(warc):
                if not entry.record.status_headers.statusline.startswith("200"):
                    continue
                if entry.record.rec_type == "revisit":
                    continue
                pdf = get_filename(entry)
                if pdf is not None:
                    key = entry.pop("url")
                    filenames[key] = pdf
    return filenames
def get_filenames(warcs):
    """Builds a URL->filename lookup from WARC files."""
    filenames = {}
    options = {"include_all": False, "surt_ordered": False}
    for fullpath, filename in cdxindexer.iter_file_or_dir(warcs):
        with open(fullpath, "rb") as warc:
            iter = DefaultRecordIter(**options)
            for entry in iter(warc):
                if not entry.record.status_headers.statusline.startswith(
                        "200"):
                    continue
                if entry.record.rec_type == "revisit":
                    continue
                pdf = get_filename(entry)
                if pdf is not None:
                    key = entry.pop("url")
                    filenames[key] = pdf
    return filenames
Ejemplo n.º 9
0
def index_warcs( warcs, cdx, base_cdx=None ):
    """Creates a sorted CDX for a series of WARC files, 
       supplementing revisit records with original records."""
    with open(cdx, "wb") as o:
        with cdxindexer.CDXWriter(o, False) as writer:
            for fullpath, filename in cdxindexer.iter_file_or_dir(warcs):
                with open(fullpath, "rb") as infile:
                    entry_iter = cdxindexer.create_index_iter(infile, include_all=False, surt_ordered=False)
                    for entry in entry_iter:
                        if entry.mime == "warc/revisit":
                            if base_cdx is not None:
                                with open(base_cdx, "rb") as c:
                                    matches = [m for m in binsearch.iter_exact(c, entry.key) if m.split()[3] != "warc/revisit" and m.split()[5] == entry.digest]
                                if len(matches) > 0:
                                    o.write("%s\n" % matches[-1])
                        writer.write(entry, fullpath)

    with open(cdx, "rb") as i:
        lines = [l.strip() for l in i]
    lines.sort(cmp=locale.strcoll)
    with open(cdx, "wb") as o:
        o.write("\n".join(lines))