Ejemplo n.º 1
0
    def write_page_list(self, wacz, filename, page_iter):
        pages_file = zipfile.ZipInfo(filename, now())
        pages_file.compress_type = zipfile.ZIP_DEFLATED

        with wacz.open(pages_file, "w") as pg_fh:
            for line in page_iter:
                pg_fh.write(line.encode("utf-8"))
Ejemplo n.º 2
0
    def check_indexes(self):
        """Indexing existing WARC which should match the index in the wacz"""
        if os.path.exists(os.path.join(self.dir.name, "indexes/index.cdx.gz")):
            for resource in self.datapackage["resources"]:
                if resource["path"] == "indexes/index.cdx.gz":
                    cdx = resource["stats"]["hash"]
        else:
            return False

        archive_folder = os.listdir(os.path.join(self.dir.name, "archive"))
        for item in archive_folder:
            if ".warc" in item:
                warc = item
        wacz_file = tempfile.NamedTemporaryFile(delete=False)
        wacz = zipfile.ZipFile(wacz_file.name, "w")
        data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now())
        index_buff = BytesIO()
        text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True)
        wacz_indexer = None
        with wacz.open(data_file, "w") as data:
            wacz_indexer = WACZIndexer(
                text_wrap,
                {},
                sort=True,
                compress=data,
                fields="referrer",
                data_out_name="index.cdx.gz",
                records="all",
                main_url="",
                detect_pages="",
            )

            wacz_indexer.process_all()
        wacz.close()
        dir = tempfile.TemporaryDirectory()
        with zipfile.ZipFile(self.wacz, "r") as zip_ref:
            zip_ref.extractall(dir.name)
            zip_ref.close()

        with open(os.path.join(dir.name, "indexes/index.cdx.gz"), "rb") as fd:
            hash = support_hash_file(self.hash_type, fd.read())
            gzip_fd = gzip.GzipFile(fileobj=fd)

        return cdx == hash
Ejemplo n.º 3
0
def create_wacz(res):
    wacz = zipfile.ZipFile(res.output, "w")

    print("Generating indexes...")

    # write index
    data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now())

    index_file = zipfile.ZipInfo("indexes/index.idx", now())
    index_file.compress_type = zipfile.ZIP_DEFLATED

    index_buff = BytesIO()

    text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True)

    wacz_indexer = None

    passed_pages_dict = {}

    # If the flag for passed pages has been passed
    if res.pages != None:
        print("Attempt to validate passed pages.jsonl file")
        passed_content = open(res.pages, "r").read().split("\n")

        # Get rid of the blank end line that editors can sometimes add to jsonl files if it's present
        if passed_content[len(passed_content) - 1] == "":
            passed_content.pop()

        # Confirm the passed jsonl file has valid json on each line
        for page_str in passed_content:
            page_json = validateJSON(page_str)

            if not page_json:
                print(
                    "The passed jsonl file cannot be validated. Error found on the following line\n %s"
                    % page_str)
                return 1

        # Create a dict of the passed pages that will be used in the construction of the index
        passed_pages_dict = construct_passed_pages_dict(passed_content)

    with wacz.open(data_file, "w") as data:
        wacz_indexer = WACZIndexer(
            text_wrap,
            res.inputs,
            sort=True,
            post_append=True,
            compress=data,
            fields="referrer",
            data_out_name="index.cdx.gz",
            hash_type=res.hash_type,
            main_url=res.url,
            main_ts=res.ts,
            detect_pages=res.detect_pages,
            passed_pages_dict=passed_pages_dict,
            extract_text=res.text,
        )

        wacz_indexer.process_all()

    index_buff.seek(0)

    with wacz.open(index_file, "w") as index:
        shutil.copyfileobj(index_buff, index)

    # write archives
    print("Writing archives...")
    for _input in res.inputs:
        archive_file = zipfile.ZipInfo.from_file(
            _input, "archive/" + os.path.basename(_input))
        with wacz.open(archive_file, "w") as out_fh:
            with open(_input, "rb") as in_fh:
                shutil.copyfileobj(in_fh, out_fh)
                path = "archive/" + os.path.basename(_input)

    if wacz_indexer.passed_pages_dict != None:
        for key in wacz_indexer.passed_pages_dict:
            print(
                "Invalid passed page. We were unable to find a match for %s" %
                str(key))

    if len(wacz_indexer.pages) > 0 and res.pages == None:
        print("Generating page index...")
        # generate pages/text
        wacz_indexer.write_page_list(
            wacz,
            PAGE_INDEX,
            wacz_indexer.serialize_json_pages(
                wacz_indexer.pages.values(),
                id="pages",
                title="All Pages",
                has_text=wacz_indexer.has_text,
            ),
        )

    if len(wacz_indexer.pages) > 0 and res.pages != None:
        print("Generating page index from passed pages...")
        # Initially set the default value of the header id and title
        id_value = "pages"
        title_value = "All Pages"

        # If the user has provided a title or an id in a header of their file we will use those instead of our default.
        header = json.loads(passed_content[0])
        if "format" in header:
            print("Header detected in the passed pages.jsonl file")
            if "id" in header:
                id_value = header["id"]
            if "title" in header:
                title_value = header["title"]

        wacz_indexer.write_page_list(
            wacz,
            PAGE_INDEX,
            wacz_indexer.serialize_json_pages(
                wacz_indexer.pages.values(),
                id=id_value,
                title=title_value,
                has_text=wacz_indexer.has_text,
            ),
        )

    if len(wacz_indexer.extra_page_lists) > 0:
        print("Generating extra page lists...")

        for name, pagelist in wacz_indexer.extra_page_lists.items():
            if name == "pages":
                name = shortuuid.uuid()
            filename = PAGE_INDEX_TEMPLATE.format(name)

            wacz_indexer.write_page_list(wacz, filename, pagelist)

    # generate metadata
    print("Generating metadata...")

    metadata = wacz_indexer.generate_metadata(res, wacz)
    metadata_file = zipfile.ZipInfo("datapackage.json", now())
    metadata_file.compress_type = zipfile.ZIP_DEFLATED
    wacz.writestr(metadata_file, metadata.encode("utf-8"))
    return 0
Ejemplo n.º 4
0
def create_wacz(res):
    wacz = zipfile.ZipFile(res.output, "w")

    print("Generating indexes...")

    # write index
    data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now())

    index_file = zipfile.ZipInfo("indexes/index.idx", now())
    index_file.compress_type = zipfile.ZIP_DEFLATED

    index_buff = BytesIO()

    text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True)

    wacz_indexer = None
    with wacz.open(data_file, "w") as data:
        wacz_indexer = WACZIndexer(
            text_wrap,
            res.inputs,
            sort=True,
            compress=data,
            fields="referrer",
            data_out_name="index.cdx.gz",
            records="all",
            main_url=res.url,
            main_ts=res.ts,
            detect_pages=res.detect_pages,
            extract_text=res.text,
        )

        wacz_indexer.process_all()

    index_buff.seek(0)

    with wacz.open(index_file, "w") as index:
        shutil.copyfileobj(index_buff, index)

    # write archives
    print("Writing archives...")
    for _input in res.inputs:
        archive_file = zipfile.ZipInfo.from_file(
            _input, "archive/" + os.path.basename(_input))
        with wacz.open(archive_file, "w") as out_fh:
            with open(_input, "rb") as in_fh:
                shutil.copyfileobj(in_fh, out_fh)
                path = "archive/" + os.path.basename(_input)

    if len(wacz_indexer.pages) > 0:
        print("Generating page index...")
        # generate pages/text
        wacz_indexer.write_page_list(
            wacz,
            PAGE_INDEX,
            wacz_indexer.serialize_json_pages(
                wacz_indexer.pages.values(),
                id="pages",
                title="All Pages",
                has_text=wacz_indexer.has_text,
            ),
        )

    if len(wacz_indexer.extra_page_lists) > 0:
        print("Generating extra page lists...")

        for name, pagelist in wacz_indexer.extra_page_lists.items():
            if name == "pages":
                name = shortuuid.uuid()
            filename = PAGE_INDEX_TEMPLATE.format(name)

            wacz_indexer.write_page_list(wacz, filename, pagelist)

    # generate metadata
    print("Generating metadata...")

    metadata = wacz_indexer.generate_metadata(res, wacz)
    metadata_file = zipfile.ZipInfo("datapackage.json", now())
    metadata_file.compress_type = zipfile.ZIP_DEFLATED
    wacz.writestr(metadata_file, metadata.encode("utf-8"))
    return 0