Beispiel #1
0
    def test_include_domains_favicon_and_language(self, tmp_path):
        zim_output = "spt.zim"
        warc2zim([
            os.path.join(TEST_DATA_DIR, "single-page-test.warc"),
            "-i",
            "reseau-canope.fr",
            "--output",
            str(tmp_path),
            "--zim-file",
            zim_output,
            "--name",
            "spt",
        ])

        zim_output = tmp_path / zim_output

        for article in self.list_articles(zim_output):
            url = article.path
            # ignore the replay files, which have only one path segment
            if url.startswith("A/") and len(url.split("/")) > 2:
                assert "reseau-canope.fr/" in url

        # test detected language
        assert self.get_metadata(zim_output, "Language") == b"fra"

        # test detected favicon
        assert self.get_article(
            zim_output,
            "A/lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico",
        )
        assert self.get_metadata(zim_output, "Illustration_48x48@1")

        # test default tags added
        assert (self.get_metadata(
            zim_output, "Tags") == b"_ftindex:yes;_category:other;_sw:yes")
Beispiel #2
0
    def test_custom_css(self, tmp_path):
        custom_css = b"* { background-color: red; }"
        custom_css_path = tmp_path / "custom.css"
        with open(custom_css_path, "wb") as fh:
            fh.write(custom_css)

        zim_output = "test-css.zim"

        warc2zim([
            os.path.join(TEST_DATA_DIR, "example-response.warc"),
            "--output",
            str(tmp_path),
            "--zim-file",
            zim_output,
            "--name",
            "test-css",
            "--custom-css",
            str(custom_css_path),
        ])
        zim_output = tmp_path / zim_output

        res = self.get_article(zim_output, "A/example.com/")
        assert "https://warc2zim.kiwix.app/custom.css".encode("utf-8") in res

        res = self.get_article(zim_output, "A/warc2zim.kiwix.app/custom.css")
        assert custom_css == res
Beispiel #3
0
    def test_all_warcs_root_dir(self, tmp_path):
        zim_output = "test-all.zim"
        warc2zim([
            os.path.join(TEST_DATA_DIR),
            "--output",
            str(tmp_path),
            "--zim-file",
            zim_output,
            "--name",
            "test-all",
            "--url",
            "http://example.com",
        ])
        zim_output = tmp_path / zim_output

        # check articles from different warc records in tests/data dir

        # ensure trailing slash added
        assert b'window.mainUrl = "http://example.com/"' in self.get_article(
            zim_output, "A/index.html")

        # from example.warc.gz
        assert self.get_article(zim_output, "A/example.com/") != b""

        # from single-page-test.warc
        assert (self.get_article(
            zim_output, "A/lesfondamentaux.reseau-canope.fr/accueil.html") !=
                b"")

        # timestamp fuzzy match from example-with-timestamp.warc
        assert self.get_article(zim_output, "H/example.com/path.txt?") != b""
Beispiel #4
0
    def test_same_domain_only(self, tmp_path):
        zim_output = "same-domain.zim"
        warc2zim([
            os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"),
            "--favicon",
            "http://example.com/favicon.ico",
            "--include-domains",
            "example.com/",
            "--lang",
            "eng",
            "--zim-file",
            zim_output,
            "--name",
            "same-domain",
            "--output",
            str(tmp_path),
        ])

        zim_output = tmp_path / zim_output

        for article in self.list_articles(zim_output):
            url = article.path
            # ignore the replay files, which have only one path segment
            if url.startswith("A/") and len(url.split("/")) > 2:
                assert url.startswith("A/example.com/")
Beispiel #5
0
    def test_args_only(self):
        # error, name required
        with pytest.raises(SystemExit) as e:
            warc2zim([])
            assert e.code == 2

        # error, no such output directory
        with pytest.raises(Exception) as e:
            warc2zim(["--name", "test", "--output", "/no-such-dir"])

        # success, special error code for no output files
        assert warc2zim(["--name", "test", "--output", "./"]) == 100
Beispiel #6
0
 def test_error_bad_main_page(self, tmp_path):
     zim_output_not_created = "zim-out-not-created.zim"
     with pytest.raises(Exception) as e:
         warc2zim([
             "-v",
             os.path.join(TEST_DATA_DIR, "example-response.warc"),
             "-u",
             "https://no-such-url.example.com",
             "--output",
             str(tmp_path),
             "--name",
             "bad",
             "--zim-file",
             zim_output_not_created,
         ])
Beispiel #7
0
    def test_fuzzy_urls(self, tmp_path, fuzzycheck):
        zim_output = fuzzycheck["filename"] + ".zim"
        warc2zim([
            os.path.join(TEST_DATA_DIR, fuzzycheck["filename"]),
            "--output",
            str(tmp_path),
            "--zim-file",
            zim_output,
            "--name",
            "test-fuzzy",
        ])
        zim_output = tmp_path / zim_output

        for entry in fuzzycheck["entries"]:
            res = self.get_article(zim_output, entry)
            assert b"Location: " in res
Beispiel #8
0
    def test_error_bad_replay_viewer_url(self, tmp_path):
        zim_output_not_created = "zim-out-not-created.zim"
        with pytest.raises(Exception) as e:
            warc2zim([
                "-v",
                os.path.join(TEST_DATA_DIR, "example-response.warc"),
                "-r",
                "x-invalid-x",
                "--output",
                str(tmp_path),
                "--name",
                "bad",
                "--zim-file",
                zim_output_not_created,
            ])

        # zim file should not have been created since replay viewer could not be loaded
        assert not os.path.isfile(tmp_path / zim_output_not_created)
Beispiel #9
0
    def test_skip_self_redirect(self, tmp_path):
        zim_output = "self-redir.zim"
        warc2zim([
            os.path.join(TEST_DATA_DIR, "self-redirect.warc"),
            "--output",
            str(tmp_path),
            "--zim-file",
            zim_output,
            "--name",
            "self-redir",
        ])

        zim_output = tmp_path / zim_output

        for article in self.list_articles(zim_output):
            url = article.path
            if url.startswith("H/"):
                # ensure there is only one H/ record, and its a 200 (not 301)
                assert url == "H/kiwix.org/"
                assert b"HTTP/1.1 200 OK" in self.get_article(
                    zim_output, "H/kiwix.org/")
Beispiel #10
0
    def test_warc_to_zim(self, cmdline, tmp_path):
        # intput filename
        filename = cmdline[0]

        # set intput filename (first arg) to absolute path from test dir
        warcfile = os.path.join(TEST_DATA_DIR, filename)
        cmdline[0] = warcfile

        cmdline.extend(["--output", str(tmp_path), "--name", filename])

        warc2zim(cmdline)

        zimfile = filename + "_" + time.strftime("%Y-%m") + ".zim"

        if "--progress-file" in cmdline:
            with open(tmp_path / "progress.json", "r") as fh:
                progress = json.load(fh)
                assert (progress["written"] > 0 and progress["total"] > 0
                        and progress["written"] <= progress["total"])

        self.verify_warc_and_zim(warcfile, tmp_path / zimfile)
Beispiel #11
0
    def test_local_replay_viewer_url(self, tmp_path):
        zim_local_sw = "zim-local-sw.zim"

        res = requests.get(
            "https://cdn.jsdelivr.net/npm/@webrecorder/[email protected]/dist/sw.js")

        with open(tmp_path / "sw.js", "wt") as fh:
            fh.write(res.text)

        warc2zim([
            "-v",
            os.path.join(TEST_DATA_DIR, "example-response.warc"),
            "-r",
            str(tmp_path) + "/",
            "--output",
            str(tmp_path),
            "--name",
            "local-sw",
            "--zim-file",
            zim_local_sw,
        ])

        assert os.path.isfile(tmp_path / zim_local_sw)
Beispiel #12
0
    def test_custom_css_remote(self, tmp_path):
        zim_output = "test-css.zim"
        url = (
            "https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap-reboot.css"
        )

        warc2zim([
            os.path.join(TEST_DATA_DIR, "example-response.warc"),
            "--output",
            str(tmp_path),
            "--zim-file",
            zim_output,
            "--name",
            "test-css",
            "--custom-css",
            url,
        ])
        zim_output = tmp_path / zim_output

        res = self.get_article(zim_output, "A/example.com/")
        assert "https://warc2zim.kiwix.app/custom.css".encode("utf-8") in res

        res = self.get_article(zim_output, "A/warc2zim.kiwix.app/custom.css")
        assert res == requests.get(url).content
Beispiel #13
0
    def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
        zim_output = "zim-out-filename.zim"
        warc2zim([
            "-v",
            os.path.join(TEST_DATA_DIR, "example-response.warc"),
            "--name",
            "example-response",
            "--output",
            str(tmp_path),
            "--zim-file",
            zim_output,
            "-r",
            "https://cdn.jsdelivr.net/npm/@webrecorder/[email protected]/dist/",
            "--tags",
            "some",
            "--tags",
            "foo",
            "--desc",
            "test zim",
            "--tags",
            "bar",
            "--title",
            "Some Title",
        ])

        zim_output = tmp_path / zim_output

        assert os.path.isfile(zim_output)

        all_articles = {
            article.path: article.title
            for article in self.list_articles(zim_output)
        }

        assert all_articles == {
            # entries from WARC
            "A/example.com/": "Example Domain",
            "H/example.com/": "H/example.com/",
            # replay system files
            "A/index.html": "A/index.html",
            "A/load.js": "A/load.js",
            "A/404.html": "A/404.html",
            "A/sw.js": "A/sw.js",
            "A/topFrame.html": "A/topFrame.html",
        }

        zim_fh = Archive(zim_output)

        # ZIM metadata
        assert list(zim_fh.metadata.keys()) == [
            "Counter",
            "Creator",
            "Date",
            "Description",
            "Language",
            "Name",
            "Publisher",
            "Scraper",
            "Source",
            "Tags",
            "Title",
        ]

        assert zim_fh.has_fulltext_index
        assert zim_fh.has_title_index

        assert self.get_metadata(zim_output, "Description") == b"test zim"
        assert (self.get_metadata(
            zim_output,
            "Tags") == b"_ftindex:yes;_category:other;_sw:yes;some;foo;bar")
        assert self.get_metadata(zim_output, "Title") == b"Some Title"
Beispiel #14
0
def zimit(args=None):
    wait_until_options = [
        "load", "domcontentloaded", "networkidle0", "networkidle2"
    ]
    wait_until_all = wait_until_options + [
        f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2)
    ]
    parser = ArgumentParser(
        description=
        "Run a browser-based crawl on the specified URL and convert to ZIM")

    parser.add_argument("-u", "--url", help="The URL to start crawling from")

    parser.add_argument("-w",
                        "--workers",
                        type=int,
                        help="Number of parallel workers")

    parser.add_argument(
        "--newContext",
        help="The context for each new capture (page, session or browser).",
        choices=["page", "session", "browser"],
        default="page",
    )

    parser.add_argument(
        "--waitUntil",
        help=
        "Puppeteer page.goto() condition to wait for before continuing. One of "
        f"{wait_until_options} or a comma-separated combination of those.",
        choices=wait_until_all,
        default="load,networkidle0",
    )

    parser.add_argument("--limit",
                        help="Limit crawl to this number of pages",
                        type=int,
                        default=0)

    parser.add_argument(
        "--timeout",
        help="Timeout for each page to load (in seconds)",
        type=int,
        default=90,
    )

    parser.add_argument(
        "--scope",
        help="Regex of page URLs that should be included in the crawl "
        "(defaults to the immediate directory of the URL)",
    )

    parser.add_argument(
        "--exclude",
        help="Regex of page URLs that should be excluded from the crawl.")

    parser.add_argument(
        "--scroll",
        help="If set, will autoscroll to bottom of the page",
        action="store_true",
        default=False,
    )

    parser.add_argument(
        "--keep",
        help="If set, keep WARC files after crawl, don't delete",
        action="store_true",
    )

    parser.add_argument("--output",
                        help="Output directory for ZIM and WARC files",
                        default="/output")

    parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler")

    parser.add_argument("--mobileDevice",
                        help="Crawl as Mobile Device",
                        nargs="?",
                        const="iPhone X")

    parser.add_argument(
        "--useSitemap",
        help=
        "If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)",
    )

    parser.add_argument(
        "--custom-css",
        help="[warc2zim] Custom CSS file URL/path to inject into all articles",
    )

    parser.add_argument(
        "--statsFilename",
        help="If set, output stats as JSON to this file",
    )

    zimit_args, warc2zim_args = parser.parse_known_args(args)

    # pass url and output to warc2zim also
    if zimit_args.output:
        warc2zim_args.append("--output")
        warc2zim_args.append(zimit_args.output)

    url = zimit_args.url

    if url:
        url = check_url(url, zimit_args.scope)
        warc2zim_args.append("--url")
        warc2zim_args.append(url)

    if zimit_args.custom_css:
        warc2zim_args += ["--custom-css", zimit_args.custom_css]

    print("----------")
    print("Testing warc2zim args")
    print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
    res = warc2zim(warc2zim_args)
    if res != 100:
        print("Exiting, invalid warc2zim params")
        return 2

    # make temp dir for this crawl
    temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output,
                                          prefix=".tmp"))

    if not zimit_args.keep:

        def cleanup():
            print("")
            print("----------")
            print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True)
            shutil.rmtree(temp_root_dir)

        atexit.register(cleanup)

    cmd_args = get_node_cmd_line(zimit_args)
    if url:
        cmd_args.append("--url")
        cmd_args.append(url)

    user_agent_suffix = "+Zimit "
    if zimit_args.adminEmail:
        user_agent_suffix += zimit_args.adminEmail

    cmd_args.append("--userAgentSuffix")
    cmd_args.append(user_agent_suffix)

    cmd_args.append("--cwd")
    cmd_args.append(str(temp_root_dir))

    # setup inotify crawler progress watcher
    if zimit_args.statsFilename:
        watcher = ProgressFileWatcher(Path(zimit_args.output),
                                      Path(zimit_args.statsFilename))
        print(f"Writing progress to {watcher.stats_path}")
        # update crawler command
        cmd_args.append("--statsFilename")
        cmd_args.append(str(watcher.crawl_path))
        # update warc2zim command
        warc2zim_args.append("-v")
        warc2zim_args.append("--progress-file")
        warc2zim_args.append(str(watcher.warc2zim_path))
        watcher.watch()

    cmd_line = " ".join(cmd_args)

    print("")
    print("----------")
    print(
        f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}"
    )
    print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True)
    subprocess.run(cmd_args, check=True)

    warc_files = temp_root_dir / "collections" / "capture" / "archive"
    warc2zim_args.append(str(warc_files))

    num_files = sum(1 for e in warc_files.iterdir())

    print("")
    print("----------")
    print(f"Processing {num_files} WARC files to ZIM", flush=True)

    return warc2zim(warc2zim_args)