def test_include_domains_favicon_and_language(self, tmp_path): zim_output = "spt.zim" warc2zim([ os.path.join(TEST_DATA_DIR, "single-page-test.warc"), "-i", "reseau-canope.fr", "--output", str(tmp_path), "--zim-file", zim_output, "--name", "spt", ]) zim_output = tmp_path / zim_output for article in self.list_articles(zim_output): url = article.path # ignore the replay files, which have only one path segment if url.startswith("A/") and len(url.split("/")) > 2: assert "reseau-canope.fr/" in url # test detected language assert self.get_metadata(zim_output, "Language") == b"fra" # test detected favicon assert self.get_article( zim_output, "A/lesfondamentaux.reseau-canope.fr/fileadmin/template/img/favicon.ico", ) assert self.get_metadata(zim_output, "Illustration_48x48@1") # test default tags added assert (self.get_metadata( zim_output, "Tags") == b"_ftindex:yes;_category:other;_sw:yes")
def test_custom_css(self, tmp_path): custom_css = b"* { background-color: red; }" custom_css_path = tmp_path / "custom.css" with open(custom_css_path, "wb") as fh: fh.write(custom_css) zim_output = "test-css.zim" warc2zim([ os.path.join(TEST_DATA_DIR, "example-response.warc"), "--output", str(tmp_path), "--zim-file", zim_output, "--name", "test-css", "--custom-css", str(custom_css_path), ]) zim_output = tmp_path / zim_output res = self.get_article(zim_output, "A/example.com/") assert "https://warc2zim.kiwix.app/custom.css".encode("utf-8") in res res = self.get_article(zim_output, "A/warc2zim.kiwix.app/custom.css") assert custom_css == res
def test_all_warcs_root_dir(self, tmp_path): zim_output = "test-all.zim" warc2zim([ os.path.join(TEST_DATA_DIR), "--output", str(tmp_path), "--zim-file", zim_output, "--name", "test-all", "--url", "http://example.com", ]) zim_output = tmp_path / zim_output # check articles from different warc records in tests/data dir # ensure trailing slash added assert b'window.mainUrl = "http://example.com/"' in self.get_article( zim_output, "A/index.html") # from example.warc.gz assert self.get_article(zim_output, "A/example.com/") != b"" # from single-page-test.warc assert (self.get_article( zim_output, "A/lesfondamentaux.reseau-canope.fr/accueil.html") != b"") # timestamp fuzzy match from example-with-timestamp.warc assert self.get_article(zim_output, "H/example.com/path.txt?") != b""
def test_same_domain_only(self, tmp_path): zim_output = "same-domain.zim" warc2zim([ os.path.join(TEST_DATA_DIR, "example-revisit.warc.gz"), "--favicon", "http://example.com/favicon.ico", "--include-domains", "example.com/", "--lang", "eng", "--zim-file", zim_output, "--name", "same-domain", "--output", str(tmp_path), ]) zim_output = tmp_path / zim_output for article in self.list_articles(zim_output): url = article.path # ignore the replay files, which have only one path segment if url.startswith("A/") and len(url.split("/")) > 2: assert url.startswith("A/example.com/")
def test_args_only(self): # error, name required with pytest.raises(SystemExit) as e: warc2zim([]) assert e.code == 2 # error, no such output directory with pytest.raises(Exception) as e: warc2zim(["--name", "test", "--output", "/no-such-dir"]) # success, special error code for no output files assert warc2zim(["--name", "test", "--output", "./"]) == 100
def test_error_bad_main_page(self, tmp_path): zim_output_not_created = "zim-out-not-created.zim" with pytest.raises(Exception) as e: warc2zim([ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), "-u", "https://no-such-url.example.com", "--output", str(tmp_path), "--name", "bad", "--zim-file", zim_output_not_created, ])
def test_fuzzy_urls(self, tmp_path, fuzzycheck): zim_output = fuzzycheck["filename"] + ".zim" warc2zim([ os.path.join(TEST_DATA_DIR, fuzzycheck["filename"]), "--output", str(tmp_path), "--zim-file", zim_output, "--name", "test-fuzzy", ]) zim_output = tmp_path / zim_output for entry in fuzzycheck["entries"]: res = self.get_article(zim_output, entry) assert b"Location: " in res
def test_error_bad_replay_viewer_url(self, tmp_path): zim_output_not_created = "zim-out-not-created.zim" with pytest.raises(Exception) as e: warc2zim([ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), "-r", "x-invalid-x", "--output", str(tmp_path), "--name", "bad", "--zim-file", zim_output_not_created, ]) # zim file should not have been created since replay viewer could not be loaded assert not os.path.isfile(tmp_path / zim_output_not_created)
def test_skip_self_redirect(self, tmp_path): zim_output = "self-redir.zim" warc2zim([ os.path.join(TEST_DATA_DIR, "self-redirect.warc"), "--output", str(tmp_path), "--zim-file", zim_output, "--name", "self-redir", ]) zim_output = tmp_path / zim_output for article in self.list_articles(zim_output): url = article.path if url.startswith("H/"): # ensure there is only one H/ record, and its a 200 (not 301) assert url == "H/kiwix.org/" assert b"HTTP/1.1 200 OK" in self.get_article( zim_output, "H/kiwix.org/")
def test_warc_to_zim(self, cmdline, tmp_path): # intput filename filename = cmdline[0] # set intput filename (first arg) to absolute path from test dir warcfile = os.path.join(TEST_DATA_DIR, filename) cmdline[0] = warcfile cmdline.extend(["--output", str(tmp_path), "--name", filename]) warc2zim(cmdline) zimfile = filename + "_" + time.strftime("%Y-%m") + ".zim" if "--progress-file" in cmdline: with open(tmp_path / "progress.json", "r") as fh: progress = json.load(fh) assert (progress["written"] > 0 and progress["total"] > 0 and progress["written"] <= progress["total"]) self.verify_warc_and_zim(warcfile, tmp_path / zimfile)
def test_local_replay_viewer_url(self, tmp_path): zim_local_sw = "zim-local-sw.zim" res = requests.get( "https://cdn.jsdelivr.net/npm/@webrecorder/[email protected]/dist/sw.js") with open(tmp_path / "sw.js", "wt") as fh: fh.write(res.text) warc2zim([ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), "-r", str(tmp_path) + "/", "--output", str(tmp_path), "--name", "local-sw", "--zim-file", zim_local_sw, ]) assert os.path.isfile(tmp_path / zim_local_sw)
def test_custom_css_remote(self, tmp_path): zim_output = "test-css.zim" url = ( "https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap-reboot.css" ) warc2zim([ os.path.join(TEST_DATA_DIR, "example-response.warc"), "--output", str(tmp_path), "--zim-file", zim_output, "--name", "test-css", "--custom-css", url, ]) zim_output = tmp_path / zim_output res = self.get_article(zim_output, "A/example.com/") assert "https://warc2zim.kiwix.app/custom.css".encode("utf-8") in res res = self.get_article(zim_output, "A/warc2zim.kiwix.app/custom.css") assert res == requests.get(url).content
def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): zim_output = "zim-out-filename.zim" warc2zim([ "-v", os.path.join(TEST_DATA_DIR, "example-response.warc"), "--name", "example-response", "--output", str(tmp_path), "--zim-file", zim_output, "-r", "https://cdn.jsdelivr.net/npm/@webrecorder/[email protected]/dist/", "--tags", "some", "--tags", "foo", "--desc", "test zim", "--tags", "bar", "--title", "Some Title", ]) zim_output = tmp_path / zim_output assert os.path.isfile(zim_output) all_articles = { article.path: article.title for article in self.list_articles(zim_output) } assert all_articles == { # entries from WARC "A/example.com/": "Example Domain", "H/example.com/": "H/example.com/", # replay system files "A/index.html": "A/index.html", "A/load.js": "A/load.js", "A/404.html": "A/404.html", "A/sw.js": "A/sw.js", "A/topFrame.html": "A/topFrame.html", } zim_fh = Archive(zim_output) # ZIM metadata assert list(zim_fh.metadata.keys()) == [ "Counter", "Creator", "Date", "Description", "Language", "Name", "Publisher", "Scraper", "Source", "Tags", "Title", ] assert zim_fh.has_fulltext_index assert zim_fh.has_title_index assert self.get_metadata(zim_output, "Description") == b"test zim" assert (self.get_metadata( zim_output, "Tags") == b"_ftindex:yes;_category:other;_sw:yes;some;foo;bar") assert self.get_metadata(zim_output, "Title") == b"Some Title"
def zimit(args=None): wait_until_options = [ "load", "domcontentloaded", "networkidle0", "networkidle2" ] wait_until_all = wait_until_options + [ f"{a},{b}" for a, b in itertools.combinations(wait_until_options, 2) ] parser = ArgumentParser( description= "Run a browser-based crawl on the specified URL and convert to ZIM") parser.add_argument("-u", "--url", help="The URL to start crawling from") parser.add_argument("-w", "--workers", type=int, help="Number of parallel workers") parser.add_argument( "--newContext", help="The context for each new capture (page, session or browser).", choices=["page", "session", "browser"], default="page", ) parser.add_argument( "--waitUntil", help= "Puppeteer page.goto() condition to wait for before continuing. One of " f"{wait_until_options} or a comma-separated combination of those.", choices=wait_until_all, default="load,networkidle0", ) parser.add_argument("--limit", help="Limit crawl to this number of pages", type=int, default=0) parser.add_argument( "--timeout", help="Timeout for each page to load (in seconds)", type=int, default=90, ) parser.add_argument( "--scope", help="Regex of page URLs that should be included in the crawl " "(defaults to the immediate directory of the URL)", ) parser.add_argument( "--exclude", help="Regex of page URLs that should be excluded from the crawl.") parser.add_argument( "--scroll", help="If set, will autoscroll to bottom of the page", action="store_true", default=False, ) parser.add_argument( "--keep", help="If set, keep WARC files after crawl, don't delete", action="store_true", ) parser.add_argument("--output", help="Output directory for ZIM and WARC files", default="/output") parser.add_argument("--adminEmail", help="Admin Email for Zimit crawler") parser.add_argument("--mobileDevice", help="Crawl as Mobile Device", nargs="?", const="iPhone X") parser.add_argument( "--useSitemap", help= "If set, use the URL as sitemap to get additional URLs for the crawl (usually /sitemap.xml)", ) parser.add_argument( "--custom-css", help="[warc2zim] Custom CSS file URL/path to inject into all articles", ) parser.add_argument( "--statsFilename", help="If set, output stats as JSON to this file", ) zimit_args, warc2zim_args = parser.parse_known_args(args) # pass url and output to warc2zim also if zimit_args.output: warc2zim_args.append("--output") warc2zim_args.append(zimit_args.output) url = zimit_args.url if url: url = check_url(url, zimit_args.scope) warc2zim_args.append("--url") warc2zim_args.append(url) if zimit_args.custom_css: warc2zim_args += ["--custom-css", zimit_args.custom_css] print("----------") print("Testing warc2zim args") print("Running: warc2zim " + " ".join(warc2zim_args), flush=True) res = warc2zim(warc2zim_args) if res != 100: print("Exiting, invalid warc2zim params") return 2 # make temp dir for this crawl temp_root_dir = Path(tempfile.mkdtemp(dir=zimit_args.output, prefix=".tmp")) if not zimit_args.keep: def cleanup(): print("") print("----------") print(f"Cleanup, removing temp dir: {temp_root_dir}", flush=True) shutil.rmtree(temp_root_dir) atexit.register(cleanup) cmd_args = get_node_cmd_line(zimit_args) if url: cmd_args.append("--url") cmd_args.append(url) user_agent_suffix = "+Zimit " if zimit_args.adminEmail: user_agent_suffix += zimit_args.adminEmail cmd_args.append("--userAgentSuffix") cmd_args.append(user_agent_suffix) cmd_args.append("--cwd") cmd_args.append(str(temp_root_dir)) # setup inotify crawler progress watcher if zimit_args.statsFilename: watcher = ProgressFileWatcher(Path(zimit_args.output), Path(zimit_args.statsFilename)) print(f"Writing progress to {watcher.stats_path}") # update crawler command cmd_args.append("--statsFilename") cmd_args.append(str(watcher.crawl_path)) # update warc2zim command warc2zim_args.append("-v") warc2zim_args.append("--progress-file") warc2zim_args.append(str(watcher.warc2zim_path)) watcher.watch() cmd_line = " ".join(cmd_args) print("") print("----------") print( f"Output to tempdir: {temp_root_dir} - {'will keep' if zimit_args.keep else 'will delete'}" ) print(f"Running browsertrix-crawler crawl: {cmd_line}", flush=True) subprocess.run(cmd_args, check=True) warc_files = temp_root_dir / "collections" / "capture" / "archive" warc2zim_args.append(str(warc_files)) num_files = sum(1 for e in warc_files.iterdir()) print("") print("----------") print(f"Processing {num_files} WARC files to ZIM", flush=True) return warc2zim(warc2zim_args)