Example #1
0
def test_urlitem_staticcontent(tmp_path, gzip_nonhtml_url):
    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(URLItem(url=gzip_nonhtml_url, content="hello"))

    zim = Archive(fpath)
    assert bytes(zim.get_item("robots.txt").content) == b"hello"
Example #2
0
def test_filelikeprovider_nosize(tmp_path, png_image_url):
    fileobj = io.BytesIO()
    stream_file(png_image_url, byte_stream=fileobj)

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(FileLikeProviderItem(fileobj=fileobj, path="one.png"))

    zim = Archive(fpath)
    assert bytes(zim.get_item("one.png").content) == fileobj.getvalue()
Example #3
0
def test_urlitem_html(tmp_path, gzip_html_url):
    file_path = tmp_path / "file.html"
    save_large_file(gzip_html_url, file_path)
    with open(file_path, "rb") as fh:
        file_bytes = fh.read()

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(URLItem(url=gzip_html_url))

    zim = Archive(fpath)
    assert bytes(zim.get_item("wiki/Main_Page").content) == file_bytes
Example #4
0
def test_urlitem_nonhtmlgzip(tmp_path, gzip_nonhtml_url):
    file_path = tmp_path / "file.txt"
    save_large_file(gzip_nonhtml_url, file_path)
    with open(file_path, "rb") as fh:
        file_bytes = fh.read()

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(URLItem(url=gzip_nonhtml_url))
        creator.add_item(URLItem(url=gzip_nonhtml_url, use_disk=True))

    zim = Archive(fpath)
    assert bytes(zim.get_item("robots.txt").content) == file_bytes
Example #5
0
def test_urlprovider(tmp_path, png_image_url):
    file_path = tmp_path / "file.png"
    save_large_file(png_image_url, file_path)
    with open(file_path, "rb") as fh:
        file_bytes = fh.read()

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(
            SpecialURLProviderItem(url=png_image_url, path="one.png"))

    zim = Archive(fpath)
    assert bytes(zim.get_item("one.png").content) == file_bytes
Example #6
0
def test_urlitem_binary(tmp_path, png_image_url):
    file_path = tmp_path / "file.png"
    save_large_file(png_image_url, file_path)
    with open(file_path, "rb") as fh:
        file_bytes = fh.read()

    fpath = tmp_path / "test.zim"
    with Creator(fpath) as creator:
        creator.add_item(URLItem(url=png_image_url))

    zim = Archive(fpath)
    assert (bytes(
        zim.get_item("static/images/project-logos/commonswiki.png").content) ==
            file_bytes)
Example #7
0
def test_add_item_for_delete(tmp_path, html_file):
    fpath = tmp_path / "test.zim"
    local_path = pathlib.Path(tmp_path / "somefile.html")

    # copy file to local path
    shutil.copyfile(html_file, local_path)

    with Creator(fpath, "welcome", "") as creator:
        creator.add_item_for(fpath=local_path, path="index", delete_fpath=True)

    assert not local_path.exists()

    reader = Archive(fpath)
    assert reader.get_item("index")
Example #8
0
def test_noindexlanguage(tmp_path):
    fpath = tmp_path / "test.zim"
    with Creator(fpath, "welcome", "") as creator:
        creator.add_item(StaticItem(path="welcome", content="hello"))
        creator.update_metadata(language="bam")
        creator.add_item_for("index",
                             "Index",
                             content="-",
                             mimetype="text/html")

    reader = Archive(fpath)
    assert reader.get_metadata("Language").decode(UTF8) == "bam"
    # html content triggers both title and content xapian indexes
    # but since indexing is disabled, we should only have title one
    assert reader.has_title_index
    assert not reader.has_fulltext_index
Example #9
0
def test_add_item_for_delete_fail(tmp_path, png_image):
    fpath = tmp_path / "test.zim"
    local_path = pathlib.Path(tmp_path / "somefile.png")

    # copy file to local path
    shutil.copyfile(png_image, local_path)

    def remove_source(item):
        print("##########", "remove_source")
        os.remove(item.filepath)

    with Creator(fpath, "welcome", "") as creator:
        creator.add_item(
            StaticItem(filepath=local_path,
                       path="index",
                       callback=remove_source),
            callback=(delete_callback, local_path),
        )
    assert not local_path.exists()

    reader = Archive(fpath)
    assert reader.get_item("index")
Example #10
0
 def get_metadata(self, zimfile, name):
     zim_fh = Archive(zimfile)
     return zim_fh.get_metadata(name)
Example #11
0
 def list_articles(self, zimfile):
     zim_fh = Archive(zimfile)
     for x in range(zim_fh.entry_count):
         yield zim_fh.get_entry_by_id(x)
Example #12
0
    def test_warc_to_zim_specify_params_and_metadata(self, tmp_path):
        zim_output = "zim-out-filename.zim"
        warc2zim([
            "-v",
            os.path.join(TEST_DATA_DIR, "example-response.warc"),
            "--name",
            "example-response",
            "--output",
            str(tmp_path),
            "--zim-file",
            zim_output,
            "-r",
            "https://cdn.jsdelivr.net/npm/@webrecorder/[email protected]/dist/",
            "--tags",
            "some",
            "--tags",
            "foo",
            "--desc",
            "test zim",
            "--tags",
            "bar",
            "--title",
            "Some Title",
        ])

        zim_output = tmp_path / zim_output

        assert os.path.isfile(zim_output)

        all_articles = {
            article.path: article.title
            for article in self.list_articles(zim_output)
        }

        assert all_articles == {
            # entries from WARC
            "A/example.com/": "Example Domain",
            "H/example.com/": "H/example.com/",
            # replay system files
            "A/index.html": "A/index.html",
            "A/load.js": "A/load.js",
            "A/404.html": "A/404.html",
            "A/sw.js": "A/sw.js",
            "A/topFrame.html": "A/topFrame.html",
        }

        zim_fh = Archive(zim_output)

        # ZIM metadata
        assert list(zim_fh.metadata.keys()) == [
            "Counter",
            "Creator",
            "Date",
            "Description",
            "Language",
            "Name",
            "Publisher",
            "Scraper",
            "Source",
            "Tags",
            "Title",
        ]

        assert zim_fh.has_fulltext_index
        assert zim_fh.has_title_index

        assert self.get_metadata(zim_output, "Description") == b"test zim"
        assert (self.get_metadata(
            zim_output,
            "Tags") == b"_ftindex:yes;_category:other;_sw:yes;some;foo;bar")
        assert self.get_metadata(zim_output, "Title") == b"Some Title"
Example #13
0
    def verify_warc_and_zim(self, warcfile, zimfile):
        assert os.path.isfile(warcfile)
        assert os.path.isfile(zimfile)

        # autoescape=False to allow injecting html entities from translated text
        env = Environment(
            loader=PackageLoader("warc2zim", "templates"),
            extensions=["jinja2.ext.i18n"],
            autoescape=False,
        )

        head_insert = env.get_template("sw_check.html").render().encode(
            "utf-8")

        # track to avoid checking duplicates, which are not written to ZIM
        warc_urls = set()

        zim_fh = Archive(zimfile)
        for record in iter_warc_records([warcfile]):
            url = get_record_url(record)
            if not url:
                continue

            if url in warc_urls:
                continue

            if record.rec_type not in (("response", "resource", "revisit")):
                continue

            # ignore revisit records that are to the same url
            if (record.rec_type == "revisit" and
                    record.rec_headers["WARC-Refers-To-Target-URI"] == url):
                continue

            # parse headers as record, ensure headers match
            url_no_scheme = url.split("//", 2)[1]
            print(url_no_scheme)
            parsed_record = next(
                ArchiveIterator(
                    BytesIO(zim_fh.get_content("H/" + url_no_scheme))))

            assert record.rec_headers == parsed_record.rec_headers
            assert record.http_headers == parsed_record.http_headers

            # ensure payloads match
            try:
                payload = zim_fh.get_item("A/" + url_no_scheme)
            except KeyError:
                payload = None

            if record.rec_type == "revisit" or (
                    record.http_headers
                    and record.http_headers.get("Content-Length") == "0"):
                assert not payload
            else:
                payload_content = payload.content.tobytes()

                # if HTML_RAW, still need to account for the head insert, otherwise should have exact match
                if payload.mimetype == HTML_RAW:
                    assert head_insert in payload_content
                    assert (payload_content.replace(
                        head_insert, b"") == record.buffered_stream.read())
                else:
                    assert payload_content == record.buffered_stream.read()

            warc_urls.add(url)
Example #14
0
 def get_article_raw(self, zimfile, path):
     zim_fh = Archive(zimfile)
     return zim_fh.get_item(path)
Example #15
0
 def get_article(self, zimfile, path):
     zim_fh = Archive(zimfile)
     return zim_fh.get_content(path)
Example #16
0
def test_zim_creator(tmp_path, png_image, html_file, html_str):
    fpath = tmp_path / "test.zim"
    main_path, language, title = "welcome", "fra", "My Title"
    tags = ";".join(["toto", "tata"])

    with open(png_image, "rb") as fh:
        png_data = fh.read()

    with Creator(fpath, main_path, language, title=title,
                 tags=tags) as creator:
        # verbatim HTML from string
        creator.add_item_for("welcome", "wel", content=html_str, is_front=True)
        # verbatim HTML from file
        creator.add_item_for("welcome3", "wel3", fpath=html_file)
        creator.add_item_for("welcome4", "wel4", fpath=html_file)
        # single binary image
        creator.add_item_for("images/yahoo.png",
                             "Home !!",
                             fpath=png_image,
                             is_front=True)
        # redirect to our main page (no title)
        creator.add_redirect("home", "welcome")
        # redirect to our main page (with a custom title)
        creator.add_redirect("home2", "welcome", "Home !!")
        creator.add_redirect("home3", "welcome3", "Home !!", is_front=True)
        creator.add_redirect("home4", "welcome4", "Home !!", is_front=False)

        # ensure args requirement are checked
        with pytest.raises(ValueError,
                           match="One of fpath or content is required"):
            creator.add_item_for("images/yahoo.png")

        with open(png_image, "rb") as fh:
            creator.add_default_illustration(png_data)

    assert fpath.exists()

    reader = Archive(fpath)
    assert reader.get_metadata("Title").decode(UTF8) == title
    assert reader.get_metadata("Language").decode(UTF8) == language
    assert reader.get_metadata("Tags").decode(UTF8) == tags
    assert reader.main_entry.get_item().path == f"{main_path}"
    # make sure we have our image
    assert reader.get_item("images/yahoo.png")
    # make sure we have our redirects
    assert reader.get_entry_by_path("home2").is_redirect
    assert reader.get_entry_by_path(
        "home2").get_redirect_entry().path == f"{main_path}"
    # make sure titles were indexed (html with title for xapian)
    # see https://github.com/openzim/python-libzim/issues/125
    assert "home2" in list(reader.get_suggestions("Home !!"))
    assert "home3" in list(reader.get_suggestions("Home !!"))
    assert "home4" in list(reader.get_suggestions("Home !!"))
    assert "images/yahoo.png" not in list(reader.get_suggestions("Home !!"))
    # make sure full text was indexed
    assert reader.get_search_results_count("PDF doc") >= 1

    # ensure non-rewritten articles have not been rewritten
    assert bytes(reader.get_item("welcome").content).decode(UTF8) == html_str
    assert bytes(reader.get_item("welcome3").content).decode(UTF8) == html_str

    # ensure illustration is present and corrext
    assert reader.has_illustration()
    assert bytes(reader.get_illustration_item().content) == png_data
Example #17
0
def test_urlprovider_nolength(tmp_path, png_image_url, png_image):

    # save url's content locally using external tool
    png_image = tmp_path / "original.png"
    save_large_file(png_image_url, png_image)
    with open(png_image, "rb") as fh:
        png_image_bytes = fh.read()

    # create and start an http server without Content-Length support
    server_fpath = tmp_path / "httpd.py"
    port = random.randint(10000, 20000)
    server_code = """
from http.server import BaseHTTPRequestHandler, HTTPServer

class handler(BaseHTTPRequestHandler):

    def do_GET(self):
        self.send_response(200)
        self.send_header("Content-type", "image/png")
        if "gzip" in self.path:
            self.send_header("Content-Encoding", "gzip")
        self.end_headers()
        with open("{src}", "rb") as fh:
            self.wfile.write(fh.read())


with HTTPServer(('', {port}), handler) as server:
    server.serve_forever()

"""
    with open(server_fpath, "w") as fh:
        fh.write(
            server_code.replace("{port}",
                                str(port)).replace("{src}", str(png_image)))

    httpd = subprocess.Popen([sys.executable, server_fpath])
    time.sleep(2)  # allow http server to start

    fpath = tmp_path / "test.zim"
    try:
        with tempfile.TemporaryDirectory() as tmp_dir, Creator(
                fpath) as creator:
            tmp_dir = pathlib.Path(tmp_dir)
            creator.add_item(
                URLItem(
                    url=f"http://localhost:{port}/hoho.png",
                    path="B",
                    tmp_dir=tmp_dir,
                    use_disk=True,
                ))
            creator.add_item(
                URLItem(url=f"http://localhost:{port}/home.png",
                        tmp_dir=tmp_dir))

            creator.add_item(
                SpecialURLProviderItem(url=f"http://localhost:{port}/home.png",
                                       mimetype="image/png"))
    finally:
        httpd.terminate()

    zim = Archive(fpath)
    assert bytes(zim.get_item("home.png").content) == png_image_bytes
    assert bytes(zim.get_item("B").content) == png_image_bytes