Ejemplo n.º 1
0
def test_verify_xpath_will_not_run_splash_settings_if_not_javascript(mocker):
    mocker.patch("os.path.isdir")
    mock_crawler_worker = mocker.patch("webcomix.comic.CrawlerWorker")
    comic = Comic(mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY, mocker.ANY,
                  False)
    comic.verify_xpath()
    settings = mock_crawler_worker.call_args_list[0][0][0]
    assert all(setting not in settings.items()
               for setting in SPLASH_SETTINGS.items())
Ejemplo n.º 2
0
def test_verify_xpath_with_alt_text(three_webpages_alt_text_uri):
    comic = Comic(
        "test_alt",
        three_webpages_alt_text_uri,
        "//img/@src",
        "//a/@href",
        alt_text="//img/@title",
    )

    three_webpages_alt_text_folder = three_webpages_alt_text_uri.strip(
        "1.html")

    assert comic.verify_xpath() == [
        {
            "page": 1,
            "url": three_webpages_alt_text_uri,
            "image_urls": [three_webpages_alt_text_folder + "1.jpeg"],
            "alt_text": "First page",
        },
        {
            "page": 2,
            "url": three_webpages_alt_text_folder + "2.html",
            "image_urls": [three_webpages_alt_text_folder + "2.jpeg"],
            "alt_text": "Second page",
        },
        {
            "page": 3,
            "url": three_webpages_alt_text_folder + "3.html",
            "image_urls": [],
            "alt_text": None,
        },
    ]
Ejemplo n.º 3
0
def test_verify_xpath(three_webpages_uri):
    comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href")

    three_webpages_folder = three_webpages_uri.strip("1.html")

    assert comic.verify_xpath() == [
        {
            "page": 1,
            "url": three_webpages_uri,
            "image_urls": [three_webpages_folder + "1.jpeg"],
            "alt_text": None,
        },
        {
            "page": 2,
            "url": three_webpages_folder + "2.html",
            "image_urls": [three_webpages_folder + "2.jpeg"],
            "alt_text": None,
        },
        {
            "page": 3,
            "url": three_webpages_folder + "3.html",
            "image_urls": [],
            "alt_text": None,
        },
    ]
Ejemplo n.º 4
0
def test_print_verification(capfd, three_webpages_uri):
    comic = Comic("test", three_webpages_uri, "//img/@src", "//a/@href")
    verification = comic.verify_xpath()
    cli.print_verification(verification)
    out, err = capfd.readouterr()

    three_webpages_folder = three_webpages_uri.strip("1.html")

    assert out == ("Page 1:\n"
                   "Page URL: " + three_webpages_uri + "\n"
                   "Image URLs:\n"
                   "" + three_webpages_folder + "1.jpeg"
                   "\n"
                   "\n"
                   "Page 2:\n"
                   "Page URL: " + three_webpages_folder + "2.html"
                   "\n"
                   "Image URLs:\n"
                   "" + three_webpages_folder + "2.jpeg"
                   "\n"
                   "\n"
                   "Page 3:\n"
                   "Page URL: " + three_webpages_folder + "3.html"
                   "\n"
                   "Image URLs:\n"
                   "\n"
                   "\n")
Ejemplo n.º 5
0
def test_verify_xpath_only_verifies_one_page_with_single_page(one_webpage_uri):
    comic = Comic("test",
                  one_webpage_uri,
                  "//img/@src",
                  "//a/@href",
                  single_page=True)
    actual = comic.verify_xpath()
    assert len(actual) == 1
    assert len(actual[0]["image_urls"]) == 2
Ejemplo n.º 6
0
def test_print_verification_with_alt_text(capfd, three_webpages_alt_text_uri):
    comic = Comic(
        "test_alt",
        three_webpages_alt_text_uri,
        "//img/@src",
        "//a/@href",
        alt_text="//img/@title",
    )
    verification = comic.verify_xpath()
    cli.print_verification(verification)
    out, err = capfd.readouterr()

    three_webpages_alt_text_folder = three_webpages_alt_text_uri.strip(
        "1.html")

    assert out == ("Page 1:\n"
                   "Page URL: " + three_webpages_alt_text_uri + "\n"
                   "Image URLs:\n"
                   "" + three_webpages_alt_text_folder + "1.jpeg"
                   "\n"
                   "Alt text: First page\n"
                   "\n"
                   "Page 2:\n"
                   "Page URL: " + three_webpages_alt_text_folder + "2.html"
                   "\n"
                   "Image URLs:\n"
                   "" + three_webpages_alt_text_folder + "2.jpeg"
                   "\n"
                   "Alt text: Second page\n"
                   "\n"
                   "Page 3:\n"
                   "Page URL: " + three_webpages_alt_text_folder + "3.html"
                   "\n"
                   "Image URLs:\n"
                   "\n"
                   "\n")
Ejemplo n.º 7
0
def test_supported_comics(comic_name):
    comic = Comic(comic_name, *supported_comics[comic_name])
    first_pages = comic.verify_xpath()
    check_first_pages(first_pages)
Ejemplo n.º 8
0
def discovery(
    name: str,
    url: str,
    start_page: int = 1,
    alt_text: str = None,
    single_page: bool = False,
    javascript: bool = False,
    title: bool = False,
    debug: bool = False,
) -> Tuple[Optional[Comic], Optional[List[Mapping]]]:
    def to_lower_case(attribute):
        return (
            "translate({}, "
            "'ABCDEFGHIJKLMNOPQRSTUVWXYZ',"
            "'abcdefghijklmnopqrstuvwxyz')"
        ).format(attribute)

    click.echo("Looking for a path to the whole comic... (Ctrl-C to exit)")
    combinations = product(
        possible_next_page_xpath,
        possible_image_xpath,
        possible_tags_image,
        possible_tags_next,
        possible_attributes_image,
        possible_attributes_next,
    )
    total = (
        len(possible_next_page_xpath)
        * len(possible_image_xpath)
        * len(possible_tags_image)
        * len(possible_tags_next)
        * len(possible_attributes_image)
        * len(possible_attributes_next)
    )

    for next_page, image, tag_image, tag_next, attribute_image, attribute_next in tqdm(
        combinations, total=total
    ):
        next_page_xpath = "//{}[contains({}, '{}')]//@href".format(
            tag_next, to_lower_case(attribute_next), next_page
        )
        image_xpath = "//{}[contains({}, '{}')]//@src".format(
            tag_image, to_lower_case(attribute_image), image
        )
        try:
            comic = Comic(
                name,
                url,
                image_xpath,
                next_page_xpath,
                start_page=start_page,
                alt_text=alt_text,
                single_page=single_page,
                javascript=javascript,
                title=title,
                debug=debug,
            )
            first_pages = comic.verify_xpath()
            check_first_pages(first_pages)
            return comic, first_pages
        except KeyboardInterrupt:
            sys.exit(0)
        except:
            continue
    click.echo("Search has failed.")
    return None, None