Exemple #1
0
def test_crawler_has_root_domain(allow_subd, subd_result):
    cr = crawler.Crawler("https://google.com", "", allow_subdomains=allow_subd)

    assert cr._has_root_url("https://google.com/search") == True
    assert cr._has_root_url("https://yandex.ru/search") == False

    assert cr._has_root_url("https://mail.google.com") == subd_result
Exemple #2
0
def test_crawler_filter_links(checkup_html, allow_subd, result):
    cr = crawler.Crawler("https://google.com", "", allow_subdomains=allow_subd)

    filter_result = list(
        cr._filter_links(checkup_html["title_nonempty_links"][1],
                         "https://google.com"))
    filter_result.sort()
    assert filter_result == checkup_html[result]
Exemple #3
0
def test_crawler_normalize_link():
    cr = crawler.Crawler("https://google.com", "")

    assert (cr._normalize_link(
        "https://google.com/help",
        "https://google.com/") == "https://google.com/help")
    assert (cr._normalize_link(
        "/help", "https://google.com/") == "https://google.com/help")
    assert (cr._normalize_link(
        "https://mail.google.com/help#fragment",
        "https://google.com/") == "https://mail.google.com/help")
Exemple #4
0
def test_crawler_instance():
    cr = crawler.Crawler(
        "https://google.com",
        splash_address="0.0.0.0:8050",
        allow_subdomains=True,
        allow_queries=True,
        depth_by_desc=3,
        concurrency=5,
        max_pause=10.0,
    )
    assert isinstance(cr, crawler.Crawler)
Exemple #5
0
def test_crawler_is_valid_link(allow_subd, subd_result):
    cr = crawler.Crawler("https://google.com", "", allow_subdomains=allow_subd)

    assert (cr._is_valid_link("https://google.com/search",
                              "https://google.com") == True)
    assert cr._is_valid_link("/help", "https://google.com") == True
    assert cr._is_valid_link("#help", "https://google.com") == False
    assert (cr._is_valid_link("https.google.com/#help",
                              "https://google.com") == False)
    assert (cr._is_valid_link("https://yandex.ru/search",
                              "https://google.com") == False)

    assert (cr._is_valid_link("https://mail.google.com",
                              "https://google.com") == subd_result)
Exemple #6
0
def test_remove_query():
    cr = crawler.Crawler("https://google.com", "")

    assert (cr._remove_query("https://google.com/search?page=42") ==
            "https://google.com/search")
Exemple #7
0
def test_crawler_get_page_data(checkup_html):
    cr = crawler.Crawler("https://google.com", "")

    assert (cr.get_page_data(
        checkup_html["html"]) == checkup_html["title_nonempty_links"])
Exemple #8
0
def save(
    url,
    splash_address,
    subdomains,
    queries,
    depth,
    concurrency,
    max_pause,
    file_prefix,
):
    """Perform the crawling and save crawled data as JSON graph representaiont
    and TAB-formatted text file in case of success.

    EXAMPLE:
        python baby_crawler save http://scrapethissite.com -s http://localhost:8050
    """

    # Perfom crawling and measure time

    cr = crawler.Crawler(
        url,
        splash_address,
        subdomains,
        queries,
        depth,
        concurrency,
        max_pause,
    )

    start = time.perf_counter()
    cr.make_site_map()
    elapsed = time.perf_counter() - start

    # Count processed links and errors

    links_found = len(cr.added_tasks)
    links_crawled = len(cr.crawled_links) - 1

    echo(
        f"Found {links_found} unique links on {url} wiht depth level {depth}.",
        color="green",
    )
    echo(f"Successfully crawled {links_crawled} links.", color="green")
    echo(
        "Elapsed time {}".format(datetime.timedelta(seconds=elapsed)),
        color="green",
    )

    if cr.error_count:
        echo("Errors:", color="red")
        for error, ammount in cr.error_count.items():
            echo(f"{error}: {ammount}", color="red")

    # Write files in case of successfull crawling

    if links_found:
        if not file_prefix:
            file_prefix = re.sub("[^a-zA-Z0-9]", "_", url)
        file_prefix += time.strftime("_%y-%m-%d_%H-%M-%S", time.localtime())
        with open(f"{file_prefix}.json", "w+", encoding="utf-8") as json_file:
            json.dump(nx.node_link_data(cr.site_graph),
                      json_file,
                      ensure_ascii=False)
            echo(f"Written graph data to {file_prefix}.json", color="green")

        with open(f"{file_prefix}.txt", "w+") as txt_file:

            txt_file.write(cr.site_graph.nodes[1]["url"] + "\n")

            def writegraph(graph, start_node, level):
                level += 1
                for i in graph.successors(start_node):
                    txt_file.write("\t" * level + graph.nodes[i]["url"] + "\n")
                    writegraph(graph, i, level)

            writegraph(cr.site_graph, 1, 0)

            echo(f"Written found links to {file_prefix}.txt", color="green")