Esempio n. 1
0
def test_union_with_url_updates():
    list_a = [
        pyd.Frontier(
            fqdn="www.example.com",
            tld="com",
            fqdn_last_ipv4="123.456.78.90",
        ),
    ]

    list_b = [
        pyd.Frontier(fqdn="www.example.com", tld="com"),
    ]

    asserted_list = [
        pyd.Frontier(
            fqdn="www.example.com",
            tld="com",
            fqdn_last_ipv4="123.456.78.90",
        ),
    ]
    assert fetch.unique_fqdn_list(list_a, list_b) == asserted_list
Esempio n. 2
0
def test_union_with_urls():
    lista = [
        pyd.Frontier(fqdn="www.example.com", tld="com"),
        pyd.Frontier(fqdn="www.example.de", tld="de"),
        pyd.Frontier(fqdn="www.example.com", tld="com")
    ]
    listb = [
        pyd.Frontier(fqdn="www.example.com", tld="com"),
        pyd.Frontier(fqdn="www.example.fr", tld="fr"),
    ]
    asserted_list = [
        pyd.Frontier(fqdn="www.example.com", tld="com"),
        pyd.Frontier(fqdn="www.example.de", tld="de"),
        pyd.Frontier(fqdn="www.example.fr", tld="fr"),
    ]
    assert fetch.unique_fqdn_list(lista, listb) == asserted_list
Esempio n. 3
0
def test_simulate_short_term_fetch():
    short_term_frontier = pyd.Frontier(
        fqdn="www.example.de",
        tld="de",
        fqdn_last_ipv4="123.456.78.91",
        fqdn_last_ipv6="2001:DB8::1234",
        fqdn_pagerank=0.00001,
        fqdn_crawl_delay=None,
        fqdn_url_count=2,
        url_list=[
            pyd.Url(
                url="http://www.example.de/html/index",
                fqdn="www.example.de",
                url_discovery_date=None,
                url_last_visited="2020-01-01T06:00:00",
                url_blacklisted=False,
                url_bot_excluded=False,
            ),
            pyd.Url(
                url="http://www.example.de/html/contact",
                fqdn="www.example.de",
                url_discovery_date=None,
                url_last_visited="2020-01-01T07:00:00",
                url_blacklisted=False,
                url_bot_excluded=False,
            ),
        ],
    )

    short_term_fetch_result = fetch.simulate_short_term_fetch(short_term_frontier)

    assert isinstance(short_term_fetch_result, List)
    assert len(short_term_fetch_result) == len(short_term_frontier.url_list) * (
        s.max_links_per_page + 1
    )
    for i in range(len(short_term_fetch_result)):
        assert isinstance(short_term_fetch_result[i], pyd.Url)
        assert isinstance(short_term_fetch_result[i].url, HttpUrl)
Esempio n. 4
0
def fqdns_from_url_list(url_list: List[pyd.Url]) -> List[pyd.Frontier]:
    fqdn_list = []
    for url in url_list:
        fqdn_list.append(pyd.Frontier(fqdn=url.fqdn, tld=get_tld(url.fqdn)))
    return fqdn_list
Esempio n. 5
0
def test_simulate_fetch():

    frontier_partition = pyd.FrontierResponse(
        uuid="12345678-90ab-cdef-0000-000000000000",
        response_url="http://www.example.com/submit",
        latest_return="2020-10-10T23:00:00.000000",
        url_frontiers_count=2,
        urls_count=2,
        url_frontiers=[
            pyd.Frontier(
                fqdn="www.example.de",
                tld="de",
                fqdn_last_ipv4="123.456.78.90",
                fqdn_last_ipv6="2001:DB8::1234",
                fqdn_pagerank=0.00001,
                fqdn_crawl_delay=5,
                fqdn_url_count=1,
                url_list=[
                    pyd.Url(
                        url="http://www.example.de/html/index",
                        fqdn="www.example.de",
                        url_discovery_date=None,
                        url_last_visited="2020-01-01T06:00:00",
                        url_blacklisted=False,
                        url_bot_excluded=False,
                    ),
                ],
            ),
            pyd.Frontier(
                fqdn="www.example.com",
                tld="com",
                fqdn_last_ipv4="123.456.78.90",
                fqdn_last_ipv6="2001:DB8::1234",
                fqdn_pagerank=0.00001,
                fqdn_crawl_delay=5,
                fqdn_url_count=1,
                url_list=[
                    pyd.Url(
                        url="http://www.example.com/html/index",
                        fqdn="www.example.com",
                        url_discovery_date=None,
                        url_last_visited="2020-01-01T06:00:00",
                        url_blacklisted=False,
                        url_bot_excluded=False,
                    ),
                ],
            ),
        ],
    )

    processed_list = fetch.simulate_full_fetch(frontier_partition)

    assert processed_list.uuid == frontier_partition.uuid
    assert isinstance(processed_list.url_count, int)

    assert processed_list.url_count == frontier_partition.urls_count * (
        s.max_links_per_page + 1
    )

    for i in range(len(processed_list.urls)):
        assert isinstance(processed_list.urls[i].url, HttpUrl)
        assert (
            isinstance(processed_list.urls[i].url_discovery_date, datetime)
            or processed_list.urls[i].url_discovery_date is None
        )
        assert (
            isinstance(processed_list.urls[i].url_last_visited, datetime)
            or processed_list.urls[i].url_last_visited is None
        )
        assert isinstance(
            processed_list.urls[i].url_discovery_date, datetime
        ) or isinstance(processed_list.urls[i].url_last_visited, datetime)