Esempio n. 1
0
def generate_existing_internal_url(url: pyd.Url):
    return pyd.Url(
        url=url.url,
        fqdn=url.fqdn,
        url_pagerank=url.url_pagerank,
        url_discovery_date=url.url_discovery_date,
    )
Esempio n. 2
0
def generate_new_internal_url(url: pyd.Url):
    return pyd.Url(
        url=gen.get_similar_url(url).url,
        fqdn=gen.get_fqdn_from_url(url),
        url_pagerank=gen.random_pagerank(),
        url_discovery_date=datetime.now(),
    )
Esempio n. 3
0
def generate_random_url(fqdn=None) -> pyd.Url:
    applied_fqdn = get_random_fqdn() if fqdn is None else fqdn
    return pyd.Url(
        url="http://{}/{}{}".format(applied_fqdn, get_random_german_text(),
                                    get_random_web_filename()),
        fqdn=applied_fqdn,
        url_pagerank=random_pagerank(),
        url_discovery_date=datetime.now(),
    )
Esempio n. 4
0
def generate_existing_external_url(session: requests.Session,
                                   fqdn: str = None):
    url = gen.get_random_existing_url(session=session, fqdn=fqdn)

    return pyd.Url(
        url=url.url,
        fqdn=url.fqdn,
        url_pagerank=gen.random_pagerank(),
        url_discovery_date=datetime.now(),
    )
def test_url_to_dict():
    test_url = pyd.Url(url="https://www.example.com/abcefg",
                       fqdn="www.example.com")
    assert datsav.url_dict(test_url) == {
        "url": "https://www.example.com/abcefg",
        "fqdn": "www.example.com",
        "url_discovery_date": None,
        "url_last_visited": None,
        "url_blacklisted": None,
        "url_bot_excluded": None,
        "url_pagerank": None,
    }
Esempio n. 6
0
def test_simulate_short_term_fetch():
    short_term_frontier = pyd.Frontier(
        fqdn="www.example.de",
        tld="de",
        fqdn_last_ipv4="123.456.78.91",
        fqdn_last_ipv6="2001:DB8::1234",
        fqdn_pagerank=0.00001,
        fqdn_crawl_delay=None,
        fqdn_url_count=2,
        url_list=[
            pyd.Url(
                url="http://www.example.de/html/index",
                fqdn="www.example.de",
                url_discovery_date=None,
                url_last_visited="2020-01-01T06:00:00",
                url_blacklisted=False,
                url_bot_excluded=False,
            ),
            pyd.Url(
                url="http://www.example.de/html/contact",
                fqdn="www.example.de",
                url_discovery_date=None,
                url_last_visited="2020-01-01T07:00:00",
                url_blacklisted=False,
                url_bot_excluded=False,
            ),
        ],
    )

    short_term_fetch_result = fetch.simulate_short_term_fetch(short_term_frontier)

    assert isinstance(short_term_fetch_result, List)
    assert len(short_term_fetch_result) == len(short_term_frontier.url_list) * (
        s.max_links_per_page + 1
    )
    for i in range(len(short_term_fetch_result)):
        assert isinstance(short_term_fetch_result[i], pyd.Url)
        assert isinstance(short_term_fetch_result[i].url, HttpUrl)
Esempio n. 7
0
def get_random_existing_url(session: requests.Session,
                            fqdn: str = None) -> pyd.Url:
    if fqdn is None:
        random_url = session.get(s.websch_random_urls_endpoint).json()

    else:
        random_url = session.get("{}?fqdn={}".format(
            s.websch_random_urls_endpoint, fqdn)).json()

    if len(random_url["url_list"]) == 0:
        new_url = generate_random_url()
        random_url["url_list"].append(dict(url=new_url.url, fqdn=new_url.fqdn))

    return pyd.Url(url=random_url["url_list"][0]["url"],
                   fqdn=random_url["url_list"][0]["fqdn"])
Esempio n. 8
0
def test_simulate_parse_url():
    url = pyd.Url(
        url="http://www.example.de/html/index",
        fqdn="www.example.de",
        url_discovery_date=None,
        url_last_visited="2020-01-01T06:00:00",
        rl_blacklisted=False,
        url_bot_excluded=False,
    )

    parsed_list = fetch.simulate_parse_url(url, session)
    assert isinstance(parsed_list[0].url, HttpUrl)
    assert parsed_list[0].url_discovery_date is None
    assert parsed_list[0].url_last_visited != "2020-01-01T06:00:00"
    assert isinstance(parsed_list[1].url, HttpUrl)
Esempio n. 9
0
def test_simulate_fetch():

    frontier_partition = pyd.FrontierResponse(
        uuid="12345678-90ab-cdef-0000-000000000000",
        response_url="http://www.example.com/submit",
        latest_return="2020-10-10T23:00:00.000000",
        url_frontiers_count=2,
        urls_count=2,
        url_frontiers=[
            pyd.Frontier(
                fqdn="www.example.de",
                tld="de",
                fqdn_last_ipv4="123.456.78.90",
                fqdn_last_ipv6="2001:DB8::1234",
                fqdn_pagerank=0.00001,
                fqdn_crawl_delay=5,
                fqdn_url_count=1,
                url_list=[
                    pyd.Url(
                        url="http://www.example.de/html/index",
                        fqdn="www.example.de",
                        url_discovery_date=None,
                        url_last_visited="2020-01-01T06:00:00",
                        url_blacklisted=False,
                        url_bot_excluded=False,
                    ),
                ],
            ),
            pyd.Frontier(
                fqdn="www.example.com",
                tld="com",
                fqdn_last_ipv4="123.456.78.90",
                fqdn_last_ipv6="2001:DB8::1234",
                fqdn_pagerank=0.00001,
                fqdn_crawl_delay=5,
                fqdn_url_count=1,
                url_list=[
                    pyd.Url(
                        url="http://www.example.com/html/index",
                        fqdn="www.example.com",
                        url_discovery_date=None,
                        url_last_visited="2020-01-01T06:00:00",
                        url_blacklisted=False,
                        url_bot_excluded=False,
                    ),
                ],
            ),
        ],
    )

    processed_list = fetch.simulate_full_fetch(frontier_partition)

    assert processed_list.uuid == frontier_partition.uuid
    assert isinstance(processed_list.url_count, int)

    assert processed_list.url_count == frontier_partition.urls_count * (
        s.max_links_per_page + 1
    )

    for i in range(len(processed_list.urls)):
        assert isinstance(processed_list.urls[i].url, HttpUrl)
        assert (
            isinstance(processed_list.urls[i].url_discovery_date, datetime)
            or processed_list.urls[i].url_discovery_date is None
        )
        assert (
            isinstance(processed_list.urls[i].url_last_visited, datetime)
            or processed_list.urls[i].url_last_visited is None
        )
        assert isinstance(
            processed_list.urls[i].url_discovery_date, datetime
        ) or isinstance(processed_list.urls[i].url_last_visited, datetime)