Exemple #1
0
def test_urlutils():
    '''Test URL manipulation tools'''
    # domain extraction
    assert extract_domain('h') is None
    assert extract_domain('https://httpbin.org/') == 'httpbin.org'
    # url parsing
    result = _parse('https://httpbin.org/')
    assert isinstance(result, ParseResult)
    newresult = _parse(result)
    assert isinstance(result, ParseResult)
    with pytest.raises(TypeError):
        result = _parse(float(1.23))

    assert get_base_url('https://example.org/path') == 'https://example.org'
    with pytest.raises(ValueError):
        assert get_host_and_path('123') is None
    assert get_host_and_path('https://example.org/path') == (
        'https://example.org', '/path')
    assert get_host_and_path('https://example.org/') == ('https://example.org',
                                                         '/')
    assert get_host_and_path('https://example.org') == ('https://example.org',
                                                        '/')
    assert get_hostinfo('https://httpbin.org/') == ('httpbin.org',
                                                    'https://httpbin.org')
    assert get_hostinfo('https://example.org/path') == ('example.org',
                                                        'https://example.org')
    # keeping track of known URLs
    known_links = {'https://test.org'}
    assert is_known_link('https://test.org/1', known_links) is False
    assert is_known_link('https://test.org', known_links) is True
    assert is_known_link('http://test.org', known_links) is True
    assert is_known_link('http://test.org/', known_links) is True
    assert is_known_link('https://test.org/', known_links) is True
Exemple #2
0
def test_urlutils():
    '''Test URL manipulation tools'''
    assert extract_domain('https://httpbin.org/') == 'httpbin.org'
    assert get_base_url('https://example.org/path') == 'https://example.org'
    assert get_host_and_path('https://example.org/path') == ('https://example.org', '/path')
    assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org')
    assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org')
Exemple #3
0
def store_todo_links(todo, new_links, shortform=False):
    """Store the retrieved internal links in todo-list while prioritizing
       the navigation ones."""
    # add links to deque
    if todo is None:
        todo = deque()
    # prioritize navigation links
    # use most short links if there are no navlinks?
    for link in new_links:
        if shortform is True:
            link = get_host_and_path(link)[1]
        if is_navigation_page(link):
            todo.appendleft(link)
        else:
            todo.append(link)
    # unique list while preserving order
    return deque(OrderedDict.fromkeys(todo))
Exemple #4
0
def add_to_compressed_dict(inputlist,
                           blacklist=None,
                           url_filter=None,
                           inputdict=None):
    '''Filter, convert input URLs and add them to domain-aware processing dictionary'''
    # init
    if inputdict is None:
        inputdict = defaultdict(deque)
    # deduplicate while keeping order
    inputlist = list(OrderedDict.fromkeys(inputlist))
    # filter
    if blacklist:
        inputlist = [
            u for u in inputlist
            if re.sub(r'https?://', '', u) not in blacklist
        ]
    if url_filter:
        filtered_list = []
        while inputlist:
            u = inputlist.pop()
            for f in url_filter:
                if f in u:
                    filtered_list.append(u)
                    break
        inputlist = filtered_list
    # validate and store in dict
    for url in inputlist:
        # validate URL
        if validate_url(url)[0] is False:
            continue
        # segment URL and add to domain dictionary
        try:
            hostinfo, urlpath = get_host_and_path(url)
            inputdict[hostinfo].append(urlpath)
        except ValueError:
            LOGGER.warning('Could not parse URL, discarding: %s', url)
    return inputdict
Exemple #5
0
def cli_crawler(args, n=30, domain_dict=None):
    '''Start a focused crawler which downloads a fixed number of URLs within a website
       and prints the links found in the process'''
    config = use_config(filename=args.config_file)
    sleep_time = config.getfloat('DEFAULT', 'SLEEP_TIME')
    counter, crawlinfo, backoff_dict = None, {}, {}
    # load input URLs
    if domain_dict is None:
        domain_dict = load_input_dict(args)
    # load crawl data
    for website in domain_dict:
        homepage = website + domain_dict[website].popleft()
        crawlinfo[website] = {}
        domain_dict[website], crawlinfo[website]['known'], crawlinfo[website][
            'base'], crawlinfo[website]['count'], crawlinfo[website][
                'rules'] = init_crawl(homepage,
                                      None,
                                      set(),
                                      language=args.target_language,
                                      shortform=True)
        # update info
        # TODO: register changes?
        # if base_url != website:
        # ...
    # iterate until the threshold is reached
    while domain_dict:
        bufferlist, download_threads, domain_dict, backoff_dict = load_download_buffer(
            domain_dict, backoff_dict, sleep_time, threads=args.parallel)
        # start several threads
        for url, result in buffered_downloads(bufferlist,
                                              download_threads,
                                              decode=False):
            website, _ = get_host_and_path(url)
            crawlinfo[website]['count'] += 1
            # handle result
            if result is not None and result != '':
                domain_dict[website], crawlinfo[website][
                    'known'], htmlstring = process_response(
                        result,
                        domain_dict[website],
                        crawlinfo[website]['known'],
                        crawlinfo[website]['base'],
                        args.target_language,
                        shortform=True,
                        rules=crawlinfo[website]['rules'])
                # only store content pages, not navigation
                if not is_navigation_page(url):  # + response.geturl()
                    if args.list:
                        write_result(url, args)
                    else:
                        counter = process_result(htmlstring, args, url,
                                                 counter, config)
                # just in case a crawl delay is specified in robots.txt
                sleep(get_crawl_delay(crawlinfo[website]['rules']))
                #else:
                #    LOGGER.debug('No result for URL: %s', url)
                #    if args.archived is True:
                #        errors.append(url)
        # early exit if maximum count is reached
        if any(i >= n for i in
               [dictvalue['count'] for _, dictvalue in crawlinfo.items()]):
            break
    # print results
    for website in sorted(domain_dict):
        for urlpath in sorted(domain_dict[website]):
            sys.stdout.write(website + urlpath + '\n')