Python Crawler.run Exemples, sitemapgenerator.crawler.Crawler.run Python Exemples

Exemple #1

0

Afficher le fichier

def test_run(path, links_all, links_unvisited, httpbin, recurse):
    c = Crawler(httpbin.url)
    c.run(url=path, recurse=recurse[0], throttle=recurse[1])

    assert c.links.keys() == links_all.keys()
    for k, v in c.links.items():
        assert 'soup' in v
        if recurse[0] is False:
            assert 'visited' not in v
        else:
            if k.startswith('http'):
                assert 'visited' not in v
            else:
                assert 'visited' in v

    if recurse[0] is False:
        assert sorted(list(c.unvisited_links)) == sorted(links_unvisited)
    else:
        assert list(c.unvisited_links) == []

Exemple #2

0

Afficher le fichier

Fichier : test_crawler.py Projet : techjacker/sitemapgenerator

def test_run(path, links_all, links_unvisited, httpbin, recurse):
    c = Crawler(httpbin.url)
    c.run(url=path, recurse=recurse[0], throttle=recurse[1])

    assert c.links.keys() == links_all.keys()
    for k, v in c.links.items():
        assert 'soup' in v
        if recurse[0] is False:
            assert 'visited' not in v
        else:
            if k.startswith('http'):
                assert 'visited' not in v
            else:
                assert 'visited' in v

    if recurse[0] is False:
        assert sorted(list(c.unvisited_links)) == sorted(links_unvisited)
    else:
        assert list(c.unvisited_links) == []

Exemple #3

0

Afficher le fichier

Fichier : sitemapgenerator.py Projet : techjacker/sitemapgenerator

def main():
    parser = argparse.ArgumentParser(
        description='Generate an XML sitemap for a domain'
    )
    parser.add_argument(
        'domain',
        type=str,
        help='domain to crawl'
    )
    parser.add_argument('-f', '--file', help='write the xml to a file')
    parser.add_argument(
        '-t',
        '--throttle',
        type=int,
        help='max time in secs to wait between requesting URLs'
    )
    parser.add_argument(
        '-l',
        '--limit',
        type=int,
        help='max number of URLs to crawl'
    )
    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true'
    )
    args = parser.parse_args()

    crawler = Crawler(args.domain, args.quiet, args.throttle, args.limit)
    if args.file:
        sitemap = Sitemap(args.file, args.quiet)

    crawler.run(recurse=True)
    if sitemap:
        sitemap.run(crawler.domain_links)