Exemple #1
0
def worker(query, working, process, loops, page_limit, timeout=1):
    if not os.path.exists(
            path + 'archive_dict.pkl'
    ):  # performs initial crawl to build archive_dict.pkl (required by article_extractor.py)
        crawl(query, path, page_limit)
    while working:
        for i in range(loops):
            crawl(query, path, page_limit)
            time.sleep(timeout)
        if process:
            extract(path)
Exemple #2
0
def perform_pagerank(url):
    dictionary, web_graph, text = webcrawler.crawl(url)
    indices, probabilities = compute_probability_matrix(web_graph)
    pagerank = compute_pagerank(indices, probabilities)
    print_stats(dictionary, web_graph)

    return (dictionary, text, pagerank, indices)
def test_parse_generic_page_with_container_classes():
    data = webcrawler.crawl(
        'https://as.com/motor/2017/07/15/formula_1/1500113537_936620.html',
        content_container_class='int-articulo',
        image_container_class='cont-img-dest-art',
    )

    assert data is not None
    assert data.get('headline') == (
        'Hamilton y Vettel se citan para la pole con Alonso 11º y Sainz 17º'
    )
    assert data.get('paragraph') == (
        'Es esa lluvia fina, como esos trabajos en los que te vas dando '
        'golpes, uno y otro y otro… Y al final acabas saltando, dejando las '
        'llaves del coche de empresa, el pórtatil en la mesa, metiendo el '
        'marco con la foto de tu familia en una caja y largándote a buscar '
        'otra vida. Una mejor, si es posible. Así es esa lluvia de Inglaterra '
        'que mojó Silverstone en la parte final de los terceros entrenamientos'
        ' libres del GP de Inglaterra. Y por eso en esos últimos minutos con '
        'los neumáticos intermedios hubo varias salidas de pista, porque esa '
        'lluvia no es torrencial, pero duele.'
    )
    assert data.get('image_url') == '//as01.epimg.net/motor/imagenes/2017/07/15/formula_1/1500113537_936620_1500113633_noticia_normal.jpg'  # noqa

    filename = '{}-{}.json'.format(
        datetime.now().strftime('%Y-%m-%d-%H:%M'),
        slugify(data.get('headline'))
    )
    os.remove('data/{}'.format(filename))
def test_persisted_data():
    data = webcrawler.crawl('https://en.wikipedia.org/wiki/Donald_Trump')
    filename = '{}-{}.json'.format(
        datetime.now().strftime('%Y-%m-%d-%H:%M'),
        slugify(data.get('headline'))
    )

    with open('data/{}'.format(filename), 'r') as file:
        persisted_data = json.loads(file.read())
        assert data.get('headline') == persisted_data.get('headline')
        assert data.get('paragraph') == persisted_data.get('paragraph')
        assert data.get('image_url') == persisted_data.get('image_url')

    os.remove('data/{}'.format(filename))
def test_parse_wikipedia_page():
    data = webcrawler.crawl('https://en.wikipedia.org/wiki/Donald_Trump')

    assert data is not None
    assert data.get('headline') == 'Donald Trump'
    assert data.get('paragraph') == (
        'Donald John Trump (born June 14, 1946) is the 45th and current '
        'President of the United States, in office since January 20, 2017. '
        'Before entering politics, he was a businessman and television '
        'personality.'
    )
    assert data.get('image_url') == '//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Donald_Trump_Pentagon_2017.jpg/220px-Donald_Trump_Pentagon_2017.jpg'  # noqa

    filename = '{}-{}.json'.format(
        datetime.now().strftime('%Y-%m-%d-%H:%M'),
        slugify(data.get('headline'))
    )
    os.remove('data/{}'.format(filename))
Exemple #6
0
    def no_mp_helper(self, page_url_list, url_count, already_processed,
                     nprocess, page_level):
        '''
        The method no_mp_helper generates all urls scraped from a given list of urls without using multiprocessing.
        We call the crawl function on the given list of urls to generate a list of urls linked in each given url.
        As we ultimately begin with one starting url and branch out to many urls (depending on how many urls are linked on the starting url),
            we have a tree structure of urls. Therefore, we choose to use recursion to generate urls linked in each given url.

        param page_url_list: list of string urls
        param url_count: int value for count of total urls scraped from web pages
        param already_processed: list of urls (strings) that's already been scraped
        param nprocess: number of processes
        param page_level: number of nested page levels to recurse through (for testing purposes). This program will run until there are absolutely no urls on the webpages if this restriction is not set.

        return total_count: total number of urls from entire nested web scraping
        '''
        if (len(page_url_list) == 0 or (page_level == 0)):
            return url_count

        url_list = []
        total_count = [len(page_url_list)]

        for url in page_url_list:
            if (url in already_processed):
                continue

            #Adding url to list of processed url's.
            already_processed.append(url)

            # Returns list of urls scraped from current url
            result = crawl(url)

            # Keeping track of total count of url's
            total_count.append(
                self.no_mp_helper(result, url_count + len(result),
                                  already_processed, nprocess, page_level - 1))

        return sum(total_count)
Exemple #7
0
            args.output_dir))

if args.seed_url:
    try:
        os.mkdir(os.path.join(args.output_dir, HTML_DIRECTORY))
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise e
        pass
    print "Triggering webcrawler.\n\tCreated '{}' directory to save raw html.".format(
        HTML_DIRECTORY)
    webcrawler.crawl({
        'BASE_DIRECTORY': args.output_dir,
        'DEPTH': args.depth,
        'DEPTH_FIRST_CRAWL': args.depth_first_crawl,
        'KEYWORD': args.keyword,
        'LIMIT': args.limit,
        'POLITENESS': args.politeness,
        'SEED_URL': args.seed_url
    })

if args.pagerank:
    in_link_file_path = os.path.join(args.output_dir, args.in_link)
    if os.path.isfile(in_link_file_path):
        print "Triggering pagerank."
        pagerank.generate_pagerank({
            'BASE_DIRECTORY': args.output_dir,
            'CONVERGENCE': args.convergence,
            'IN_LINK_FILE': args.in_link,
            'ITERATIONS': args.iterations
        })
    try:
        bus = float(bus)
    except ValueError:
        print('Please enter in a number.\n')
        continue 

    # Get times to repeat request 
    count = raw_input('How many time to repeat: ')
    # Try to convert it to a float
    try:
        count = float(count)
    except ValueError:
        print('Please enter in a number.\n')
        continue
    count = int(count)
    for i in range(1, count+1):      
        # Run our time.sleep() command,
        # and show the before and after time
        print('\nREPEATING No. ' + str(i))
        print('Before: %s' % time.ctime())
        webcrawler.crawl(int(bus), 1)
        time.sleep(num)
        print('After: %s\n' % time.ctime())
        print('END REPEATING No. ' + str(i) + '\n')

# TO DO:
# add wait method (wait for 60 seconds) √
# while loop √
# regex to extract and format info √
# a better interface √
# more parameters to method √
def test_parse_generic_page_without_container_classes():
    data = webcrawler.crawl(
        'https://as.com/motor/2017/07/15/formula_1/1500113537_936620.html',
    )

    assert data is None
def test_url_not_found():
    try:
        webcrawler.crawl('http://www.google.de/this-page-does-not-exist')
    except Exception as e:
        assert type(e) == requests.exceptions.HTTPError
        assert e.response.status_code == 404
Exemple #11
0
def crawl_most_read_articles():
    for article in MOST_READ_ARTICLES:
        webcrawler.crawl(article)