Beispiel #1
0
def run_crawler(url, allows_foreign_urls=False):
    deque_urls = deque([url])
    broken_urls = []
    all_links = {}
    while deque_urls:
        url = deque_urls.popleft()
        if url in broken_urls:
            continue
        crawled_data = crawler(url)
        if allows_foreign_urls:
            deque_urls += deque(crawled_data['foreign_urls'])
            broken_urls += crawled_data['broken_urls']
        all_links[crawled_data['base_url']] = crawled_data
        data = {
            'base_url': crawled_data['base_url'],
            'processed_urls': url_categorizer(crawled_data['processed_urls']),
            'updated_at': datetime.now()
        }
        foreign_data = base_categorizer(crawled_data['foreign_urls'])
        save_data = {
            key: url_categorizer(value)
            for key, value in foreign_data
        }
        save_data.update({crawled_data['base_url']: data})

        for key, value in save_data:
            save_links(
                key,  # base url
                value  # all links
            )
    return all_links
Beispiel #2
0
def test_saving_links_in_mongo():
    print 'lol'
    mongo.save_links(SAMPLE_LINKS)
    cursor = mongo.LinksCollection.find()
    fetched_links = [ line['url'] for line in cursor]

    for line in SAMPLE_LINKS:
       assert line in fetched_links
Beispiel #3
0
    def test_saving_links_in_mongo(self):

    	#if flush == True: mongo._db.links.delete_many({})
    	# The function we are testing
    	mongo.save_links(self.SAMPLE.LINKS)
        cursor = mongo._db.links.find()
       	fetched_links = [ line['url'] for line in cursor]
        for line in self.SAMPLE.LINKS:
        	assert line in fetched_links