def run_crawler(url, allows_foreign_urls=False): deque_urls = deque([url]) broken_urls = [] all_links = {} while deque_urls: url = deque_urls.popleft() if url in broken_urls: continue crawled_data = crawler(url) if allows_foreign_urls: deque_urls += deque(crawled_data['foreign_urls']) broken_urls += crawled_data['broken_urls'] all_links[crawled_data['base_url']] = crawled_data data = { 'base_url': crawled_data['base_url'], 'processed_urls': url_categorizer(crawled_data['processed_urls']), 'updated_at': datetime.now() } foreign_data = base_categorizer(crawled_data['foreign_urls']) save_data = { key: url_categorizer(value) for key, value in foreign_data } save_data.update({crawled_data['base_url']: data}) for key, value in save_data: save_links( key, # base url value # all links ) return all_links
def test_saving_links_in_mongo(): print 'lol' mongo.save_links(SAMPLE_LINKS) cursor = mongo.LinksCollection.find() fetched_links = [ line['url'] for line in cursor] for line in SAMPLE_LINKS: assert line in fetched_links
def test_saving_links_in_mongo(self): #if flush == True: mongo._db.links.delete_many({}) # The function we are testing mongo.save_links(self.SAMPLE.LINKS) cursor = mongo._db.links.find() fetched_links = [ line['url'] for line in cursor] for line in self.SAMPLE.LINKS: assert line in fetched_links