def test_bing_crawl_fake(self): ''' test bing top 10 search crawler ''' keywords = [['new', 'york', 'university'], ['Torsten', 'Suel'], ['Amazon', 'Full', 'Time'], ['Bing', 'Ann', 'Arbor']] for key in keywords: bs = BingWebCrawler(key, fake=True) urls = bs.query() self.assertTrue(urls == ['http://engineering.nyu.edu', 'http://www.nyu.edu'])
def test_bing_crawl(self): ''' test bing top 10 search crawler ''' keywords = [['new', 'york', 'university'], ['Torsten', 'Suel'], ['Amazon', 'Full', 'Time'], ['Bing', 'Ann', 'Arbor']] for key in keywords: bs = BingWebCrawler(key, fake=False) urls = bs.query() print urls self.assertTrue(self.validate_urls(urls))
def test_bing_crawl_fake(self): ''' test bing top 10 search crawler ''' keywords = [['new', 'york', 'university'], ['Torsten', 'Suel'], ['Amazon', 'Full', 'Time'], ['Bing', 'Ann', 'Arbor']] for key in keywords: bs = BingWebCrawler(key, fake=True) urls = bs.query() self.assertTrue( urls == ['http://engineering.nyu.edu', 'http://www.nyu.edu'])
def run(self): ''' run the dispatcher ''' # crawl google web search engine gs = GoogleWebCrawler(self.keywords, self.args.fake) urls = gs.query() if not urls and gs.error > 0: print('Network Error. Please check network connection.') return if not urls: bs = BingWebCrawler(self.keywords, self.args.fake) urls = bs.query() if not urls: print( 'See crawl failed. Please check network connection or contact the author.' ) return self.bulk_url_enqueue(urls) # launch the crawler thread t_crawler = threading.Thread(target=self.run_page_crawler) t_crawler.daemon = True t_crawler.start() # launch the log writer thread t_logger = threading.Thread(target=self.run_log_writter) t_logger.daemon = True t_logger.start() # launch the progress reporter t_reporter = threading.Thread(target=self.run_progress_reporter) t_reporter.daemon = True t_reporter.start() # wait for the workers to finish t_crawler.join() t_logger.join() # finalize statistical metrics self.stats.finalize() # close reporter t_reporter.join() # close logger self.logger.close()
def run(self): ''' run the dispatcher ''' # crawl google web search engine gs = GoogleWebCrawler(self.keywords, self.args.fake) urls = gs.query() if not urls and gs.error > 0: print('Network Error. Please check network connection.') return if not urls: bs = BingWebCrawler(self.keywords, self.args.fake) urls = bs.query() if not urls: print('See crawl failed. Please check network connection or contact the author.') return self.bulk_url_enqueue(urls) # launch the crawler thread t_crawler = threading.Thread(target=self.run_page_crawler) t_crawler.daemon = True t_crawler.start() # launch the log writer thread t_logger = threading.Thread(target=self.run_log_writter) t_logger.daemon = True t_logger.start() # launch the progress reporter t_reporter = threading.Thread(target=self.run_progress_reporter) t_reporter.daemon = True t_reporter.start() # wait for the workers to finish t_crawler.join() t_logger.join() # finalize statistical metrics self.stats.finalize() # close reporter t_reporter.join() # close logger self.logger.close()