def test_next_url_new_domain_block(self): my_example = "https://my.example.com/" www_example = "https://www.example.com/" schedule = Scheduler(my_example) # Constructor calls upon schedule_url schedule.schedule_url(www_example) self.assertEqual(schedule.next_url(), my_example) self.assertEqual(schedule.next_url(), www_example)
def test_schedule_multiple_url_different_subdomain(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://www.example.com/path/to") www_example = schedule.blocks_to_crawl.pop() my_example = schedule.blocks_to_crawl.pop() self.assertNotEqual(www_example, my_example) self.assertTrue(www_example.next_url()) self.assertTrue(my_example.next_url())
def trade(): logger = get_logger() loop = asyncio.get_event_loop() scheduler = Scheduler() try: loop.run_until_complete(schedule_trading(scheduler)) except KeyboardInterrupt: logger.debug('Trading interrupted...') finally: # cleanup once again? pass
async def refresh_data(): logger = get_logger() logger.info('Refreshing initiated from webapp') factory = Factory() await factory.init_cache() factory.load_resources() tasks = factory.create() scheduler = Scheduler(tasks=tasks) await scheduler.run_tasks() await scheduler.cleanup()
async def test_daily_tasks(): scheduler = Scheduler() async def dummy_task(): print('Running dummy task...') await asyncio.sleep(0.1) task = ScheduledTask(task=dummy_task, scheduled_time='23:45') scheduler.add_daily_tasks([task]) task.is_ready = lambda: True await scheduler.run_daily_tasks() assert task.done
async def test_run_task_exceptions(resource, caplog): bad_task = BadGrabber(resource=resource) scheduler = Scheduler() scheduler.add_tasks([bad_task]) import logging with caplog.at_level(logging.INFO): await scheduler.run_tasks() print('asdf') for record in caplog.records: print(record) print('asdf')
async def test_working_time(): scheduler = Scheduler() with mock.patch('crawler.scheduler.get_config', _get_config_mock): await scheduler.update_config() assert scheduler.working_time is True
def test_next_url(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url correct_output = "https://my.example.com/path/to/location" self.assertEqual(schedule.next_url(), correct_output)
from crawler.domain import Domain from crawler.scheduler import Scheduler from urllib.parse import urlparse from multiprocessing import Process pages_fetchers = [] pages_fetchers_limit = 60 inicio = datetime.datetime.now() arr_str_urls_seeds = [ "http://cnn.com/", "https://pt.wikipedia.org/wiki/House,_M.D./", "https://globoesporte.globo.com/" ] arr_urls_seeds = [(urlparse(str_url), 0) for str_url in arr_str_urls_seeds] scheduler = Scheduler(str_usr_agent="bifaroBot", int_page_limit=1000, int_depth_limit=6, arr_urls_seeds=arr_urls_seeds) for a in range(0, pages_fetchers_limit): pages_fetchers.append(PageFetcher(scheduler)) proc = [] for pages_fetcher in pages_fetchers: p = Process(target=pages_fetcher.run()) p.start() proc.append(p) for p in proc: p.join() fim = datetime.datetime.now() print(f"Tempo gasto: {(fim-inicio).total_seconds()}")
def test_schedule_multiple_url_same_netloc(self): schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://www.example.com/path/to") block = schedule.blocks_to_crawl.pop() correct_output = deque(["/path/to", "/path/to/location"]) self.assertEqual(sorted(block.extensions_to_crawl), sorted(correct_output))
def test_schedule_url_already_crawled(self): url = "https://my.example.com/path/to/location" schedule = Scheduler(url) # Constructor calls upon schedule_url schedule.next_url() self.assertFalse(schedule.schedule_url(url))
def test_schedule_same_url_twice(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location") self.assertFalse(has_been_scheduled)
def test_schedule_url_different_domain(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.org/path/to/location") self.assertFalse(has_been_scheduled)
def test_next_url_extensions(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://my.example.com/path/to/location?key=val;word=bird#frag") correct_output = "https://my.example.com/path/to/location?key=val;word=bird#frag" self.assertEqual(schedule.next_url(), correct_output)
def test_next_url_empty_queue(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url schedule.next_url() self.assertFalse(schedule.next_url())
def test_schedule_one_url(self): schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url block = schedule.blocks_to_crawl.pop() ext = block.extensions_to_crawl[0] correct_output = "/path/to/location" self.assertEqual(ext, correct_output)
def test_schedule_url_variety(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location?key1=val1;key2=val2#frag") self.assertTrue(has_been_scheduled)
def test_schedule_url_params(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location;key=val") self.assertTrue(has_been_scheduled)
def test_schedule_url_fragment(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location#maincontent") self.assertTrue(has_been_scheduled)