Beispiel #1
0
 def test_next_url_new_domain_block(self):
     my_example = "https://my.example.com/"
     www_example = "https://www.example.com/"
     schedule = Scheduler(my_example) # Constructor calls upon schedule_url
     schedule.schedule_url(www_example)
     self.assertEqual(schedule.next_url(), my_example)
     self.assertEqual(schedule.next_url(), www_example)
Beispiel #2
0
 def test_schedule_multiple_url_different_subdomain(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://www.example.com/path/to")
     www_example = schedule.blocks_to_crawl.pop()
     my_example = schedule.blocks_to_crawl.pop()
     self.assertNotEqual(www_example, my_example)
     self.assertTrue(www_example.next_url())
     self.assertTrue(my_example.next_url())
Beispiel #3
0
def trade():
    logger = get_logger()
    loop = asyncio.get_event_loop()
    scheduler = Scheduler()
    try:
        loop.run_until_complete(schedule_trading(scheduler))
    except KeyboardInterrupt:
        logger.debug('Trading interrupted...')
    finally:
        # cleanup once again?
        pass
Beispiel #4
0
async def refresh_data():
    logger = get_logger()
    logger.info('Refreshing initiated from webapp')
    factory = Factory()
    await factory.init_cache()
    factory.load_resources()
    tasks = factory.create()
    scheduler = Scheduler(tasks=tasks)

    await scheduler.run_tasks()
    await scheduler.cleanup()
Beispiel #5
0
async def test_daily_tasks():
    scheduler = Scheduler()

    async def dummy_task():
        print('Running dummy task...')
        await asyncio.sleep(0.1)

    task = ScheduledTask(task=dummy_task, scheduled_time='23:45')
    scheduler.add_daily_tasks([task])

    task.is_ready = lambda: True
    await scheduler.run_daily_tasks()
    assert task.done
Beispiel #6
0
async def test_run_task_exceptions(resource, caplog):
    bad_task = BadGrabber(resource=resource)
    scheduler = Scheduler()

    scheduler.add_tasks([bad_task])

    import logging
    with caplog.at_level(logging.INFO):
        await scheduler.run_tasks()

    print('asdf')
    for record in caplog.records:
        print(record)
    print('asdf')
Beispiel #7
0
async def test_working_time():
    scheduler = Scheduler()

    with mock.patch('crawler.scheduler.get_config', _get_config_mock):
        await scheduler.update_config()
        assert scheduler.working_time is True
Beispiel #8
0
 def test_next_url(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     correct_output = "https://my.example.com/path/to/location"
     self.assertEqual(schedule.next_url(), correct_output)
Beispiel #9
0
from crawler.domain import Domain
from crawler.scheduler import Scheduler
from urllib.parse import urlparse
from multiprocessing import Process

pages_fetchers = []
pages_fetchers_limit = 60
inicio = datetime.datetime.now()

arr_str_urls_seeds = [
    "http://cnn.com/", "https://pt.wikipedia.org/wiki/House,_M.D./",
    "https://globoesporte.globo.com/"
]
arr_urls_seeds = [(urlparse(str_url), 0) for str_url in arr_str_urls_seeds]
scheduler = Scheduler(str_usr_agent="bifaroBot",
                      int_page_limit=1000,
                      int_depth_limit=6,
                      arr_urls_seeds=arr_urls_seeds)

for a in range(0, pages_fetchers_limit):
    pages_fetchers.append(PageFetcher(scheduler))

proc = []
for pages_fetcher in pages_fetchers:
    p = Process(target=pages_fetcher.run())
    p.start()
    proc.append(p)
for p in proc:
    p.join()

fim = datetime.datetime.now()
print(f"Tempo gasto: {(fim-inicio).total_seconds()}")
Beispiel #10
0
 def test_schedule_multiple_url_same_netloc(self):
     schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://www.example.com/path/to")
     block = schedule.blocks_to_crawl.pop()
     correct_output = deque(["/path/to", "/path/to/location"])
     self.assertEqual(sorted(block.extensions_to_crawl), sorted(correct_output))
Beispiel #11
0
 def test_schedule_url_already_crawled(self):
     url = "https://my.example.com/path/to/location"
     schedule = Scheduler(url) # Constructor calls upon schedule_url
     schedule.next_url()
     self.assertFalse(schedule.schedule_url(url))
Beispiel #12
0
 def test_schedule_same_url_twice(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location")
     self.assertFalse(has_been_scheduled)
Beispiel #13
0
 def test_schedule_url_different_domain(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.org/path/to/location")
     self.assertFalse(has_been_scheduled)
Beispiel #14
0
 def test_next_url_extensions(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://my.example.com/path/to/location?key=val;word=bird#frag")
     correct_output = "https://my.example.com/path/to/location?key=val;word=bird#frag" 
     self.assertEqual(schedule.next_url(), correct_output)
Beispiel #15
0
 def test_next_url_empty_queue(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.next_url()
     self.assertFalse(schedule.next_url())
Beispiel #16
0
 def test_schedule_one_url(self):
     schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url
     block = schedule.blocks_to_crawl.pop()
     ext = block.extensions_to_crawl[0]
     correct_output = "/path/to/location"
     self.assertEqual(ext, correct_output)
Beispiel #17
0
 def test_schedule_url_variety(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location?key1=val1;key2=val2#frag")
     self.assertTrue(has_been_scheduled)
Beispiel #18
0
 def test_schedule_url_params(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location;key=val")
     self.assertTrue(has_been_scheduled)
Beispiel #19
0
 def test_schedule_url_fragment(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location#maincontent")
     self.assertTrue(has_been_scheduled)