def test_next_url_new_domain_block(self): my_example = "https://my.example.com/" www_example = "https://www.example.com/" schedule = Scheduler(my_example) # Constructor calls upon schedule_url schedule.schedule_url(www_example) self.assertEqual(schedule.next_url(), my_example) self.assertEqual(schedule.next_url(), www_example)
def test_schedule_multiple_url_different_subdomain(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://www.example.com/path/to") www_example = schedule.blocks_to_crawl.pop() my_example = schedule.blocks_to_crawl.pop() self.assertNotEqual(www_example, my_example) self.assertTrue(www_example.next_url()) self.assertTrue(my_example.next_url())
class Driver: def __init__(self): self.scheduler = Scheduler() def run(self): scheduler_job = SchedulerJob(self.scheduler) self.scheduler.schedule(scheduler_job, 3000) self.scheduler.start()
async def test_daily_tasks(): scheduler = Scheduler() async def dummy_task(): print('Running dummy task...') await asyncio.sleep(0.1) task = ScheduledTask(task=dummy_task, scheduled_time='23:45') scheduler.add_daily_tasks([task]) task.is_ready = lambda: True await scheduler.run_daily_tasks() assert task.done
async def test_run_task_exceptions(resource, caplog): bad_task = BadGrabber(resource=resource) scheduler = Scheduler() scheduler.add_tasks([bad_task]) import logging with caplog.at_level(logging.INFO): await scheduler.run_tasks() print('asdf') for record in caplog.records: print(record) print('asdf')
def trade(): logger = get_logger() loop = asyncio.get_event_loop() scheduler = Scheduler() try: loop.run_until_complete(schedule_trading(scheduler)) except KeyboardInterrupt: logger.debug('Trading interrupted...') finally: # cleanup once again? pass
async def refresh_data(): logger = get_logger() logger.info('Refreshing initiated from webapp') factory = Factory() await factory.init_cache() factory.load_resources() tasks = factory.create() scheduler = Scheduler(tasks=tasks) await scheduler.run_tasks() await scheduler.cleanup()
def test_schedule_multiple_url_same_netloc(self): schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://www.example.com/path/to") block = schedule.blocks_to_crawl.pop() correct_output = deque(["/path/to", "/path/to/location"]) self.assertEqual(sorted(block.extensions_to_crawl), sorted(correct_output))
from crawler.domain import Domain from crawler.scheduler import Scheduler from urllib.parse import urlparse from multiprocessing import Process pages_fetchers = [] pages_fetchers_limit = 60 inicio = datetime.datetime.now() arr_str_urls_seeds = [ "http://cnn.com/", "https://pt.wikipedia.org/wiki/House,_M.D./", "https://globoesporte.globo.com/" ] arr_urls_seeds = [(urlparse(str_url), 0) for str_url in arr_str_urls_seeds] scheduler = Scheduler(str_usr_agent="bifaroBot", int_page_limit=1000, int_depth_limit=6, arr_urls_seeds=arr_urls_seeds) for a in range(0, pages_fetchers_limit): pages_fetchers.append(PageFetcher(scheduler)) proc = [] for pages_fetcher in pages_fetchers: p = Process(target=pages_fetcher.run()) p.start() proc.append(p) for p in proc: p.join() fim = datetime.datetime.now() print(f"Tempo gasto: {(fim-inicio).total_seconds()}")
def index(request): template_name = "index.html" if request.method == "POST": if "crawl-btn" in request.POST: crawler_form = CrawlerForm(request.POST) if crawler_form.is_valid(): n_docs = crawler_form.cleaned_data["n_docs"] in_degree = crawler_form.cleaned_data["in_degree"] out_degree = crawler_form.cleaned_data["out_degree"] starting_url = crawler_form.cleaned_data["starting_url"] urls = [x.strip() for x in starting_url.split(",")] crawler = Scheduler(starting_url=urls, num=n_docs, in_degre=in_degree, out_degree=out_degree) crawler.crawl() else: return render(request, template_name, {"crawler_form": crawler_form}) if "index-btn" in request.POST: index_form = IndexForm(request.POST) if index_form.is_valid(): direction = index_form.cleaned_data["direction"] print(direction) searcher.index(es, direction) else: return render(request, template_name, {"index_form", index_form}) if "cluster-btn" in request.POST: cluster_form = ClusterForm(request.POST) if cluster_form.is_valid(): path_to_json = cluster_form.cleaned_data["direction"] titles = [] json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith(".json")] for js in json_files: with open(os.path.join(path_to_json, js)) as json_file: data = json.load(json_file) titles.append(data["title"]) json_file.close() vectorizer = CountVectorizer() vectors = vectorizer.fit_transform(titles).todense().tolist() kmeans = Kmeans(vectors) k_points = [] j_points = [] for i in range(len(vectors)): k_points.append(i + 1) j_points.append(kmeans.kmenas(i + 1)) plt.plot(j_points, k_points) plt.show() else: return render(request, template_name, {"cluster_form": cluster_form}) if "page-rank-btn" in request.POST: page_rank_form = PageRankForm(request.POST) if page_rank_form.is_valid(): alpha = page_rank_form.cleaned_data["alpha"] threshold = page_rank_form.cleaned_data["threshold"] else: return render(request, template_name, {"page_rank_form": page_rank_form}) if "query-btn" in request.POST: query_form = QueryForm(request.POST) if query_form.is_valid(): cluster = query_form.cleaned_data["cluster"] pagerank = query_form.cleaned_data["pagerank"] query = query_form.cleaned_data["query"] pass else: return render(request, template_name, {"query_form": query_form}) else: query_form = QueryForm() page_rank_form = PageRankForm() cluster_form = ClusterForm() crawler_form = CrawlerForm() index_form = IndexForm() return render( request, template_name, { "query_form": query_form, "page_rank_form": page_rank_form, "cluster_form": cluster_form, "crawler_form": crawler_form, "index_form": index_form, }, )
def __init__(self): self.scheduler = Scheduler()
def test_schedule_same_url_twice(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location") self.assertFalse(has_been_scheduled)
def test_schedule_url_fragment(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location#maincontent") self.assertTrue(has_been_scheduled)
def test_next_url_extensions(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url schedule.schedule_url("https://my.example.com/path/to/location?key=val;word=bird#frag") correct_output = "https://my.example.com/path/to/location?key=val;word=bird#frag" self.assertEqual(schedule.next_url(), correct_output)
def test_schedule_one_url(self): schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url block = schedule.blocks_to_crawl.pop() ext = block.extensions_to_crawl[0] correct_output = "/path/to/location" self.assertEqual(ext, correct_output)
def test_next_url_empty_queue(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url schedule.next_url() self.assertFalse(schedule.next_url())
def test_next_url(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url correct_output = "https://my.example.com/path/to/location" self.assertEqual(schedule.next_url(), correct_output)
def test_schedule_url_variety(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location?key1=val1;key2=val2#frag") self.assertTrue(has_been_scheduled)
def test_schedule_url_params(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location;key=val") self.assertTrue(has_been_scheduled)
async def test_working_time(): scheduler = Scheduler() with mock.patch('crawler.scheduler.get_config', _get_config_mock): await scheduler.update_config() assert scheduler.working_time is True
def test_schedule_url_already_crawled(self): url = "https://my.example.com/path/to/location" schedule = Scheduler(url) # Constructor calls upon schedule_url schedule.next_url() self.assertFalse(schedule.schedule_url(url))
def test_schedule_url_different_domain(self): schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url has_been_scheduled = schedule.schedule_url("https://my.example.org/path/to/location") self.assertFalse(has_been_scheduled)