Beispiel #1
0
 def test_next_url_new_domain_block(self):
     my_example = "https://my.example.com/"
     www_example = "https://www.example.com/"
     schedule = Scheduler(my_example) # Constructor calls upon schedule_url
     schedule.schedule_url(www_example)
     self.assertEqual(schedule.next_url(), my_example)
     self.assertEqual(schedule.next_url(), www_example)
Beispiel #2
0
 def test_schedule_multiple_url_different_subdomain(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://www.example.com/path/to")
     www_example = schedule.blocks_to_crawl.pop()
     my_example = schedule.blocks_to_crawl.pop()
     self.assertNotEqual(www_example, my_example)
     self.assertTrue(www_example.next_url())
     self.assertTrue(my_example.next_url())
Beispiel #3
0
class Driver:
    def __init__(self):
        self.scheduler = Scheduler()

    def run(self):
        scheduler_job = SchedulerJob(self.scheduler)
        self.scheduler.schedule(scheduler_job, 3000)
        self.scheduler.start()
Beispiel #4
0
async def test_daily_tasks():
    scheduler = Scheduler()

    async def dummy_task():
        print('Running dummy task...')
        await asyncio.sleep(0.1)

    task = ScheduledTask(task=dummy_task, scheduled_time='23:45')
    scheduler.add_daily_tasks([task])

    task.is_ready = lambda: True
    await scheduler.run_daily_tasks()
    assert task.done
Beispiel #5
0
async def test_run_task_exceptions(resource, caplog):
    bad_task = BadGrabber(resource=resource)
    scheduler = Scheduler()

    scheduler.add_tasks([bad_task])

    import logging
    with caplog.at_level(logging.INFO):
        await scheduler.run_tasks()

    print('asdf')
    for record in caplog.records:
        print(record)
    print('asdf')
Beispiel #6
0
def trade():
    logger = get_logger()
    loop = asyncio.get_event_loop()
    scheduler = Scheduler()
    try:
        loop.run_until_complete(schedule_trading(scheduler))
    except KeyboardInterrupt:
        logger.debug('Trading interrupted...')
    finally:
        # cleanup once again?
        pass
Beispiel #7
0
async def refresh_data():
    logger = get_logger()
    logger.info('Refreshing initiated from webapp')
    factory = Factory()
    await factory.init_cache()
    factory.load_resources()
    tasks = factory.create()
    scheduler = Scheduler(tasks=tasks)

    await scheduler.run_tasks()
    await scheduler.cleanup()
Beispiel #8
0
 def test_schedule_multiple_url_same_netloc(self):
     schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://www.example.com/path/to")
     block = schedule.blocks_to_crawl.pop()
     correct_output = deque(["/path/to", "/path/to/location"])
     self.assertEqual(sorted(block.extensions_to_crawl), sorted(correct_output))
Beispiel #9
0
from crawler.domain import Domain
from crawler.scheduler import Scheduler
from urllib.parse import urlparse
from multiprocessing import Process

pages_fetchers = []
pages_fetchers_limit = 60
inicio = datetime.datetime.now()

arr_str_urls_seeds = [
    "http://cnn.com/", "https://pt.wikipedia.org/wiki/House,_M.D./",
    "https://globoesporte.globo.com/"
]
arr_urls_seeds = [(urlparse(str_url), 0) for str_url in arr_str_urls_seeds]
scheduler = Scheduler(str_usr_agent="bifaroBot",
                      int_page_limit=1000,
                      int_depth_limit=6,
                      arr_urls_seeds=arr_urls_seeds)

for a in range(0, pages_fetchers_limit):
    pages_fetchers.append(PageFetcher(scheduler))

proc = []
for pages_fetcher in pages_fetchers:
    p = Process(target=pages_fetcher.run())
    p.start()
    proc.append(p)
for p in proc:
    p.join()

fim = datetime.datetime.now()
print(f"Tempo gasto: {(fim-inicio).total_seconds()}")
Beispiel #10
0
def index(request):
    template_name = "index.html"
    if request.method == "POST":
        if "crawl-btn" in request.POST:
            crawler_form = CrawlerForm(request.POST)
            if crawler_form.is_valid():
                n_docs = crawler_form.cleaned_data["n_docs"]
                in_degree = crawler_form.cleaned_data["in_degree"]
                out_degree = crawler_form.cleaned_data["out_degree"]
                starting_url = crawler_form.cleaned_data["starting_url"]
                urls = [x.strip() for x in starting_url.split(",")]
                crawler = Scheduler(starting_url=urls, num=n_docs, in_degre=in_degree, out_degree=out_degree)
                crawler.crawl()
            else:
                return render(request, template_name, {"crawler_form": crawler_form})
        if "index-btn" in request.POST:
            index_form = IndexForm(request.POST)
            if index_form.is_valid():
                direction = index_form.cleaned_data["direction"]
                print(direction)
                searcher.index(es, direction)

            else:
                return render(request, template_name, {"index_form", index_form})
        if "cluster-btn" in request.POST:
            cluster_form = ClusterForm(request.POST)
            if cluster_form.is_valid():
                path_to_json = cluster_form.cleaned_data["direction"]
                titles = []
                json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith(".json")]
                for js in json_files:
                    with open(os.path.join(path_to_json, js)) as json_file:
                        data = json.load(json_file)
                        titles.append(data["title"])
                        json_file.close()
                vectorizer = CountVectorizer()
                vectors = vectorizer.fit_transform(titles).todense().tolist()
                kmeans = Kmeans(vectors)
                k_points = []
                j_points = []
                for i in range(len(vectors)):
                    k_points.append(i + 1)
                    j_points.append(kmeans.kmenas(i + 1))

                plt.plot(j_points, k_points)
                plt.show()
            else:
                return render(request, template_name, {"cluster_form": cluster_form})
        if "page-rank-btn" in request.POST:
            page_rank_form = PageRankForm(request.POST)
            if page_rank_form.is_valid():
                alpha = page_rank_form.cleaned_data["alpha"]
                threshold = page_rank_form.cleaned_data["threshold"]

            else:
                return render(request, template_name, {"page_rank_form": page_rank_form})
        if "query-btn" in request.POST:
            query_form = QueryForm(request.POST)
            if query_form.is_valid():
                cluster = query_form.cleaned_data["cluster"]
                pagerank = query_form.cleaned_data["pagerank"]
                query = query_form.cleaned_data["query"]
                pass
            else:
                return render(request, template_name, {"query_form": query_form})

    else:
        query_form = QueryForm()
        page_rank_form = PageRankForm()
        cluster_form = ClusterForm()
        crawler_form = CrawlerForm()
        index_form = IndexForm()
        return render(
            request,
            template_name,
            {
                "query_form": query_form,
                "page_rank_form": page_rank_form,
                "cluster_form": cluster_form,
                "crawler_form": crawler_form,
                "index_form": index_form,
            },
        )
Beispiel #11
0
 def __init__(self):
     self.scheduler = Scheduler()
Beispiel #12
0
 def test_schedule_same_url_twice(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location")
     self.assertFalse(has_been_scheduled)
Beispiel #13
0
 def test_schedule_url_fragment(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location#maincontent")
     self.assertTrue(has_been_scheduled)
Beispiel #14
0
 def test_next_url_extensions(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.schedule_url("https://my.example.com/path/to/location?key=val;word=bird#frag")
     correct_output = "https://my.example.com/path/to/location?key=val;word=bird#frag" 
     self.assertEqual(schedule.next_url(), correct_output)
Beispiel #15
0
 def test_schedule_one_url(self):
     schedule = Scheduler("https://www.example.com/path/to/location") # Constructor calls upon schedule_url
     block = schedule.blocks_to_crawl.pop()
     ext = block.extensions_to_crawl[0]
     correct_output = "/path/to/location"
     self.assertEqual(ext, correct_output)
Beispiel #16
0
 def test_next_url_empty_queue(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     schedule.next_url()
     self.assertFalse(schedule.next_url())
Beispiel #17
0
 def test_next_url(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     correct_output = "https://my.example.com/path/to/location"
     self.assertEqual(schedule.next_url(), correct_output)
Beispiel #18
0
 def test_schedule_url_variety(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location?key1=val1;key2=val2#frag")
     self.assertTrue(has_been_scheduled)
Beispiel #19
0
 def test_schedule_url_params(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.com/path/to/location;key=val")
     self.assertTrue(has_been_scheduled)
Beispiel #20
0
async def test_working_time():
    scheduler = Scheduler()

    with mock.patch('crawler.scheduler.get_config', _get_config_mock):
        await scheduler.update_config()
        assert scheduler.working_time is True
Beispiel #21
0
 def test_schedule_url_already_crawled(self):
     url = "https://my.example.com/path/to/location"
     schedule = Scheduler(url) # Constructor calls upon schedule_url
     schedule.next_url()
     self.assertFalse(schedule.schedule_url(url))
Beispiel #22
0
 def test_schedule_url_different_domain(self):
     schedule = Scheduler("https://my.example.com/path/to/location") # Constructor calls upon schedule_url
     has_been_scheduled = schedule.schedule_url("https://my.example.org/path/to/location")
     self.assertFalse(has_been_scheduled)