def handle(self, *args, **options):
        crawl_info = CrawlInfo(init_url=options['url'][0], limit=options['limit'],
                               type="author", i_limit=0, o_limit=options['branch_factor'][0])
        crawl_info.save()

        if not os.path.exists("managed_data/crawled_authors/%d" % crawl_info.id):
            os.makedirs("managed_data/crawled_authors/%d" % crawl_info.id)

        author_id = InformationDownloader.get_researcher_id_from_url(crawl_info.init_url)
        crawl_author_pages.delay(crawl_info.id, author_id)
Esempio n. 2
0
def crawl_author_page(request):
    if request.GET.get('urls') is not None:
        urls = request.GET.get('urls').split("\n")
        crawl_info = CrawlInfo(init_url=request.GET.get('urls'), limit=request.GET.get('limit'),
                               i_limit=0, o_limit=request.GET.get('branch_factor'), type='author')
        crawl_info.save()

        if not os.path.exists("managed_data/crawled_authors/%d" % crawl_info.id):
            os.makedirs("managed_data/crawled_authors/%d" % crawl_info.id)

        for url in urls:
            author_id = InformationDownloader.get_researcher_id_from_url(url)
            crawl_author_pages.delay(crawl_info.id, author_id)

        return redirect("/crawl/status/%d/" % crawl_info.id)

    return render(request, 'crawl_authors.html')