def WebscraperTask(job_id): """ Celery task spawns process to perform scraping job. """ logger.debug("Entering dispatch_wordscraper with job ID: %s" % str(job_id)) # Obtain all required crawler arguments job = Job.objects.get(pk=job_id) word_list = [(term.term_id, term.term) for term in Term.objects.filter(job=job)] site_list = [(site.site_id, site.url, job.max_depth, 'English') for site in Site.objects.filter(job=job)] # Initialize the crawling process logger.debug("Configuring crawl controller with job_id: " + "%d, terms: %s, sites: %s" % (job_id, str(word_list), str(site_list),)) controller = init_controller() controller.crawl(job_id, site_list, word_list) logger.debug("Crawl complete for job: %d" % job_id) # Cache the results in the database so the results are dated as close to the # crawl date as possible. results = controller.query(site_list, word_list) result_cache = Result() result_cache.job = job result_cache.output = results result_cache.save()
def job_result(request, id): """ Retrieve the results of the requested Word Scraper job in a CSV file. """ try: job = Job.objects.get(result_id=id) except Job.DoesNotExist: raise Http404 word_list = [(term.term_id, term.term) for term in Term.objects.filter(job=job)] site_list = [(site.site_id, site.url, job.max_depth, 'English') for site in Site.objects.filter(job=job)] controller = init_controller() results = Result.objects.get(job=job).output response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="%s_results.csv"' % job.job_id writer = csv.writer(response) writer.writerow(['url', 'accessed', 'depth', 'domain', 'site_id', 'status', 'term_matches']) for key in sorted(results, key=lambda x: (results[x]['site_id'], results[x]['depth'], x)): writer.writerow(format_row(key, results)) return response