def handle(self, **options): logging.info("starting crawl daemon") logging.info("setting any unfinished crawls to finished") for c in m.Crawl.objects.filter(finished=None): c.finished = datetime.datetime.now() c.save() while True: # look for websites that haven't been updated in the last 4 hrs # TODO: make 4 hours configurable in settings.py recently = datetime.datetime.now() - datetime.timedelta(hours=4) for website in m.Website.objects.all(): # first look for any new websites that haven't been crawled yet # in the interim, since updating a bunch of sites could # cause new sites not to be crawled for quite a bit of time for new_website in m.Website.objects.filter(crawls=None): logging.info("new website to crawl: %s" % new_website.name) crawl(new_website) # now crawl any sites that need to be updated last_crawl = website.last_crawl() if last_crawl and last_crawl.finished < recently: logging.info("refreshing links for %s" % website.name) crawl(website) # TODO: should be configurable in settings.py time.sleep(10)
def test_harvest(self): # get a website with some user pages website = m.Website(name='Europeana', url='http://www.europeana.eu') website.save() crawl(website) self.assertTrue(m.WikipediaCategory.objects.all().count() > 0) created, updated = load_users() self.assertTrue(created > 0) self.assertEqual(updated, 0) self.assertTrue(m.WikipediaUser.objects.all().count() > 1) u = m.WikipediaUser.objects.all()[0] self.assertTrue(u.edit_count > 0)
def handle(self, **options): logging.info("starting crawl daemon") logging.info("setting any unfinished crawls to finished") for c in m.Crawl.objects.filter(finished=None): c.finished = datetime.datetime.now() c.save() # crawl brand new websites first for new_website in m.Website.objects.filter(crawls=None): logging.info("new website to crawl: %s" % new_website.name) crawl(new_website) # now crawl the rest websites = m.Website.objects.all() websites = websites.annotate(most_recent_crawl=Max("crawls__started")) websites = websites.order_by("most_recent_crawl") for website in websites: crawl(website)