Exemple #1
0
    def handle(self, **options):
        logging.info("starting crawl daemon")

        logging.info("setting any unfinished crawls to finished")
        for c in m.Crawl.objects.filter(finished=None):
            c.finished = datetime.datetime.now()
            c.save()

        while True:
           
            # look for websites that haven't been updated in the last 4 hrs 
            # TODO: make 4 hours configurable in settings.py

            recently = datetime.datetime.now() - datetime.timedelta(hours=4)
            for website in m.Website.objects.all():

                # first look for any new websites that haven't been crawled yet
                # in the interim, since updating a bunch of sites could
                # cause new sites not to be crawled for quite a bit of time
                for new_website in m.Website.objects.filter(crawls=None):
                    logging.info("new website to crawl: %s" % new_website.name)
                    crawl(new_website)

                # now crawl any sites that need to be updated 
                last_crawl = website.last_crawl()
                if last_crawl and last_crawl.finished < recently:
                    logging.info("refreshing links for %s" % website.name)
                    crawl(website)

                # TODO: should be configurable in settings.py
                time.sleep(10)
Exemple #2
0
    def test_harvest(self):
        # get a website with some user pages
        website = m.Website(name='Europeana', url='http://www.europeana.eu')
        website.save()
        crawl(website)

        self.assertTrue(m.WikipediaCategory.objects.all().count() > 0)

        created, updated = load_users()
        self.assertTrue(created > 0)
        self.assertEqual(updated, 0)

        self.assertTrue(m.WikipediaUser.objects.all().count() > 1)
        u = m.WikipediaUser.objects.all()[0]
        self.assertTrue(u.edit_count > 0)
Exemple #3
0
    def test_harvest(self):
        # get a website with some user pages
        website = m.Website(name='Europeana', url='http://www.europeana.eu')
        website.save()
        crawl(website)

        self.assertTrue(m.WikipediaCategory.objects.all().count() > 0)

        created, updated = load_users()
        self.assertTrue(created > 0)
        self.assertEqual(updated, 0)

        self.assertTrue(m.WikipediaUser.objects.all().count() > 1)
        u = m.WikipediaUser.objects.all()[0]
        self.assertTrue(u.edit_count > 0)
Exemple #4
0
    def handle(self, **options):
        logging.info("starting crawl daemon")

        logging.info("setting any unfinished crawls to finished")
        for c in m.Crawl.objects.filter(finished=None):
            c.finished = datetime.datetime.now()
            c.save()

        # crawl brand new websites first
        for new_website in m.Website.objects.filter(crawls=None):
            logging.info("new website to crawl: %s" % new_website.name)
            crawl(new_website)

        # now crawl the rest
        websites = m.Website.objects.all()
        websites = websites.annotate(most_recent_crawl=Max("crawls__started"))
        websites = websites.order_by("most_recent_crawl")
        for website in websites:
            crawl(website)
Exemple #5
0
    def handle(self, **options):
        logging.info("starting crawl daemon")

        logging.info("setting any unfinished crawls to finished")
        for c in m.Crawl.objects.filter(finished=None):
            c.finished = datetime.datetime.now()
            c.save()

        # crawl brand new websites first
        for new_website in m.Website.objects.filter(crawls=None):
            logging.info("new website to crawl: %s" % new_website.name)
            crawl(new_website)

        # now crawl the rest
        websites = m.Website.objects.all()
        websites = websites.annotate(most_recent_crawl=Max("crawls__started"))
        websites = websites.order_by("most_recent_crawl")
        for website in websites:
            crawl(website)