def testCrawlJobsScheduledChecked(self): """ tests out the mark_job_scheduled and mark_job_checked logic """ urls = [u"http://feeds.feedburner.com/43folders", u"http://advocacy.python.org/podcasts/littlebit.rss", u"http://friendfeed.com/alawrence?format=atom", u"http://feeds.feedburner.com/antiwar"] # indicate url is scheduled to be crawled with transaction.manager: for url in urls: rec = scheduler.mark_job_scheduled(url) self.assert_(rec, "no rec for url %s" % url) recs = [r for r in \ meta.Session().query(scheduler.CrawlJobModel).all()] self.assert_(len(recs) == len(urls), (len(recs), len(urls))) # pretend we crawled the url and update the record with transaction.manager: etag = str(uuid.uuid4()) last_modified = datetime.datetime.now() rec = scheduler.mark_job_checked(url, etag=etag, last_modified=last_modified) self.assert_(rec, "no rec for url %s" % url) self.assert_(etag == rec.etag) self.assert_(last_modified == rec.last_modified)
def testAssumeChecked(self): """ coverage if url comes in that hasn't been scheduled, assume it was checked and just persist/return the record """ with transaction.manager: newrec = scheduler.mark_job_checked("http://gooogle.com") meta.Session().add(newrec) self.assert_(newrec)