def test_coverage_record(self): edition, pool = self._edition(with_license_pool=True) data_source = edition.data_source # No preexisting coverage record coverage = CoverageRecord.lookup(edition, data_source) eq_(coverage, None) last_update = datetime.datetime(2015, 1, 1) m = Metadata(data_source=data_source, title=u"New title", data_source_last_updated=last_update) m.apply(edition) coverage = CoverageRecord.lookup(edition, data_source) eq_(last_update, coverage.timestamp) eq_(u"New title", edition.title) older_last_update = datetime.datetime(2014, 1, 1) m = Metadata(data_source=data_source, title=u"Another new title", data_source_last_updated=older_last_update) m.apply(edition) eq_(u"New title", edition.title) coverage = CoverageRecord.lookup(edition, data_source) eq_(last_update, coverage.timestamp) m.apply(edition, force=True) eq_(u"Another new title", edition.title) coverage = CoverageRecord.lookup(edition, data_source) eq_(older_last_update, coverage.timestamp)
def test_import_one_feed(self): # Check coverage records are created. monitor = OPDSImportMonitor(self._db, "http://url", DataSource.OA_CONTENT_SERVER, DoomedOPDSImporter) data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER) feed = self.content_server_mini_feed monitor.import_one_feed(feed, "http://root-url/") editions = self._db.query(Edition).all() # One edition has been imported eq_(1, len(editions)) [edition] = editions # That edition has a CoverageRecord. record = CoverageRecord.lookup( editions[0].primary_identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION) eq_(CoverageRecord.SUCCESS, record.status) eq_(None, record.exception) # The edition's primary identifier has a cover link whose # relative URL has been resolved relative to the URL we passed # into import_one_feed. [cover] = [ x.resource.url for x in editions[0].primary_identifier.links if x.rel == Hyperlink.IMAGE ] eq_("http://root-url/full-cover-image.png", cover) # The 202 status message in the feed caused a transient failure. # The exception caused a persistent failure. coverage_records = self._db.query(CoverageRecord).filter( CoverageRecord.operation == CoverageRecord.IMPORT_OPERATION, CoverageRecord.status != CoverageRecord.SUCCESS) eq_( sorted([ CoverageRecord.TRANSIENT_FAILURE, CoverageRecord.PERSISTENT_FAILURE ]), sorted([x.status for x in coverage_records])) identifier, ignore = Identifier.parse_urn( self._db, "urn:librarysimplified.org/terms/id/Gutenberg%20ID/10441") failure = CoverageRecord.lookup( identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION) assert "Utter failure!" in failure.exception
def test_items_that_need_coverage_respects_operation(self): record1 = CoverageRecord.add_for(self.identifier, self.output_source) # Here's a provider that carries out the 'foo' operation. provider = AlwaysSuccessfulCoverageProvider( "Always successful", self.input_identifier_types, self.output_source, operation='foo') # It is missing coverage for self.identifier, because the # CoverageRecord we created at the start of this test has no # operation. eq_([self.identifier], provider.items_that_need_coverage().all()) # Here's a provider that has no operation set. provider = AlwaysSuccessfulCoverageProvider( "Always successful", self.input_identifier_types, self.output_source) # It is not missing coverage for self.identifier, because the # CoverageRecord we created at the start of the test takes # care of it. eq_([], provider.items_that_need_coverage().all())
def test_items_that_need_coverage(self): cutoff_time = datetime.datetime(2016, 1, 1) record = CoverageRecord.add_for(self.edition, self.output_source, timestamp=cutoff_time) provider = AlwaysSuccessfulCoverageProvider( "Always successful", self.input_identifier_types, self.output_source, cutoff_time=cutoff_time) eq_([], provider.items_that_need_coverage().all()) one_second_after = cutoff_time + datetime.timedelta(seconds=1) provider = AlwaysSuccessfulCoverageProvider( "Always successful", self.input_identifier_types, self.output_source, cutoff_time=one_second_after) eq_([self.identifier], provider.items_that_need_coverage().all()) provider = AlwaysSuccessfulCoverageProvider( "Always successful", self.input_identifier_types, self.output_source) eq_([], provider.items_that_need_coverage().all())
def add_coverage_record_for(self, item): """Record this CoverageProvider's coverage for the given Edition/Identifier, as a CoverageRecord. """ return CoverageRecord.add_for( item, data_source=self.data_source, operation=self.operation )
def to_coverage_record(self, operation=None): """Convert this failure into a CoverageRecord.""" if not self.data_source: raise Exception( "Cannot convert coverage failure to CoverageRecord because it has no output source." ) record, ignore = CoverageRecord.add_for(self.obj, self.data_source, operation=operation) record.exception = self.exception if self.transient: record.status = CoverageRecord.TRANSIENT_FAILURE else: record.status = CoverageRecord.PERSISTENT_FAILURE return record
def import_one_feed(self, feed, feed_url=None): imported_editions, pools, works, failures = self.importer.import_from_feed( feed, even_if_no_author=True, immediately_presentation_ready = self.immediately_presentation_ready, feed_url=feed_url ) data_source = self.importer.data_source # Create CoverageRecords for the successful imports. for edition in imported_editions: record = CoverageRecord.add_for( edition, data_source, CoverageRecord.IMPORT_OPERATION, status=CoverageRecord.SUCCESS ) # Create CoverageRecords for the failures. for urn, failure in failures.items(): failure.to_coverage_record(operation=CoverageRecord.IMPORT_OPERATION)
def test_run_on_specific_identifiers_respects_cutoff_time(self): last_run = datetime.datetime(2016, 1, 1) # Once upon a time we successfully added coverage for # self.identifier. record, ignore = CoverageRecord.add_for(self.identifier, self.output_source) record.timestamp = last_run # But now something has gone wrong, and if we ever run the # coverage provider again we will get a persistent failure. provider = NeverSuccessfulCoverageProvider("Persistent failure", self.input_identifier_types, self.output_source, cutoff_time=last_run) # You might think this would result in a persistent failure... (success, transient_failure, persistent_failure), records = (provider.run_on_specific_identifiers( [self.identifier])) # ...but we get an automatic success. We didn't even try to # run the coverage provider on self.identifier because the # coverage record was up-to-date. eq_(1, success) eq_(0, persistent_failure) eq_([], records) # But if we move the cutoff time forward, the provider will run # on self.identifier and fail. provider.cutoff_time = datetime.datetime(2016, 2, 1) (success, transient_failure, persistent_failure), records = (provider.run_on_specific_identifiers( [self.identifier])) eq_(0, success) eq_(1, persistent_failure) # The formerly successful CoverageRecord will be updated to # reflect the failure. eq_(records[0], record) eq_("What did you expect?", record.exception)
def test_should_update(self): cutoff = datetime.datetime(2016, 1, 1) provider = AlwaysSuccessfulCoverageProvider( "Always successful", self.input_identifier_types, self.output_source, cutoff_time=cutoff) # If coverage is missing, we should update. eq_(True, provider.should_update(None)) # If coverage is outdated, we should update. record, ignore = CoverageRecord.add_for(self.identifier, self.output_source) record.timestamp = datetime.datetime(2015, 1, 1) eq_(True, provider.should_update(record)) # If coverage is up-to-date, we should not update. record.timestamp = cutoff eq_(False, provider.should_update(record))
def test_follow_one_link(self): monitor = OPDSImportMonitor(self._db, "http://url", DataSource.OA_CONTENT_SERVER, OPDSImporter) feed = self.content_server_mini_feed # If there's new data, follow_one_link extracts the next links. http = DummyHTTPClient() http.queue_response(200, content=feed) next_links, content = monitor.follow_one_link("http://url", do_get=http.do_get) eq_(1, len(next_links)) eq_("http://localhost:5000/?after=327&size=100", next_links[0]) eq_(feed, content) # Now import the editions and add coverage records. monitor.importer.import_from_feed(feed) eq_(2, self._db.query(Edition).count()) editions = self._db.query(Edition).all() data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER) for edition in editions: record, ignore = CoverageRecord.add_for( edition, data_source, CoverageRecord.IMPORT_OPERATION) record.timestamp = datetime.datetime(2016, 1, 1, 1, 1, 1) # If there's no new data, follow_one_link returns no next links and no content. http.queue_response(200, content=feed) next_links, content = monitor.follow_one_link("http://url", do_get=http.do_get) eq_(0, len(next_links)) eq_(None, content)
def check_for_new_data(self, feed): """Check if the feed contains any entries that haven't been imported yet. If force_import is set, every entry in the feed is treated as new. """ # If force_reimport is set, we don't even need to check. Always # treat the feed as though it contained new data. if self.force_reimport: return True last_update_dates = self.importer.extract_last_update_dates(feed) new_data = False for identifier, remote_updated in last_update_dates: identifier, ignore = Identifier.parse_urn(self._db, identifier) data_source = self.importer.data_source record = None if identifier: record = CoverageRecord.lookup( identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION ) # If there was a transient failure last time we tried to # import this book, try again regardless of whether the # feed has changed. if record and record.status == CoverageRecord.TRANSIENT_FAILURE: new_data = True self.log.info( "Counting %s as new because previous attempt resulted in transient failure: %s", record.identifier, record.exception ) break # If our last attempt was a success or a persistent # failure, we only want to import again if something # changed since then. if record and record.timestamp: # We've imported this entry before, so don't import it # again unless it's changed. if not remote_updated: # The remote isn't telling us whether the entry # has been updated. Import it again to be safe. new_data = True self.log.info( "Counting %s as new because remote has no information about when it was updated.", record.identifier ) break if remote_updated >= record.timestamp: # This book has been updated. self.log.info( "Counting %s as new because its coverage date is %s and remote has %s.", record.identifier, record.timestamp, remote_updated ) new_data = True break else: # There's no record of an attempt to import this book. self.log.info( "Counting %s as new because it has no CoverageRecord.", identifier ) new_data = True break return new_data
def test_check_for_new_data(self): feed = self.content_server_mini_feed class MockOPDSImportMonitor(OPDSImportMonitor): def _get(self, url, headers): return 200, {}, feed monitor = OPDSImportMonitor(self._db, "http://url", DataSource.OA_CONTENT_SERVER, OPDSImporter) # Nothing has been imported yet, so all data is new. eq_(True, monitor.check_for_new_data(feed)) # Now import the editions. monitor = MockOPDSImportMonitor(self._db, "http://url", DataSource.OA_CONTENT_SERVER, OPDSImporter) monitor.run_once("http://url", None) # Editions have been imported. eq_(2, self._db.query(Edition).count()) # Note that unlike many other Monitors, OPDSImportMonitor # doesn't store a Timestamp. assert not hasattr(monitor, 'timestamp') editions = self._db.query(Edition).all() data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER) # If there are CoverageRecords that record work are after the updated # dates, there's nothing new. record, ignore = CoverageRecord.add_for( editions[0], data_source, CoverageRecord.IMPORT_OPERATION) record.timestamp = datetime.datetime(2016, 1, 1, 1, 1, 1) record2, ignore = CoverageRecord.add_for( editions[1], data_source, CoverageRecord.IMPORT_OPERATION) record2.timestamp = datetime.datetime(2016, 1, 1, 1, 1, 1) eq_(False, monitor.check_for_new_data(feed)) # If the monitor is set up to force reimport, it doesn't # matter that there's nothing new--we act as though there is. monitor.force_reimport = True eq_(True, monitor.check_for_new_data(feed)) monitor.force_reimport = False # If an entry was updated after the date given in that entry's # CoverageRecord, there's new data. record2.timestamp = datetime.datetime(1970, 1, 1, 1, 1, 1) eq_(True, monitor.check_for_new_data(feed)) # If a CoverageRecord is a transient failure, we try again # regardless of whether it's been updated. for r in [record, record2]: r.timestamp = datetime.datetime(2016, 1, 1, 1, 1, 1) r.exception = "Failure!" r.status = CoverageRecord.TRANSIENT_FAILURE eq_(True, monitor.check_for_new_data(feed)) # If a CoverageRecord is a persistent failure, we don't try again... for r in [record, record2]: r.status = CoverageRecord.PERSISTENT_FAILURE eq_(False, monitor.check_for_new_data(feed)) # ...unless the feed updates. record.timestamp = datetime.datetime(1970, 1, 1, 1, 1, 1) eq_(True, monitor.check_for_new_data(feed))