Ejemplo n.º 1
0
    def test_coverage_record(self):
        edition, pool = self._edition(with_license_pool=True)
        data_source = edition.data_source

        # No preexisting coverage record
        coverage = CoverageRecord.lookup(edition, data_source)
        eq_(coverage, None)

        last_update = datetime.datetime(2015, 1, 1)

        m = Metadata(data_source=data_source,
                     title=u"New title",
                     data_source_last_updated=last_update)
        m.apply(edition)

        coverage = CoverageRecord.lookup(edition, data_source)
        eq_(last_update, coverage.timestamp)
        eq_(u"New title", edition.title)

        older_last_update = datetime.datetime(2014, 1, 1)
        m = Metadata(data_source=data_source,
                     title=u"Another new title",
                     data_source_last_updated=older_last_update)
        m.apply(edition)
        eq_(u"New title", edition.title)

        coverage = CoverageRecord.lookup(edition, data_source)
        eq_(last_update, coverage.timestamp)

        m.apply(edition, force=True)
        eq_(u"Another new title", edition.title)
        coverage = CoverageRecord.lookup(edition, data_source)
        eq_(older_last_update, coverage.timestamp)
Ejemplo n.º 2
0
    def test_import_one_feed(self):
        # Check coverage records are created.

        monitor = OPDSImportMonitor(self._db, "http://url",
                                    DataSource.OA_CONTENT_SERVER,
                                    DoomedOPDSImporter)
        data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER)

        feed = self.content_server_mini_feed

        monitor.import_one_feed(feed, "http://root-url/")

        editions = self._db.query(Edition).all()

        # One edition has been imported
        eq_(1, len(editions))
        [edition] = editions

        # That edition has a CoverageRecord.
        record = CoverageRecord.lookup(
            editions[0].primary_identifier,
            data_source,
            operation=CoverageRecord.IMPORT_OPERATION)
        eq_(CoverageRecord.SUCCESS, record.status)
        eq_(None, record.exception)

        # The edition's primary identifier has a cover link whose
        # relative URL has been resolved relative to the URL we passed
        # into import_one_feed.
        [cover] = [
            x.resource.url for x in editions[0].primary_identifier.links
            if x.rel == Hyperlink.IMAGE
        ]
        eq_("http://root-url/full-cover-image.png", cover)

        # The 202 status message in the feed caused a transient failure.
        # The exception caused a persistent failure.

        coverage_records = self._db.query(CoverageRecord).filter(
            CoverageRecord.operation == CoverageRecord.IMPORT_OPERATION,
            CoverageRecord.status != CoverageRecord.SUCCESS)
        eq_(
            sorted([
                CoverageRecord.TRANSIENT_FAILURE,
                CoverageRecord.PERSISTENT_FAILURE
            ]), sorted([x.status for x in coverage_records]))

        identifier, ignore = Identifier.parse_urn(
            self._db,
            "urn:librarysimplified.org/terms/id/Gutenberg%20ID/10441")
        failure = CoverageRecord.lookup(
            identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION)
        assert "Utter failure!" in failure.exception
Ejemplo n.º 3
0
    def test_items_that_need_coverage_respects_operation(self):

        record1 = CoverageRecord.add_for(self.identifier, self.output_source)

        # Here's a provider that carries out the 'foo' operation.
        provider = AlwaysSuccessfulCoverageProvider(
            "Always successful",
            self.input_identifier_types,
            self.output_source,
            operation='foo')

        # It is missing coverage for self.identifier, because the
        # CoverageRecord we created at the start of this test has no
        # operation.
        eq_([self.identifier], provider.items_that_need_coverage().all())

        # Here's a provider that has no operation set.
        provider = AlwaysSuccessfulCoverageProvider(
            "Always successful", self.input_identifier_types,
            self.output_source)

        # It is not missing coverage for self.identifier, because the
        # CoverageRecord we created at the start of the test takes
        # care of it.
        eq_([], provider.items_that_need_coverage().all())
Ejemplo n.º 4
0
    def test_items_that_need_coverage(self):
        cutoff_time = datetime.datetime(2016, 1, 1)
        record = CoverageRecord.add_for(self.edition,
                                        self.output_source,
                                        timestamp=cutoff_time)

        provider = AlwaysSuccessfulCoverageProvider(
            "Always successful",
            self.input_identifier_types,
            self.output_source,
            cutoff_time=cutoff_time)
        eq_([], provider.items_that_need_coverage().all())

        one_second_after = cutoff_time + datetime.timedelta(seconds=1)
        provider = AlwaysSuccessfulCoverageProvider(
            "Always successful",
            self.input_identifier_types,
            self.output_source,
            cutoff_time=one_second_after)
        eq_([self.identifier], provider.items_that_need_coverage().all())

        provider = AlwaysSuccessfulCoverageProvider(
            "Always successful", self.input_identifier_types,
            self.output_source)
        eq_([], provider.items_that_need_coverage().all())
Ejemplo n.º 5
0
 def add_coverage_record_for(self, item):
     """Record this CoverageProvider's coverage for the given
     Edition/Identifier, as a CoverageRecord.
     """
     return CoverageRecord.add_for(
         item, data_source=self.data_source, operation=self.operation
     )
Ejemplo n.º 6
0
    def to_coverage_record(self, operation=None):
        """Convert this failure into a CoverageRecord."""
        if not self.data_source:
            raise Exception(
                "Cannot convert coverage failure to CoverageRecord because it has no output source."
            )

        record, ignore = CoverageRecord.add_for(self.obj,
                                                self.data_source,
                                                operation=operation)
        record.exception = self.exception
        if self.transient:
            record.status = CoverageRecord.TRANSIENT_FAILURE
        else:
            record.status = CoverageRecord.PERSISTENT_FAILURE
        return record
Ejemplo n.º 7
0
    def import_one_feed(self, feed, feed_url=None):
        imported_editions, pools, works, failures = self.importer.import_from_feed(
            feed, even_if_no_author=True,
            immediately_presentation_ready = self.immediately_presentation_ready,
            feed_url=feed_url
        )

        data_source = self.importer.data_source
        
        # Create CoverageRecords for the successful imports.
        for edition in imported_editions:
            record = CoverageRecord.add_for(
                edition, data_source, CoverageRecord.IMPORT_OPERATION,
                status=CoverageRecord.SUCCESS
            )

        # Create CoverageRecords for the failures.
        for urn, failure in failures.items():
            failure.to_coverage_record(operation=CoverageRecord.IMPORT_OPERATION)
Ejemplo n.º 8
0
    def test_run_on_specific_identifiers_respects_cutoff_time(self):

        last_run = datetime.datetime(2016, 1, 1)

        # Once upon a time we successfully added coverage for
        # self.identifier.
        record, ignore = CoverageRecord.add_for(self.identifier,
                                                self.output_source)
        record.timestamp = last_run

        # But now something has gone wrong, and if we ever run the
        # coverage provider again we will get a persistent failure.
        provider = NeverSuccessfulCoverageProvider("Persistent failure",
                                                   self.input_identifier_types,
                                                   self.output_source,
                                                   cutoff_time=last_run)

        # You might think this would result in a persistent failure...
        (success, transient_failure,
         persistent_failure), records = (provider.run_on_specific_identifiers(
             [self.identifier]))

        # ...but we get an automatic success. We didn't even try to
        # run the coverage provider on self.identifier because the
        # coverage record was up-to-date.
        eq_(1, success)
        eq_(0, persistent_failure)
        eq_([], records)

        # But if we move the cutoff time forward, the provider will run
        # on self.identifier and fail.
        provider.cutoff_time = datetime.datetime(2016, 2, 1)
        (success, transient_failure,
         persistent_failure), records = (provider.run_on_specific_identifiers(
             [self.identifier]))
        eq_(0, success)
        eq_(1, persistent_failure)

        # The formerly successful CoverageRecord will be updated to
        # reflect the failure.
        eq_(records[0], record)
        eq_("What did you expect?", record.exception)
Ejemplo n.º 9
0
    def test_should_update(self):
        cutoff = datetime.datetime(2016, 1, 1)
        provider = AlwaysSuccessfulCoverageProvider(
            "Always successful",
            self.input_identifier_types,
            self.output_source,
            cutoff_time=cutoff)

        # If coverage is missing, we should update.
        eq_(True, provider.should_update(None))

        # If coverage is outdated, we should update.
        record, ignore = CoverageRecord.add_for(self.identifier,
                                                self.output_source)
        record.timestamp = datetime.datetime(2015, 1, 1)
        eq_(True, provider.should_update(record))

        # If coverage is up-to-date, we should not update.
        record.timestamp = cutoff
        eq_(False, provider.should_update(record))
Ejemplo n.º 10
0
    def test_follow_one_link(self):
        monitor = OPDSImportMonitor(self._db, "http://url",
                                    DataSource.OA_CONTENT_SERVER, OPDSImporter)
        feed = self.content_server_mini_feed

        # If there's new data, follow_one_link extracts the next links.

        http = DummyHTTPClient()
        http.queue_response(200, content=feed)

        next_links, content = monitor.follow_one_link("http://url",
                                                      do_get=http.do_get)

        eq_(1, len(next_links))
        eq_("http://localhost:5000/?after=327&size=100", next_links[0])

        eq_(feed, content)

        # Now import the editions and add coverage records.
        monitor.importer.import_from_feed(feed)
        eq_(2, self._db.query(Edition).count())

        editions = self._db.query(Edition).all()
        data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER)

        for edition in editions:
            record, ignore = CoverageRecord.add_for(
                edition, data_source, CoverageRecord.IMPORT_OPERATION)
            record.timestamp = datetime.datetime(2016, 1, 1, 1, 1, 1)

        # If there's no new data, follow_one_link returns no next links and no content.
        http.queue_response(200, content=feed)

        next_links, content = monitor.follow_one_link("http://url",
                                                      do_get=http.do_get)

        eq_(0, len(next_links))
        eq_(None, content)
Ejemplo n.º 11
0
    def check_for_new_data(self, feed):
        """Check if the feed contains any entries that haven't been imported
        yet. If force_import is set, every entry in the feed is
        treated as new.
        """

        # If force_reimport is set, we don't even need to check. Always
        # treat the feed as though it contained new data.
        if self.force_reimport:
            return True

        last_update_dates = self.importer.extract_last_update_dates(feed)

        new_data = False
        for identifier, remote_updated in last_update_dates:

            identifier, ignore = Identifier.parse_urn(self._db, identifier)
            data_source = self.importer.data_source
            record = None

            if identifier:
                record = CoverageRecord.lookup(
                    identifier, data_source, operation=CoverageRecord.IMPORT_OPERATION
                )

            # If there was a transient failure last time we tried to
            # import this book, try again regardless of whether the
            # feed has changed.
            if record and record.status == CoverageRecord.TRANSIENT_FAILURE:
                new_data = True
                self.log.info(
                    "Counting %s as new because previous attempt resulted in transient failure: %s", 
                    record.identifier, record.exception
                )
                break

            # If our last attempt was a success or a persistent
            # failure, we only want to import again if something
            # changed since then.

            if record and record.timestamp:
                # We've imported this entry before, so don't import it
                # again unless it's changed.

                if not remote_updated:
                    # The remote isn't telling us whether the entry
                    # has been updated. Import it again to be safe.
                    new_data = True
                    self.log.info(
                        "Counting %s as new because remote has no information about when it was updated.", 
                        record.identifier
                    )
                    break

                if remote_updated >= record.timestamp:
                    # This book has been updated.
                    self.log.info(
                        "Counting %s as new because its coverage date is %s and remote has %s.", 
                        record.identifier, record.timestamp, remote_updated
                    )

                    new_data = True
                    break

            else:
                # There's no record of an attempt to import this book.
                self.log.info(
                    "Counting %s as new because it has no CoverageRecord.", 
                    identifier
                )
                new_data = True
                break
        return new_data
Ejemplo n.º 12
0
    def test_check_for_new_data(self):
        feed = self.content_server_mini_feed

        class MockOPDSImportMonitor(OPDSImportMonitor):
            def _get(self, url, headers):
                return 200, {}, feed

        monitor = OPDSImportMonitor(self._db, "http://url",
                                    DataSource.OA_CONTENT_SERVER, OPDSImporter)

        # Nothing has been imported yet, so all data is new.
        eq_(True, monitor.check_for_new_data(feed))

        # Now import the editions.
        monitor = MockOPDSImportMonitor(self._db, "http://url",
                                        DataSource.OA_CONTENT_SERVER,
                                        OPDSImporter)
        monitor.run_once("http://url", None)

        # Editions have been imported.
        eq_(2, self._db.query(Edition).count())

        # Note that unlike many other Monitors, OPDSImportMonitor
        # doesn't store a Timestamp.
        assert not hasattr(monitor, 'timestamp')

        editions = self._db.query(Edition).all()
        data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER)

        # If there are CoverageRecords that record work are after the updated
        # dates, there's nothing new.
        record, ignore = CoverageRecord.add_for(
            editions[0], data_source, CoverageRecord.IMPORT_OPERATION)
        record.timestamp = datetime.datetime(2016, 1, 1, 1, 1, 1)

        record2, ignore = CoverageRecord.add_for(
            editions[1], data_source, CoverageRecord.IMPORT_OPERATION)
        record2.timestamp = datetime.datetime(2016, 1, 1, 1, 1, 1)

        eq_(False, monitor.check_for_new_data(feed))

        # If the monitor is set up to force reimport, it doesn't
        # matter that there's nothing new--we act as though there is.
        monitor.force_reimport = True
        eq_(True, monitor.check_for_new_data(feed))
        monitor.force_reimport = False

        # If an entry was updated after the date given in that entry's
        # CoverageRecord, there's new data.
        record2.timestamp = datetime.datetime(1970, 1, 1, 1, 1, 1)
        eq_(True, monitor.check_for_new_data(feed))

        # If a CoverageRecord is a transient failure, we try again
        # regardless of whether it's been updated.
        for r in [record, record2]:
            r.timestamp = datetime.datetime(2016, 1, 1, 1, 1, 1)
            r.exception = "Failure!"
            r.status = CoverageRecord.TRANSIENT_FAILURE
        eq_(True, monitor.check_for_new_data(feed))

        # If a CoverageRecord is a persistent failure, we don't try again...
        for r in [record, record2]:
            r.status = CoverageRecord.PERSISTENT_FAILURE
        eq_(False, monitor.check_for_new_data(feed))

        # ...unless the feed updates.
        record.timestamp = datetime.datetime(1970, 1, 1, 1, 1, 1)
        eq_(True, monitor.check_for_new_data(feed))