def __init__(self, _db, api, datasource, batch_size=10, metadata_replacement_policy=None, circulationdata_replacement_policy=None, cutoff_time=None): self._db = _db self.api = api output_source = DataSource.lookup(_db, datasource) input_identifier_types = [output_source.primary_identifier_type] service_name = "%s Bibliographic Coverage Provider" % datasource metadata_replacement_policy = ( metadata_replacement_policy or ReplacementPolicy.from_metadata_source()) circulationdata_replacement_policy = ( circulationdata_replacement_policy or ReplacementPolicy.from_license_source()) self.metadata_replacement_policy = metadata_replacement_policy self.circulationdata_replacement_policy = circulationdata_replacement_policy super(BibliographicCoverageProvider, self).__init__(service_name, input_identifier_types, output_source, batch_size=batch_size, cutoff_time=cutoff_time)
def test_apply_removes_old_formats_based_on_replacement_policy(self): edition, pool = self._edition(with_license_pool=True) # Start with one delivery mechanism for this pool. for lpdm in pool.delivery_mechanisms: self._db.delete(lpdm) old_lpdm = pool.set_delivery_mechanism(Representation.PDF_MEDIA_TYPE, DeliveryMechanism.ADOBE_DRM, RightsStatus.IN_COPYRIGHT, None) # And it has been loaned. patron = self._patron() loan, ignore = pool.loan_to(patron, fulfillment=old_lpdm) eq_(old_lpdm, loan.fulfillment) # We have new circulation data that has a different format. format = FormatData( content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM, ) circulation_data = CirculationData( formats=[format], data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) # If we apply the new CirculationData with formats false in the policy, # we'll add the new format, but keep the old one as well. replacement_policy = ReplacementPolicy(formats=False) circulation_data.apply(self._db, pool.collection, replacement_policy) eq_(2, pool.delivery_mechanisms.count()) eq_( set([ Representation.PDF_MEDIA_TYPE, Representation.EPUB_MEDIA_TYPE ]), set([ lpdm.delivery_mechanism.content_type for lpdm in pool.delivery_mechanisms ])) eq_(old_lpdm, loan.fulfillment) # But if we make formats true in the policy, we'll delete the old format # and remove it from its loan. replacement_policy = ReplacementPolicy(formats=True) circulation_data.apply(self._db, pool.collection, replacement_policy) eq_(1, pool.delivery_mechanisms.count()) eq_(Representation.EPUB_MEDIA_TYPE, pool.delivery_mechanisms[0].delivery_mechanism.content_type) eq_(None, loan.fulfillment)
def test_implicit_format_for_open_access_link(self): # A format is a delivery mechanism. We handle delivery on open access # pools from our mirrored content in S3. # Tests that when a link is open access, a pool can be delivered. edition, pool = self._edition(with_license_pool=True) # This is the delivery mechanism created by default when you # create a book with _edition(). [epub] = pool.delivery_mechanisms eq_(Representation.EPUB_MEDIA_TYPE, epub.delivery_mechanism.content_type) eq_(DeliveryMechanism.ADOBE_DRM, epub.delivery_mechanism.drm_scheme) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.PDF_MEDIA_TYPE, href=self._url ) circulation_data = CirculationData( data_source=DataSource.GUTENBERG, primary_identifier=edition.primary_identifier, links=[link], ) replace = ReplacementPolicy( formats=True, ) circulation_data.apply(pool, replace) # We destroyed the default delivery format and added a new, # open access delivery format. [pdf] = pool.delivery_mechanisms eq_(Representation.PDF_MEDIA_TYPE, pdf.delivery_mechanism.content_type) eq_(DeliveryMechanism.NO_DRM, pdf.delivery_mechanism.drm_scheme) circulation_data = CirculationData( data_source=DataSource.GUTENBERG, primary_identifier=edition.primary_identifier, links=[] ) replace = ReplacementPolicy( formats=True, links=True, ) circulation_data.apply(pool, replace) # Now we have no formats at all. eq_([], pool.delivery_mechanisms)
def test_rights_status_default_rights_from_data_source(self): identifier = IdentifierData( Identifier.GUTENBERG_ID, "abcd", ) link = LinkData(rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url) circulation_data = CirculationData( data_source=DataSource.OA_CONTENT_SERVER, primary_identifier=identifier, links=[link], ) replace = ReplacementPolicy(formats=True, ) # This pool starts off as not being open-access. pool, ignore = circulation_data.license_pool(self._db, self._default_collection) eq_(False, pool.open_access) circulation_data.apply(self._db, pool.collection, replace) # The pool became open-access because it was given a # link that came from the OS content server. eq_(True, pool.open_access) eq_(1, pool.delivery_mechanisms.count()) # The rights status is the default for the OA content server. eq_(RightsStatus.GENERIC_OPEN_ACCESS, pool.delivery_mechanisms[0].rights_status.uri)
def test_mirror_404_error(self): mirror = DummyS3Uploader() h = DummyHTTPClient() h.queue_response(404) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) link = LinkData( rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) m = Metadata(data_source=data_source) m.mirror_link(edition, data_source, link, link_obj, policy) # Since we got a 404 error, the cover image was not mirrored. eq_(404, link_obj.resource.representation.status_code) eq_(None, link_obj.resource.representation.mirror_url) eq_([], mirror.uploaded)
def __init__(self, collection, api_class=OdiloAPI, **kwargs): """Constructor. :param collection: Provide bibliographic coverage to all Odilo books in the given Collection. :param api_class: Instantiate this class with the given Collection, rather than instantiating OdiloAPI. """ super(OdiloBibliographicCoverageProvider, self).__init__(collection, **kwargs) if isinstance(api_class, OdiloAPI): # Use a previously instantiated OdiloAPI instance # rather than creating a new one. self.api = api_class else: # A web application should not use this option because it # will put a non-scoped session in the mix. _db = Session.object_session(collection) self.api = api_class(_db, collection) self.replacement_policy = ReplacementPolicy( identifiers=True, subjects=True, contributions=True, links=True, formats=True, rights=True, link_content=True, # even_if_not_apparently_updated=False, analytics=Analytics(self._db))
def test_non_open_access_book_not_mirrored(self): data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) content = "foo" link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href="http://example.com/", content=content, rights_uri=RightsStatus.IN_COPYRIGHT) identifier = self._identifier() link_obj, is_new = identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, media_type=link.media_type, content=link.content, ) # The Hyperlink object makes it look like an open-access book, # but the context we have from the OPDS feed says that it's # not. m.mirror_link(None, data_source, link, link_obj, policy) # No HTTP requests were made. eq_([], h.requests) # Nothing was uploaded. eq_([], mirror.uploaded)
def import_edition_from_metadata( self, metadata, even_if_no_author, immediately_presentation_ready ): """ For the passed-in Metadata object, see if can find or create an Edition in the database. Do not set the edition's pool or work, yet. """ # Locate or create an Edition for this book. edition, is_new_edition = metadata.edition(self._db) policy = ReplacementPolicy( subjects=True, links=True, contributions=True, rights=True, link_content=True, even_if_not_apparently_updated=True, mirror=self.mirror, content_modifier=self.content_modifier, http_get=self.http_get, ) metadata.apply( edition, self.metadata_client, replace=policy ) return edition
def test_rights_status_open_access_link_no_rights_uses_data_source_default( self): identifier = IdentifierData( Identifier.GUTENBERG_ID, "abcd", ) # Here's a CirculationData that will create an open-access # LicensePoolDeliveryMechanism. link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url) circulation_data = CirculationData( data_source=DataSource.GUTENBERG, primary_identifier=identifier, links=[link], ) replace_formats = ReplacementPolicy(formats=True, ) pool, ignore = circulation_data.license_pool(self._db, self._default_collection) pool.open_access = False # Applying this CirculationData to a LicensePool makes it # open-access. circulation_data.apply(self._db, pool.collection, replace_formats) eq_(True, pool.open_access) eq_(1, pool.delivery_mechanisms.count()) # The delivery mechanism's rights status is the default for # the data source. eq_(RightsStatus.PUBLIC_DOMAIN_USA, pool.delivery_mechanisms[0].rights_status.uri) # Even if a commercial source like Overdrive should offer a # link with rel="open access", unless we know it's an # open-access link we will give it a RightsStatus of # IN_COPYRIGHT. identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], ) pool, ignore = circulation_data.license_pool(self._db, self._default_collection) pool.open_access = False circulation_data.apply(self._db, pool.collection, replace_formats) eq_(RightsStatus.IN_COPYRIGHT, pool.delivery_mechanisms[0].rights_status.uri) eq_(False, pool.open_access)
def populate_all_catalog(self): """ Call get_all_catalog to get all of library's book info from OneClick. Create Work, Edition, LicensePool objects in our database. """ catalog_list = self.get_all_catalog() items_transmitted = len(catalog_list) items_created = 0 coverage_provider = OneClickBibliographicCoverageProvider(_db=self._db) # the default policy doesn't update delivery mechanisms, which we do want to do metadata_replacement_policy = ReplacementPolicy.from_metadata_source() metadata_replacement_policy.formats = True for catalog_item in catalog_list: result = coverage_provider.update_metadata( catalog_item=catalog_item, metadata_replacement_policy=metadata_replacement_policy) if not isinstance(result, CoverageFailure): items_created += 1 if isinstance(result, Identifier): # calls work.set_presentation_ready() for us coverage_provider.handle_success(result) # We're populating the catalog, so we can assume the list OneClick # sent us is of books we own licenses to. # NOTE: TODO later: For the 4 out of 2000 libraries that chose to display # books they don't own, we'd need to call the search endpoint to get # the interest field, and then deal with licenses_owned. if result.licensed_through: result.licensed_through.licenses_owned = 1 # stay data, stay! self._db.commit() return items_transmitted, items_created
def test_rights_status_open_access_link_with_rights(self): identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, rights_uri=RightsStatus.CC_BY_ND, ) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], ) replace = ReplacementPolicy( formats=True, ) pool, ignore = circulation_data.license_pool(self._db) circulation_data.apply(pool, replace) eq_(True, pool.open_access) eq_(1, len(pool.delivery_mechanisms)) eq_(RightsStatus.CC_BY_ND, pool.delivery_mechanisms[0].rights_status.uri)
def test_rights_status_commercial_link_with_rights(self): identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, rights_uri=RightsStatus.IN_COPYRIGHT, ) format = FormatData( content_type=link.media_type, drm_scheme=DeliveryMechanism.ADOBE_DRM, link=link, rights_uri=RightsStatus.IN_COPYRIGHT, ) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], formats=[format], ) replace = ReplacementPolicy( formats=True, ) pool, ignore = circulation_data.license_pool(self._db) circulation_data.apply(pool, replace) eq_(False, pool.open_access) eq_(1, len(pool.delivery_mechanisms)) eq_(RightsStatus.IN_COPYRIGHT, pool.delivery_mechanisms[0].rights_status.uri)
def test_rights_status_default_rights_from_data_source(self): identifier = IdentifierData( Identifier.GUTENBERG_ID, "abcd", ) link = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url ) circulation_data = CirculationData( data_source=DataSource.OA_CONTENT_SERVER, primary_identifier=identifier, links=[link], ) replace = ReplacementPolicy( formats=True, ) pool, ignore = circulation_data.license_pool(self._db) circulation_data.apply(pool, replace) eq_(True, pool.open_access) eq_(1, len(pool.delivery_mechanisms)) # The rights status is the default for the OA content server. eq_(RightsStatus.GENERIC_OPEN_ACCESS, pool.delivery_mechanisms[0].rights_status.uri)
def test_rights_status_open_access_link_no_rights(self): identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url ) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], ) replace = ReplacementPolicy( formats=True, ) pool, ignore = circulation_data.license_pool(self._db) circulation_data.apply(pool, replace) eq_(True, pool.open_access) eq_(1, len(pool.delivery_mechanisms)) # Rights status is generic open access because there's an open access # link but no other rights info. eq_(RightsStatus.GENERIC_OPEN_ACCESS, pool.delivery_mechanisms[0].rights_status.uri)
def test_explicit_formatdata(self): # Creating an edition with an open-access download will # automatically create a delivery mechanism. edition, pool = self._edition(with_open_access_download=True) # Let's also add a DRM format. drm_format = FormatData( content_type=Representation.PDF_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM, ) circulation_data = CirculationData(formats=[drm_format], data_source=edition.data_source, primary_identifier=edition.primary_identifier) circulation_data.apply(pool) [epub, pdf] = sorted(pool.delivery_mechanisms, key=lambda x: x.delivery_mechanism.content_type) eq_(epub.resource, edition.license_pool.best_open_access_link) eq_(Representation.PDF_MEDIA_TYPE, pdf.delivery_mechanism.content_type) eq_(DeliveryMechanism.ADOBE_DRM, pdf.delivery_mechanism.drm_scheme) # If we tell Metadata to replace the list of formats, we only # have the one format we manually created. replace = ReplacementPolicy( formats=True, ) circulation_data.apply(pool, replace=replace) [pdf] = pool.delivery_mechanisms eq_(Representation.PDF_MEDIA_TYPE, pdf.delivery_mechanism.content_type)
def test_rights_status_default_rights_passed_in(self): identifier = IdentifierData( Identifier.GUTENBERG_ID, "abcd", ) link = LinkData(rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url) circulation_data = CirculationData( data_source=DataSource.OA_CONTENT_SERVER, primary_identifier=identifier, default_rights_uri=RightsStatus.CC_BY, links=[link], ) replace = ReplacementPolicy(formats=True, ) pool, ignore = circulation_data.license_pool(self._db, self._default_collection) circulation_data.apply(self._db, pool.collection, replace) eq_(True, pool.open_access) eq_(1, pool.delivery_mechanisms.count()) # The rights status is the one that was passed in to CirculationData. eq_(RightsStatus.CC_BY, pool.delivery_mechanisms[0].rights_status.uri)
def test_rights_status_open_access_link_no_rights_uses_data_source_default(self): identifier = IdentifierData( Identifier.GUTENBERG_ID, "abcd", ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url ) circulation_data = CirculationData( data_source=DataSource.GUTENBERG, primary_identifier=identifier, links=[link], ) replace = ReplacementPolicy( formats=True, ) pool, ignore = circulation_data.license_pool(self._db) circulation_data.apply(pool, replace) eq_(True, pool.open_access) eq_(1, len(pool.delivery_mechanisms)) # The delivery mechanism's rights status is the default for # the data source. eq_(RightsStatus.PUBLIC_DOMAIN_USA, pool.delivery_mechanisms[0].rights_status.uri) # Even if a commercial source like Overdrive should offer a # link with rel="open access", unless we know it's an # open-access link we will give it a RightsStatus of # IN_COPYRIGHT. identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url ) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], ) pool, ignore = circulation_data.license_pool(self._db) circulation_data.apply(pool, replace) eq_(RightsStatus.IN_COPYRIGHT, pool.delivery_mechanisms[0].rights_status.uri) # This will cause the work to be treated as a non-open-access # work. eq_(False, pool.open_access)
def test_mirror_open_access_link_mirror_failure(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) content = open(self.sample_cover_path("test-book-cover.png")).read() link = LinkData(rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", content=content) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.JPEG_MEDIA_TYPE) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None eq_(None, representation.mirrored_at) eq_(link.media_type, representation.media_type) eq_(link.href, representation.url) # The mirror url should still be set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith( "%s/cover.jpg" % edition.primary_identifier.identifier) # Book content is still there since it wasn't mirrored. assert representation.content != None # the edition's identifier-associated license pool should not be # suppressed just because fetch failed on getting image. eq_(False, pool.suppressed) # the license pool only gets its license_exception column filled in # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub. eq_(None, pool.license_exception)
def test_mirror_with_content_modifier(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader() def dummy_content_modifier(representation): representation.content = "Replaced Content" h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, content_modifier=dummy_content_modifier, http_get=h.do_get) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href="http://example.com/test.epub", content="I'm an epub", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # The mirror url is set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith( "%s/%s.epub" % (edition.primary_identifier.identifier, edition.title)) # Content isn't there since it was mirrored. eq_(None, representation.content) # The representation was mirrored, with the modified content. eq_([representation], mirror.uploaded) eq_(["Replaced Content"], mirror.content)
def test_mirror_open_access_link_mirror_failure(self): mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None eq_(None, representation.mirrored_at) eq_(link.media_type, representation.media_type) eq_(link.href, representation.url) # The mirror url should still be set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith("%s.epub" % edition.title) # Book content is still there since it wasn't mirrored. assert representation.content != None # The license pool is suppressed when mirroring fails. eq_(True, pool.suppressed) assert representation.mirror_exception in pool.license_exception
def test_mirror_open_access_link_fetch_failure(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader() h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) link = LinkData( rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(403) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None eq_(None, representation.mirror_exception) eq_(None, representation.mirror_url) eq_(link.href, representation.url) assert representation.fetched_at != None eq_(None, representation.mirrored_at) # the edition's identifier-associated license pool should not be # suppressed just because fetch failed on getting image. eq_(False, pool.suppressed) # the license pool only gets its license_exception column filled in # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub. eq_(None, pool.license_exception)
def test_mirror_open_access_link_fetch_failure(self): mirror = DummyS3Uploader() h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(403) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None eq_(None, representation.mirror_exception) eq_(None, representation.mirror_url) eq_(link.href, representation.url) assert representation.fetched_at != None eq_(None, representation.mirrored_at) # The license pool is suppressed when fetch fails. eq_(True, pool.suppressed) assert representation.fetch_exception in pool.license_exception
def test_format_change_may_change_open_access_status(self): # In this test, whenever we call CirculationData.apply(), we # want to destroy the old list of formats and recreate it. replace_formats = ReplacementPolicy(formats=True) # Here's a seemingly ordinary non-open-access LicensePool. edition, pool = self._edition(with_license_pool=True) eq_(False, pool.open_access) # One day, we learn that it has an open-access delivery mechanism. link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, rights_uri=RightsStatus.CC_BY_ND, ) circulation_data = CirculationData( data_source=pool.data_source, primary_identifier=pool.identifier, links=[link], ) # Applying this information turns the pool into an open-access pool. circulation_data.apply(self._db, pool.collection, replace=replace_formats) eq_(True, pool.open_access) # Then we find out it was a mistake -- the book is in copyright. format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM, rights_uri=RightsStatus.IN_COPYRIGHT) circulation_data = CirculationData(data_source=pool.data_source, primary_identifier=pool.identifier, formats=[format]) circulation_data.apply(self._db, pool.collection, replace=replace_formats) # The original LPDM has been removed and only the new one remains. eq_(False, pool.open_access) eq_(1, pool.delivery_mechanisms.count())
def _set_circulationdata(self, identifier, circulationdata, circulationdata_replacement_policy=None): """Finds or creates the LicensePool for an Identifier, updates it with the given circulationdata, then creates a Work for the book. TODO: Makes assumption of one license pool per identifier. In a later branch, this will change. :return: The Identifier (if successful) or an appropriate CoverageFailure (if not). """ circulationdata_replacement_policy = circulationdata_replacement_policy or ( ReplacementPolicy.from_license_source()) pool = self.license_pool(identifier) if isinstance(pool, CoverageFailure): return pool if not circulationdata: e = "Did not receive circulationdata from input source" return CoverageFailure(identifier, e, data_source=self.output_source, transient=True) try: circulationdata.apply( pool, replace=circulationdata_replacement_policy, ) except Exception as e: self.log.warn("Error applying circulationdata to pool %d: %s", pool.id, e, exc_info=e) return CoverageFailure(identifier, repr(e), data_source=self.output_source, transient=True) return identifier
def test_classifications_from_another_source_not_updated(self): # Set up an edition whose primary identifier has two # classifications. source1 = DataSource.lookup(self._db, DataSource.AXIS_360) source2 = DataSource.lookup(self._db, DataSource.METADATA_WRANGLER) edition = self._edition() identifier = edition.primary_identifier c1 = identifier.classify(source1, Subject.TAG, "i will persist") c2 = identifier.classify(source2, Subject.TAG, "i will perish") # Now we get some new metadata from source #2. subjects = [SubjectData(type=Subject.TAG, identifier="i will conquer")] metadata = Metadata(subjects=subjects, data_source=source2) replace = ReplacementPolicy(subjects=True) metadata.apply(edition, replace=replace) # The old classification from source #2 has been destroyed. # The old classification from source #1 is still there. eq_(['i will conquer', 'i will persist'], sorted([x.subject.identifier for x in identifier.classifications]))
def _set_metadata(self, identifier, metadata, metadata_replacement_policy=None): """Finds or creates the Edition for an Identifier, updates it with the given metadata. :return: The Identifier (if successful) or an appropriate CoverageFailure (if not). """ metadata_replacement_policy = metadata_replacement_policy or ( ReplacementPolicy.from_metadata_source()) edition = self.edition(identifier) if isinstance(edition, CoverageFailure): return edition if not metadata: e = "Did not receive metadata from input source" return CoverageFailure(identifier, e, data_source=self.output_source, transient=True) try: metadata.apply( edition, replace=metadata_replacement_policy, ) except Exception as e: self.log.warn("Error applying metadata to edition %d: %s", edition.id, e, exc_info=e) return CoverageFailure(identifier, repr(e), data_source=self.output_source, transient=True) return identifier
def test_set_metadata_incorporates_replacement_policy(self): """Make sure that if a ReplacementPolicy is passed in to set_metadata(), the policy's settings (and those of its .presentation_calculation_policy) are respected. """ edition, pool = self._edition(with_license_pool=True) identifier = edition.primary_identifier # All images and open-access content should be uploaded to # this 'mirror'. mirror = DummyS3Uploader() http = DummyHTTPClient() http.queue_response( 200, content='I am an epub.', media_type=Representation.EPUB_MEDIA_TYPE, ) class Tripwire(PresentationCalculationPolicy): # This class sets a variable if one of its properties is # accessed. def __init__(self, *args, **kwargs): self.tripped = False def __getattr__(self, name): self.tripped = True return True presentation_calculation_policy = Tripwire() metadata_replacement_policy = ReplacementPolicy( mirror=mirror, http_get=http.do_get, presentation_calculation_policy=presentation_calculation_policy) circulationdata_replacement_policy = ReplacementPolicy( mirror=mirror, http_get=http.do_get, ) output_source = DataSource.lookup(self._db, DataSource.GUTENBERG) provider = CoverageProvider("service", [identifier.type], output_source) metadata = Metadata(output_source) # We've got a CirculationData object that includes an open-access download. link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://foo.com/") circulationdata = CirculationData( output_source, primary_identifier=metadata.primary_identifier, links=[link]) provider.set_metadata_and_circulation_data( identifier, metadata, circulationdata, metadata_replacement_policy=metadata_replacement_policy, circulationdata_replacement_policy= circulationdata_replacement_policy, ) # The open-access download was 'downloaded' and 'mirrored'. [mirrored] = mirror.uploaded eq_("http://foo.com/", mirrored.url) assert mirrored.mirror_url.endswith( "/%s/%s.epub" % (identifier.identifier, edition.title)) # The book content was removed from the db after it was # mirrored successfully. eq_(None, mirrored.content) # Our custom PresentationCalculationPolicy was used when # determining whether to recalculate the work's # presentation. We know this because the tripwire was # triggered. eq_(True, presentation_calculation_policy.tripped)
def test_open_access_content_mirrored(self): # Make sure that open access material links are translated to our S3 buckets, and that # commercial material links are left as is. # Note: Mirroring tests passing does not guarantee that all code now # correctly calls on CirculationData, as well as Metadata. This is a risk. mirror = DummyS3Uploader() # Here's a book. edition, pool = self._edition(with_license_pool=True) # Here's a link to the content of the book, which will be mirrored. link_mirrored = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://example.com/", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a tiny book" ) # This link will not be mirrored. link_unmirrored = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, href="http://example.com/2", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a pricy book" ) # Apply the metadata. policy = ReplacementPolicy(mirror=mirror) metadata = Metadata(data_source=edition.data_source, links=[link_mirrored, link_unmirrored], ) metadata.apply(edition, replace=policy) # make sure the refactor is done right, and metadata does not upload eq_(0, len(mirror.uploaded)) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, links=[link_mirrored, link_unmirrored], ) circulation_data.apply(pool, replace=policy) # make sure the refactor is done right, and circulation does upload eq_(1, len(mirror.uploaded)) # Only the open-access link has been 'mirrored'. [book] = mirror.uploaded # It's remained an open-access link. eq_( [Hyperlink.OPEN_ACCESS_DOWNLOAD], [x.rel for x in book.resource.links] ) # It's been 'mirrored' to the appropriate S3 bucket. assert book.mirror_url.startswith('http://s3.amazonaws.com/test.content.bucket/') expect = '/%s/%s.epub' % ( edition.primary_identifier.identifier, edition.title ) assert book.mirror_url.endswith(expect) # make sure the mirrored link is safely on edition sorted_edition_links = sorted(edition.license_pool.identifier.links, key=lambda x: x.rel) unmirrored_representation, mirrored_representation = [edlink.resource.representation for edlink in sorted_edition_links] assert mirrored_representation.mirror_url.startswith('http://s3.amazonaws.com/test.content.bucket/') # make sure the unmirrored link is safely on edition eq_('http://example.com/2', unmirrored_representation.url) # make sure the unmirrored link has not been translated to an S3 URL eq_(None, unmirrored_representation.mirror_url)
class TitleFromExternalList(object): """This class helps you convert data from external lists into Simplified Edition and CustomListEntry objects. """ def __init__(self, metadata, first_appearance, most_recent_appearance, annotation): self.log = logging.getLogger("Title from external list") self.metadata = metadata self.first_appearance = first_appearance or most_recent_appearance self.most_recent_appearance = (most_recent_appearance or datetime.datetime.now()) self.annotation = annotation def to_custom_list_entry(self, custom_list, metadata_client, overwrite_old_data=False): """Turn this object into a CustomListEntry with associated Edition.""" _db = Session.object_session(custom_list) edition = self.to_edition(_db, metadata_client, overwrite_old_data) list_entry, is_new = get_one_or_create(_db, CustomListEntry, edition=edition, customlist=custom_list) if (not list_entry.first_appearance or list_entry.first_appearance > self.first_appearance): if list_entry.first_appearance: self.log.info( "I thought %s first showed up at %s, but then I saw it earlier, at %s!", self.metadata.title, list_entry.first_appearance, self.first_appearance) list_entry.first_appearance = self.first_appearance if (not list_entry.most_recent_appearance or list_entry.most_recent_appearance < self.most_recent_appearance): if list_entry.most_recent_appearance: self.log.info( "I thought %s most recently showed up at %s, but then I saw it later, at %s!", self.metadata.title, list_entry.most_recent_appearance, self.most_recent_appearance) list_entry.most_recent_appearance = self.most_recent_appearance list_entry.annotation = self.annotation list_entry.set_work(self.metadata, metadata_client) return list_entry, is_new def to_edition(self, _db, metadata_client, overwrite_old_data=False): """Create or update an Edition object for this list item. We have two goals here: 1. Make sure there is an Edition representing the list's view of the data. 2. If at all possible, connect the Edition's primary identifier to other identifiers in the system, identifiers which may have associated LicensePools. This can happen in two ways: 2a. The Edition's primary identifier, or other identifiers associated with the Edition, may be directly associated with LicensePools. This can happen if a book's list entry includes (e.g.) an Overdrive ID. 2b. The Edition's permanent work ID may identify it as the same work as other Editions in the system. In that case this Edition's primary identifier may be associated with the other Editions' primary identifiers. (p=0.85) """ self.log.info("Converting %s to an Edition object.", self.metadata.title) # Make sure the Metadata object's view of the book is present # as an Edition. This will also associate all its identifiers # with its primary identifier, and calculate the permanent work # ID if possible. try: edition, is_new = self.metadata.edition(_db) except ValueError, e: self.log.info("Ignoring %s, no corresponding edition.", self.metadata.title) return None if overwrite_old_data: policy = ReplacementPolicy.from_metadata_source( even_if_not_apparently_updated=True) else: policy = ReplacementPolicy.append_only( even_if_not_apparently_updated=True) self.metadata.apply( edition=edition, metadata_client=metadata_client, replace=policy, ) self.metadata.associate_with_identifiers_based_on_permanent_work_id( _db) return edition
def test_image_scale_and_mirror(self): # Make sure that open access material links are translated to our S3 buckets, and that # commercial material links are left as is. # Note: mirroring links is now also CirculationData's job. So the unit tests # that test for that have been changed to call to mirror cover images. # However, updated tests passing does not guarantee that all code now # correctly calls on CirculationData, too. This is a risk. mirror = DummyS3Uploader() edition, pool = self._edition(with_license_pool=True) content = open(self.sample_cover_path("test-book-cover.png")).read() l1 = LinkData(rel=Hyperlink.IMAGE, href="http://example.com/", media_type=Representation.JPEG_MEDIA_TYPE, content=content) thumbnail_content = open( self.sample_cover_path("tiny-image-cover.png")).read() l2 = LinkData(rel=Hyperlink.THUMBNAIL_IMAGE, href="http://example.com/thumb.jpg", media_type=Representation.JPEG_MEDIA_TYPE, content=content) # When we call metadata.apply, all image links will be scaled and # 'mirrored'. policy = ReplacementPolicy(mirror=mirror) metadata = Metadata(links=[l1, l2], data_source=edition.data_source) metadata.apply(edition, replace=policy) # Two Representations were 'mirrored'. image, thumbnail = mirror.uploaded # The image... [image_link] = image.resource.links eq_(Hyperlink.IMAGE, image_link.rel) # And its thumbnail. eq_(image, thumbnail.thumbnail_of) # The original image is too big to be a thumbnail. eq_(600, image.image_height) eq_(400, image.image_width) # The thumbnail is the right height. eq_(Edition.MAX_THUMBNAIL_HEIGHT, thumbnail.image_height) eq_(Edition.MAX_THUMBNAIL_WIDTH, thumbnail.image_width) # The thumbnail is newly generated from the full-size # image--the thumbnail that came in from the OPDS feed was # ignored. assert thumbnail.url != l2.href assert thumbnail.content != l2.content # Both images have been 'mirrored' to Amazon S3. assert image.mirror_url.startswith( 'http://s3.amazonaws.com/test.cover.bucket/') assert image.mirror_url.endswith('cover.jpg') # The thumbnail image has been converted to PNG. assert thumbnail.mirror_url.startswith( 'http://s3.amazonaws.com/test.cover.bucket/scaled/300/') assert thumbnail.mirror_url.endswith('cover.png')