def test_replacement_policy_uses_provided_mirror(self): collection = MockOverdriveAPI.mock_collection(self._db) mirror = MockS3Uploader() replacement_policy = ReplacementPolicy.from_metadata_source( mirror=mirror ) api = MockOverdriveAPI(self._db, collection) api.queue_collection_token() provider = OverdriveBibliographicCoverageProvider( collection, replacement_policy=replacement_policy, api_class=api ) # Any resources discovered by Overdrive will be # sent through this mirror. eq_(mirror, provider.replacement_policy.mirror) http = DummyHTTPClient() provider.replacement_policy.http_get = http.do_get # Now let's try looking up a specific identifier through 'Overdrive'. identifier = self._identifier( Identifier.OVERDRIVE_ID, "3896665d-9d81-4cac-bd43-ffc5066de1f5" ) body = self.data_file("overdrive/overdrive_metadata.json") provider.api.queue_response(200, {}, body) test_cover = self.data_file("covers/test-book-cover.png") test_small_cover = self.data_file("covers/tiny-image-cover.png") # Overdrive's full-sized image -- we will be creating our own # thumbnail from this. http.queue_response(200, "image/jpeg", {}, test_cover) # Overdrive's thumbnail image -- we will not be using this http.queue_response(200, "image/jpeg", {}, test_small_cover) record = provider.ensure_coverage(identifier) eq_("success", record.status) # The full image and the thumbnail have been uploaded to # the fake S3. full, thumbnail = mirror.uploaded eq_(test_cover, full.content) # The URLs for the Resource objects are our S3 URLs, not Overdrive's # URLs. expect = "Overdrive/Overdrive+ID/%s" % identifier.identifier for url in [full.mirror_url, thumbnail.mirror_url]: assert expect in url assert "/scaled/" in thumbnail.mirror_url assert "/scaled/" not in full.mirror_url # The thumbnail is a newly created image that is not the # same as the full image or the test cover. assert thumbnail.content != test_small_cover assert thumbnail.content != test_cover
def test_mirror_open_access_link_mirror_failure(self): mirrors = dict(books_mirror=MockS3Uploader(fail=True), covers_mirror=None) h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. assert None == representation.fetch_exception assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None assert None == representation.mirrored_at assert link.media_type == representation.media_type assert link.href == representation.url # The mirror url was never set. assert None == representation.mirror_url # Book content is still there since it wasn't mirrored. assert representation.content != None # The license pool is suppressed when mirroring fails. assert True == pool.suppressed assert representation.mirror_exception in pool.license_exception
def test_replacement_policy_uses_provided_mirror(self): collection = MockOverdriveAPI.mock_collection(self._db) mirror = MockS3Uploader() replacement_policy = ReplacementPolicy.from_metadata_source( mirror=mirror) api = MockOverdriveAPI(self._db, collection) api.queue_collection_token() provider = OverdriveBibliographicCoverageProvider( collection, replacement_policy=replacement_policy, api_class=api) # Any resources discovered by Overdrive will be # sent through this mirror. eq_(mirror, provider.replacement_policy.mirror) http = DummyHTTPClient() provider.replacement_policy.http_get = http.do_get # Now let's try looking up a specific identifier through 'Overdrive'. identifier = self._identifier(Identifier.OVERDRIVE_ID, "3896665d-9d81-4cac-bd43-ffc5066de1f5") body = self.data_file("overdrive/overdrive_metadata.json") provider.api.queue_response(200, {}, body) test_cover = self.data_file("covers/test-book-cover.png") test_small_cover = self.data_file("covers/tiny-image-cover.png") # Overdrive's full-sized image -- we will be creating our own # thumbnail from this. http.queue_response(200, "image/jpeg", {}, test_cover) # Overdrive's thumbnail image -- we will not be using this http.queue_response(200, "image/jpeg", {}, test_small_cover) record = provider.ensure_coverage(identifier) eq_("success", record.status) # The full image and the thumbnail have been uploaded to # the fake S3. full, thumbnail = mirror.uploaded eq_(test_cover, full.content) # The URLs for the Resource objects are our S3 URLs, not Overdrive's # URLs. expect = "Overdrive/Overdrive+ID/%s" % identifier.identifier for url in [full.mirror_url, thumbnail.mirror_url]: assert expect in url assert "/scaled/" in thumbnail.mirror_url assert "/scaled/" not in full.mirror_url # The thumbnail is a newly created image that is not the # same as the full image or the test cover. assert thumbnail.content != test_small_cover assert thumbnail.content != test_cover
def test_500_creates_uncachable_representation(self): h = DummyHTTPClient() h.queue_response(500) url = self._url representation, cached = Representation.get(self._db, url, do_get=h.do_get) assert False == cached h.queue_response(500) representation, cached = Representation.get(self._db, url, do_get=h.do_get) assert False == cached
def test_get_with_url_normalizer(self): # Verify our ability to store a Resource under a URL other than # the exact URL used to make the HTTP request. class Normalizer(object): called_with = None def normalize(self, url): # Strip off a session ID from an outgoing URL. self.called_with = url return url[:11] normalizer = Normalizer() h = DummyHTTPClient() h.queue_response(200, content="yay") original_url = "http://url/?sid=12345" representation, from_cache = Representation.get( self._db, original_url, do_get=h.do_get, url_normalizer=normalizer.normalize) # The original URL was used to make the actual request. assert [original_url] == h.requests # The original URL was then passed into Normalizer.normalize assert original_url == normalizer.called_with # And the normalized URL was used as the Representation's # storage key. normalized_url = "http://url/" assert "yay" == representation.content.decode("utf-8") assert normalized_url == representation.url assert False == from_cache # Try again, and the Representation is retrieved from cache under # the normalized URL. # # Replace do_get with a dud object to prove that no second # request goes out 'over the wire'. representation2, from_cache = Representation.get( self._db, original_url, do_get=object(), url_normalizer=normalizer.normalize) assert True == from_cache assert representation2 == representation assert normalized_url == representation.url
def test_response_reviewer_impacts_representation(self): h = DummyHTTPClient() h.queue_response(200, media_type="text/html") def reviewer(response): status, headers, content = response if "html" in headers["content-type"]: raise Exception("No. Just no.") representation, cached = Representation.get(self._db, self._url, do_get=h.do_get, response_reviewer=reviewer) assert "No. Just no." in representation.fetch_exception assert False == cached
def test_302_creates_cachable_representation(self): h = DummyHTTPClient() h.queue_response(302) url = self._url representation, cached = Representation.get(self._db, url, do_get=h.do_get) assert False == cached representation2, cached = Representation.get(self._db, url, do_get=h.do_get) assert True == cached assert representation == representation2
def test_mirror_open_access_link_fetch_failure(self): mirrors = dict(books_mirror=MockS3Uploader()) h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, media_type=link.media_type, content=link.content, ) h.queue_response(403) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None assert None == representation.mirror_exception assert None == representation.mirror_url assert link.href == representation.url assert representation.fetched_at != None assert None == representation.mirrored_at # The license pool is suppressed when fetch fails. assert True == pool.suppressed assert representation.fetch_exception in pool.license_exception
def test_presumed_media_type(self): h = DummyHTTPClient() # In the absence of a content-type header, the presumed_media_type # takes over. h.queue_response(200, None, content="content") representation, cached = Representation.get( self._db, "http://url", do_get=h.do_get, max_age=0, presumed_media_type="text/xml", ) assert "text/xml" == representation.media_type # In the presence of a generic content-type header, the # presumed_media_type takes over. h.queue_response(200, "application/octet-stream", content="content") representation, cached = Representation.get( self._db, "http://url", do_get=h.do_get, max_age=0, presumed_media_type="text/xml", ) assert "text/xml" == representation.media_type # A non-generic content-type header takes precedence over # presumed_media_type. h.queue_response(200, "text/plain", content="content") representation, cached = Representation.get( self._db, "http://url", do_get=h.do_get, max_age=0, presumed_media_type="text/xml", ) assert "text/plain" == representation.media_type
class TestFeedbooksOPDSImporter(DatabaseTest): def _importer(self, **settings): collection = self._collection( name=DataSource.FEEDBOOKS + self._str, protocol=ExternalIntegration.FEEDBOOKS, ) defaults = { FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "true", FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: None, } for setting, value in defaults.items(): if setting not in settings: settings[setting] = value collection.external_account_id = settings.pop('language', 'de') for setting, value in settings.items(): if value is None: continue collection.external_integration.set_setting(setting, value) return collection, FeedbooksOPDSImporter( self._db, collection, http_get=self.http.do_get, mirror=self.mirror, metadata_client=self.metadata, ) def setup(self): super(TestFeedbooksOPDSImporter, self).setup() self.http = DummyHTTPClient() self.metadata = DummyMetadataClient() self.mirror = MockS3Uploader() self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS) # Create a default importer that's good enough for most tests. self.collection, self.importer = self._importer() def sample_file(self, filename): return sample_data(filename, "feedbooks") def test_safety_switch(self): """The importer won't be instantiated if REALLY_IMPORT_KEY is not set to true. """ settings = {FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "false"} assert_raises_regexp( Exception, "configured to not actually do an import", self._importer, **settings ) def test_unique_identifier(self): # The unique account ID is the language of the Feedbooks # feed in use. eq_('de', self.collection.unique_account_id) def test_error_retrieving_replacement_css(self): """The importer cannot be instantiated if a replacement CSS is specified but the replacement CSS document cannot be retrieved or does not appear to be CSS. """ settings = {FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: "http://foo"} self.http.queue_response(500, content="An error message") assert_raises_regexp( IOError, "Replacement stylesheet URL returned 500 response code", self._importer, **settings ) self.http.queue_response( 200, content="We have many CSS offerings", media_type="text/html" ) assert_raises_regexp( IOError, "Replacement stylesheet is 'text/html', not a CSS document.", self._importer, **settings ) def test_extract_feed_data_improves_descriptions(self): feed = self.sample_file("feed.atom") self.http.queue_response(200, OPDSFeed.ENTRY_TYPE, content=self.sample_file("677.atom")) metadata, failures = self.importer.extract_feed_data( feed, "http://url/" ) [(key, value)] = metadata.items() eq_(u'http://www.feedbooks.com/book/677', key) eq_("Discourse on the Method", value.title) # Instead of the short description from feed.atom, we have the # long description from 677.atom. [description] = [x for x in value.links if x.rel==Hyperlink.DESCRIPTION] eq_(1818, len(description.content)) def test_improve_description(self): # Here's a Metadata that has a bad (truncated) description. metadata = Metadata(self.data_source) bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...") irrelevant_description = LinkData( rel=Hyperlink.DESCRIPTION, media_type="text/plain", content="Don't look at me; I'm irrelevant!" ) # Sending an HTTP request to this URL is going to give a 404 error. alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/", media_type=OPDSFeed.ENTRY_TYPE) # We're not even going to try to send an HTTP request to this URL # because it doesn't promise an OPDS entry. alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/", media_type="text/html") # But this URL will give us full information about this # entry, including a better description. alternate3 = LinkData( rel=Hyperlink.ALTERNATE, href="http://baz/", media_type=OPDSFeed.ENTRY_TYPE ) # This URL will not be requested because the third alternate URL # gives us the answer we're looking for. alternate4 = LinkData( rel=Hyperlink.ALTERNATE, href="http://qux/", media_type=OPDSFeed.ENTRY_TYPE ) # Two requests will be made. The first will result in a 404 # error. The second will give us an OPDS entry. self.http.queue_response(404, content="Not found") self.http.queue_response(200, OPDSFeed.ENTRY_TYPE, content=self.sample_file("677.atom")) metadata.links = [bad_description, irrelevant_description, alternate, alternate2, alternate3, alternate4] self.importer.improve_description("some ID", metadata) # The descriptions have been removed from metatadata.links, # because 677.atom included a description we know was better. # # The incomplete description was removed even though 677.atom # also included a copy of it. assert bad_description not in metadata.links assert irrelevant_description not in metadata.links # The more complete description from 677.atom has been added. [good_description] = [ x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION ] # The four alternate links have not been touched. assert (alternate in metadata.links) assert (alternate2 in metadata.links) assert (alternate3 in metadata.links) assert (alternate4 in metadata.links) # Two HTTP requests were made. eq_(['http://foo/', 'http://baz/'], self.http.requests) def test_generic_acquisition_epub_link_picked_up_as_open_access(self): """The OPDS feed has links with generic OPDS "acquisition" relations. We know that the EPUB link should be open-access relations, and we modify its relation on the way in. We do not modify the link relation for links to the other formats, which means they don't get picked up at all. """ feed = self.sample_file("feed_with_open_access_book.atom") imports, errors = self.importer.extract_feed_data(feed) [book] = imports.values() open_access_links = [x for x in book.circulation.links if x.rel==Hyperlink.OPEN_ACCESS_DOWNLOAD] links = sorted(x.href for x in open_access_links) eq_(['http://www.feedbooks.com/book/677.epub'], links) generic_links = [x for x in book.circulation.links if x.rel==Hyperlink.GENERIC_OPDS_ACQUISITION] eq_([], generic_links) def test_open_access_book_modified_and_mirrored(self): # If no replacement CSS is specified (this is the case with # the default importer), the OPDSImporter.content_modifier # method is not assigned. eq_(None, self.importer.new_css) eq_(None, self.importer.content_modifier) # Let's create an importer that does specify a replacement # CSS file. settings = { FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY : "http://css/" } # The very first request made is going to be to the # REPLACEMENT_CSS_KEY URL. self.http.queue_response( 200, content="Some new CSS", media_type="text/css", ) ignore, importer = self._importer(**settings) # The replacement CSS is retrieved during the FeedbooksImporter # constructor. eq_([u'http://css/'], self.http.requests) # OPDSImporter.content_modifier has been set to call replace_css # when necessary. eq_("Some new CSS", importer.new_css) eq_(importer.replace_css, importer.content_modifier) # The requests to the various copies of the book will succeed, # and the books will be mirrored. self.http.queue_response( 200, content=self.sample_file("677.epub"), media_type=Representation.EPUB_MEDIA_TYPE ) # The request to # http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185' # will result in a 404 error, and the image will not be # mirrored. self.http.queue_response(404, media_type="text/plain") self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" } feed = self.sample_file("feed_with_open_access_book.atom") self.http.queue_response( 200, OPDSFeed.ACQUISITION_FEED_TYPE, content=feed ) [edition], [pool], [work], failures = importer.import_from_feed( feed, immediately_presentation_ready=True, ) eq_({}, failures) # The work has been created and has metadata. eq_("Discourse on the Method", work.title) eq_(u'Ren\xe9 Descartes', work.author) # Two more mock HTTP requests have now made. eq_([ u'http://css/', u'http://www.feedbooks.com/book/677.epub', u'http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185', ], self.http.requests ) # The EPUB was 'uploaded' to the mock S3 service and turned # into a LicensePoolDeliveryMechanism. The other formats were # ignored. [mechanism] = pool.delivery_mechanisms eq_('https://s3.amazonaws.com/test.content.bucket/FeedBooks/URI/http%3A%2F%2Fwww.feedbooks.com%2Fbook%2F677/Discourse+on+the+Method.epub', mechanism.resource.representation.mirror_url ) eq_(u'application/epub+zip', mechanism.delivery_mechanism.content_type) # From information contained in the OPDS entry we determined # the book's license to be CC-BY-NC. eq_(u'https://creativecommons.org/licenses/by-nc/4.0', mechanism.rights_status.uri) # The pool is marked as open-access, because it has an open-access # delivery mechanism that was mirrored. eq_(True, pool.open_access) # The mirrored content contains the modified CSS. content = StringIO(self.mirror.content[0]) with ZipFile(content) as zip: # The zip still contains the original epub's files. assert "META-INF/container.xml" in zip.namelist() assert "OPS/css/about.css" in zip.namelist() assert "OPS/main0.xml" in zip.namelist() # The content of an old file hasn't changed. with zip.open("mimetype") as f: eq_("application/epub+zip\r\n", f.read()) # The content of CSS files has been changed to the new value. with zip.open("OPS/css/about.css") as f: eq_("Some new CSS", f.read()) def test_in_copyright_book_not_mirrored(self): self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" } feed = self.sample_file("feed_with_in_copyright_book.atom") self.http.queue_response( 200, OPDSFeed.ACQUISITION_FEED_TYPE, content=feed ) response = self.importer.import_from_feed( feed, immediately_presentation_ready=True, ) [edition], [pool], [work], failures = self.importer.import_from_feed( feed, immediately_presentation_ready=True, ) # The work has been created and has metadata. eq_("Discourse on the Method", work.title) eq_(u'Ren\xe9 Descartes', work.author) # No mock HTTP requests were made. eq_([], self.http.requests) # Nothing was uploaded to the mock S3. eq_([], self.mirror.uploaded) # The LicensePool's delivery mechanism is set appropriately # to reflect an in-copyright work. [mechanism] = pool.delivery_mechanisms eq_(RightsStatus.IN_COPYRIGHT, mechanism.rights_status.uri) # The DeliveryMechanism has a Representation but the Representation # has not been set as mirrored, because nothing was uploaded. rep = mechanism.resource.representation eq_('http://www.feedbooks.com/book/677.epub', rep.url) eq_(None, rep.mirror_url) eq_(None, rep.mirror_exception) # The pool is not marked as open-access because although it # has open-access links, they're not licensed under terms we # can use. eq_(False, pool.open_access)
def test_lookup(self): # Test the lookup() method. h = DummyHTTPClient() h.queue_response(200, "text/html", content="yay") class Mock(NoveListAPI): def build_query_url(self, params): self.build_query_url_called_with = params return "http://query-url/" def scrubbed_url(self, params): self.scrubbed_url_called_with = params return "http://scrubbed-url/" def review_response(self, response): self.review_response_called_with = response def lookup_info_to_metadata(self, representation): self.lookup_info_to_metadata_called_with = representation return "some metadata" novelist = Mock.from_config(self._default_library) identifier = self._identifier(identifier_type=Identifier.ISBN) # Do the lookup. result = novelist.lookup(identifier, do_get=h.do_get) # A number of parameters were passed into build_query_url() to # get the URL of the HTTP request. The same parameters were # also passed into scrubbed_url(), to get the URL that should # be used when storing the Representation in the database. params1 = novelist.build_query_url_called_with params2 = novelist.scrubbed_url_called_with assert params1 == params2 assert ( dict( profile=novelist.profile, ClientIdentifier=identifier.urn, ISBN=identifier.identifier, password=novelist.password, version=novelist.version, ) == params1 ) # The HTTP request went out to the query URL -- not the scrubbed URL. assert ["http://query-url/"] == h.requests # The HTTP response was passed into novelist.review_response() assert ( 200, {"content-type": "text/html"}, b"yay", ) == novelist.review_response_called_with # Finally, the Representation was passed into # lookup_info_to_metadata, which returned a hard-coded string # as the final result. assert "some metadata" == result # Looking at the Representation we can see that it was stored # in the database under its scrubbed URL, not the URL used to # make the request. rep = novelist.lookup_info_to_metadata_called_with assert "http://scrubbed-url/" == rep.url assert b"yay" == rep.content
def test_cautious_http_get(self): h = DummyHTTPClient() h.queue_response(200, content="yay") # If the domain is obviously safe, the GET request goes through, # with no HEAD request being made. m = Representation.cautious_http_get status, headers, content = m( "http://safe.org/", {}, do_not_access=["unsafe.org"], do_get=h.do_get, cautious_head_client=object(), ) assert 200 == status assert b"yay" == content # If the domain is obviously unsafe, no GET request or HEAD # request is made. status, headers, content = m( "http://unsafe.org/", {}, do_not_access=["unsafe.org"], do_get=object(), cautious_head_client=object(), ) assert 417 == status assert ( "Cautiously decided not to make a GET request to http://unsafe.org/" == content) # If the domain is potentially unsafe, a HEAD request is made, # and the answer depends on its outcome. # Here, the HEAD request redirects to a prohibited site. def mock_redirect(*args, **kwargs): return MockRequestsResponse(301, dict(location="http://unsafe.org/")) status, headers, content = m( "http://caution.org/", {}, do_not_access=["unsafe.org"], check_for_redirect=["caution.org"], do_get=object(), cautious_head_client=mock_redirect, ) assert 417 == status assert ("application/vnd.librarysimplified-did-not-make-request" == headers["content-type"]) assert ( "Cautiously decided not to make a GET request to http://caution.org/" == content) # Here, the HEAD request redirects to an allowed site. h.queue_response(200, content="good content") def mock_redirect(*args, **kwargs): return MockRequestsResponse(301, dict(location="http://safe.org/")) status, headers, content = m( "http://caution.org/", {}, do_not_access=["unsafe.org"], check_for_redirect=["caution.org"], do_get=h.do_get, cautious_head_client=mock_redirect, ) assert 200 == status assert b"good content" == content
class TestFeedbooksOPDSImporter(DatabaseTest): def _importer(self, **settings): collection = self._collection( name=DataSource.FEEDBOOKS + self._str, protocol=ExternalIntegration.FEEDBOOKS, ) defaults = { FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "true", FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: None, } for setting, value in defaults.items(): if setting not in settings: settings[setting] = value collection.external_account_id = settings.pop('language', 'de') for setting, value in settings.items(): if value is None: continue collection.external_integration.set_setting(setting, value) return collection, FeedbooksOPDSImporter( self._db, collection, http_get=self.http.do_get, mirror=self.mirror, metadata_client=self.metadata, ) def setup(self): super(TestFeedbooksOPDSImporter, self).setup() self.http = DummyHTTPClient() self.metadata = DummyMetadataClient() self.mirror = MockS3Uploader() self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS) # Create a default importer that's good enough for most tests. self.collection, self.importer = self._importer() def sample_file(self, filename): return sample_data(filename, "feedbooks") def test_safety_switch(self): """The importer won't be instantiated if REALLY_IMPORT_KEY is not set to true. """ settings = {FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "false"} assert_raises_regexp( Exception, "configured to not actually do an import", self._importer, **settings ) def test_unique_identifier(self): # The unique account ID is the language of the Feedbooks # feed in use. eq_('de', self.collection.unique_account_id) def test_error_retrieving_replacement_css(self): """The importer cannot be instantiated if a replacement CSS is specified but the replacement CSS document cannot be retrieved or does not appear to be CSS. """ settings = {FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: "http://foo"} self.http.queue_response(500, content="An error message") assert_raises_regexp( IOError, "Replacement stylesheet URL returned 500 response code", self._importer, **settings ) self.http.queue_response( 200, content="We have many CSS offerings", media_type="text/html" ) assert_raises_regexp( IOError, "Replacement stylesheet is 'text/html', not a CSS document.", self._importer, **settings ) def test_extract_feed_data_improves_descriptions(self): feed = self.sample_file("feed.atom") self.http.queue_response(200, OPDSFeed.ENTRY_TYPE, content=self.sample_file("677.atom")) metadata, failures = self.importer.extract_feed_data( feed, "http://url/" ) [(key, value)] = metadata.items() eq_(u'http://www.feedbooks.com/book/677', key) eq_("Discourse on the Method", value.title) # Instead of the short description from feed.atom, we have the # long description from 677.atom. [description] = [x for x in value.links if x.rel==Hyperlink.DESCRIPTION] eq_(1818, len(description.content)) def test_improve_description(self): # Here's a Metadata that has a bad (truncated) description. metadata = Metadata(self.data_source) bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...") irrelevant_description = LinkData( rel=Hyperlink.DESCRIPTION, media_type="text/plain", content="Don't look at me; I'm irrelevant!" ) # Sending an HTTP request to this URL is going to give a 404 error. alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/", media_type=OPDSFeed.ENTRY_TYPE) # We're not even going to try to send an HTTP request to this URL # because it doesn't promise an OPDS entry. alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/", media_type="text/html") # But this URL will give us full information about this # entry, including a better description. alternate3 = LinkData( rel=Hyperlink.ALTERNATE, href="http://baz/", media_type=OPDSFeed.ENTRY_TYPE ) # This URL will not be requested because the third alternate URL # gives us the answer we're looking for. alternate4 = LinkData( rel=Hyperlink.ALTERNATE, href="http://qux/", media_type=OPDSFeed.ENTRY_TYPE ) # Two requests will be made. The first will result in a 404 # error. The second will give us an OPDS entry. self.http.queue_response(404, content="Not found") self.http.queue_response(200, OPDSFeed.ENTRY_TYPE, content=self.sample_file("677.atom")) metadata.links = [bad_description, irrelevant_description, alternate, alternate2, alternate3, alternate4] self.importer.improve_description("some ID", metadata) # The descriptions have been removed from metatadata.links, # because 677.atom included a description we know was better. # # The incomplete description was removed even though 677.atom # also included a copy of it. assert bad_description not in metadata.links assert irrelevant_description not in metadata.links # The more complete description from 677.atom has been added. [good_description] = [ x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION ] # The four alternate links have not been touched. assert (alternate in metadata.links) assert (alternate2 in metadata.links) assert (alternate3 in metadata.links) assert (alternate4 in metadata.links) # Two HTTP requests were made. eq_(['http://foo/', 'http://baz/'], self.http.requests) def test_generic_acquisition_epub_link_picked_up_as_open_access(self): """The OPDS feed has links with generic OPDS "acquisition" relations. We know that the EPUB link should be open-access relations, and we modify its relation on the way in. We do not modify the link relation for links to the other formats, which means they don't get picked up at all. """ feed = self.sample_file("feed_with_open_access_book.atom") imports, errors = self.importer.extract_feed_data(feed) [book] = imports.values() open_access_links = [x for x in book.circulation.links if x.rel==Hyperlink.OPEN_ACCESS_DOWNLOAD] links = sorted(x.href for x in open_access_links) eq_(['http://www.feedbooks.com/book/677.epub'], links) generic_links = [x for x in book.circulation.links if x.rel==Hyperlink.GENERIC_OPDS_ACQUISITION] eq_([], generic_links) def test_open_access_book_modified_and_mirrored(self): # If no replacement CSS is specified (this is the case with # the default importer), the OPDSImporter.content_modifier # method is not assigned. eq_(None, self.importer.new_css) eq_(None, self.importer.content_modifier) # Let's create an importer that does specify a replacement # CSS file. settings = { FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY : "http://css/" } # The very first request made is going to be to the # REPLACEMENT_CSS_KEY URL. self.http.queue_response( 200, content="Some new CSS", media_type="text/css", ) ignore, importer = self._importer(**settings) # The replacement CSS is retrieved during the FeedbooksImporter # constructor. eq_([u'http://css/'], self.http.requests) # OPDSImporter.content_modifier has been set to call replace_css # when necessary. eq_("Some new CSS", importer.new_css) eq_(importer.replace_css, importer.content_modifier) # The requests to the various copies of the book will succeed, # and the books will be mirrored. self.http.queue_response( 200, content=self.sample_file("677.epub"), media_type=Representation.EPUB_MEDIA_TYPE ) # The request to # http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185' # will result in a 404 error, and the image will not be # mirrored. self.http.queue_response(404, media_type="text/plain") self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" } feed = self.sample_file("feed_with_open_access_book.atom") self.http.queue_response( 200, OPDSFeed.ACQUISITION_FEED_TYPE, content=feed ) [edition], [pool], [work], failures = importer.import_from_feed(feed) eq_({}, failures) # The work has been created and has metadata. eq_("Discourse on the Method", work.title) eq_(u'Ren\xe9 Descartes', work.author) # Two more mock HTTP requests have now made. eq_([ u'http://css/', u'http://www.feedbooks.com/book/677.epub', u'http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185', ], self.http.requests ) # The EPUB was 'uploaded' to the mock S3 service and turned # into a LicensePoolDeliveryMechanism. The other formats were # ignored. [mechanism] = pool.delivery_mechanisms eq_('https://s3.amazonaws.com/test.content.bucket/FeedBooks/URI/http%3A%2F%2Fwww.feedbooks.com%2Fbook%2F677/Discourse+on+the+Method.epub', mechanism.resource.representation.mirror_url ) eq_(u'application/epub+zip', mechanism.delivery_mechanism.content_type) # From information contained in the OPDS entry we determined # the book's license to be CC-BY-NC. eq_(u'https://creativecommons.org/licenses/by-nc/4.0', mechanism.rights_status.uri) # The pool is marked as open-access, because it has an open-access # delivery mechanism that was mirrored. eq_(True, pool.open_access) # The mirrored content contains the modified CSS. content = StringIO(self.mirror.content[0]) with ZipFile(content) as zip: # The zip still contains the original epub's files. assert "META-INF/container.xml" in zip.namelist() assert "OPS/css/about.css" in zip.namelist() assert "OPS/main0.xml" in zip.namelist() # The content of an old file hasn't changed. with zip.open("mimetype") as f: eq_("application/epub+zip\r\n", f.read()) # The content of CSS files has been changed to the new value. with zip.open("OPS/css/about.css") as f: eq_("Some new CSS", f.read()) def test_in_copyright_book_not_mirrored(self): self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" } feed = self.sample_file("feed_with_in_copyright_book.atom") self.http.queue_response( 200, OPDSFeed.ACQUISITION_FEED_TYPE, content=feed ) [edition], [pool], [work], failures = self.importer.import_from_feed(feed) # The work has been created and has metadata. eq_("Discourse on the Method", work.title) eq_(u'Ren\xe9 Descartes', work.author) # No mock HTTP requests were made. eq_([], self.http.requests) # Nothing was uploaded to the mock S3. eq_([], self.mirror.uploaded) # The LicensePool's delivery mechanism is set appropriately # to reflect an in-copyright work. [mechanism] = pool.delivery_mechanisms eq_(RightsStatus.IN_COPYRIGHT, mechanism.rights_status.uri) # The DeliveryMechanism has a Representation but the Representation # has not been set as mirrored, because nothing was uploaded. rep = mechanism.resource.representation eq_('http://www.feedbooks.com/book/677.epub', rep.url) eq_(None, rep.mirror_url) eq_(None, rep.mirror_exception) # The pool is not marked as open-access because although it # has open-access links, they're not licensed under terms we # can use. eq_(False, pool.open_access)