def test_replacement_policy_uses_provided_mirror(self):
        collection = MockOverdriveAPI.mock_collection(self._db)
        mirror = MockS3Uploader()
        replacement_policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror
        )
        api = MockOverdriveAPI(self._db, collection)
        api.queue_collection_token()
        provider = OverdriveBibliographicCoverageProvider(
            collection, replacement_policy=replacement_policy,
            api_class=api
        )
        
        # Any resources discovered by Overdrive will be
        # sent through this mirror.
        eq_(mirror, provider.replacement_policy.mirror)

        http = DummyHTTPClient()
        provider.replacement_policy.http_get = http.do_get

        # Now let's try looking up a specific identifier through 'Overdrive'.
        identifier = self._identifier(
            Identifier.OVERDRIVE_ID, "3896665d-9d81-4cac-bd43-ffc5066de1f5"
        )


        body = self.data_file("overdrive/overdrive_metadata.json")
        provider.api.queue_response(200, {}, body)

        test_cover = self.data_file("covers/test-book-cover.png")
        test_small_cover = self.data_file("covers/tiny-image-cover.png")

        # Overdrive's full-sized image -- we will be creating our own
        # thumbnail from this.
        http.queue_response(200, "image/jpeg", {}, test_cover)

        # Overdrive's thumbnail image -- we will not be using this
        http.queue_response(200, "image/jpeg", {}, test_small_cover)

        record = provider.ensure_coverage(identifier)
        eq_("success", record.status)

        # The full image and the thumbnail have been uploaded to
        # the fake S3.
        full, thumbnail = mirror.uploaded
        eq_(test_cover, full.content)

        # The URLs for the Resource objects are our S3 URLs, not Overdrive's
        # URLs.
        expect = "Overdrive/Overdrive+ID/%s" % identifier.identifier
        for url in [full.mirror_url, thumbnail.mirror_url]:
            assert expect in url
        assert "/scaled/" in thumbnail.mirror_url
        assert "/scaled/" not in full.mirror_url

        # The thumbnail is a newly created image that is not the
        # same as the full image or the test cover.
        assert thumbnail.content != test_small_cover
        assert thumbnail.content != test_cover
Ejemplo n.º 2
0
    def test_mirror_open_access_link_mirror_failure(self):
        mirrors = dict(books_mirror=MockS3Uploader(fail=True),
                       covers_mirror=None)
        h = DummyHTTPClient()

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get)

        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE)

        circulation_data.mirror_link(pool, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # The representation was fetched successfully.
        assert None == representation.fetch_exception
        assert representation.fetched_at != None

        # But mirroing failed.
        assert representation.mirror_exception != None
        assert None == representation.mirrored_at
        assert link.media_type == representation.media_type
        assert link.href == representation.url

        # The mirror url was never set.
        assert None == representation.mirror_url

        # Book content is still there since it wasn't mirrored.
        assert representation.content != None

        # The license pool is suppressed when mirroring fails.
        assert True == pool.suppressed
        assert representation.mirror_exception in pool.license_exception
    def test_replacement_policy_uses_provided_mirror(self):
        collection = MockOverdriveAPI.mock_collection(self._db)
        mirror = MockS3Uploader()
        replacement_policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror)
        api = MockOverdriveAPI(self._db, collection)
        api.queue_collection_token()
        provider = OverdriveBibliographicCoverageProvider(
            collection, replacement_policy=replacement_policy, api_class=api)

        # Any resources discovered by Overdrive will be
        # sent through this mirror.
        eq_(mirror, provider.replacement_policy.mirror)

        http = DummyHTTPClient()
        provider.replacement_policy.http_get = http.do_get

        # Now let's try looking up a specific identifier through 'Overdrive'.
        identifier = self._identifier(Identifier.OVERDRIVE_ID,
                                      "3896665d-9d81-4cac-bd43-ffc5066de1f5")

        body = self.data_file("overdrive/overdrive_metadata.json")
        provider.api.queue_response(200, {}, body)

        test_cover = self.data_file("covers/test-book-cover.png")
        test_small_cover = self.data_file("covers/tiny-image-cover.png")

        # Overdrive's full-sized image -- we will be creating our own
        # thumbnail from this.
        http.queue_response(200, "image/jpeg", {}, test_cover)

        # Overdrive's thumbnail image -- we will not be using this
        http.queue_response(200, "image/jpeg", {}, test_small_cover)

        record = provider.ensure_coverage(identifier)
        eq_("success", record.status)

        # The full image and the thumbnail have been uploaded to
        # the fake S3.
        full, thumbnail = mirror.uploaded
        eq_(test_cover, full.content)

        # The URLs for the Resource objects are our S3 URLs, not Overdrive's
        # URLs.
        expect = "Overdrive/Overdrive+ID/%s" % identifier.identifier
        for url in [full.mirror_url, thumbnail.mirror_url]:
            assert expect in url
        assert "/scaled/" in thumbnail.mirror_url
        assert "/scaled/" not in full.mirror_url

        # The thumbnail is a newly created image that is not the
        # same as the full image or the test cover.
        assert thumbnail.content != test_small_cover
        assert thumbnail.content != test_cover
Ejemplo n.º 4
0
    def test_500_creates_uncachable_representation(self):
        h = DummyHTTPClient()
        h.queue_response(500)
        url = self._url
        representation, cached = Representation.get(self._db,
                                                    url,
                                                    do_get=h.do_get)
        assert False == cached

        h.queue_response(500)
        representation, cached = Representation.get(self._db,
                                                    url,
                                                    do_get=h.do_get)
        assert False == cached
Ejemplo n.º 5
0
    def test_get_with_url_normalizer(self):
        # Verify our ability to store a Resource under a URL other than
        # the exact URL used to make the HTTP request.

        class Normalizer(object):
            called_with = None

            def normalize(self, url):
                # Strip off a  session ID from an outgoing URL.
                self.called_with = url
                return url[:11]

        normalizer = Normalizer()

        h = DummyHTTPClient()
        h.queue_response(200, content="yay")
        original_url = "http://url/?sid=12345"

        representation, from_cache = Representation.get(
            self._db,
            original_url,
            do_get=h.do_get,
            url_normalizer=normalizer.normalize)

        # The original URL was used to make the actual request.
        assert [original_url] == h.requests

        # The original URL was then passed into Normalizer.normalize
        assert original_url == normalizer.called_with

        # And the normalized URL was used as the Representation's
        # storage key.
        normalized_url = "http://url/"
        assert "yay" == representation.content.decode("utf-8")
        assert normalized_url == representation.url
        assert False == from_cache

        # Try again, and the Representation is retrieved from cache under
        # the normalized URL.
        #
        # Replace do_get with a dud object to prove that no second
        # request goes out 'over the wire'.
        representation2, from_cache = Representation.get(
            self._db,
            original_url,
            do_get=object(),
            url_normalizer=normalizer.normalize)
        assert True == from_cache
        assert representation2 == representation
        assert normalized_url == representation.url
Ejemplo n.º 6
0
    def test_response_reviewer_impacts_representation(self):
        h = DummyHTTPClient()
        h.queue_response(200, media_type="text/html")

        def reviewer(response):
            status, headers, content = response
            if "html" in headers["content-type"]:
                raise Exception("No. Just no.")

        representation, cached = Representation.get(self._db,
                                                    self._url,
                                                    do_get=h.do_get,
                                                    response_reviewer=reviewer)
        assert "No. Just no." in representation.fetch_exception
        assert False == cached
Ejemplo n.º 7
0
    def test_302_creates_cachable_representation(self):
        h = DummyHTTPClient()
        h.queue_response(302)

        url = self._url
        representation, cached = Representation.get(self._db,
                                                    url,
                                                    do_get=h.do_get)
        assert False == cached

        representation2, cached = Representation.get(self._db,
                                                     url,
                                                     do_get=h.do_get)
        assert True == cached
        assert representation == representation2
Ejemplo n.º 8
0
    def test_mirror_open_access_link_fetch_failure(self):
        mirrors = dict(books_mirror=MockS3Uploader())
        h = DummyHTTPClient()

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get)
        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(403)

        circulation_data.mirror_link(pool, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # Fetch failed, so we should have a fetch exception but no mirror url.
        assert representation.fetch_exception != None
        assert None == representation.mirror_exception
        assert None == representation.mirror_url
        assert link.href == representation.url
        assert representation.fetched_at != None
        assert None == representation.mirrored_at

        # The license pool is suppressed when fetch fails.
        assert True == pool.suppressed
        assert representation.fetch_exception in pool.license_exception
Ejemplo n.º 9
0
    def test_presumed_media_type(self):
        h = DummyHTTPClient()

        # In the absence of a content-type header, the presumed_media_type
        # takes over.
        h.queue_response(200, None, content="content")
        representation, cached = Representation.get(
            self._db,
            "http://url",
            do_get=h.do_get,
            max_age=0,
            presumed_media_type="text/xml",
        )
        assert "text/xml" == representation.media_type

        # In the presence of a generic content-type header, the
        # presumed_media_type takes over.
        h.queue_response(200, "application/octet-stream", content="content")
        representation, cached = Representation.get(
            self._db,
            "http://url",
            do_get=h.do_get,
            max_age=0,
            presumed_media_type="text/xml",
        )
        assert "text/xml" == representation.media_type

        # A non-generic content-type header takes precedence over
        # presumed_media_type.
        h.queue_response(200, "text/plain", content="content")
        representation, cached = Representation.get(
            self._db,
            "http://url",
            do_get=h.do_get,
            max_age=0,
            presumed_media_type="text/xml",
        )
        assert "text/plain" == representation.media_type
Ejemplo n.º 10
0
class TestFeedbooksOPDSImporter(DatabaseTest):

    def _importer(self, **settings):
        collection = self._collection(
            name=DataSource.FEEDBOOKS + self._str,
            protocol=ExternalIntegration.FEEDBOOKS,
        )

        defaults = {
            FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "true",
            FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: None,
        }
        for setting, value in defaults.items():
            if setting not in settings:
                settings[setting] = value

        collection.external_account_id = settings.pop('language', 'de')
        for setting, value in settings.items():
            if value is None:
                continue
            collection.external_integration.set_setting(setting, value)

        return collection, FeedbooksOPDSImporter(
            self._db, collection,
            http_get=self.http.do_get, mirror=self.mirror,
            metadata_client=self.metadata,
        )

    def setup(self):
        super(TestFeedbooksOPDSImporter, self).setup()
        self.http = DummyHTTPClient()
        self.metadata = DummyMetadataClient()
        self.mirror = MockS3Uploader()

        self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS)

        # Create a default importer that's good enough for most tests.
        self.collection, self.importer = self._importer()

    def sample_file(self, filename):
        return sample_data(filename, "feedbooks")

    def test_safety_switch(self):
        """The importer won't be instantiated if REALLY_IMPORT_KEY is not
        set to true.
        """
        settings = {FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "false"}
        assert_raises_regexp(
            Exception, "configured to not actually do an import",
            self._importer, **settings
        )

    def test_unique_identifier(self):
        # The unique account ID is the language of the Feedbooks
        # feed in use.
        eq_('de', self.collection.unique_account_id)

    def test_error_retrieving_replacement_css(self):
        """The importer cannot be instantiated if a replacement CSS
        is specified but the replacement CSS document cannot be
        retrieved or does not appear to be CSS.
        """
        settings = {FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: "http://foo"}

        self.http.queue_response(500, content="An error message")
        assert_raises_regexp(
            IOError, "Replacement stylesheet URL returned 500 response code",
            self._importer, **settings
        )

        self.http.queue_response(
            200, content="We have many CSS offerings",
            media_type="text/html"
        )
        assert_raises_regexp(
            IOError, "Replacement stylesheet is 'text/html', not a CSS document.",
            self._importer, **settings
        )

    def test_extract_feed_data_improves_descriptions(self):
        feed = self.sample_file("feed.atom")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))
        metadata, failures = self.importer.extract_feed_data(
            feed, "http://url/"
        )
        [(key, value)] = metadata.items()
        eq_(u'http://www.feedbooks.com/book/677', key)
        eq_("Discourse on the Method", value.title)

        # Instead of the short description from feed.atom, we have the
        # long description from 677.atom.
        [description] = [x for x in value.links if x.rel==Hyperlink.DESCRIPTION]
        eq_(1818, len(description.content))

    def test_improve_description(self):
        # Here's a Metadata that has a bad (truncated) description.
        metadata = Metadata(self.data_source)

        bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...")

        irrelevant_description = LinkData(
            rel=Hyperlink.DESCRIPTION, media_type="text/plain",
            content="Don't look at me; I'm irrelevant!"
        )

        # Sending an HTTP request to this URL is going to give a 404 error.
        alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/",
                             media_type=OPDSFeed.ENTRY_TYPE)

        # We're not even going to try to send an HTTP request to this URL
        # because it doesn't promise an OPDS entry.
        alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/",
                             media_type="text/html")

        # But this URL will give us full information about this
        # entry, including a better description.
        alternate3 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://baz/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # This URL will not be requested because the third alternate URL
        # gives us the answer we're looking for.
        alternate4 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://qux/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # Two requests will be made. The first will result in a 404
        # error. The second will give us an OPDS entry.
        self.http.queue_response(404, content="Not found")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))

        metadata.links = [bad_description, irrelevant_description,
                          alternate, alternate2, alternate3, alternate4]

        self.importer.improve_description("some ID", metadata)

        # The descriptions have been removed from metatadata.links,
        # because 677.atom included a description we know was better.
        #
        # The incomplete description was removed even though 677.atom
        # also included a copy of it.
        assert bad_description not in metadata.links
        assert irrelevant_description not in metadata.links

        # The more complete description from 677.atom has been added.
        [good_description] = [
            x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION
        ]

        # The four alternate links have not been touched.
        assert (alternate in metadata.links)
        assert (alternate2 in metadata.links)
        assert (alternate3 in metadata.links)
        assert (alternate4 in metadata.links)

        # Two HTTP requests were made.
        eq_(['http://foo/', 'http://baz/'], self.http.requests)

    def test_generic_acquisition_epub_link_picked_up_as_open_access(self):
        """The OPDS feed has links with generic OPDS "acquisition"
        relations. We know that the EPUB link should be open-access
        relations, and we modify its relation on the way in.

        We do not modify the link relation for links to the other
        formats, which means they don't get picked up at all.
        """

        feed = self.sample_file("feed_with_open_access_book.atom")
        imports, errors = self.importer.extract_feed_data(feed)
        [book] = imports.values()
        open_access_links = [x for x in book.circulation.links
                             if x.rel==Hyperlink.OPEN_ACCESS_DOWNLOAD]
        links = sorted(x.href for x in open_access_links)
        eq_(['http://www.feedbooks.com/book/677.epub'], links)

        generic_links = [x for x in book.circulation.links
                         if x.rel==Hyperlink.GENERIC_OPDS_ACQUISITION]
        eq_([], generic_links)

    def test_open_access_book_modified_and_mirrored(self):
        # If no replacement CSS is specified (this is the case with
        # the default importer), the OPDSImporter.content_modifier
        # method is not assigned.
        eq_(None, self.importer.new_css)
        eq_(None, self.importer.content_modifier)

        # Let's create an importer that does specify a replacement
        # CSS file.
        settings = {
            FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY : "http://css/"
        }

        # The very first request made is going to be to the 
        # REPLACEMENT_CSS_KEY URL.
        self.http.queue_response(
            200, content="Some new CSS", media_type="text/css",
        )
        ignore, importer = self._importer(**settings)

        # The replacement CSS is retrieved during the FeedbooksImporter
        # constructor.
        eq_([u'http://css/'], self.http.requests)

        # OPDSImporter.content_modifier has been set to call replace_css
        # when necessary.
        eq_("Some new CSS", importer.new_css)
        eq_(importer.replace_css, importer.content_modifier)

        # The requests to the various copies of the book will succeed,
        # and the books will be mirrored.
        self.http.queue_response(
            200, content=self.sample_file("677.epub"),
            media_type=Representation.EPUB_MEDIA_TYPE
        )

        # The request to
        # http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185'
        # will result in a 404 error, and the image will not be
        # mirrored.
        self.http.queue_response(404, media_type="text/plain")

        self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" }
        feed = self.sample_file("feed_with_open_access_book.atom")
        self.http.queue_response(
            200, OPDSFeed.ACQUISITION_FEED_TYPE,
            content=feed
        )

        [edition], [pool], [work], failures = importer.import_from_feed(
            feed, immediately_presentation_ready=True,
        )

        eq_({}, failures)

        # The work has been created and has metadata.
        eq_("Discourse on the Method", work.title)
        eq_(u'Ren\xe9 Descartes', work.author)

        # Two more mock HTTP requests have now made.
        eq_([
            u'http://css/',
            u'http://www.feedbooks.com/book/677.epub',
            u'http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185',
        ],
            self.http.requests
        )

        # The EPUB was 'uploaded' to the mock S3 service and turned
        # into a LicensePoolDeliveryMechanism. The other formats were
        # ignored.
        [mechanism] = pool.delivery_mechanisms
        eq_('https://s3.amazonaws.com/test.content.bucket/FeedBooks/URI/http%3A%2F%2Fwww.feedbooks.com%2Fbook%2F677/Discourse+on+the+Method.epub',
            mechanism.resource.representation.mirror_url
        )
        eq_(u'application/epub+zip', mechanism.delivery_mechanism.content_type)

        # From information contained in the OPDS entry we determined
        # the book's license to be CC-BY-NC.
        eq_(u'https://creativecommons.org/licenses/by-nc/4.0',
            mechanism.rights_status.uri)

        # The pool is marked as open-access, because it has an open-access
        # delivery mechanism that was mirrored.
        eq_(True, pool.open_access)

        # The mirrored content contains the modified CSS.
        content = StringIO(self.mirror.content[0])
        with ZipFile(content) as zip:
            # The zip still contains the original epub's files.
            assert "META-INF/container.xml" in zip.namelist()
            assert "OPS/css/about.css" in zip.namelist()
            assert "OPS/main0.xml" in zip.namelist()

            # The content of an old file hasn't changed.
            with zip.open("mimetype") as f:
                eq_("application/epub+zip\r\n", f.read())

            # The content of CSS files has been changed to the new value.
            with zip.open("OPS/css/about.css") as f:
                eq_("Some new CSS", f.read())

    def test_in_copyright_book_not_mirrored(self):

        self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" }
        feed = self.sample_file("feed_with_in_copyright_book.atom")
        self.http.queue_response(
            200, OPDSFeed.ACQUISITION_FEED_TYPE,
            content=feed
        )

        response = self.importer.import_from_feed(
            feed, immediately_presentation_ready=True,
        )

        [edition], [pool], [work], failures = self.importer.import_from_feed(
            feed, immediately_presentation_ready=True,
        )

        # The work has been created and has metadata.
        eq_("Discourse on the Method", work.title)
        eq_(u'Ren\xe9 Descartes', work.author)

        # No mock HTTP requests were made.
        eq_([], self.http.requests)

        # Nothing was uploaded to the mock S3.
        eq_([], self.mirror.uploaded)

        # The LicensePool's delivery mechanism is set appropriately
        # to reflect an in-copyright work.
        [mechanism] = pool.delivery_mechanisms
        eq_(RightsStatus.IN_COPYRIGHT, mechanism.rights_status.uri)

        # The DeliveryMechanism has a Representation but the Representation
        # has not been set as mirrored, because nothing was uploaded.
        rep = mechanism.resource.representation
        eq_('http://www.feedbooks.com/book/677.epub', rep.url)
        eq_(None, rep.mirror_url)
        eq_(None, rep.mirror_exception)

        # The pool is not marked as open-access because although it
        # has open-access links, they're not licensed under terms we
        # can use.
        eq_(False, pool.open_access)
Ejemplo n.º 11
0
    def test_lookup(self):
        # Test the lookup() method.
        h = DummyHTTPClient()
        h.queue_response(200, "text/html", content="yay")

        class Mock(NoveListAPI):
            def build_query_url(self, params):
                self.build_query_url_called_with = params
                return "http://query-url/"

            def scrubbed_url(self, params):
                self.scrubbed_url_called_with = params
                return "http://scrubbed-url/"

            def review_response(self, response):
                self.review_response_called_with = response

            def lookup_info_to_metadata(self, representation):
                self.lookup_info_to_metadata_called_with = representation
                return "some metadata"

        novelist = Mock.from_config(self._default_library)
        identifier = self._identifier(identifier_type=Identifier.ISBN)

        # Do the lookup.
        result = novelist.lookup(identifier, do_get=h.do_get)

        # A number of parameters were passed into build_query_url() to
        # get the URL of the HTTP request. The same parameters were
        # also passed into scrubbed_url(), to get the URL that should
        # be used when storing the Representation in the database.
        params1 = novelist.build_query_url_called_with
        params2 = novelist.scrubbed_url_called_with
        assert params1 == params2

        assert (
            dict(
                profile=novelist.profile,
                ClientIdentifier=identifier.urn,
                ISBN=identifier.identifier,
                password=novelist.password,
                version=novelist.version,
            )
            == params1
        )

        # The HTTP request went out to the query URL -- not the scrubbed URL.
        assert ["http://query-url/"] == h.requests

        # The HTTP response was passed into novelist.review_response()
        assert (
            200,
            {"content-type": "text/html"},
            b"yay",
        ) == novelist.review_response_called_with

        # Finally, the Representation was passed into
        # lookup_info_to_metadata, which returned a hard-coded string
        # as the final result.
        assert "some metadata" == result

        # Looking at the Representation we can see that it was stored
        # in the database under its scrubbed URL, not the URL used to
        # make the request.
        rep = novelist.lookup_info_to_metadata_called_with
        assert "http://scrubbed-url/" == rep.url
        assert b"yay" == rep.content
Ejemplo n.º 12
0
    def test_cautious_http_get(self):

        h = DummyHTTPClient()
        h.queue_response(200, content="yay")

        # If the domain is obviously safe, the GET request goes through,
        # with no HEAD request being made.
        m = Representation.cautious_http_get
        status, headers, content = m(
            "http://safe.org/",
            {},
            do_not_access=["unsafe.org"],
            do_get=h.do_get,
            cautious_head_client=object(),
        )
        assert 200 == status
        assert b"yay" == content

        # If the domain is obviously unsafe, no GET request or HEAD
        # request is made.
        status, headers, content = m(
            "http://unsafe.org/",
            {},
            do_not_access=["unsafe.org"],
            do_get=object(),
            cautious_head_client=object(),
        )
        assert 417 == status
        assert (
            "Cautiously decided not to make a GET request to http://unsafe.org/"
            == content)

        # If the domain is potentially unsafe, a HEAD request is made,
        # and the answer depends on its outcome.

        # Here, the HEAD request redirects to a prohibited site.
        def mock_redirect(*args, **kwargs):
            return MockRequestsResponse(301,
                                        dict(location="http://unsafe.org/"))

        status, headers, content = m(
            "http://caution.org/",
            {},
            do_not_access=["unsafe.org"],
            check_for_redirect=["caution.org"],
            do_get=object(),
            cautious_head_client=mock_redirect,
        )
        assert 417 == status
        assert ("application/vnd.librarysimplified-did-not-make-request" ==
                headers["content-type"])
        assert (
            "Cautiously decided not to make a GET request to http://caution.org/"
            == content)

        # Here, the HEAD request redirects to an allowed site.
        h.queue_response(200, content="good content")

        def mock_redirect(*args, **kwargs):
            return MockRequestsResponse(301, dict(location="http://safe.org/"))

        status, headers, content = m(
            "http://caution.org/",
            {},
            do_not_access=["unsafe.org"],
            check_for_redirect=["caution.org"],
            do_get=h.do_get,
            cautious_head_client=mock_redirect,
        )
        assert 200 == status
        assert b"good content" == content
Ejemplo n.º 13
0
class TestFeedbooksOPDSImporter(DatabaseTest):

    def _importer(self, **settings):
        collection = self._collection(
            name=DataSource.FEEDBOOKS + self._str,
            protocol=ExternalIntegration.FEEDBOOKS,
        )

        defaults = {
            FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "true",
            FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: None,
        }
        for setting, value in defaults.items():
            if setting not in settings:
                settings[setting] = value

        collection.external_account_id = settings.pop('language', 'de')
        for setting, value in settings.items():
            if value is None:
                continue
            collection.external_integration.set_setting(setting, value)

        return collection, FeedbooksOPDSImporter(
            self._db, collection,
            http_get=self.http.do_get, mirror=self.mirror,
            metadata_client=self.metadata,
        )

    def setup(self):
        super(TestFeedbooksOPDSImporter, self).setup()
        self.http = DummyHTTPClient()
        self.metadata = DummyMetadataClient()
        self.mirror = MockS3Uploader()

        self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS)

        # Create a default importer that's good enough for most tests.
        self.collection, self.importer = self._importer()

    def sample_file(self, filename):
        return sample_data(filename, "feedbooks")

    def test_safety_switch(self):
        """The importer won't be instantiated if REALLY_IMPORT_KEY is not
        set to true.
        """
        settings = {FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "false"}
        assert_raises_regexp(
            Exception, "configured to not actually do an import",
            self._importer, **settings
        )

    def test_unique_identifier(self):
        # The unique account ID is the language of the Feedbooks
        # feed in use.
        eq_('de', self.collection.unique_account_id)

    def test_error_retrieving_replacement_css(self):
        """The importer cannot be instantiated if a replacement CSS
        is specified but the replacement CSS document cannot be
        retrieved or does not appear to be CSS.
        """
        settings = {FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: "http://foo"}

        self.http.queue_response(500, content="An error message")
        assert_raises_regexp(
            IOError, "Replacement stylesheet URL returned 500 response code",
            self._importer, **settings
        )

        self.http.queue_response(
            200, content="We have many CSS offerings",
            media_type="text/html"
        )
        assert_raises_regexp(
            IOError, "Replacement stylesheet is 'text/html', not a CSS document.",
            self._importer, **settings
        )

    def test_extract_feed_data_improves_descriptions(self):
        feed = self.sample_file("feed.atom")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))
        metadata, failures = self.importer.extract_feed_data(
            feed, "http://url/"
        )
        [(key, value)] = metadata.items()
        eq_(u'http://www.feedbooks.com/book/677', key)
        eq_("Discourse on the Method", value.title)

        # Instead of the short description from feed.atom, we have the
        # long description from 677.atom.
        [description] = [x for x in value.links if x.rel==Hyperlink.DESCRIPTION]
        eq_(1818, len(description.content))

    def test_improve_description(self):
        # Here's a Metadata that has a bad (truncated) description.
        metadata = Metadata(self.data_source)

        bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...")

        irrelevant_description = LinkData(
            rel=Hyperlink.DESCRIPTION, media_type="text/plain",
            content="Don't look at me; I'm irrelevant!"
        )

        # Sending an HTTP request to this URL is going to give a 404 error.
        alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/",
                             media_type=OPDSFeed.ENTRY_TYPE)

        # We're not even going to try to send an HTTP request to this URL
        # because it doesn't promise an OPDS entry.
        alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/",
                             media_type="text/html")

        # But this URL will give us full information about this
        # entry, including a better description.
        alternate3 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://baz/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # This URL will not be requested because the third alternate URL
        # gives us the answer we're looking for.
        alternate4 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://qux/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # Two requests will be made. The first will result in a 404
        # error. The second will give us an OPDS entry.
        self.http.queue_response(404, content="Not found")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))

        metadata.links = [bad_description, irrelevant_description,
                          alternate, alternate2, alternate3, alternate4]

        self.importer.improve_description("some ID", metadata)

        # The descriptions have been removed from metatadata.links,
        # because 677.atom included a description we know was better.
        #
        # The incomplete description was removed even though 677.atom
        # also included a copy of it.
        assert bad_description not in metadata.links
        assert irrelevant_description not in metadata.links

        # The more complete description from 677.atom has been added.
        [good_description] = [
            x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION
        ]

        # The four alternate links have not been touched.
        assert (alternate in metadata.links)
        assert (alternate2 in metadata.links)
        assert (alternate3 in metadata.links)
        assert (alternate4 in metadata.links)

        # Two HTTP requests were made.
        eq_(['http://foo/', 'http://baz/'], self.http.requests)

    def test_generic_acquisition_epub_link_picked_up_as_open_access(self):
        """The OPDS feed has links with generic OPDS "acquisition"
        relations. We know that the EPUB link should be open-access
        relations, and we modify its relation on the way in.

        We do not modify the link relation for links to the other
        formats, which means they don't get picked up at all.
        """

        feed = self.sample_file("feed_with_open_access_book.atom")
        imports, errors = self.importer.extract_feed_data(feed)
        [book] = imports.values()
        open_access_links = [x for x in book.circulation.links
                             if x.rel==Hyperlink.OPEN_ACCESS_DOWNLOAD]
        links = sorted(x.href for x in open_access_links)
        eq_(['http://www.feedbooks.com/book/677.epub'], links)

        generic_links = [x for x in book.circulation.links
                         if x.rel==Hyperlink.GENERIC_OPDS_ACQUISITION]
        eq_([], generic_links)

    def test_open_access_book_modified_and_mirrored(self):
        # If no replacement CSS is specified (this is the case with
        # the default importer), the OPDSImporter.content_modifier
        # method is not assigned.
        eq_(None, self.importer.new_css)
        eq_(None, self.importer.content_modifier)

        # Let's create an importer that does specify a replacement
        # CSS file.
        settings = {
            FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY : "http://css/"
        }

        # The very first request made is going to be to the
        # REPLACEMENT_CSS_KEY URL.
        self.http.queue_response(
            200, content="Some new CSS", media_type="text/css",
        )
        ignore, importer = self._importer(**settings)

        # The replacement CSS is retrieved during the FeedbooksImporter
        # constructor.
        eq_([u'http://css/'], self.http.requests)

        # OPDSImporter.content_modifier has been set to call replace_css
        # when necessary.
        eq_("Some new CSS", importer.new_css)
        eq_(importer.replace_css, importer.content_modifier)

        # The requests to the various copies of the book will succeed,
        # and the books will be mirrored.
        self.http.queue_response(
            200, content=self.sample_file("677.epub"),
            media_type=Representation.EPUB_MEDIA_TYPE
        )

        # The request to
        # http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185'
        # will result in a 404 error, and the image will not be
        # mirrored.
        self.http.queue_response(404, media_type="text/plain")

        self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" }
        feed = self.sample_file("feed_with_open_access_book.atom")
        self.http.queue_response(
            200, OPDSFeed.ACQUISITION_FEED_TYPE,
            content=feed
        )

        [edition], [pool], [work], failures = importer.import_from_feed(feed)

        eq_({}, failures)

        # The work has been created and has metadata.
        eq_("Discourse on the Method", work.title)
        eq_(u'Ren\xe9 Descartes', work.author)

        # Two more mock HTTP requests have now made.
        eq_([
            u'http://css/',
            u'http://www.feedbooks.com/book/677.epub',
            u'http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185',
        ],
            self.http.requests
        )

        # The EPUB was 'uploaded' to the mock S3 service and turned
        # into a LicensePoolDeliveryMechanism. The other formats were
        # ignored.
        [mechanism] = pool.delivery_mechanisms
        eq_('https://s3.amazonaws.com/test.content.bucket/FeedBooks/URI/http%3A%2F%2Fwww.feedbooks.com%2Fbook%2F677/Discourse+on+the+Method.epub',
            mechanism.resource.representation.mirror_url
        )
        eq_(u'application/epub+zip', mechanism.delivery_mechanism.content_type)

        # From information contained in the OPDS entry we determined
        # the book's license to be CC-BY-NC.
        eq_(u'https://creativecommons.org/licenses/by-nc/4.0',
            mechanism.rights_status.uri)

        # The pool is marked as open-access, because it has an open-access
        # delivery mechanism that was mirrored.
        eq_(True, pool.open_access)

        # The mirrored content contains the modified CSS.
        content = StringIO(self.mirror.content[0])
        with ZipFile(content) as zip:
            # The zip still contains the original epub's files.
            assert "META-INF/container.xml" in zip.namelist()
            assert "OPS/css/about.css" in zip.namelist()
            assert "OPS/main0.xml" in zip.namelist()

            # The content of an old file hasn't changed.
            with zip.open("mimetype") as f:
                eq_("application/epub+zip\r\n", f.read())

            # The content of CSS files has been changed to the new value.
            with zip.open("OPS/css/about.css") as f:
                eq_("Some new CSS", f.read())

    def test_in_copyright_book_not_mirrored(self):

        self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" }
        feed = self.sample_file("feed_with_in_copyright_book.atom")
        self.http.queue_response(
            200, OPDSFeed.ACQUISITION_FEED_TYPE,
            content=feed
        )

        [edition], [pool], [work], failures = self.importer.import_from_feed(feed)

        # The work has been created and has metadata.
        eq_("Discourse on the Method", work.title)
        eq_(u'Ren\xe9 Descartes', work.author)

        # No mock HTTP requests were made.
        eq_([], self.http.requests)

        # Nothing was uploaded to the mock S3.
        eq_([], self.mirror.uploaded)

        # The LicensePool's delivery mechanism is set appropriately
        # to reflect an in-copyright work.
        [mechanism] = pool.delivery_mechanisms
        eq_(RightsStatus.IN_COPYRIGHT, mechanism.rights_status.uri)

        # The DeliveryMechanism has a Representation but the Representation
        # has not been set as mirrored, because nothing was uploaded.
        rep = mechanism.resource.representation
        eq_('http://www.feedbooks.com/book/677.epub', rep.url)
        eq_(None, rep.mirror_url)
        eq_(None, rep.mirror_exception)

        # The pool is not marked as open-access because although it
        # has open-access links, they're not licensed under terms we
        # can use.
        eq_(False, pool.open_access)