def test_fetch_catalog(self):
        # Test our ability to retrieve essential information from a
        # remote registry's root catalog.
        class Mock(RemoteRegistry):
            def _extract_catalog_information(self, response):
                self.extracted_from = response
                return "Essential information"

        # The behavior of fetch_catalog() depends on what comes back
        # when we ask the remote registry for its root catalog.
        client = DummyHTTPClient()

        # If the result is a problem detail document, that document is
        # the return value of fetch_catalog().
        problem = REMOTE_INTEGRATION_FAILED.detailed("oops")
        client.responses.append(problem)
        registry = Mock(self.integration)
        result = registry.fetch_catalog(do_get=client.do_get)
        assert self.integration.url == client.requests.pop()
        assert problem == result

        # If the response looks good, it's passed into
        # _extract_catalog_information(), and the result of _that_
        # method is the return value of fetch_catalog.
        client.queue_requests_response(200, content="A root catalog")
        [queued] = client.responses
        assert "Essential information" == registry.fetch_catalog(
            "custom catalog URL", do_get=client.do_get)
        assert "custom catalog URL" == client.requests.pop()
    def test_replacement_policy_uses_provided_mirror(self):
        collection = MockOverdriveAPI.mock_collection(self._db)
        mirror = MockS3Uploader()
        replacement_policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror
        )
        api = MockOverdriveAPI(self._db, collection)
        api.queue_collection_token()
        provider = OverdriveBibliographicCoverageProvider(
            collection, replacement_policy=replacement_policy,
            api_class=api
        )
        
        # Any resources discovered by Overdrive will be
        # sent through this mirror.
        eq_(mirror, provider.replacement_policy.mirror)

        http = DummyHTTPClient()
        provider.replacement_policy.http_get = http.do_get

        # Now let's try looking up a specific identifier through 'Overdrive'.
        identifier = self._identifier(
            Identifier.OVERDRIVE_ID, "3896665d-9d81-4cac-bd43-ffc5066de1f5"
        )


        body = self.data_file("overdrive/overdrive_metadata.json")
        provider.api.queue_response(200, {}, body)

        test_cover = self.data_file("covers/test-book-cover.png")
        test_small_cover = self.data_file("covers/tiny-image-cover.png")

        # Overdrive's full-sized image -- we will be creating our own
        # thumbnail from this.
        http.queue_response(200, "image/jpeg", {}, test_cover)

        # Overdrive's thumbnail image -- we will not be using this
        http.queue_response(200, "image/jpeg", {}, test_small_cover)

        record = provider.ensure_coverage(identifier)
        eq_("success", record.status)

        # The full image and the thumbnail have been uploaded to
        # the fake S3.
        full, thumbnail = mirror.uploaded
        eq_(test_cover, full.content)

        # The URLs for the Resource objects are our S3 URLs, not Overdrive's
        # URLs.
        expect = "Overdrive/Overdrive+ID/%s" % identifier.identifier
        for url in [full.mirror_url, thumbnail.mirror_url]:
            assert expect in url
        assert "/scaled/" in thumbnail.mirror_url
        assert "/scaled/" not in full.mirror_url

        # The thumbnail is a newly created image that is not the
        # same as the full image or the test cover.
        assert thumbnail.content != test_small_cover
        assert thumbnail.content != test_cover
 def setup(self):
     super(TestContentCafeAPI, self).setup()
     self.http = DummyHTTPClient()
     self.soap = MockSOAPClient(popularity_value=5)
     self.api = ContentCafeAPI(self._db, 'uid', 'pw', self.soap,
                               self.http.do_get)
     self.identifier = self._identifier(identifier_type=Identifier.ISBN)
     self.args = dict(userid=self.api.user_id,
                      password=self.api.password,
                      isbn=self.identifier.identifier)
    def setup(self):
        super(TestFeedbooksOPDSImporter, self).setup()
        self.http = DummyHTTPClient()
        self.metadata = DummyMetadataClient()
        self.mirror = MockS3Uploader()

        self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS)

        # Create a default importer that's good enough for most tests.
        self.collection, self.importer = self._importer()
Beispiel #5
0
    def test_mirror_open_access_link_mirror_failure(self):
        mirrors = dict(books_mirror=MockS3Uploader(fail=True),
                       covers_mirror=None)
        h = DummyHTTPClient()

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get)

        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE)

        circulation_data.mirror_link(pool, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # The representation was fetched successfully.
        assert None == representation.fetch_exception
        assert representation.fetched_at != None

        # But mirroing failed.
        assert representation.mirror_exception != None
        assert None == representation.mirrored_at
        assert link.media_type == representation.media_type
        assert link.href == representation.url

        # The mirror url was never set.
        assert None == representation.mirror_url

        # Book content is still there since it wasn't mirrored.
        assert representation.content != None

        # The license pool is suppressed when mirroring fails.
        assert True == pool.suppressed
        assert representation.mirror_exception in pool.license_exception
    def test_get_with_url_normalizer(self):
        # Verify our ability to store a Resource under a URL other than
        # the exact URL used to make the HTTP request.

        class Normalizer(object):
            called_with = None

            def normalize(self, url):
                # Strip off a  session ID from an outgoing URL.
                self.called_with = url
                return url[:11]

        normalizer = Normalizer()

        h = DummyHTTPClient()
        h.queue_response(200, content="yay")
        original_url = "http://url/?sid=12345"

        representation, from_cache = Representation.get(
            self._db,
            original_url,
            do_get=h.do_get,
            url_normalizer=normalizer.normalize)

        # The original URL was used to make the actual request.
        assert [original_url] == h.requests

        # The original URL was then passed into Normalizer.normalize
        assert original_url == normalizer.called_with

        # And the normalized URL was used as the Representation's
        # storage key.
        normalized_url = "http://url/"
        assert "yay" == representation.content.decode("utf-8")
        assert normalized_url == representation.url
        assert False == from_cache

        # Try again, and the Representation is retrieved from cache under
        # the normalized URL.
        #
        # Replace do_get with a dud object to prove that no second
        # request goes out 'over the wire'.
        representation2, from_cache = Representation.get(
            self._db,
            original_url,
            do_get=object(),
            url_normalizer=normalizer.normalize)
        assert True == from_cache
        assert representation2 == representation
        assert normalized_url == representation.url
    def test_response_reviewer_impacts_representation(self):
        h = DummyHTTPClient()
        h.queue_response(200, media_type="text/html")

        def reviewer(response):
            status, headers, content = response
            if "html" in headers["content-type"]:
                raise Exception("No. Just no.")

        representation, cached = Representation.get(self._db,
                                                    self._url,
                                                    do_get=h.do_get,
                                                    response_reviewer=reviewer)
        assert "No. Just no." in representation.fetch_exception
        assert False == cached
    def test_302_creates_cachable_representation(self):
        h = DummyHTTPClient()
        h.queue_response(302)

        url = self._url
        representation, cached = Representation.get(self._db,
                                                    url,
                                                    do_get=h.do_get)
        assert False == cached

        representation2, cached = Representation.get(self._db,
                                                     url,
                                                     do_get=h.do_get)
        assert True == cached
        assert representation == representation2
Beispiel #9
0
    def test_mirror_open_access_link_fetch_failure(self):
        mirrors = dict(books_mirror=MockS3Uploader())
        h = DummyHTTPClient()

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get)
        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(403)

        circulation_data.mirror_link(pool, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # Fetch failed, so we should have a fetch exception but no mirror url.
        assert representation.fetch_exception != None
        assert None == representation.mirror_exception
        assert None == representation.mirror_url
        assert link.href == representation.url
        assert representation.fetched_at != None
        assert None == representation.mirrored_at

        # The license pool is suppressed when fetch fails.
        assert True == pool.suppressed
        assert representation.fetch_exception in pool.license_exception
 def setup(self):
     super(TestContentCafeAPI, self).setup()
     self.http = DummyHTTPClient()
     self.soap = MockSOAPClient(popularity_value=5)
     self.api = ContentCafeAPI(
         self._db, 'uid', 'pw', self.soap, self.http.do_get
     )
     self.identifier = self._identifier(identifier_type=Identifier.ISBN)
     self.args = dict(userid=self.api.user_id, password=self.api.password,
                      isbn=self.identifier.identifier)
    def setup(self):
        super(TestFeedbooksOPDSImporter, self).setup()
        self.http = DummyHTTPClient()
        self.metadata = DummyMetadataClient()
        self.mirror = MockS3Uploader()

        self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS)

        # Create a default importer that's good enough for most tests.
        self.collection, self.importer = self._importer()
    def test_replacement_policy_uses_provided_mirror(self):
        collection = MockOverdriveAPI.mock_collection(self._db)
        mirror = MockS3Uploader()
        replacement_policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror)
        api = MockOverdriveAPI(self._db, collection)
        api.queue_collection_token()
        provider = OverdriveBibliographicCoverageProvider(
            collection, replacement_policy=replacement_policy, api_class=api)

        # Any resources discovered by Overdrive will be
        # sent through this mirror.
        eq_(mirror, provider.replacement_policy.mirror)

        http = DummyHTTPClient()
        provider.replacement_policy.http_get = http.do_get

        # Now let's try looking up a specific identifier through 'Overdrive'.
        identifier = self._identifier(Identifier.OVERDRIVE_ID,
                                      "3896665d-9d81-4cac-bd43-ffc5066de1f5")

        body = self.data_file("overdrive/overdrive_metadata.json")
        provider.api.queue_response(200, {}, body)

        test_cover = self.data_file("covers/test-book-cover.png")
        test_small_cover = self.data_file("covers/tiny-image-cover.png")

        # Overdrive's full-sized image -- we will be creating our own
        # thumbnail from this.
        http.queue_response(200, "image/jpeg", {}, test_cover)

        # Overdrive's thumbnail image -- we will not be using this
        http.queue_response(200, "image/jpeg", {}, test_small_cover)

        record = provider.ensure_coverage(identifier)
        eq_("success", record.status)

        # The full image and the thumbnail have been uploaded to
        # the fake S3.
        full, thumbnail = mirror.uploaded
        eq_(test_cover, full.content)

        # The URLs for the Resource objects are our S3 URLs, not Overdrive's
        # URLs.
        expect = "Overdrive/Overdrive+ID/%s" % identifier.identifier
        for url in [full.mirror_url, thumbnail.mirror_url]:
            assert expect in url
        assert "/scaled/" in thumbnail.mirror_url
        assert "/scaled/" not in full.mirror_url

        # The thumbnail is a newly created image that is not the
        # same as the full image or the test cover.
        assert thumbnail.content != test_small_cover
        assert thumbnail.content != test_cover
    def test_500_creates_uncachable_representation(self):
        h = DummyHTTPClient()
        h.queue_response(500)
        url = self._url
        representation, cached = Representation.get(self._db,
                                                    url,
                                                    do_get=h.do_get)
        assert False == cached

        h.queue_response(500)
        representation, cached = Representation.get(self._db,
                                                    url,
                                                    do_get=h.do_get)
        assert False == cached
    def test_presumed_media_type(self):
        h = DummyHTTPClient()

        # In the absence of a content-type header, the presumed_media_type
        # takes over.
        h.queue_response(200, None, content="content")
        representation, cached = Representation.get(
            self._db,
            "http://url",
            do_get=h.do_get,
            max_age=0,
            presumed_media_type="text/xml",
        )
        assert "text/xml" == representation.media_type

        # In the presence of a generic content-type header, the
        # presumed_media_type takes over.
        h.queue_response(200, "application/octet-stream", content="content")
        representation, cached = Representation.get(
            self._db,
            "http://url",
            do_get=h.do_get,
            max_age=0,
            presumed_media_type="text/xml",
        )
        assert "text/xml" == representation.media_type

        # A non-generic content-type header takes precedence over
        # presumed_media_type.
        h.queue_response(200, "text/plain", content="content")
        representation, cached = Representation.get(
            self._db,
            "http://url",
            do_get=h.do_get,
            max_age=0,
            presumed_media_type="text/xml",
        )
        assert "text/plain" == representation.media_type
class TestContentCafeAPI(DatabaseTest):

    base_path = os.path.split(__file__)[0]
    resource_path = os.path.join(base_path, "files", "content_cafe")

    def data_file(self, path):
        """Return the contents of a test data file."""
        return open(os.path.join(self.resource_path, path)).read()

    def setup(self):
        super(TestContentCafeAPI, self).setup()
        self.http = DummyHTTPClient()
        self.soap = MockSOAPClient(popularity_value=5)
        self.api = ContentCafeAPI(self._db, 'uid', 'pw', self.soap,
                                  self.http.do_get)
        self.identifier = self._identifier(identifier_type=Identifier.ISBN)
        self.args = dict(userid=self.api.user_id,
                         password=self.api.password,
                         isbn=self.identifier.identifier)

    def test_from_config(self):
        # Without an integration, an error is raised.
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db)

        # With incomplete integrations, an error is raised.
        integration = self._external_integration(
            ExternalIntegration.CONTENT_CAFE,
            goal=ExternalIntegration.METADATA_GOAL,
            username=u'yup')
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db)

        integration.username = None
        integration.password = u'yurp'
        assert_raises(CannotLoadConfiguration, ContentCafeAPI.from_config,
                      self._db)

        integration.username = u'yup'
        result = ContentCafeAPI.from_config(self._db, soap_client=object())
        eq_(True, isinstance(result, ContentCafeAPI))

        # NOTE: We can't test the case where soap_client is not
        # mocked, because the ContentCafeSOAPClient constructor makes
        # a real HTTP request to load its WSDL file. We might be able
        # to improve this by seeing how mockable SudsClient is, or by
        # mocking ContentCafeAPISOAPClient.WSDL_URL as a file:// URL.

    def test_data_source(self):
        eq_(DataSource.CONTENT_CAFE, self.api.data_source.name)

    def test_create_metadata(self):
        class Mock(ContentCafeAPI):

            popularity_measurement = "a popularity measurement"
            annotate_calls = []

            def add_reviews(self, *args):
                self.add_reviews_called_with = args

            def add_descriptions(self, *args):
                self.add_descriptions_called_with = args

            def add_author_notes(self, *args):
                self.add_author_notes_called_with = args

            def add_excerpt(self, *args):
                self.add_excerpt_called_with = args

            def measure_popularity(self, *args):
                self.measure_popularity_called_with = args
                return self.popularity_measurement

            def is_suitable_image(self, image):
                self.is_suitable_image_called_with = image
                return True

        api = Mock(self._db, 'uid', 'pw', self.soap, self.http.do_get)
        m = api.create_metadata

        # First we will make a request for a cover image. If that
        # gives a 404 error, we return nothing and don't bother making
        # any more requests.
        self.http.queue_requests_response(404)
        eq_(None, m(self.identifier))
        request_url = self.http.requests.pop()
        image_url = api.image_url % self.args
        eq_(image_url, request_url)
        eq_([], self.http.requests)

        # If the cover image request succeeds, we turn it into a LinkData
        # and add it to a new Metadata object. We then pass the
        # Metadata object a number of other methods to get additional
        # information from Content Cafe.
        #
        # We then call measure_popularity, and add its return value
        # to Metadata.measurements.
        self.http.queue_requests_response(200,
                                          'image/png',
                                          content='an image!')

        # Here's the result.
        metadata = m(self.identifier)

        # Here's the image LinkData.
        [image] = metadata.links
        eq_(Hyperlink.IMAGE, image.rel)
        eq_(image_url, image.href)
        eq_('image/png', image.media_type)
        eq_('an image!', image.content)

        # We ran the image through our mocked version of is_suitable_image,
        # and it said it was fine.
        eq_(image.content, api.is_suitable_image_called_with)

        # Here's the popularity measurement.
        eq_([api.popularity_measurement], metadata.measurements)

        # Confirm that the mock methods were called with the right
        # arguments -- their functionality is tested individually
        # below.
        expected_args = (metadata, self.identifier, self.args)
        for called_with in (
                api.add_reviews_called_with,
                api.add_descriptions_called_with,
                api.add_author_notes_called_with,
                api.add_excerpt_called_with,
        ):
            eq_(expected_args, called_with)
        eq_((self.identifier, api.ONE_YEAR_AGO),
            api.measure_popularity_called_with)

        # If measure_popularity returns nothing, metadata.measurements
        # will be left empty.
        api.popularity_measurement = None
        self.http.queue_requests_response(200,
                                          'image/png',
                                          content='an image!')
        metadata = m(self.identifier)
        eq_([], metadata.measurements)

    def test_annotate_with_web_resources(self):
        metadata = Metadata(DataSource.CONTENT_CAFE)
        rel = self._str

        # We're going to be grabbing this URL and
        # scraping it.
        url_template = "http://url/%(arg1)s"
        args = dict(arg1='value')

        # A couple of useful functions for scraping.
        class MockScrapers(object):
            scrape_called = False
            explode_called = False

            def scrape(self, soup):
                self.scrape_called = True
                return [soup.find('content').string]

            def explode(self, soup):
                self.explode_called = True
                raise Exception("I'll never be called")

        scrapers = MockScrapers()

        # When the result of the HTTP request contains a certain phrase,
        # we don't even bother scraping.
        m = self.api.annotate_with_web_resources
        http = self.http
        http.queue_requests_response(200,
                                     'text/html',
                                     content='There is no data!')
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.explode)
        # We made the request but nothing happened.
        expect_url = url_template % args
        eq_(expect_url, self.http.requests.pop())
        eq_(False, scrapers.explode_called)
        eq_(None, metadata.title)
        eq_([], metadata.links)

        # Otherwise, we try to scrape.
        good_content = '<html><span class="PageHeader2">Book title</span><content>Here you go</content>'
        http.queue_requests_response(200, 'text/html', content=good_content)
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.scrape)
        eq_(True, scrapers.scrape_called)

        # We called _extract_title and took a Content Cafe title out
        # for the Metadata object.
        eq_("Book title", metadata.title)

        # Then we called mock_scrape, which gave us the content for
        # one LinkData.
        [link] = metadata.links
        eq_(rel, link.rel)
        eq_(None, link.href)
        eq_("text/html", link.media_type)
        eq_("Here you go", link.content)

    def test__extract_title(self):
        # Standalone test of the _extract_title helper method.

        def assert_title(title, expect):
            markup = '<html><span class="PageHeader2">%s</span><content>Description</content>' % title
            soup = BeautifulSoup(markup, 'lxml')
            eq_(expect, ContentCafeAPI._extract_title(soup))

        # A normal book title is successfully extracted.
        assert_title("A great book", "A great book")

        # A supposed title that's in KNOWN_BAD_TITLES is ignored.
        assert_title("No content currently exists for this item", None)

    def test_add_reviews(self):
        """Verify that add_reviews works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("reviews.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_reviews(metadata, self.identifier, self.args)

        # We extracted six reviews from the sample file.
        reviews = metadata.links
        eq_(6, len(reviews))
        assert all([x.rel == Hyperlink.REVIEW for x in reviews])
        assert "isn't a myth!" in reviews[0].content

        # We incidentally figured out the book's title.
        eq_("Shadow Thieves", metadata.title)

    def test_add_author_notes(self):
        """Verify that add_author_notes works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("author_notes.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_author_notes(metadata, self.identifier, self.args)

        [notes] = metadata.links
        eq_(Hyperlink.AUTHOR, notes.rel)
        assert 'Brenda researched turtles' in notes.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)

    def test_add_excerpt(self):
        """Verify that add_excerpt works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("excerpt.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_excerpt(metadata, self.identifier, self.args)

        [excerpt] = metadata.links
        eq_(Hyperlink.SAMPLE, excerpt.rel)
        assert 'Franklin loved his marbles.' in excerpt.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)

    def test_measure_popularity(self):
        """Verify that measure_popularity turns the output of
        a SOAP request into a MeasurementData.
        """
        cutoff = object()

        # Call it.
        result = self.api.measure_popularity(self.identifier, cutoff)

        # The SOAP client's estimated_popularity method was called.
        expect = (self.identifier.identifier, cutoff)
        eq_(expect, self.soap.estimated_popularity_calls.pop())

        # The result was turned into a MeasurementData.
        assert isinstance(result, MeasurementData)
        eq_(Measurement.POPULARITY, result.quantity_measured)
        eq_(self.soap.popularity_value, result.value)

        # If the SOAP API doesn't return a popularity value, no
        # MeasurementData is created.
        self.soap.popularity_value = None
        result = self.api.measure_popularity(self.identifier, cutoff)
        eq_(expect, self.soap.estimated_popularity_calls.pop())
        eq_(None, result)

    def test_is_suitable_image(self):
        # Images are rejected if we can tell they are Content Cafe's
        # stand-in images.
        m = ContentCafeAPI.is_suitable_image

        content = self.data_file("stand-in-image.png")
        eq_(False, m(content))

        # Otherwise, it's fine. We don't check that the image is
        # valid, only that it's not a stand-in image.
        eq_(True, m("I'm not a stand-in image."))
class TestFeedbooksOPDSImporter(DatabaseTest):

    def _importer(self, **settings):
        collection = self._collection(
            name=DataSource.FEEDBOOKS + self._str,
            protocol=ExternalIntegration.FEEDBOOKS,
        )

        defaults = {
            FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "true",
            FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: None,
        }
        for setting, value in defaults.items():
            if setting not in settings:
                settings[setting] = value

        collection.external_account_id = settings.pop('language', 'de')
        for setting, value in settings.items():
            if value is None:
                continue
            collection.external_integration.set_setting(setting, value)

        return collection, FeedbooksOPDSImporter(
            self._db, collection,
            http_get=self.http.do_get, mirror=self.mirror,
            metadata_client=self.metadata,
        )

    def setup(self):
        super(TestFeedbooksOPDSImporter, self).setup()
        self.http = DummyHTTPClient()
        self.metadata = DummyMetadataClient()
        self.mirror = MockS3Uploader()

        self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS)

        # Create a default importer that's good enough for most tests.
        self.collection, self.importer = self._importer()

    def sample_file(self, filename):
        return sample_data(filename, "feedbooks")

    def test_safety_switch(self):
        """The importer won't be instantiated if REALLY_IMPORT_KEY is not
        set to true.
        """
        settings = {FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "false"}
        assert_raises_regexp(
            Exception, "configured to not actually do an import",
            self._importer, **settings
        )

    def test_unique_identifier(self):
        # The unique account ID is the language of the Feedbooks
        # feed in use.
        eq_('de', self.collection.unique_account_id)

    def test_error_retrieving_replacement_css(self):
        """The importer cannot be instantiated if a replacement CSS
        is specified but the replacement CSS document cannot be
        retrieved or does not appear to be CSS.
        """
        settings = {FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: "http://foo"}

        self.http.queue_response(500, content="An error message")
        assert_raises_regexp(
            IOError, "Replacement stylesheet URL returned 500 response code",
            self._importer, **settings
        )

        self.http.queue_response(
            200, content="We have many CSS offerings",
            media_type="text/html"
        )
        assert_raises_regexp(
            IOError, "Replacement stylesheet is 'text/html', not a CSS document.",
            self._importer, **settings
        )

    def test_extract_feed_data_improves_descriptions(self):
        feed = self.sample_file("feed.atom")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))
        metadata, failures = self.importer.extract_feed_data(
            feed, "http://url/"
        )
        [(key, value)] = metadata.items()
        eq_(u'http://www.feedbooks.com/book/677', key)
        eq_("Discourse on the Method", value.title)

        # Instead of the short description from feed.atom, we have the
        # long description from 677.atom.
        [description] = [x for x in value.links if x.rel==Hyperlink.DESCRIPTION]
        eq_(1818, len(description.content))

    def test_improve_description(self):
        # Here's a Metadata that has a bad (truncated) description.
        metadata = Metadata(self.data_source)

        bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...")

        irrelevant_description = LinkData(
            rel=Hyperlink.DESCRIPTION, media_type="text/plain",
            content="Don't look at me; I'm irrelevant!"
        )

        # Sending an HTTP request to this URL is going to give a 404 error.
        alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/",
                             media_type=OPDSFeed.ENTRY_TYPE)

        # We're not even going to try to send an HTTP request to this URL
        # because it doesn't promise an OPDS entry.
        alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/",
                             media_type="text/html")

        # But this URL will give us full information about this
        # entry, including a better description.
        alternate3 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://baz/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # This URL will not be requested because the third alternate URL
        # gives us the answer we're looking for.
        alternate4 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://qux/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # Two requests will be made. The first will result in a 404
        # error. The second will give us an OPDS entry.
        self.http.queue_response(404, content="Not found")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))

        metadata.links = [bad_description, irrelevant_description,
                          alternate, alternate2, alternate3, alternate4]

        self.importer.improve_description("some ID", metadata)

        # The descriptions have been removed from metatadata.links,
        # because 677.atom included a description we know was better.
        #
        # The incomplete description was removed even though 677.atom
        # also included a copy of it.
        assert bad_description not in metadata.links
        assert irrelevant_description not in metadata.links

        # The more complete description from 677.atom has been added.
        [good_description] = [
            x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION
        ]

        # The four alternate links have not been touched.
        assert (alternate in metadata.links)
        assert (alternate2 in metadata.links)
        assert (alternate3 in metadata.links)
        assert (alternate4 in metadata.links)

        # Two HTTP requests were made.
        eq_(['http://foo/', 'http://baz/'], self.http.requests)

    def test_generic_acquisition_epub_link_picked_up_as_open_access(self):
        """The OPDS feed has links with generic OPDS "acquisition"
        relations. We know that the EPUB link should be open-access
        relations, and we modify its relation on the way in.

        We do not modify the link relation for links to the other
        formats, which means they don't get picked up at all.
        """

        feed = self.sample_file("feed_with_open_access_book.atom")
        imports, errors = self.importer.extract_feed_data(feed)
        [book] = imports.values()
        open_access_links = [x for x in book.circulation.links
                             if x.rel==Hyperlink.OPEN_ACCESS_DOWNLOAD]
        links = sorted(x.href for x in open_access_links)
        eq_(['http://www.feedbooks.com/book/677.epub'], links)

        generic_links = [x for x in book.circulation.links
                         if x.rel==Hyperlink.GENERIC_OPDS_ACQUISITION]
        eq_([], generic_links)

    def test_open_access_book_modified_and_mirrored(self):
        # If no replacement CSS is specified (this is the case with
        # the default importer), the OPDSImporter.content_modifier
        # method is not assigned.
        eq_(None, self.importer.new_css)
        eq_(None, self.importer.content_modifier)

        # Let's create an importer that does specify a replacement
        # CSS file.
        settings = {
            FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY : "http://css/"
        }

        # The very first request made is going to be to the 
        # REPLACEMENT_CSS_KEY URL.
        self.http.queue_response(
            200, content="Some new CSS", media_type="text/css",
        )
        ignore, importer = self._importer(**settings)

        # The replacement CSS is retrieved during the FeedbooksImporter
        # constructor.
        eq_([u'http://css/'], self.http.requests)

        # OPDSImporter.content_modifier has been set to call replace_css
        # when necessary.
        eq_("Some new CSS", importer.new_css)
        eq_(importer.replace_css, importer.content_modifier)

        # The requests to the various copies of the book will succeed,
        # and the books will be mirrored.
        self.http.queue_response(
            200, content=self.sample_file("677.epub"),
            media_type=Representation.EPUB_MEDIA_TYPE
        )

        # The request to
        # http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185'
        # will result in a 404 error, and the image will not be
        # mirrored.
        self.http.queue_response(404, media_type="text/plain")

        self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" }
        feed = self.sample_file("feed_with_open_access_book.atom")
        self.http.queue_response(
            200, OPDSFeed.ACQUISITION_FEED_TYPE,
            content=feed
        )

        [edition], [pool], [work], failures = importer.import_from_feed(
            feed, immediately_presentation_ready=True,
        )

        eq_({}, failures)

        # The work has been created and has metadata.
        eq_("Discourse on the Method", work.title)
        eq_(u'Ren\xe9 Descartes', work.author)

        # Two more mock HTTP requests have now made.
        eq_([
            u'http://css/',
            u'http://www.feedbooks.com/book/677.epub',
            u'http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185',
        ],
            self.http.requests
        )

        # The EPUB was 'uploaded' to the mock S3 service and turned
        # into a LicensePoolDeliveryMechanism. The other formats were
        # ignored.
        [mechanism] = pool.delivery_mechanisms
        eq_('https://s3.amazonaws.com/test.content.bucket/FeedBooks/URI/http%3A%2F%2Fwww.feedbooks.com%2Fbook%2F677/Discourse+on+the+Method.epub',
            mechanism.resource.representation.mirror_url
        )
        eq_(u'application/epub+zip', mechanism.delivery_mechanism.content_type)

        # From information contained in the OPDS entry we determined
        # the book's license to be CC-BY-NC.
        eq_(u'https://creativecommons.org/licenses/by-nc/4.0',
            mechanism.rights_status.uri)

        # The pool is marked as open-access, because it has an open-access
        # delivery mechanism that was mirrored.
        eq_(True, pool.open_access)

        # The mirrored content contains the modified CSS.
        content = StringIO(self.mirror.content[0])
        with ZipFile(content) as zip:
            # The zip still contains the original epub's files.
            assert "META-INF/container.xml" in zip.namelist()
            assert "OPS/css/about.css" in zip.namelist()
            assert "OPS/main0.xml" in zip.namelist()

            # The content of an old file hasn't changed.
            with zip.open("mimetype") as f:
                eq_("application/epub+zip\r\n", f.read())

            # The content of CSS files has been changed to the new value.
            with zip.open("OPS/css/about.css") as f:
                eq_("Some new CSS", f.read())

    def test_in_copyright_book_not_mirrored(self):

        self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" }
        feed = self.sample_file("feed_with_in_copyright_book.atom")
        self.http.queue_response(
            200, OPDSFeed.ACQUISITION_FEED_TYPE,
            content=feed
        )

        response = self.importer.import_from_feed(
            feed, immediately_presentation_ready=True,
        )

        [edition], [pool], [work], failures = self.importer.import_from_feed(
            feed, immediately_presentation_ready=True,
        )

        # The work has been created and has metadata.
        eq_("Discourse on the Method", work.title)
        eq_(u'Ren\xe9 Descartes', work.author)

        # No mock HTTP requests were made.
        eq_([], self.http.requests)

        # Nothing was uploaded to the mock S3.
        eq_([], self.mirror.uploaded)

        # The LicensePool's delivery mechanism is set appropriately
        # to reflect an in-copyright work.
        [mechanism] = pool.delivery_mechanisms
        eq_(RightsStatus.IN_COPYRIGHT, mechanism.rights_status.uri)

        # The DeliveryMechanism has a Representation but the Representation
        # has not been set as mirrored, because nothing was uploaded.
        rep = mechanism.resource.representation
        eq_('http://www.feedbooks.com/book/677.epub', rep.url)
        eq_(None, rep.mirror_url)
        eq_(None, rep.mirror_exception)

        # The pool is not marked as open-access because although it
        # has open-access links, they're not licensed under terms we
        # can use.
        eq_(False, pool.open_access)
Beispiel #17
0
    def test_discovery_service_library_registrations_get(self):
        # Here's a discovery service.
        discovery_service, ignore = create(
            self._db,
            ExternalIntegration,
            protocol=ExternalIntegration.OPDS_REGISTRATION,
            goal=ExternalIntegration.DISCOVERY_GOAL,
        )

        # We'll be making a mock request to this URL later.
        discovery_service.setting(
            ExternalIntegration.URL).value = "http://service-url/"

        # We successfully registered this library with the service.
        succeeded, ignore = create(
            self._db,
            Library,
            name="Library 1",
            short_name="L1",
        )
        config = ConfigurationSetting.for_library_and_externalintegration
        config(self._db, "library-registration-status", succeeded,
               discovery_service).value = "success"

        # We tried to register this library with the service but were
        # unsuccessful.
        config(self._db, "library-registration-stage", succeeded,
               discovery_service).value = "production"
        failed, ignore = create(
            self._db,
            Library,
            name="Library 2",
            short_name="L2",
        )
        config(
            self._db,
            "library-registration-status",
            failed,
            discovery_service,
        ).value = "failure"
        config(
            self._db,
            "library-registration-stage",
            failed,
            discovery_service,
        ).value = "testing"

        # We've never tried to register this library with the service.
        unregistered, ignore = create(
            self._db,
            Library,
            name="Library 3",
            short_name="L3",
        )
        discovery_service.libraries = [succeeded, failed]

        # When a client sends a GET request to the controller, the
        # controller is going to call
        # RemoteRegistry.fetch_registration_document() to try and find
        # the discovery services' terms of service. That's going to
        # make one or two HTTP requests.

        # First, let's try the scenario where the discovery serivce is
        # working and has a terms-of-service.
        client = DummyHTTPClient()

        # In this case we'll make two requests. The first request will
        # ask for the root catalog, where we'll look for a
        # registration link.
        root_catalog = dict(
            links=[dict(href="http://register-here/", rel="register")])
        client.queue_requests_response(200,
                                       RemoteRegistry.OPDS_2_TYPE,
                                       content=json.dumps(root_catalog))

        # The second request will fetch that registration link -- then
        # we'll look for TOS data inside.
        registration_document = dict(links=[
            dict(rel="terms-of-service", type="text/html", href="http://tos/"),
            dict(
                rel="terms-of-service",
                type="text/html",
                href=
                "data:text/html;charset=utf-8;base64,PHA+SG93IGFib3V0IHRoYXQgVE9TPC9wPg==",
            ),
        ])
        client.queue_requests_response(
            200,
            RemoteRegistry.OPDS_2_TYPE,
            content=json.dumps(registration_document))

        controller = (self.manager.
                      admin_discovery_service_library_registrations_controller)
        m = controller.process_discovery_service_library_registrations
        with self.request_context_with_admin("/", method="GET"):
            response = m(do_get=client.do_get)
            # The document we get back from the controller is a
            # dictionary with useful information on all known
            # discovery integrations -- just one, in this case.
            [service] = response["library_registrations"]
            assert discovery_service.id == service["id"]

            # The two mock HTTP requests we predicted actually
            # happened.  The target of the first request is the URL to
            # the discovery service's main catalog. The second request
            # is to the "register" link found in that catalog.
            assert ["http://service-url/",
                    "http://register-here/"] == client.requests

            # The TOS link and TOS HTML snippet were recovered from
            # the registration document served in response to the
            # second HTTP request, and included in the dictionary.
            assert "http://tos/" == service["terms_of_service_link"]
            assert "<p>How about that TOS</p>" == service[
                "terms_of_service_html"]
            assert None == service["access_problem"]

            # The dictionary includes a 'libraries' object, a list of
            # dictionaries with information about the relationships
            # between this discovery integration and every library
            # that's tried to register with it.
            info1, info2 = service["libraries"]

            # Here's the library that successfully registered.
            assert info1 == dict(short_name=succeeded.short_name,
                                 status="success",
                                 stage="production")

            # And here's the library that tried to register but
            # failed.
            assert info2 == dict(short_name=failed.short_name,
                                 status="failure",
                                 stage="testing")

            # Note that `unregistered`, the library that never tried
            # to register with this discover service, is not included.

            # Now let's try the controller method again, except this
            # time the discovery service's web server is down. The
            # first request will return a ProblemDetail document, and
            # there will be no second request.
            client.requests = []
            client.queue_requests_response(
                502,
                content=REMOTE_INTEGRATION_FAILED,
            )
            response = m(do_get=client.do_get)

            # Everything looks good, except that there's no TOS data
            # available.
            [service] = response["library_registrations"]
            assert discovery_service.id == service["id"]
            assert 2 == len(service["libraries"])
            assert None == service["terms_of_service_link"]
            assert None == service["terms_of_service_html"]

            # The problem detail document that prevented the TOS data
            # from showing up has been converted to a dictionary and
            # included in the dictionary of information for this
            # discovery service.
            assert REMOTE_INTEGRATION_FAILED.uri == service["access_problem"][
                "type"]

            # When the user lacks the SYSTEM_ADMIN role, the
            # controller won't even start processing their GET
            # request.
            self.admin.remove_role(AdminRole.SYSTEM_ADMIN)
            self._db.flush()
            pytest.raises(AdminNotAuthorized, m)
    def test_perform_early_return(self):
        class Mock(MockOverdriveAPI):

            EARLY_RETURN_URL = "http://early-return/"

            def get_fulfillment_link(self, *args):
                self.get_fulfillment_link_call = args
                return ("http://fulfillment/", "content/type")

            def _extract_early_return_url(self, *args):
                self._extract_early_return_url_call = args
                return self.EARLY_RETURN_URL

        overdrive = Mock(self._db, self.collection)

        # This patron has a loan.
        pool = self._licensepool(None)
        patron = self._patron()
        pin = object()
        loan, ignore = pool.loan_to(patron)

        # The loan has been fulfilled and now the patron wants to
        # do early return.
        loan.fulfillment = pool.delivery_mechanisms[0]

        # Our mocked perform_early_return will make two HTTP requests.
        # The first will be to the fulfill link returned by our mock
        # get_fulfillment_link. The response to this request is a
        # redirect that includes an early return link.
        http = DummyHTTPClient()
        http.responses.append(
            MockRequestsResponse(
                302,
                dict(location="http://fulfill-this-book/?or=return-early")))

        # The second HTTP request made will be to the early return
        # link 'extracted' from that link by our mock
        # _extract_early_return_url. The response here is a copy of
        # the actual response Overdrive sends in this situation.
        http.responses.append(MockRequestsResponse(200, content="Success"))

        # Do the thing.
        success = overdrive.perform_early_return(patron, pin, loan,
                                                 http.do_get)

        # The title was 'returned'.
        eq_(True, success)

        # It worked like this:
        #
        # get_fulfillment_link was called with appropriate arguments.
        eq_((patron, pin, pool.identifier.identifier, 'ebook-epub-adobe'),
            overdrive.get_fulfillment_link_call)

        # The URL returned by that method was 'requested'.
        eq_('http://fulfillment/', http.requests.pop(0))

        # The resulting URL was passed into _extract_early_return_url.
        eq_(('http://fulfill-this-book/?or=return-early', ),
            overdrive._extract_early_return_url_call)

        # Then the URL returned by _that_ method was 'requested'.
        eq_('http://early-return/', http.requests.pop(0))

        # If no early return URL can be extracted from the fulfillment URL,
        # perform_early_return has no effect.
        #
        overdrive._extract_early_return_url_call = None
        overdrive.EARLY_RETURN_URL = None
        http.responses.append(
            MockRequestsResponse(302,
                                 dict(location="http://fulfill-this-book/")))
        success = overdrive.perform_early_return(patron, pin, loan,
                                                 http.do_get)
        eq_(False, success)

        # extract_early_return_url_call was called, but since it returned
        # None, no second HTTP request was made.
        eq_('http://fulfillment/', http.requests.pop(0))
        eq_(("http://fulfill-this-book/", ),
            overdrive._extract_early_return_url_call)
        eq_([], http.requests)

        # If we can't map the delivery mechanism to one of Overdrive's
        # internal formats, perform_early_return has no effect.
        #
        loan.fulfillment.delivery_mechanism.content_type = "not-in/overdrive"
        success = overdrive.perform_early_return(patron, pin, loan,
                                                 http.do_get)
        eq_(False, success)

        # In this case, no HTTP requests were made at all, since we
        # couldn't figure out which arguments to pass into
        # get_fulfillment_link.
        eq_([], http.requests)

        # If the final attempt to hit the return URL doesn't result
        # in a 200 status code, perform_early_return has no effect.
        http.responses.append(
            MockRequestsResponse(
                302,
                dict(location="http://fulfill-this-book/?or=return-early")))
        http.responses.append(
            MockRequestsResponse(401, content="Unauthorized!"))
        success = overdrive.perform_early_return(patron, pin, loan,
                                                 http.do_get)
        eq_(False, success)
class TestContentCafeAPI(DatabaseTest):

    base_path = os.path.split(__file__)[0]
    resource_path = os.path.join(base_path, "files", "content_cafe")

    def data_file(self, path):
        """Return the contents of a test data file."""
        return open(os.path.join(self.resource_path, path)).read()

    def setup(self):
        super(TestContentCafeAPI, self).setup()
        self.http = DummyHTTPClient()
        self.soap = MockSOAPClient(popularity_value=5)
        self.api = ContentCafeAPI(
            self._db, 'uid', 'pw', self.soap, self.http.do_get
        )
        self.identifier = self._identifier(identifier_type=Identifier.ISBN)
        self.args = dict(userid=self.api.user_id, password=self.api.password,
                         isbn=self.identifier.identifier)

    def test_from_config(self):
        # Without an integration, an error is raised.
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        # With incomplete integrations, an error is raised.
        integration = self._external_integration(
            ExternalIntegration.CONTENT_CAFE,
            goal=ExternalIntegration.METADATA_GOAL,
            username=u'yup'
        )
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        integration.username = None
        integration.password = u'yurp'
        assert_raises(
            CannotLoadConfiguration, ContentCafeAPI.from_config, self._db
        )

        integration.username = u'yup'
        result = ContentCafeAPI.from_config(
            self._db, soap_client=object()
        )
        eq_(True, isinstance(result, ContentCafeAPI))

        # NOTE: We can't test the case where soap_client is not
        # mocked, because the ContentCafeSOAPClient constructor makes
        # a real HTTP request to load its WSDL file. We might be able
        # to improve this by seeing how mockable SudsClient is, or by
        # mocking ContentCafeAPISOAPClient.WSDL_URL as a file:// URL.

    def test_data_source(self):
        eq_(DataSource.CONTENT_CAFE, self.api.data_source.name)

    def test_create_metadata(self):

        class Mock(ContentCafeAPI):

            popularity_measurement = "a popularity measurement"
            annotate_calls = []

            def add_reviews(self, *args):
                self.add_reviews_called_with = args

            def add_descriptions(self, *args):
                self.add_descriptions_called_with = args

            def add_author_notes(self, *args):
                self.add_author_notes_called_with = args

            def add_excerpt(self, *args):
                self.add_excerpt_called_with = args

            def measure_popularity(self, *args):
                self.measure_popularity_called_with = args
                return self.popularity_measurement

            def is_suitable_image(self, image):
                self.is_suitable_image_called_with = image
                return True

        api = Mock(self._db, 'uid', 'pw', self.soap, self.http.do_get)
        m = api.create_metadata

        # First we will make a request for a cover image. If that
        # gives a 404 error, we return nothing and don't bother making
        # any more requests.
        self.http.queue_requests_response(404)
        eq_(None, m(self.identifier))
        request_url = self.http.requests.pop()
        image_url = api.image_url % self.args
        eq_(image_url, request_url)
        eq_([], self.http.requests)

        # If the cover image request succeeds, we turn it into a LinkData
        # and add it to a new Metadata object. We then pass the
        # Metadata object a number of other methods to get additional
        # information from Content Cafe.
        #
        # We then call measure_popularity, and add its return value
        # to Metadata.measurements.
        self.http.queue_requests_response(200, 'image/png', content='an image!')

        # Here's the result.
        metadata = m(self.identifier)

        # Here's the image LinkData.
        [image] = metadata.links
        eq_(Hyperlink.IMAGE, image.rel)
        eq_(image_url, image.href)
        eq_('image/png', image.media_type)
        eq_('an image!', image.content)

        # We ran the image through our mocked version of is_suitable_image,
        # and it said it was fine.
        eq_(image.content, api.is_suitable_image_called_with)

        # Here's the popularity measurement.
        eq_([api.popularity_measurement], metadata.measurements)

        # Confirm that the mock methods were called with the right
        # arguments -- their functionality is tested individually
        # below.
        expected_args = (metadata, self.identifier, self.args)
        for called_with in (
            api.add_reviews_called_with, api.add_descriptions_called_with,
            api.add_author_notes_called_with, api.add_excerpt_called_with,
        ):
            eq_(expected_args, called_with)
        eq_((self.identifier, api.ONE_YEAR_AGO),
            api.measure_popularity_called_with)

        # If measure_popularity returns nothing, metadata.measurements
        # will be left empty.
        api.popularity_measurement = None
        self.http.queue_requests_response(200, 'image/png', content='an image!')
        metadata = m(self.identifier)
        eq_([], metadata.measurements)

    def test_annotate_with_web_resources(self):
        metadata = Metadata(DataSource.CONTENT_CAFE)
        rel = self._str

        # We're going to be grabbing this URL and
        # scraping it.
        url_template = "http://url/%(arg1)s"
        args = dict(arg1='value')

        # A couple of useful functions for scraping.
        class MockScrapers(object):
            scrape_called = False
            explode_called = False
            def scrape(self, soup):
                self.scrape_called = True
                return [soup.find('content').string]

            def explode(self, soup):
                self.explode_called = True
                raise Exception("I'll never be called")
        scrapers = MockScrapers()

        # When the result of the HTTP request contains a certain phrase,
        # we don't even bother scraping.
        m = self.api.annotate_with_web_resources
        http = self.http
        http.queue_requests_response(
            200, 'text/html', content='There is no data!'
        )
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.explode)
        # We made the request but nothing happened.
        expect_url = url_template % args
        eq_(expect_url, self.http.requests.pop())
        eq_(False, scrapers.explode_called)
        eq_(None, metadata.title)
        eq_([], metadata.links)

        # Otherwise, we try to scrape.
        good_content = '<html><span class="PageHeader2">Book title</span><content>Here you go</content>'
        http.queue_requests_response(200, 'text/html', content=good_content)
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.scrape)
        eq_(True, scrapers.scrape_called)

        # We called _extract_title and took a Content Cafe title out
        # for the Metadata object.
        eq_("Book title", metadata.title)

        # Then we called mock_scrape, which gave us the content for
        # one LinkData.
        [link] = metadata.links
        eq_(rel, link.rel)
        eq_(None, link.href)
        eq_("text/html", link.media_type)
        eq_("Here you go", link.content)

    def test__extract_title(self):
        # Standalone test of the _extract_title helper method.

        def assert_title(title, expect):
            markup = '<html><span class="PageHeader2">%s</span><content>Description</content>' % title
            soup = BeautifulSoup(markup, 'lxml')
            eq_(expect, ContentCafeAPI._extract_title(soup))


        # A normal book title is successfully extracted.
        assert_title("A great book", "A great book")

        # A supposed title that's in KNOWN_BAD_TITLES is ignored.
        assert_title("No content currently exists for this item", None)

    def test_add_reviews(self):
        """Verify that add_reviews works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("reviews.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_reviews(metadata, self.identifier, self.args)

        # We extracted six reviews from the sample file.
        reviews = metadata.links
        eq_(6, len(reviews))
        assert all([x.rel==Hyperlink.REVIEW for x in reviews])
        assert "isn't a myth!" in reviews[0].content

        # We incidentally figured out the book's title.
        eq_("Shadow Thieves", metadata.title)

    def test_add_author_notes(self):
        """Verify that add_author_notes works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("author_notes.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_author_notes(metadata, self.identifier, self.args)

        [notes] = metadata.links
        eq_(Hyperlink.AUTHOR, notes.rel)
        assert 'Brenda researched turtles' in notes.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)

    def test_add_excerpt(self):
        """Verify that add_excerpt works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("excerpt.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_excerpt(metadata, self.identifier, self.args)

        [excerpt] = metadata.links
        eq_(Hyperlink.SAMPLE, excerpt.rel)
        assert 'Franklin loved his marbles.' in excerpt.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)

    def test_measure_popularity(self):
        """Verify that measure_popularity turns the output of
        a SOAP request into a MeasurementData.
        """
        cutoff = object()

        # Call it.
        result = self.api.measure_popularity(self.identifier, cutoff)

        # The SOAP client's estimated_popularity method was called.
        expect = (self.identifier.identifier, cutoff)
        eq_(expect, self.soap.estimated_popularity_calls.pop())

        # The result was turned into a MeasurementData.
        assert isinstance(result, MeasurementData)
        eq_(Measurement.POPULARITY, result.quantity_measured)
        eq_(self.soap.popularity_value, result.value)

        # If the SOAP API doesn't return a popularity value, no
        # MeasurementData is created.
        self.soap.popularity_value = None
        result = self.api.measure_popularity(self.identifier, cutoff)
        eq_(expect, self.soap.estimated_popularity_calls.pop())
        eq_(None, result)

    def test_is_suitable_image(self):
        # Images are rejected if we can tell they are Content Cafe's
        # stand-in images.
        m = ContentCafeAPI.is_suitable_image

        content = self.data_file("stand-in-image.png")
        eq_(False, m(content))

        # Otherwise, it's fine. We don't check that the image is
        # valid, only that it's not a stand-in image.
        eq_(True, m("I'm not a stand-in image."))
    def test_lookup(self):
        # Test the lookup() method.
        h = DummyHTTPClient()
        h.queue_response(200, "text/html", content="yay")

        class Mock(NoveListAPI):
            def build_query_url(self, params):
                self.build_query_url_called_with = params
                return "http://query-url/"

            def scrubbed_url(self, params):
                self.scrubbed_url_called_with = params
                return "http://scrubbed-url/"

            def review_response(self, response):
                self.review_response_called_with = response

            def lookup_info_to_metadata(self, representation):
                self.lookup_info_to_metadata_called_with = representation
                return "some metadata"

        novelist = Mock.from_config(self._default_library)
        identifier = self._identifier(identifier_type=Identifier.ISBN)

        # Do the lookup.
        result = novelist.lookup(identifier, do_get=h.do_get)

        # A number of parameters were passed into build_query_url() to
        # get the URL of the HTTP request. The same parameters were
        # also passed into scrubbed_url(), to get the URL that should
        # be used when storing the Representation in the database.
        params1 = novelist.build_query_url_called_with
        params2 = novelist.scrubbed_url_called_with
        assert params1 == params2

        assert (
            dict(
                profile=novelist.profile,
                ClientIdentifier=identifier.urn,
                ISBN=identifier.identifier,
                password=novelist.password,
                version=novelist.version,
            )
            == params1
        )

        # The HTTP request went out to the query URL -- not the scrubbed URL.
        assert ["http://query-url/"] == h.requests

        # The HTTP response was passed into novelist.review_response()
        assert (
            200,
            {"content-type": "text/html"},
            b"yay",
        ) == novelist.review_response_called_with

        # Finally, the Representation was passed into
        # lookup_info_to_metadata, which returned a hard-coded string
        # as the final result.
        assert "some metadata" == result

        # Looking at the Representation we can see that it was stored
        # in the database under its scrubbed URL, not the URL used to
        # make the request.
        rep = novelist.lookup_info_to_metadata_called_with
        assert "http://scrubbed-url/" == rep.url
        assert b"yay" == rep.content
    def test_fetch_registration_document(self):
        # Test our ability to retrieve terms-of-service information
        # from a remote registry, assuming the registry makes that
        # information available.

        # First, test the case where we can't even get the catalog
        # document.
        class Mock(RemoteRegistry):
            def fetch_catalog(self, do_get):
                self.fetch_catalog_called_with = do_get
                return REMOTE_INTEGRATION_FAILED

        registry = Mock(object())
        result = registry.fetch_registration_document()

        # Our mock fetch_catalog was called with a method that would
        # have made a real HTTP request.
        assert HTTP.debuggable_get == registry.fetch_catalog_called_with

        # But the fetch_catalog method returned a problem detail,
        # which became the return value of
        # fetch_registration_document.
        assert REMOTE_INTEGRATION_FAILED == result

        # Test the case where we get the catalog document but we can't
        # get the registration document.
        client = DummyHTTPClient()
        client.responses.append(REMOTE_INTEGRATION_FAILED)

        class Mock(RemoteRegistry):
            def fetch_catalog(self, do_get):
                return "http://register-here/", "vendor id"

            def _extract_registration_information(self, response):
                self._extract_registration_information_called_with = response
                return "TOS link", "TOS HTML data"

        registry = Mock(object())
        result = registry.fetch_registration_document(client.do_get)
        # A request was made to the registration URL mentioned in the catalog.
        assert "http://register-here/" == client.requests.pop()
        assert [] == client.requests

        # But the request returned a problem detail, which became the
        # return value of the method.
        assert REMOTE_INTEGRATION_FAILED == result

        # Finally, test the case where we can get both documents.

        client.queue_requests_response(200, content="a registration document")
        result = registry.fetch_registration_document(client.do_get)

        # Another request was made to the registration URL.
        assert "http://register-here/" == client.requests.pop()
        assert [] == client.requests

        # Our mock of _extract_registration_information was called
        # with the mock response to that request.
        response = registry._extract_registration_information_called_with
        assert b"a registration document" == response.content

        # The return value of _extract_registration_information was
        # propagated as the return value of
        # fetch_registration_document.
        assert ("TOS link", "TOS HTML data") == result
    def test_cautious_http_get(self):

        h = DummyHTTPClient()
        h.queue_response(200, content="yay")

        # If the domain is obviously safe, the GET request goes through,
        # with no HEAD request being made.
        m = Representation.cautious_http_get
        status, headers, content = m(
            "http://safe.org/",
            {},
            do_not_access=["unsafe.org"],
            do_get=h.do_get,
            cautious_head_client=object(),
        )
        assert 200 == status
        assert b"yay" == content

        # If the domain is obviously unsafe, no GET request or HEAD
        # request is made.
        status, headers, content = m(
            "http://unsafe.org/",
            {},
            do_not_access=["unsafe.org"],
            do_get=object(),
            cautious_head_client=object(),
        )
        assert 417 == status
        assert (
            "Cautiously decided not to make a GET request to http://unsafe.org/"
            == content)

        # If the domain is potentially unsafe, a HEAD request is made,
        # and the answer depends on its outcome.

        # Here, the HEAD request redirects to a prohibited site.
        def mock_redirect(*args, **kwargs):
            return MockRequestsResponse(301,
                                        dict(location="http://unsafe.org/"))

        status, headers, content = m(
            "http://caution.org/",
            {},
            do_not_access=["unsafe.org"],
            check_for_redirect=["caution.org"],
            do_get=object(),
            cautious_head_client=mock_redirect,
        )
        assert 417 == status
        assert ("application/vnd.librarysimplified-did-not-make-request" ==
                headers["content-type"])
        assert (
            "Cautiously decided not to make a GET request to http://caution.org/"
            == content)

        # Here, the HEAD request redirects to an allowed site.
        h.queue_response(200, content="good content")

        def mock_redirect(*args, **kwargs):
            return MockRequestsResponse(301, dict(location="http://safe.org/"))

        status, headers, content = m(
            "http://caution.org/",
            {},
            do_not_access=["unsafe.org"],
            check_for_redirect=["caution.org"],
            do_get=h.do_get,
            cautious_head_client=mock_redirect,
        )
        assert 200 == status
        assert b"good content" == content
class TestFeedbooksOPDSImporter(DatabaseTest):

    def _importer(self, **settings):
        collection = self._collection(
            name=DataSource.FEEDBOOKS + self._str,
            protocol=ExternalIntegration.FEEDBOOKS,
        )

        defaults = {
            FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "true",
            FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: None,
        }
        for setting, value in defaults.items():
            if setting not in settings:
                settings[setting] = value

        collection.external_account_id = settings.pop('language', 'de')
        for setting, value in settings.items():
            if value is None:
                continue
            collection.external_integration.set_setting(setting, value)

        return collection, FeedbooksOPDSImporter(
            self._db, collection,
            http_get=self.http.do_get, mirror=self.mirror,
            metadata_client=self.metadata,
        )

    def setup(self):
        super(TestFeedbooksOPDSImporter, self).setup()
        self.http = DummyHTTPClient()
        self.metadata = DummyMetadataClient()
        self.mirror = MockS3Uploader()

        self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS)

        # Create a default importer that's good enough for most tests.
        self.collection, self.importer = self._importer()

    def sample_file(self, filename):
        return sample_data(filename, "feedbooks")

    def test_safety_switch(self):
        """The importer won't be instantiated if REALLY_IMPORT_KEY is not
        set to true.
        """
        settings = {FeedbooksOPDSImporter.REALLY_IMPORT_KEY: "false"}
        assert_raises_regexp(
            Exception, "configured to not actually do an import",
            self._importer, **settings
        )

    def test_unique_identifier(self):
        # The unique account ID is the language of the Feedbooks
        # feed in use.
        eq_('de', self.collection.unique_account_id)

    def test_error_retrieving_replacement_css(self):
        """The importer cannot be instantiated if a replacement CSS
        is specified but the replacement CSS document cannot be
        retrieved or does not appear to be CSS.
        """
        settings = {FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY: "http://foo"}

        self.http.queue_response(500, content="An error message")
        assert_raises_regexp(
            IOError, "Replacement stylesheet URL returned 500 response code",
            self._importer, **settings
        )

        self.http.queue_response(
            200, content="We have many CSS offerings",
            media_type="text/html"
        )
        assert_raises_regexp(
            IOError, "Replacement stylesheet is 'text/html', not a CSS document.",
            self._importer, **settings
        )

    def test_extract_feed_data_improves_descriptions(self):
        feed = self.sample_file("feed.atom")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))
        metadata, failures = self.importer.extract_feed_data(
            feed, "http://url/"
        )
        [(key, value)] = metadata.items()
        eq_(u'http://www.feedbooks.com/book/677', key)
        eq_("Discourse on the Method", value.title)

        # Instead of the short description from feed.atom, we have the
        # long description from 677.atom.
        [description] = [x for x in value.links if x.rel==Hyperlink.DESCRIPTION]
        eq_(1818, len(description.content))

    def test_improve_description(self):
        # Here's a Metadata that has a bad (truncated) description.
        metadata = Metadata(self.data_source)

        bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...")

        irrelevant_description = LinkData(
            rel=Hyperlink.DESCRIPTION, media_type="text/plain",
            content="Don't look at me; I'm irrelevant!"
        )

        # Sending an HTTP request to this URL is going to give a 404 error.
        alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/",
                             media_type=OPDSFeed.ENTRY_TYPE)

        # We're not even going to try to send an HTTP request to this URL
        # because it doesn't promise an OPDS entry.
        alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/",
                             media_type="text/html")

        # But this URL will give us full information about this
        # entry, including a better description.
        alternate3 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://baz/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # This URL will not be requested because the third alternate URL
        # gives us the answer we're looking for.
        alternate4 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://qux/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # Two requests will be made. The first will result in a 404
        # error. The second will give us an OPDS entry.
        self.http.queue_response(404, content="Not found")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))

        metadata.links = [bad_description, irrelevant_description,
                          alternate, alternate2, alternate3, alternate4]

        self.importer.improve_description("some ID", metadata)

        # The descriptions have been removed from metatadata.links,
        # because 677.atom included a description we know was better.
        #
        # The incomplete description was removed even though 677.atom
        # also included a copy of it.
        assert bad_description not in metadata.links
        assert irrelevant_description not in metadata.links

        # The more complete description from 677.atom has been added.
        [good_description] = [
            x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION
        ]

        # The four alternate links have not been touched.
        assert (alternate in metadata.links)
        assert (alternate2 in metadata.links)
        assert (alternate3 in metadata.links)
        assert (alternate4 in metadata.links)

        # Two HTTP requests were made.
        eq_(['http://foo/', 'http://baz/'], self.http.requests)

    def test_generic_acquisition_epub_link_picked_up_as_open_access(self):
        """The OPDS feed has links with generic OPDS "acquisition"
        relations. We know that the EPUB link should be open-access
        relations, and we modify its relation on the way in.

        We do not modify the link relation for links to the other
        formats, which means they don't get picked up at all.
        """

        feed = self.sample_file("feed_with_open_access_book.atom")
        imports, errors = self.importer.extract_feed_data(feed)
        [book] = imports.values()
        open_access_links = [x for x in book.circulation.links
                             if x.rel==Hyperlink.OPEN_ACCESS_DOWNLOAD]
        links = sorted(x.href for x in open_access_links)
        eq_(['http://www.feedbooks.com/book/677.epub'], links)

        generic_links = [x for x in book.circulation.links
                         if x.rel==Hyperlink.GENERIC_OPDS_ACQUISITION]
        eq_([], generic_links)

    def test_open_access_book_modified_and_mirrored(self):
        # If no replacement CSS is specified (this is the case with
        # the default importer), the OPDSImporter.content_modifier
        # method is not assigned.
        eq_(None, self.importer.new_css)
        eq_(None, self.importer.content_modifier)

        # Let's create an importer that does specify a replacement
        # CSS file.
        settings = {
            FeedbooksOPDSImporter.REPLACEMENT_CSS_KEY : "http://css/"
        }

        # The very first request made is going to be to the
        # REPLACEMENT_CSS_KEY URL.
        self.http.queue_response(
            200, content="Some new CSS", media_type="text/css",
        )
        ignore, importer = self._importer(**settings)

        # The replacement CSS is retrieved during the FeedbooksImporter
        # constructor.
        eq_([u'http://css/'], self.http.requests)

        # OPDSImporter.content_modifier has been set to call replace_css
        # when necessary.
        eq_("Some new CSS", importer.new_css)
        eq_(importer.replace_css, importer.content_modifier)

        # The requests to the various copies of the book will succeed,
        # and the books will be mirrored.
        self.http.queue_response(
            200, content=self.sample_file("677.epub"),
            media_type=Representation.EPUB_MEDIA_TYPE
        )

        # The request to
        # http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185'
        # will result in a 404 error, and the image will not be
        # mirrored.
        self.http.queue_response(404, media_type="text/plain")

        self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" }
        feed = self.sample_file("feed_with_open_access_book.atom")
        self.http.queue_response(
            200, OPDSFeed.ACQUISITION_FEED_TYPE,
            content=feed
        )

        [edition], [pool], [work], failures = importer.import_from_feed(feed)

        eq_({}, failures)

        # The work has been created and has metadata.
        eq_("Discourse on the Method", work.title)
        eq_(u'Ren\xe9 Descartes', work.author)

        # Two more mock HTTP requests have now made.
        eq_([
            u'http://css/',
            u'http://www.feedbooks.com/book/677.epub',
            u'http://covers.feedbooks.net/book/677.jpg?size=large&t=1428398185',
        ],
            self.http.requests
        )

        # The EPUB was 'uploaded' to the mock S3 service and turned
        # into a LicensePoolDeliveryMechanism. The other formats were
        # ignored.
        [mechanism] = pool.delivery_mechanisms
        eq_('https://s3.amazonaws.com/test.content.bucket/FeedBooks/URI/http%3A%2F%2Fwww.feedbooks.com%2Fbook%2F677/Discourse+on+the+Method.epub',
            mechanism.resource.representation.mirror_url
        )
        eq_(u'application/epub+zip', mechanism.delivery_mechanism.content_type)

        # From information contained in the OPDS entry we determined
        # the book's license to be CC-BY-NC.
        eq_(u'https://creativecommons.org/licenses/by-nc/4.0',
            mechanism.rights_status.uri)

        # The pool is marked as open-access, because it has an open-access
        # delivery mechanism that was mirrored.
        eq_(True, pool.open_access)

        # The mirrored content contains the modified CSS.
        content = StringIO(self.mirror.content[0])
        with ZipFile(content) as zip:
            # The zip still contains the original epub's files.
            assert "META-INF/container.xml" in zip.namelist()
            assert "OPS/css/about.css" in zip.namelist()
            assert "OPS/main0.xml" in zip.namelist()

            # The content of an old file hasn't changed.
            with zip.open("mimetype") as f:
                eq_("application/epub+zip\r\n", f.read())

            # The content of CSS files has been changed to the new value.
            with zip.open("OPS/css/about.css") as f:
                eq_("Some new CSS", f.read())

    def test_in_copyright_book_not_mirrored(self):

        self.metadata.lookups = { u"René Descartes" : "Descartes, Rene" }
        feed = self.sample_file("feed_with_in_copyright_book.atom")
        self.http.queue_response(
            200, OPDSFeed.ACQUISITION_FEED_TYPE,
            content=feed
        )

        [edition], [pool], [work], failures = self.importer.import_from_feed(feed)

        # The work has been created and has metadata.
        eq_("Discourse on the Method", work.title)
        eq_(u'Ren\xe9 Descartes', work.author)

        # No mock HTTP requests were made.
        eq_([], self.http.requests)

        # Nothing was uploaded to the mock S3.
        eq_([], self.mirror.uploaded)

        # The LicensePool's delivery mechanism is set appropriately
        # to reflect an in-copyright work.
        [mechanism] = pool.delivery_mechanisms
        eq_(RightsStatus.IN_COPYRIGHT, mechanism.rights_status.uri)

        # The DeliveryMechanism has a Representation but the Representation
        # has not been set as mirrored, because nothing was uploaded.
        rep = mechanism.resource.representation
        eq_('http://www.feedbooks.com/book/677.epub', rep.url)
        eq_(None, rep.mirror_url)
        eq_(None, rep.mirror_exception)

        # The pool is not marked as open-access because although it
        # has open-access links, they're not licensed under terms we
        # can use.
        eq_(False, pool.open_access)