Esempio n. 1
0
    def test_non_open_access_book_not_mirrored(self):
        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        m = Metadata(data_source=data_source)

        mirror = DummyS3Uploader(fail=True)
        h = DummyHTTPClient()

        policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get)

        content = "foo"
        link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
                        media_type=Representation.EPUB_MEDIA_TYPE,
                        href="http://example.com/",
                        content=content,
                        rights_uri=RightsStatus.IN_COPYRIGHT)

        identifier = self._identifier()
        link_obj, is_new = identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            media_type=link.media_type,
            content=link.content,
        )

        # The Hyperlink object makes it look like an open-access book,
        # but the context we have from the OPDS feed says that it's
        # not.
        m.mirror_link(None, data_source, link, link_obj, policy)

        # No HTTP requests were made.
        eq_([], h.requests)

        # Nothing was uploaded.
        eq_([], mirror.uploaded)
Esempio n. 2
0
    def test_mirror_404_error(self):
        mirror = DummyS3Uploader()
        h = DummyHTTPClient()
        h.queue_response(404)
        policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get)

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)

        link = LinkData(
            rel=Hyperlink.IMAGE,
            media_type=Representation.JPEG_MEDIA_TYPE,
            href="http://example.com/",
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            license_pool=pool,
            media_type=link.media_type,
            content=link.content,
        )

        m = Metadata(data_source=data_source)

        m.mirror_link(edition, data_source, link, link_obj, policy)

        # Since we got a 404 error, the cover image was not mirrored.
        eq_(404, link_obj.resource.representation.status_code)
        eq_(None, link_obj.resource.representation.mirror_url)
        eq_([], mirror.uploaded)
Esempio n. 3
0
    def test_mirror_open_access_link_mirror_failure(self):
        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        m = Metadata(data_source=data_source)

        mirror = DummyS3Uploader(fail=True)
        h = DummyHTTPClient()

        policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get)

        content = open(self.sample_cover_path("test-book-cover.png")).read()
        link = LinkData(rel=Hyperlink.IMAGE,
                        media_type=Representation.JPEG_MEDIA_TYPE,
                        href="http://example.com/",
                        content=content)

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            license_pool=pool,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(200, media_type=Representation.JPEG_MEDIA_TYPE)

        m.mirror_link(edition, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # The representation was fetched successfully.
        eq_(None, representation.fetch_exception)
        assert representation.fetched_at != None

        # But mirroing failed.
        assert representation.mirror_exception != None
        eq_(None, representation.mirrored_at)
        eq_(link.media_type, representation.media_type)
        eq_(link.href, representation.url)

        # The mirror url should still be set.
        assert "Gutenberg" in representation.mirror_url
        assert representation.mirror_url.endswith(
            "%s/cover.jpg" % edition.primary_identifier.identifier)

        # Book content is still there since it wasn't mirrored.
        assert representation.content != None

        # the edition's identifier-associated license pool should not be
        # suppressed just because fetch failed on getting image.
        eq_(False, pool.suppressed)

        # the license pool only gets its license_exception column filled in
        # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub.
        eq_(None, pool.license_exception)
Esempio n. 4
0
    def test_mirror_with_content_modifier(self):
        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        m = Metadata(data_source=data_source)

        mirror = DummyS3Uploader()

        def dummy_content_modifier(representation):
            representation.content = "Replaced Content"

        h = DummyHTTPClient()

        policy = ReplacementPolicy(mirror=mirror,
                                   content_modifier=dummy_content_modifier,
                                   http_get=h.do_get)

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href="http://example.com/test.epub",
            content="I'm an epub",
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            license_pool=pool,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE)

        m.mirror_link(edition, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # The representation was fetched successfully.
        eq_(None, representation.fetch_exception)
        assert representation.fetched_at != None

        # The mirror url is set.
        assert "Gutenberg" in representation.mirror_url
        assert representation.mirror_url.endswith(
            "%s/%s.epub" %
            (edition.primary_identifier.identifier, edition.title))

        # Content isn't there since it was mirrored.
        eq_(None, representation.content)

        # The representation was mirrored, with the modified content.
        eq_([representation], mirror.uploaded)
        eq_(["Replaced Content"], mirror.content)
Esempio n. 5
0
    def test_mirror_open_access_link_mirror_failure(self):
        mirror = DummyS3Uploader(fail=True)
        h = DummyHTTPClient()

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get)

        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            license_pool=pool,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE)

        circulation_data.mirror_link(pool, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # The representation was fetched successfully.
        eq_(None, representation.fetch_exception)
        assert representation.fetched_at != None

        # But mirroing failed.
        assert representation.mirror_exception != None
        eq_(None, representation.mirrored_at)
        eq_(link.media_type, representation.media_type)
        eq_(link.href, representation.url)

        # The mirror url should still be set.
        assert "Gutenberg" in representation.mirror_url
        assert representation.mirror_url.endswith("%s.epub" % edition.title)

        # Book content is still there since it wasn't mirrored.
        assert representation.content != None

        # The license pool is suppressed when mirroring fails.
        eq_(True, pool.suppressed)
        assert representation.mirror_exception in pool.license_exception
Esempio n. 6
0
    def test_mirror_open_access_link_fetch_failure(self):
        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        m = Metadata(data_source=data_source)

        mirror = DummyS3Uploader()
        h = DummyHTTPClient()

        policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get)

        link = LinkData(
            rel=Hyperlink.IMAGE,
            media_type=Representation.JPEG_MEDIA_TYPE,
            href="http://example.com/",
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            license_pool=pool,
            media_type=link.media_type,
            content=link.content,
        )
        h.queue_response(403)

        m.mirror_link(edition, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # Fetch failed, so we should have a fetch exception but no mirror url.
        assert representation.fetch_exception != None
        eq_(None, representation.mirror_exception)
        eq_(None, representation.mirror_url)
        eq_(link.href, representation.url)
        assert representation.fetched_at != None
        eq_(None, representation.mirrored_at)

        # the edition's identifier-associated license pool should not be
        # suppressed just because fetch failed on getting image.
        eq_(False, pool.suppressed)

        # the license pool only gets its license_exception column filled in
        # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub.
        eq_(None, pool.license_exception)
Esempio n. 7
0
    def test_mirror_open_access_link_fetch_failure(self):
        mirror = DummyS3Uploader()
        h = DummyHTTPClient()

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get)
        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            license_pool=pool,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(403)

        circulation_data.mirror_link(pool, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # Fetch failed, so we should have a fetch exception but no mirror url.
        assert representation.fetch_exception != None
        eq_(None, representation.mirror_exception)
        eq_(None, representation.mirror_url)
        eq_(link.href, representation.url)
        assert representation.fetched_at != None
        eq_(None, representation.mirrored_at)

        # The license pool is suppressed when fetch fails.
        eq_(True, pool.suppressed)
        assert representation.fetch_exception in pool.license_exception
Esempio n. 8
0
    def test_image_scale_and_mirror(self):
        # Make sure that open access material links are translated to our S3 buckets, and that
        # commercial material links are left as is.
        # Note: mirroring links is now also CirculationData's job.  So the unit tests
        # that test for that have been changed to call to mirror cover images.
        # However, updated tests passing does not guarantee that all code now
        # correctly calls on CirculationData, too.  This is a risk.

        mirror = DummyS3Uploader()
        edition, pool = self._edition(with_license_pool=True)
        content = open(self.sample_cover_path("test-book-cover.png")).read()
        l1 = LinkData(rel=Hyperlink.IMAGE,
                      href="http://example.com/",
                      media_type=Representation.JPEG_MEDIA_TYPE,
                      content=content)
        thumbnail_content = open(
            self.sample_cover_path("tiny-image-cover.png")).read()
        l2 = LinkData(rel=Hyperlink.THUMBNAIL_IMAGE,
                      href="http://example.com/thumb.jpg",
                      media_type=Representation.JPEG_MEDIA_TYPE,
                      content=content)

        # When we call metadata.apply, all image links will be scaled and
        # 'mirrored'.
        policy = ReplacementPolicy(mirror=mirror)
        metadata = Metadata(links=[l1, l2], data_source=edition.data_source)
        metadata.apply(edition, replace=policy)

        # Two Representations were 'mirrored'.
        image, thumbnail = mirror.uploaded

        # The image...
        [image_link] = image.resource.links
        eq_(Hyperlink.IMAGE, image_link.rel)

        # And its thumbnail.
        eq_(image, thumbnail.thumbnail_of)

        # The original image is too big to be a thumbnail.
        eq_(600, image.image_height)
        eq_(400, image.image_width)

        # The thumbnail is the right height.
        eq_(Edition.MAX_THUMBNAIL_HEIGHT, thumbnail.image_height)
        eq_(Edition.MAX_THUMBNAIL_WIDTH, thumbnail.image_width)

        # The thumbnail is newly generated from the full-size
        # image--the thumbnail that came in from the OPDS feed was
        # ignored.
        assert thumbnail.url != l2.href
        assert thumbnail.content != l2.content

        # Both images have been 'mirrored' to Amazon S3.
        assert image.mirror_url.startswith(
            'http://s3.amazonaws.com/test.cover.bucket/')
        assert image.mirror_url.endswith('cover.jpg')

        # The thumbnail image has been converted to PNG.
        assert thumbnail.mirror_url.startswith(
            'http://s3.amazonaws.com/test.cover.bucket/scaled/300/')
        assert thumbnail.mirror_url.endswith('cover.png')
Esempio n. 9
0
    def test_open_access_content_mirrored(self):
        # Make sure that open access material links are translated to our S3 buckets, and that 
        # commercial material links are left as is.
        # Note: Mirroring tests passing does not guarantee that all code now 
        # correctly calls on CirculationData, as well as Metadata.  This is a risk.

        mirror = DummyS3Uploader()
        # Here's a book.
        edition, pool = self._edition(with_license_pool=True)

        # Here's a link to the content of the book, which will be mirrored.
        link_mirrored = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://example.com/",
            media_type=Representation.EPUB_MEDIA_TYPE,
            content="i am a tiny book"
        )

        # This link will not be mirrored.
        link_unmirrored = LinkData(
            rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, href="http://example.com/2",
            media_type=Representation.EPUB_MEDIA_TYPE,
            content="i am a pricy book"
        )

        # Apply the metadata.
        policy = ReplacementPolicy(mirror=mirror)

        metadata = Metadata(data_source=edition.data_source, 
        	links=[link_mirrored, link_unmirrored],
    	)
        metadata.apply(edition, replace=policy)
        # make sure the refactor is done right, and metadata does not upload
        eq_(0, len(mirror.uploaded))


        circulation_data = CirculationData(
        	data_source=edition.data_source, 
        	primary_identifier=edition.primary_identifier,
        	links=[link_mirrored, link_unmirrored],
        )
        circulation_data.apply(pool, replace=policy)
        
        # make sure the refactor is done right, and circulation does upload 
        eq_(1, len(mirror.uploaded))

        # Only the open-access link has been 'mirrored'.
        [book] = mirror.uploaded

        # It's remained an open-access link.
        eq_(
            [Hyperlink.OPEN_ACCESS_DOWNLOAD], 
            [x.rel for x in book.resource.links]
        )


        # It's been 'mirrored' to the appropriate S3 bucket.
        assert book.mirror_url.startswith('http://s3.amazonaws.com/test.content.bucket/')
        expect = '/%s/%s.epub' % (
            edition.primary_identifier.identifier,
            edition.title
        )
        assert book.mirror_url.endswith(expect)

        # make sure the mirrored link is safely on edition
        sorted_edition_links = sorted(edition.license_pool.identifier.links, key=lambda x: x.rel)
        unmirrored_representation, mirrored_representation = [edlink.resource.representation for edlink in sorted_edition_links]
        assert mirrored_representation.mirror_url.startswith('http://s3.amazonaws.com/test.content.bucket/')

        # make sure the unmirrored link is safely on edition
        eq_('http://example.com/2', unmirrored_representation.url)
        # make sure the unmirrored link has not been translated to an S3 URL
        eq_(None, unmirrored_representation.mirror_url)
    def test_set_metadata_incorporates_replacement_policy(self):
        """Make sure that if a ReplacementPolicy is passed in to
        set_metadata(), the policy's settings (and those of its
        .presentation_calculation_policy) are respected.
        """

        edition, pool = self._edition(with_license_pool=True)
        identifier = edition.primary_identifier

        # All images and open-access content should be uploaded to
        # this 'mirror'.
        mirror = DummyS3Uploader()
        http = DummyHTTPClient()
        http.queue_response(
            200,
            content='I am an epub.',
            media_type=Representation.EPUB_MEDIA_TYPE,
        )

        class Tripwire(PresentationCalculationPolicy):
            # This class sets a variable if one of its properties is
            # accessed.
            def __init__(self, *args, **kwargs):
                self.tripped = False

            def __getattr__(self, name):
                self.tripped = True
                return True

        presentation_calculation_policy = Tripwire()

        metadata_replacement_policy = ReplacementPolicy(
            mirror=mirror,
            http_get=http.do_get,
            presentation_calculation_policy=presentation_calculation_policy)

        circulationdata_replacement_policy = ReplacementPolicy(
            mirror=mirror,
            http_get=http.do_get,
        )

        output_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        provider = CoverageProvider("service", [identifier.type],
                                    output_source)

        metadata = Metadata(output_source)
        # We've got a CirculationData object that includes an open-access download.
        link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
                        href="http://foo.com/")
        circulationdata = CirculationData(
            output_source,
            primary_identifier=metadata.primary_identifier,
            links=[link])

        provider.set_metadata_and_circulation_data(
            identifier,
            metadata,
            circulationdata,
            metadata_replacement_policy=metadata_replacement_policy,
            circulationdata_replacement_policy=
            circulationdata_replacement_policy,
        )

        # The open-access download was 'downloaded' and 'mirrored'.
        [mirrored] = mirror.uploaded
        eq_("http://foo.com/", mirrored.url)
        assert mirrored.mirror_url.endswith(
            "/%s/%s.epub" % (identifier.identifier, edition.title))

        # The book content was removed from the db after it was
        # mirrored successfully.
        eq_(None, mirrored.content)

        # Our custom PresentationCalculationPolicy was used when
        # determining whether to recalculate the work's
        # presentation. We know this because the tripwire was
        # triggered.
        eq_(True, presentation_calculation_policy.tripped)
Esempio n. 11
0
    def test_resources_are_mirrored_on_import(self):

        svg = """<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">

<svg xmlns="http://www.w3.org/2000/svg" width="1000" height="500">
    <ellipse cx="50" cy="25" rx="50" ry="25" style="fill:blue;"/>
</svg>"""

        http = DummyHTTPClient()
        # The request to http://root/full-cover-image.png
        # will result in a 404 error, and the image will not be mirrored.
        http.queue_response(404, media_type="text/plain")
        http.queue_response(
            200,
            content='I am 10557.epub.images',
            media_type=Representation.EPUB_MEDIA_TYPE,
        )
        http.queue_response(200,
                            content=svg,
                            media_type=Representation.SVG_MEDIA_TYPE)
        http.queue_response(200,
                            content='I am 10441.epub.images',
                            media_type=Representation.EPUB_MEDIA_TYPE)

        s3 = DummyS3Uploader()

        importer = OPDSImporter(self._db,
                                data_source_name=DataSource.OA_CONTENT_SERVER,
                                mirror=s3,
                                http_get=http.do_get)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed, feed_url='http://root'))
        e1 = imported_editions[0]
        e2 = imported_editions[1]

        # The import process requested each remote resource in the
        # order they appeared in the OPDS feed. The thumbnail
        # image was not requested, since we were going to make our own
        # thumbnail anyway.
        eq_(http.requests, [
            'http://www.gutenberg.org/ebooks/10441.epub.images',
            'https://s3.amazonaws.com/book-covers.nypl.org/Gutenberg-Illustrated/10441/cover_10441_9.png',
            'http://www.gutenberg.org/ebooks/10557.epub.images',
            'http://root/full-cover-image.png',
        ])

        [e1_oa_link, e1_image_link,
         e1_description_link] = sorted(e1.primary_identifier.links,
                                       key=lambda x: x.rel)
        [e2_image_link, e2_oa_link] = e2.primary_identifier.links

        # The two open-access links were mirrored to S3, as was the
        # original SVG image and its PNG thumbnail. The PNG image was
        # not mirrored because our attempt to download it resulted in
        # a 404 error.
        imported_representations = [
            e1_oa_link.resource.representation,
            e1_image_link.resource.representation,
            e1_image_link.resource.representation.thumbnails[0],
            e2_oa_link.resource.representation,
        ]
        eq_(imported_representations, s3.uploaded)

        eq_(4, len(s3.uploaded))
        eq_("I am 10441.epub.images", s3.content[0])
        eq_(svg, s3.content[1])
        eq_("I am 10557.epub.images", s3.content[3])

        # Each resource was 'mirrored' to an Amazon S3 bucket.
        #
        # The "mouse" book was mirrored to a bucket corresponding to
        # Project Gutenberg, its data source.
        #
        # The images were mirrored to a bucket corresponding to the
        # open-access content server, _their_ data source.
        #
        # The "crow" book was mirrored to a bucket corresponding to
        # the open-access content source, the default data source used
        # when no distributor was specified for a book.
        url0 = 'http://s3.amazonaws.com/test.content.bucket/Gutenberg/Gutenberg%20ID/10441/The%20Green%20Mouse.epub.images'
        url1 = u'http://s3.amazonaws.com/test.cover.bucket/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10441/cover_10441_9.png'
        url2 = u'http://s3.amazonaws.com/test.cover.bucket/scaled/300/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10441/cover_10441_9.png'
        url3 = 'http://s3.amazonaws.com/test.content.bucket/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10557/Johnny%20Crow%27s%20Party.epub.images'
        uploaded_urls = [x.mirror_url for x in s3.uploaded]
        eq_([url0, url1, url2, url3], uploaded_urls)

        # If we fetch the feed again, and the entries have been updated since the
        # cutoff, but the content of the open access links hasn't changed, we won't mirror
        # them again.
        cutoff = datetime.datetime(2013, 1, 2, 16, 56, 40)

        http.queue_response(304, media_type=Representation.EPUB_MEDIA_TYPE)

        http.queue_response(304, media_type=Representation.SVG_MEDIA_TYPE)

        http.queue_response(304, media_type=Representation.EPUB_MEDIA_TYPE)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed))

        eq_([e1, e2], imported_editions)

        # Nothing new has been uploaded
        eq_(4, len(s3.uploaded))

        # If the content has changed, it will be mirrored again.
        http.queue_response(200,
                            content="I am a new version of 10557.epub.images",
                            media_type=Representation.EPUB_MEDIA_TYPE)

        http.queue_response(200,
                            content=svg,
                            media_type=Representation.SVG_MEDIA_TYPE)

        http.queue_response(200,
                            content="I am a new version of 10441.epub.images",
                            media_type=Representation.EPUB_MEDIA_TYPE)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed))

        eq_([e1, e2], imported_editions)
        eq_(8, len(s3.uploaded))
        eq_("I am a new version of 10441.epub.images", s3.content[4])
        eq_(svg, s3.content[5])
        eq_("I am a new version of 10557.epub.images", s3.content[7])