def setup(self):
        super(TestFeedbooksOPDSImporter, self).setup()
        self.http = DummyHTTPClient()
        self.metadata = DummyMetadataClient()
        self.mirrors = dict(covers_mirror=MockS3Uploader(),books_mirror=MockS3Uploader())

        self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS)

        # Create a default importer that's good enough for most tests.
        self.collection, self.importer = self._importer()
    def test_load_cover_link(self):
        # Create a directory import script with an empty mock filesystem.
        script = MockDirectoryImportScript(self._db, {})

        identifier = self._identifier(Identifier.GUTENBERG_ID, "2345")
        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        mirror = MockS3Uploader()
        args = (identifier, gutenberg, "covers", mirror)

        # There is nothing on the mock filesystem, so in this case
        # load_cover_link returns None.
        eq_(None, script.load_cover_link(*args))

        # But we tried.
        eq_(
            ('2345', 'covers', Representation.COMMON_IMAGE_EXTENSIONS,
             'cover image'),
            script._locate_file_args
        )

        # Try another script that has a populated mock filesystem.
        mock_filesystem = {
            'covers' : (
                'acover.jpeg', Representation.JPEG_MEDIA_TYPE, "I'm an image."
            )
        }
        script = MockDirectoryImportScript(self._db, mock_filesystem)
        link = script.load_cover_link(*args)
        eq_(Hyperlink.IMAGE, link.rel)
        assert link.href.endswith(
            '/test.cover.bucket/Gutenberg/Gutenberg+ID/2345/2345.jpg'
        )
        eq_(Representation.JPEG_MEDIA_TYPE, link.media_type)
        eq_("I'm an image.", link.content)
    def setup(self):
        super(TestIntegrationClientCoverImageCoverageProvider, self).setup()
        mirror = MockS3Uploader()
        replacement_policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror)
        self.collection = self._collection(
            protocol=ExternalIntegration.OPDS_FOR_DISTRIBUTORS)

        self.provider = IntegrationClientCoverImageCoverageProvider(
            replacement_policy=replacement_policy, collection=self.collection)
Example #4
0
    def test_mirror_open_access_link_mirror_failure(self):
        mirrors = dict(books_mirror=MockS3Uploader(fail=True),
                       covers_mirror=None)
        h = DummyHTTPClient()

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get)

        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE)

        circulation_data.mirror_link(pool, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # The representation was fetched successfully.
        assert None == representation.fetch_exception
        assert representation.fetched_at != None

        # But mirroing failed.
        assert representation.mirror_exception != None
        assert None == representation.mirrored_at
        assert link.media_type == representation.media_type
        assert link.href == representation.url

        # The mirror url was never set.
        assert None == representation.mirror_url

        # Book content is still there since it wasn't mirrored.
        assert representation.content != None

        # The license pool is suppressed when mirroring fails.
        assert True == pool.suppressed
        assert representation.mirror_exception in pool.license_exception
    def test_replacement_policy_uses_provided_mirror(self):
        collection = MockOverdriveAPI.mock_collection(self._db)
        mirror = MockS3Uploader()
        replacement_policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror)
        api = MockOverdriveAPI(self._db, collection)
        api.queue_collection_token()
        provider = OverdriveBibliographicCoverageProvider(
            collection, replacement_policy=replacement_policy, api_class=api)

        # Any resources discovered by Overdrive will be
        # sent through this mirror.
        eq_(mirror, provider.replacement_policy.mirror)

        http = DummyHTTPClient()
        provider.replacement_policy.http_get = http.do_get

        # Now let's try looking up a specific identifier through 'Overdrive'.
        identifier = self._identifier(Identifier.OVERDRIVE_ID,
                                      "3896665d-9d81-4cac-bd43-ffc5066de1f5")

        body = self.data_file("overdrive/overdrive_metadata.json")
        provider.api.queue_response(200, {}, body)

        test_cover = self.data_file("covers/test-book-cover.png")
        test_small_cover = self.data_file("covers/tiny-image-cover.png")

        # Overdrive's full-sized image -- we will be creating our own
        # thumbnail from this.
        http.queue_response(200, "image/jpeg", {}, test_cover)

        # Overdrive's thumbnail image -- we will not be using this
        http.queue_response(200, "image/jpeg", {}, test_small_cover)

        record = provider.ensure_coverage(identifier)
        eq_("success", record.status)

        # The full image and the thumbnail have been uploaded to
        # the fake S3.
        full, thumbnail = mirror.uploaded
        eq_(test_cover, full.content)

        # The URLs for the Resource objects are our S3 URLs, not Overdrive's
        # URLs.
        expect = "Overdrive/Overdrive+ID/%s" % identifier.identifier
        for url in [full.mirror_url, thumbnail.mirror_url]:
            assert expect in url
        assert "/scaled/" in thumbnail.mirror_url
        assert "/scaled/" not in full.mirror_url

        # The thumbnail is a newly created image that is not the
        # same as the full image or the test cover.
        assert thumbnail.content != test_small_cover
        assert thumbnail.content != test_cover
    def test_load_circulation_data(self):
        # Create a directory import script with an empty mock filesystem.
        script = MockDirectoryImportScript(self._db, {})

        identifier = self._identifier(Identifier.GUTENBERG_ID, "2345")
        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        mirror = MockS3Uploader()
        args = (identifier, gutenberg, "ebooks", mirror, "Name of book",
                "rights URI")

        # There is nothing on the mock filesystem, so in this case
        # load_circulation_data returns None.
        eq_(None, script.load_circulation_data(*args))

        # But we tried.
        eq_(
            ('2345', 'ebooks', Representation.COMMON_EBOOK_EXTENSIONS,
             'ebook file'),
            script._locate_file_args
        )

        # Try another script that has a populated mock filesystem.
        mock_filesystem = {
            'ebooks' : (
                'book.epub', Representation.EPUB_MEDIA_TYPE, "I'm an EPUB."
            )
        }
        script = MockDirectoryImportScript(self._db, mock_filesystem)

        # Now _locate_file finds something on the mock filesystem, and
        # load_circulation_data loads it into a fully populated
        # CirculationData object.
        circulation = script.load_circulation_data(*args)
        eq_(identifier, circulation.primary_identifier(self._db))
        eq_(gutenberg, circulation.data_source(self._db))
        eq_("rights URI", circulation.default_rights_uri)

        # The CirculationData has an open-access link associated with it.
        [link] = circulation.links
        eq_(Hyperlink.OPEN_ACCESS_DOWNLOAD, link.rel)
        assert link.href.endswith(
            '/test.content.bucket/Gutenberg/Gutenberg+ID/2345/Name+of+book.epub'
        )
        eq_(Representation.EPUB_MEDIA_TYPE, link.media_type)
        eq_("I'm an EPUB.", link.content)

        # This open-access link will be made available through a
        # delivery mechanism described by this FormatData.
        [format] = circulation.formats
        eq_(link, format.link)
        eq_(link.media_type, format.content_type)
        eq_(DeliveryMechanism.NO_DRM, format.drm_scheme)
Example #7
0
    def test_mirror_open_access_link_fetch_failure(self):
        mirrors = dict(books_mirror=MockS3Uploader())
        h = DummyHTTPClient()

        edition, pool = self._edition(with_license_pool=True)

        data_source = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get)
        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
        )

        link_obj, ignore = edition.primary_identifier.add_link(
            rel=link.rel,
            href=link.href,
            data_source=data_source,
            media_type=link.media_type,
            content=link.content,
        )

        h.queue_response(403)

        circulation_data.mirror_link(pool, data_source, link, link_obj, policy)

        representation = link_obj.resource.representation

        # Fetch failed, so we should have a fetch exception but no mirror url.
        assert representation.fetch_exception != None
        assert None == representation.mirror_exception
        assert None == representation.mirror_url
        assert link.href == representation.url
        assert representation.fetched_at != None
        assert None == representation.mirrored_at

        # The license pool is suppressed when fetch fails.
        assert True == pool.suppressed
        assert representation.fetch_exception in pool.license_exception
Example #8
0
    def test_open_access_content_mirrored(self):
        # Make sure that open access material links are translated to our S3 buckets, and that
        # commercial material links are left as is.
        # Note: Mirroring tests passing does not guarantee that all code now
        # correctly calls on CirculationData, as well as Metadata.  This is a risk.

        mirrors = dict(books_mirror=MockS3Uploader(), covers_mirror=None)
        mirror_type = ExternalIntegrationLink.OPEN_ACCESS_BOOKS
        # Here's a book.
        edition, pool = self._edition(with_license_pool=True)

        # Here's a link to the content of the book, which will be mirrored.
        link_mirrored = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            href="http://example.com/",
            media_type=Representation.EPUB_MEDIA_TYPE,
            content="i am a tiny book",
        )

        # This link will not be mirrored.
        link_unmirrored = LinkData(
            rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD,
            href="http://example.com/2",
            media_type=Representation.EPUB_MEDIA_TYPE,
            content="i am a pricy book",
        )

        # Apply the metadata.
        policy = ReplacementPolicy(mirrors=mirrors)

        metadata = Metadata(
            data_source=edition.data_source,
            links=[link_mirrored, link_unmirrored],
        )
        metadata.apply(edition, pool.collection, replace=policy)
        # make sure the refactor is done right, and metadata does not upload
        assert 0 == len(mirrors[mirror_type].uploaded)

        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
            links=[link_mirrored, link_unmirrored],
        )
        circulation_data.apply(self._db, pool.collection, replace=policy)

        # make sure the refactor is done right, and circulation does upload
        assert 1 == len(mirrors[mirror_type].uploaded)

        # Only the open-access link has been 'mirrored'.
        [book] = mirrors[mirror_type].uploaded

        # It's remained an open-access link.
        assert [Hyperlink.OPEN_ACCESS_DOWNLOAD
                ] == [x.rel for x in book.resource.links]

        # It's been 'mirrored' to the appropriate S3 bucket.
        assert book.mirror_url.startswith(
            "https://test-content-bucket.s3.amazonaws.com/")
        expect = "/%s/%s.epub" % (edition.primary_identifier.identifier,
                                  edition.title)
        assert book.mirror_url.endswith(expect)

        # make sure the mirrored link is safely on edition
        sorted_edition_links = sorted(pool.identifier.links,
                                      key=lambda x: x.rel)
        unmirrored_representation, mirrored_representation = [
            edlink.resource.representation for edlink in sorted_edition_links
        ]
        assert mirrored_representation.mirror_url.startswith(
            "https://test-content-bucket.s3.amazonaws.com/")

        # make sure the unmirrored link is safely on edition
        assert "http://example.com/2" == unmirrored_representation.url
        # make sure the unmirrored link has not been translated to an S3 URL
        assert None == unmirrored_representation.mirror_url
Example #9
0
    def test_records(self):
        integration = self._integration()
        now = utc_now()
        exporter = MARCExporter.from_config(self._default_library)
        annotator = Annotator()
        lane = self._lane("Test Lane", genres=["Mystery"])
        w1 = self._work(genre="Mystery", with_open_access_download=True)
        w2 = self._work(genre="Mystery", with_open_access_download=True)

        search_engine = MockExternalSearchIndex()
        search_engine.bulk_update([w1, w2])

        # If there's a storage protocol but not corresponding storage integration,
        # it raises an exception.
        pytest.raises(Exception, exporter.records, lane, annotator)

        # If there is a storage integration, the output file is mirrored.
        mirror_integration = self._external_integration(
            ExternalIntegration.S3,
            ExternalIntegration.STORAGE_GOAL,
            username="******",
            password="******",
        )

        mirror = MockS3Uploader()

        exporter.records(
            lane,
            annotator,
            mirror_integration,
            mirror=mirror,
            query_batch_size=1,
            upload_batch_size=1,
            search_engine=search_engine,
        )

        # The file was mirrored and a CachedMARCFile was created to track the mirrored file.
        assert 1 == len(mirror.uploaded)
        [cache] = self._db.query(CachedMARCFile).all()
        assert self._default_library == cache.library
        assert lane == cache.lane
        assert mirror.uploaded[0] == cache.representation
        assert None == cache.representation.content
        assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s/%s.mrc" % (
            self._default_library.short_name,
            quote(str(cache.representation.fetched_at)),
            quote(lane.display_name),
        ) == mirror.uploaded[0].mirror_url)
        assert None == cache.start_time
        assert cache.end_time > now

        # The content was uploaded in two parts.
        assert 2 == len(mirror.content[0])
        complete_file = b"".join(mirror.content[0])
        records = list(MARCReader(complete_file))
        assert 2 == len(records)

        title_fields = [record.get_fields("245") for record in records]
        titles = [fields[0].get_subfields("a")[0] for fields in title_fields]
        assert set([w1.title, w2.title]) == set(titles)

        assert w1.title in w1.marc_record
        assert w2.title in w2.marc_record

        self._db.delete(cache)

        # It also works with a WorkList instead of a Lane, in which case
        # there will be no lane in the CachedMARCFile.
        worklist = WorkList()
        worklist.initialize(self._default_library, display_name="All Books")

        mirror = MockS3Uploader()
        exporter.records(
            worklist,
            annotator,
            mirror_integration,
            mirror=mirror,
            query_batch_size=1,
            upload_batch_size=1,
            search_engine=search_engine,
        )

        assert 1 == len(mirror.uploaded)
        [cache] = self._db.query(CachedMARCFile).all()
        assert self._default_library == cache.library
        assert None == cache.lane
        assert mirror.uploaded[0] == cache.representation
        assert None == cache.representation.content
        assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s/%s.mrc" % (
            self._default_library.short_name,
            quote(str(cache.representation.fetched_at)),
            quote(worklist.display_name),
        ) == mirror.uploaded[0].mirror_url)
        assert None == cache.start_time
        assert cache.end_time > now

        assert 2 == len(mirror.content[0])
        complete_file = b"".join(mirror.content[0])
        records = list(MARCReader(complete_file))
        assert 2 == len(records)

        self._db.delete(cache)

        # If a start time is set, it's used in the mirror url.
        #
        # (Our mock search engine returns everthing in its 'index',
        # so this doesn't test that the start time is actually used to
        # find works -- that's in the search index tests and the
        # tests of MARCExporterFacets.)
        start_time = now - datetime.timedelta(days=3)

        mirror = MockS3Uploader()
        exporter.records(
            lane,
            annotator,
            mirror_integration,
            start_time=start_time,
            mirror=mirror,
            query_batch_size=2,
            upload_batch_size=2,
            search_engine=search_engine,
        )
        [cache] = self._db.query(CachedMARCFile).all()

        assert self._default_library == cache.library
        assert lane == cache.lane
        assert mirror.uploaded[0] == cache.representation
        assert None == cache.representation.content
        assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s-%s/%s.mrc" % (
            self._default_library.short_name,
            quote(str(start_time)),
            quote(str(cache.representation.fetched_at)),
            quote(lane.display_name),
        ) == mirror.uploaded[0].mirror_url)
        assert start_time == cache.start_time
        assert cache.end_time > now
        self._db.delete(cache)

        # If the search engine returns no contents for the lane,
        # nothing will be mirrored, but a CachedMARCFile is still
        # created to track that we checked for updates.
        empty_search_engine = MockExternalSearchIndex()

        mirror = MockS3Uploader()
        exporter.records(
            lane,
            annotator,
            mirror_integration,
            mirror=mirror,
            search_engine=empty_search_engine,
        )

        assert [] == mirror.content[0]
        [cache] = self._db.query(CachedMARCFile).all()
        assert cache.representation == mirror.uploaded[0]
        assert self._default_library == cache.library
        assert lane == cache.lane
        assert None == cache.representation.content
        assert None == cache.start_time
        assert cache.end_time > now

        self._db.delete(cache)
    def test_work_from_metadata(self):
        """Validate the ability to create a new Work from appropriate metadata.
        """

        class Mock(MockDirectoryImportScript):
            """In this test we need to verify that annotate_metadata
            was called but did nothing.
            """
            def annotate_metadata(self, metadata, *args, **kwargs):
                metadata.annotated = True
                return super(Mock, self).annotate_metadata(
                    metadata, *args, **kwargs
                )

        identifier = IdentifierData(Identifier.GUTENBERG_ID, "1003")
        identifier_obj, ignore = identifier.load(self._db)
        metadata = Metadata(
            DataSource.GUTENBERG,
            primary_identifier=identifier,
            title=u"A book"
        )
        metadata.annotated = False
        datasource = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy.from_license_source(self._db)
        mirror = MockS3Uploader()
        policy.mirror = mirror

        # Here, work_from_metadata calls annotate_metadata, but does
        # not actually import anything because there are no files 'on
        # disk' and thus no way to actually get the book.
        collection = self._default_collection
        args = (collection, metadata, policy, "cover directory",
                "ebook directory", RightsStatus.CC0)
        script = Mock(self._db)
        eq_(None, script.work_from_metadata(*args))
        eq_(True, metadata.annotated)

        # Now let's try it with some files 'on disk'.
        with open(self.sample_cover_path('test-book-cover.png')) as fh:
            image = fh.read()
        mock_filesystem = {
            'cover directory' : (
                'cover.jpg', Representation.JPEG_MEDIA_TYPE, image
            ),
            'ebook directory' : (
                'book.epub', Representation.EPUB_MEDIA_TYPE, "I'm an EPUB."
            )
        }
        script = MockDirectoryImportScript(
            self._db, mock_filesystem=mock_filesystem
        )
        work = script.work_from_metadata(*args)

        # We have created a book. It has a cover image, which has a
        # thumbnail.
        eq_("A book", work.title)
        assert work.cover_full_url.endswith(
            '/test.cover.bucket/Gutenberg/Gutenberg+ID/1003/1003.jpg'
        )
        assert work.cover_thumbnail_url.endswith(
            '/test.cover.bucket/scaled/300/Gutenberg/Gutenberg+ID/1003/1003.png'
        )
        [pool] = work.license_pools
        assert pool.open_access_download_url.endswith(
            '/test.content.bucket/Gutenberg/Gutenberg+ID/1003/A+book.epub'
        )

        eq_(RightsStatus.CC0,
            pool.delivery_mechanisms[0].rights_status.uri)

        # The mock S3Uploader has a record of 'uploading' all these files
        # to S3.
        epub, full, thumbnail = mirror.uploaded
        eq_(epub.url, pool.open_access_download_url)
        eq_(full.url, work.cover_full_url)
        eq_(thumbnail.url, work.cover_thumbnail_url)

        # The EPUB Representation was cleared out after the upload, to
        # save database space.
        eq_("I'm an EPUB.", mirror.content[0])
        eq_(None, epub.content)