Ejemplo n.º 1
0
    def test_import_with_unrecognized_distributor_creates_distributor(self):
        """We get a book from the open-access content server but the license
        comes from an unrecognized data source. The book is imported and
        we create a DataSource to record its provenance accurately.
        """
        feed = open(
            os.path.join(self.resource_path,
                         "unrecognized_distributor.opds")).read()
        importer = OPDSImporter(self._db,
                                data_source_name=DataSource.OA_CONTENT_SERVER)
        imported_editions, pools, works, failures = (
            importer.import_from_feed(feed))
        eq_({}, failures)

        # We imported an Edition because there was metadata.
        [edition] = imported_editions
        new_data_source = edition.data_source
        eq_(DataSource.OA_CONTENT_SERVER, new_data_source.name)

        # We imported a LicensePool because there was an open-access
        # link, even though the ultimate source of the link was one
        # we'd never seen before.
        [pool] = pools
        eq_("Unknown Source", pool.data_source.name)

        # From an Edition and a LicensePool we created a Work.
        eq_(1, len(works))
Ejemplo n.º 2
0
    def test_import_with_lendability(self):
        # Tests that will create Edition, LicensePool, and Work objects, when appropriate.
        # For example, on a Metadata_Wrangler data source, it is only appropriate to create
        # editions, but not pools or works.  On a lendable data source, should create
        # pools and works as well as editions.
        # Tests that the number and contents of error messages are appropriate to the task.

        # will create editions, but not license pools or works, because the
        # metadata wrangler data source is not lendable
        feed = self.content_server_mini_feed

        importer_mw = OPDSImporter(
            self._db, data_source_name=DataSource.METADATA_WRANGLER)
        imported_editions_mw, pools_mw, works_mw, failures_mw = (
            importer_mw.import_from_feed(feed))

        # Both books were imported, because they were new.
        eq_(2, len(imported_editions_mw))

        # But pools and works weren't created, because the data source isn't lendable.
        # 1 error message, because correctly didn't even get to trying to create pools,
        # so no messages there, but do have that entry stub at end of sample xml file,
        # which should fail with a message.
        eq_(1, len(failures_mw))
        eq_(0, len(pools_mw))
        eq_(0, len(works_mw))

        # try again, with a license pool-acceptable data source
        importer_g = OPDSImporter(self._db,
                                  data_source_name=DataSource.GUTENBERG)
        imported_editions_g, pools_g, works_g, failures_g = (
            importer_g.import_from_feed(feed))

        # we made new editions, because we're now creating edition per data source, not overwriting
        eq_(2, len(imported_editions_g))
        # TODO: and we also created presentation editions, with author and title set

        # now pools and works are in, too
        eq_(1, len(failures_g))
        eq_(2, len(pools_g))
        eq_(2, len(works_g))

        # assert that bibframe datasource from feed was correctly overwritten
        # with data source I passed into the importer.
        for pool in pools_g:
            eq_(pool.data_source.name, DataSource.GUTENBERG)
Ejemplo n.º 3
0
    def test_import_book_that_offers_no_license(self):
        path = os.path.join(self.resource_path, "book_without_license.opds")
        feed = open(path).read()
        importer = OPDSImporter(self._db, DataSource.OA_CONTENT_SERVER)
        imported_editions, imported_pools, imported_works, failures = (
            importer.import_from_feed(feed))

        # We got an Edition for this book, but no LicensePool and no Work.
        [edition] = imported_editions
        eq_("Howards End", edition.title)
        eq_([], imported_pools)
        eq_([], imported_works)
Ejemplo n.º 4
0
    def test_import_from_license_source(self):
        # Instead of importing this data as though it came from the
        # metadata wrangler, let's import it as though it came from the
        # open-access content server.
        feed = self.content_server_mini_feed
        importer = OPDSImporter(
            self._db, data_source_name=DataSource.OA_CONTENT_SERVER
        )

        imported_editions, imported_pools, imported_works, failures = (
            importer.import_from_feed(feed)
        )

        # Two works have been created, because the content server
        # actually tells you how to get copies of these books.
        [crow, mouse] = sorted(imported_works, key=lambda x: x.title)

        # Each work has one license pool.
        [crow_pool] = crow.license_pools
        [mouse_pool] = mouse.license_pools

        # The OPDS importer sets the data source of the license pool
        # to Project Gutenberg, since that's the authority that grants
        # access to the book.
        eq_(DataSource.GUTENBERG, mouse_pool.data_source.name)

        # But the license pool's presentation edition has a data
        # source associated with the Library Simplified open-access
        # content server, since that's where the metadata comes from.
        eq_(DataSource.OA_CONTENT_SERVER, 
            mouse_pool.presentation_edition.data_source.name
        )

        # Since the 'mouse' book came with an open-access link, the license
        # pool delivery mechanism has been marked as open access.
        eq_(True, mouse_pool.open_access)
        eq_(RightsStatus.GENERIC_OPEN_ACCESS, 
            mouse_pool.delivery_mechanisms[0].rights_status.uri)

        # The 'mouse' work has not been marked presentation-ready,
        # because the OPDS importer was not told to make works
        # presentation-ready as they're imported.
        eq_(False, mouse_pool.work.presentation_ready)

        # The OPDS feed didn't actually say where the 'crow' book
        # comes from, but we did tell the importer to use the open access 
        # content server as the data source, so both a Work and a LicensePool 
        # were created, and their data source is the open access content server,
        # not Project Gutenberg.
        eq_(DataSource.OA_CONTENT_SERVER, crow_pool.data_source.name)
Ejemplo n.º 5
0
    def test_import_and_make_presentation_ready(self):
        # Now let's tell the OPDS importer to make works presentation-ready
        # as soon as they're imported.
        feed = self.content_server_mini_feed
        importer = OPDSImporter(self._db,
                                data_source_name=DataSource.OA_CONTENT_SERVER)
        imported_editions, imported_pools, imported_works, failures = (
            importer.import_from_feed(feed,
                                      immediately_presentation_ready=True))

        [crow, mouse] = sorted(imported_works, key=lambda x: x.title)

        # Both the 'crow' and the 'mouse' book had presentation-ready works created.
        eq_(True, crow.presentation_ready)
        eq_(True, mouse.presentation_ready)
Ejemplo n.º 6
0
 def test_import_with_unrecognized_distributor_fails(self):
     """We get a book from the open-access content server but the license
     comes from an unrecognized data source. We can't import the book
     because we can't record its provenance accurately.
     """
     feed = open(
         os.path.join(self.resource_path, "unrecognized_distributor.opds")).read()
     importer = OPDSImporter(
         self._db, 
         data_source_name=DataSource.OA_CONTENT_SERVER
     )
     imported_editions, pools, works, failures = (
         importer.import_from_feed(feed)
     )
     # No editions, licensepools, or works were imported.
     eq_([], imported_editions)
     eq_([], pools)
     eq_([], works)
     [failure] = failures.values()
     eq_(True, failure.transient)
     assert "Unrecognized circulation data source: Unknown Source" in failure.exception
Ejemplo n.º 7
0
    def test_resources_are_mirrored_on_import(self):

        svg = """<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">

<svg xmlns="http://www.w3.org/2000/svg" width="1000" height="500">
    <ellipse cx="50" cy="25" rx="50" ry="25" style="fill:blue;"/>
</svg>"""

        http = DummyHTTPClient()
        # The request to http://root/full-cover-image.png
        # will result in a 404 error, and the image will not be mirrored.
        http.queue_response(404, media_type="text/plain")
        http.queue_response(
            200,
            content='I am 10557.epub.images',
            media_type=Representation.EPUB_MEDIA_TYPE,
        )
        http.queue_response(200,
                            content=svg,
                            media_type=Representation.SVG_MEDIA_TYPE)
        http.queue_response(200,
                            content='I am 10441.epub.images',
                            media_type=Representation.EPUB_MEDIA_TYPE)

        s3 = DummyS3Uploader()

        importer = OPDSImporter(self._db,
                                data_source_name=DataSource.OA_CONTENT_SERVER,
                                mirror=s3,
                                http_get=http.do_get)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed, feed_url='http://root'))
        e1 = imported_editions[0]
        e2 = imported_editions[1]

        # The import process requested each remote resource in the
        # order they appeared in the OPDS feed. The thumbnail
        # image was not requested, since we were going to make our own
        # thumbnail anyway.
        eq_(http.requests, [
            'http://www.gutenberg.org/ebooks/10441.epub.images',
            'https://s3.amazonaws.com/book-covers.nypl.org/Gutenberg-Illustrated/10441/cover_10441_9.png',
            'http://www.gutenberg.org/ebooks/10557.epub.images',
            'http://root/full-cover-image.png',
        ])

        [e1_oa_link, e1_image_link,
         e1_description_link] = sorted(e1.primary_identifier.links,
                                       key=lambda x: x.rel)
        [e2_image_link, e2_oa_link] = e2.primary_identifier.links

        # The two open-access links were mirrored to S3, as was the
        # original SVG image and its PNG thumbnail. The PNG image was
        # not mirrored because our attempt to download it resulted in
        # a 404 error.
        imported_representations = [
            e1_oa_link.resource.representation,
            e1_image_link.resource.representation,
            e1_image_link.resource.representation.thumbnails[0],
            e2_oa_link.resource.representation,
        ]
        eq_(imported_representations, s3.uploaded)

        eq_(4, len(s3.uploaded))
        eq_("I am 10441.epub.images", s3.content[0])
        eq_(svg, s3.content[1])
        eq_("I am 10557.epub.images", s3.content[3])

        # Each resource was 'mirrored' to an Amazon S3 bucket.
        #
        # The "mouse" book was mirrored to a bucket corresponding to
        # Project Gutenberg, its data source.
        #
        # The images were mirrored to a bucket corresponding to the
        # open-access content server, _their_ data source.
        #
        # The "crow" book was mirrored to a bucket corresponding to
        # the open-access content source, the default data source used
        # when no distributor was specified for a book.
        url0 = 'http://s3.amazonaws.com/test.content.bucket/Gutenberg/Gutenberg%20ID/10441/The%20Green%20Mouse.epub.images'
        url1 = u'http://s3.amazonaws.com/test.cover.bucket/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10441/cover_10441_9.png'
        url2 = u'http://s3.amazonaws.com/test.cover.bucket/scaled/300/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10441/cover_10441_9.png'
        url3 = 'http://s3.amazonaws.com/test.content.bucket/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10557/Johnny%20Crow%27s%20Party.epub.images'
        uploaded_urls = [x.mirror_url for x in s3.uploaded]
        eq_([url0, url1, url2, url3], uploaded_urls)

        # If we fetch the feed again, and the entries have been updated since the
        # cutoff, but the content of the open access links hasn't changed, we won't mirror
        # them again.
        cutoff = datetime.datetime(2013, 1, 2, 16, 56, 40)

        http.queue_response(304, media_type=Representation.EPUB_MEDIA_TYPE)

        http.queue_response(304, media_type=Representation.SVG_MEDIA_TYPE)

        http.queue_response(304, media_type=Representation.EPUB_MEDIA_TYPE)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed))

        eq_([e1, e2], imported_editions)

        # Nothing new has been uploaded
        eq_(4, len(s3.uploaded))

        # If the content has changed, it will be mirrored again.
        http.queue_response(200,
                            content="I am a new version of 10557.epub.images",
                            media_type=Representation.EPUB_MEDIA_TYPE)

        http.queue_response(200,
                            content=svg,
                            media_type=Representation.SVG_MEDIA_TYPE)

        http.queue_response(200,
                            content="I am a new version of 10441.epub.images",
                            media_type=Representation.EPUB_MEDIA_TYPE)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed))

        eq_([e1, e2], imported_editions)
        eq_(8, len(s3.uploaded))
        eq_("I am a new version of 10441.epub.images", s3.content[4])
        eq_(svg, s3.content[5])
        eq_("I am a new version of 10557.epub.images", s3.content[7])