Beispiel #1
0
    def test_import_with_unrecognized_distributor_creates_distributor(self):
        """We get a book from the open-access content server but the license
        comes from an unrecognized data source. The book is imported and
        we create a DataSource to record its provenance accurately.
        """
        feed = open(
            os.path.join(self.resource_path,
                         "unrecognized_distributor.opds")).read()
        importer = OPDSImporter(self._db,
                                data_source_name=DataSource.OA_CONTENT_SERVER)
        imported_editions, pools, works, failures = (
            importer.import_from_feed(feed))
        eq_({}, failures)

        # We imported an Edition because there was metadata.
        [edition] = imported_editions
        new_data_source = edition.data_source
        eq_(DataSource.OA_CONTENT_SERVER, new_data_source.name)

        # We imported a LicensePool because there was an open-access
        # link, even though the ultimate source of the link was one
        # we'd never seen before.
        [pool] = pools
        eq_("Unknown Source", pool.data_source.name)

        # From an Edition and a LicensePool we created a Work.
        eq_(1, len(works))
Beispiel #2
0
    def test_extract_next_links(self):
        importer = OPDSImporter(self._db, DataSource.NYT)
        next_links = importer.extract_next_links(
            self.content_server_mini_feed
        )

        eq_(1, len(next_links))
        eq_("http://localhost:5000/?after=327&size=100", next_links[0])
Beispiel #3
0
    def test_import_book_that_offers_no_license(self):
        path = os.path.join(self.resource_path, "book_without_license.opds")
        feed = open(path).read()
        importer = OPDSImporter(self._db, DataSource.OA_CONTENT_SERVER)
        imported_editions, imported_pools, imported_works, failures = (
            importer.import_from_feed(feed))

        # We got an Edition for this book, but no LicensePool and no Work.
        [edition] = imported_editions
        eq_("Howards End", edition.title)
        eq_([], imported_pools)
        eq_([], imported_works)
Beispiel #4
0
    def test_import_from_license_source(self):
        # Instead of importing this data as though it came from the
        # metadata wrangler, let's import it as though it came from the
        # open-access content server.
        feed = self.content_server_mini_feed
        importer = OPDSImporter(
            self._db, data_source_name=DataSource.OA_CONTENT_SERVER
        )

        imported_editions, imported_pools, imported_works, failures = (
            importer.import_from_feed(feed)
        )

        # Two works have been created, because the content server
        # actually tells you how to get copies of these books.
        [crow, mouse] = sorted(imported_works, key=lambda x: x.title)

        # Each work has one license pool.
        [crow_pool] = crow.license_pools
        [mouse_pool] = mouse.license_pools

        # The OPDS importer sets the data source of the license pool
        # to Project Gutenberg, since that's the authority that grants
        # access to the book.
        eq_(DataSource.GUTENBERG, mouse_pool.data_source.name)

        # But the license pool's presentation edition has a data
        # source associated with the Library Simplified open-access
        # content server, since that's where the metadata comes from.
        eq_(DataSource.OA_CONTENT_SERVER, 
            mouse_pool.presentation_edition.data_source.name
        )

        # Since the 'mouse' book came with an open-access link, the license
        # pool delivery mechanism has been marked as open access.
        eq_(True, mouse_pool.open_access)
        eq_(RightsStatus.GENERIC_OPEN_ACCESS, 
            mouse_pool.delivery_mechanisms[0].rights_status.uri)

        # The 'mouse' work has not been marked presentation-ready,
        # because the OPDS importer was not told to make works
        # presentation-ready as they're imported.
        eq_(False, mouse_pool.work.presentation_ready)

        # The OPDS feed didn't actually say where the 'crow' book
        # comes from, but we did tell the importer to use the open access 
        # content server as the data source, so both a Work and a LicensePool 
        # were created, and their data source is the open access content server,
        # not Project Gutenberg.
        eq_(DataSource.OA_CONTENT_SERVER, crow_pool.data_source.name)
Beispiel #5
0
    def test_extract_link(self):
        no_rel = AtomFeed.E.link(href="http://foo/")
        eq_(None, OPDSImporter.extract_link(no_rel))

        no_href = AtomFeed.E.link(href="", rel="foo")
        eq_(None, OPDSImporter.extract_link(no_href))

        good = AtomFeed.E.link(href="http://foo", rel="bar")
        link = OPDSImporter.extract_link(good)
        eq_("http://foo", link.href)
        eq_("bar", link.rel)

        relative = AtomFeed.E.link(href="/foo/bar", rel="self")
        link = OPDSImporter.extract_link(relative, "http://server")
        eq_("http://server/foo/bar", link.href)
Beispiel #6
0
    def test_import_and_make_presentation_ready(self):
        # Now let's tell the OPDS importer to make works presentation-ready
        # as soon as they're imported.
        feed = self.content_server_mini_feed
        importer = OPDSImporter(self._db,
                                data_source_name=DataSource.OA_CONTENT_SERVER)
        imported_editions, imported_pools, imported_works, failures = (
            importer.import_from_feed(feed,
                                      immediately_presentation_ready=True))

        [crow, mouse] = sorted(imported_works, key=lambda x: x.title)

        # Both the 'crow' and the 'mouse' book had presentation-ready works created.
        eq_(True, crow.presentation_ready)
        eq_(True, mouse.presentation_ready)
Beispiel #7
0
    def test_extract_link(self):
        E = builder.ElementMaker()
        no_rel = E.link(href="http://foo/")
        eq_(None, OPDSImporter.extract_link(no_rel))

        no_href = E.link(href="", rel="foo")
        eq_(None, OPDSImporter.extract_link(no_href))

        good = E.link(href="http://foo", rel="bar")
        link = OPDSImporter.extract_link(good)
        eq_("http://foo", link.href)
        eq_("bar", link.rel)

        relative = E.link(href="/foo/bar", rel="self")
        link = OPDSImporter.extract_link(relative, "http://server")
        eq_("http://server/foo/bar", link.href)
Beispiel #8
0
    def test_extract_link_rights_uri(self):

        # Most of the time, a link's rights URI is inherited from the entry.
        entry_rights = RightsStatus.PUBLIC_DOMAIN_USA

        link_tag = AtomFeed.E.link(href="http://foo", rel="bar")
        link = OPDSImporter.extract_link(link_tag,
                                         entry_rights_uri=entry_rights)
        eq_(RightsStatus.PUBLIC_DOMAIN_USA, link.rights_uri)

        # But a dcterms:rights tag beneath the link can override this.
        rights_attr = "{%s}rights" % AtomFeed.DCTERMS_NS
        link_tag.attrib[rights_attr] = RightsStatus.IN_COPYRIGHT
        link = OPDSImporter.extract_link(link_tag,
                                         entry_rights_uri=entry_rights)
        eq_(RightsStatus.IN_COPYRIGHT, link.rights_uri)
Beispiel #9
0
 def test_combine_present_value_extends_list(self):
     """When both dictionaries define a list, the combined value
     is a combined list.
     """
     a_is_true = dict(a=[True])
     a_is_false = dict(a=[False])
     eq_(dict(a=[True, False]), OPDSImporter.combine(a_is_true, a_is_false))
Beispiel #10
0
    def test_import_updates_metadata(self):

        path = os.path.join(self.resource_path,
                            "metadata_wrangler_overdrive.opds")
        feed = open(path).read()

        edition, is_new = self._edition(DataSource.OVERDRIVE,
                                        Identifier.OVERDRIVE_ID,
                                        with_license_pool=True)
        edition.license_pool.calculate_work()
        work = edition.license_pool.work

        old_license_pool = edition.license_pool
        feed = feed.replace("{OVERDRIVE ID}",
                            edition.primary_identifier.identifier)

        imported_editions, imported_pools, imported_works, failures = (
            OPDSImporter(
                self._db,
                data_source_name=DataSource.OVERDRIVE).import_from_feed(feed))

        # The edition we created has had its metadata updated.
        eq_(imported_editions[0], edition)
        eq_("The Green Mouse", imported_editions[0].title)

        # But the license pools have not changed.
        eq_(edition.license_pool, old_license_pool)
        eq_(work.license_pools, [old_license_pool])
Beispiel #11
0
    def test_combine(self):
        """An overall test that duplicates a lot of functionality
        in the more specific tests.
        """
        d1 = dict(a_list=[1],
                  a_scalar="old value",
                  a_dict=dict(key1=None, key2=[2], key3="value3"))

        d2 = dict(a_list=[2],
                  a_scalar="new value",
                  a_dict=dict(key1="finally a value",
                              key4="value4",
                              key2=[200]))

        combined = OPDSImporter.combine(d1, d2)

        # Dictionaries get combined recursively.
        d = combined['a_dict']

        # Normal scalar values can be overridden once set.
        eq_("new value", combined['a_scalar'])

        # Missing values are filled in.
        eq_('finally a value', d["key1"])
        eq_('value3', d['key3'])
        eq_('value4', d['key4'])

        # Lists get extended.
        eq_([1, 2], combined['a_list'])
        eq_([2, 200], d['key2'])
Beispiel #12
0
    def test_consolidate_links(self):

        # If a link turns out to be a dud, consolidate_links()
        # gets rid of it.
        links = [None, None]
        eq_([], OPDSImporter.consolidate_links(links))

        links = [
            LinkData(href=self._url, rel=rel, media_type="image/jpeg")
            for rel in [
                Hyperlink.OPEN_ACCESS_DOWNLOAD, Hyperlink.IMAGE,
                Hyperlink.THUMBNAIL_IMAGE, Hyperlink.OPEN_ACCESS_DOWNLOAD
            ]
        ]
        old_link = links[2]
        links = OPDSImporter.consolidate_links(links)
        eq_([
            Hyperlink.OPEN_ACCESS_DOWNLOAD, Hyperlink.IMAGE,
            Hyperlink.OPEN_ACCESS_DOWNLOAD
        ], [x.rel for x in links])
        link = links[1]
        eq_(old_link, link.thumbnail)

        links = [
            LinkData(href=self._url, rel=rel, media_type="image/jpeg")
            for rel in [
                Hyperlink.THUMBNAIL_IMAGE, Hyperlink.IMAGE,
                Hyperlink.THUMBNAIL_IMAGE, Hyperlink.IMAGE
            ]
        ]
        t1, i1, t2, i2 = links
        links = OPDSImporter.consolidate_links(links)
        eq_([Hyperlink.IMAGE, Hyperlink.IMAGE], [x.rel for x in links])
        eq_(t1, i1.thumbnail)
        eq_(t2, i2.thumbnail)

        links = [
            LinkData(href=self._url, rel=rel, media_type="image/jpeg") for rel
            in [Hyperlink.THUMBNAIL_IMAGE, Hyperlink.IMAGE, Hyperlink.IMAGE]
        ]
        t1, i1, i2 = links
        links = OPDSImporter.consolidate_links(links)
        eq_([Hyperlink.IMAGE, Hyperlink.IMAGE], [x.rel for x in links])
        eq_(t1, i1.thumbnail)
        eq_(None, i2.thumbnail)
Beispiel #13
0
 def test_combine_present_value_not_replaced_with_none(self):
     """When combining a dictionary where a key is set to None
     with a dictionary where that key is present, the value
     is left alone.
     """
     a_is_present = dict(a=True)
     a_is_none = dict(a=None, b=True)
     expect = dict(a=True, b=True)
     eq_(expect, OPDSImporter.combine(a_is_present, a_is_none))
Beispiel #14
0
 def test_combine_present_value_extends_dictionary(self):
     """When both dictionaries define a dictionary, the combined value is
     the result of combining the two dictionaries with a recursive
     combine() call.
     """
     a_is_true = dict(a=dict(b=[True]))
     a_is_false = dict(a=dict(b=[False]))
     eq_(dict(a=dict(b=[True, False])),
         OPDSImporter.combine(a_is_true, a_is_false))
Beispiel #15
0
    def test_data_source_autocreated(self):
        name = "New data source " + self._str
        importer = OPDSImporter(self._db, name)
        source1 = importer.data_source
        eq_(name, source1.name)

        # By default, DataSources created through this mechanism do
        # not offer licenses.
        eq_(False, source1.offers_licenses)

        # But we can create a DataSource that does offer licenses.
        name = "New data source " + self._str
        importer = OPDSImporter(self._db,
                                name,
                                data_source_offers_licenses=True)
        source2 = importer.data_source
        eq_(name, source2.name)
        eq_(True, source2.offers_licenses)
Beispiel #16
0
    def test_import_from_feed_treats_message_as_failure(self):
        path = os.path.join(self.resource_path, "unrecognized_identifier.opds")
        feed = open(path).read()
        imported_editions, imported_pools, imported_works, failures = (
            OPDSImporter(self._db).import_from_feed(feed))

        [failure] = failures.values()
        assert isinstance(failure, CoverageFailure)
        eq_(True, failure.transient)
        eq_("404: I've never heard of this work.", failure.exception)
Beispiel #17
0
 def test_extract_messages(self):
     parser = OPDSXMLParser()
     feed = open(
         os.path.join(self.resource_path,
                      "unrecognized_identifier.opds")).read()
     root = etree.parse(StringIO(feed))
     [message] = OPDSImporter.extract_messages(parser, root)
     eq_('urn:librarysimplified.org/terms/id/Gutenberg ID/100', message.urn)
     eq_(404, message.status_code)
     eq_("I've never heard of this work.", message.message)
Beispiel #18
0
    def test_extract_last_update_dates(self):
        importer = OPDSImporter(self._db, DataSource.NYT)

        # This file has two <entry> tags and one <simplified:message> tag.
        # The <entry> tags have their last update dates extracted,
        # the message is ignored.
        last_update_dates = importer.extract_last_update_dates(
            self.content_server_mini_feed
        )

        eq_(2, len(last_update_dates))

        identifier1, updated1 = last_update_dates[0]
        identifier2, updated2 = last_update_dates[1]

        eq_("urn:librarysimplified.org/terms/id/Gutenberg%20ID/10441", identifier1)
        eq_(datetime.datetime(2015, 1, 2, 16, 56, 40), updated1)

        eq_("urn:librarysimplified.org/terms/id/Gutenberg%20ID/10557", identifier2)
        eq_(datetime.datetime(2015, 1, 2, 16, 56, 40), updated2)
Beispiel #19
0
 def test_import_with_unrecognized_distributor_fails(self):
     """We get a book from the open-access content server but the license
     comes from an unrecognized data source. We can't import the book
     because we can't record its provenance accurately.
     """
     feed = open(
         os.path.join(self.resource_path, "unrecognized_distributor.opds")).read()
     importer = OPDSImporter(
         self._db, 
         data_source_name=DataSource.OA_CONTENT_SERVER
     )
     imported_editions, pools, works, failures = (
         importer.import_from_feed(feed)
     )
     # No editions, licensepools, or works were imported.
     eq_([], imported_editions)
     eq_([], pools)
     eq_([], works)
     [failure] = failures.values()
     eq_(True, failure.transient)
     assert "Unrecognized circulation data source: Unknown Source" in failure.exception
Beispiel #20
0
    def test_extract_metadata(self):
        importer = OPDSImporter(self._db, DataSource.NYT)
        metadata, failures = importer.extract_feed_data(
            self.content_server_mini_feed
        )

        m1 = metadata['http://www.gutenberg.org/ebooks/10441']
        m2 = metadata['http://www.gutenberg.org/ebooks/10557']
        c1 = metadata['http://www.gutenberg.org/ebooks/10441']
        c2 = metadata['http://www.gutenberg.org/ebooks/10557']

        eq_("The Green Mouse", m1.title)
        eq_("A Tale of Mousy Terror", m1.subtitle)

        eq_(DataSource.NYT, m1._data_source)
        eq_(DataSource.NYT, m2._data_source)
        eq_(DataSource.NYT, c1._data_source)
        eq_(DataSource.NYT, c2._data_source)

        [failure] = failures.values()
        eq_(u"202: I'm working to locate a source for this identifier.", failure.exception)
Beispiel #21
0
    def test_extract_metadata_from_elementtree_treats_message_as_failure(self):
        data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER)

        feed = open(
            os.path.join(self.resource_path,
                         "unrecognized_identifier.opds")).read()
        values, failures = OPDSImporter.extract_metadata_from_elementtree(
            feed, data_source)

        # We have no Metadata objects and one CoverageFailure.
        eq_({}, values)

        # The CoverageFailure contains the information that was in a
        # <simplified:message> tag in unrecognized_identifier.opds.
        key = 'http://www.gutenberg.org/ebooks/100'
        eq_([key], failures.keys())
        failure = failures[key]
        eq_("404: I've never heard of this work.", failure.exception)
        eq_(key, failure.obj.urn)
Beispiel #22
0
    def test_extract_data_from_feedparser(self):

        data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER)
        values, failures = OPDSImporter.extract_data_from_feedparser(
            self.content_server_mini_feed, data_source)

        # The <entry> tag became a Metadata object.
        metadata = values[
            'urn:librarysimplified.org/terms/id/Gutenberg%20ID/10441']
        eq_("The Green Mouse", metadata['title'])
        eq_("A Tale of Mousy Terror", metadata['subtitle'])
        eq_('en', metadata['language'])
        eq_('Project Gutenberg', metadata['publisher'])

        circulation = metadata['circulation']
        eq_(DataSource.GUTENBERG, circulation['data_source'])

        # The <simplified:message> tag did not become a
        # CoverageFailure -- that's handled by
        # extract_metadata_from_elementtree.
        eq_({}, failures)
Beispiel #23
0
    def test_import_with_lendability(self):
        # Tests that will create Edition, LicensePool, and Work objects, when appropriate.
        # For example, on a Metadata_Wrangler data source, it is only appropriate to create
        # editions, but not pools or works.  On a lendable data source, should create
        # pools and works as well as editions.
        # Tests that the number and contents of error messages are appropriate to the task.

        # will create editions, but not license pools or works, because the
        # metadata wrangler data source is not lendable
        feed = self.content_server_mini_feed

        importer_mw = OPDSImporter(
            self._db, data_source_name=DataSource.METADATA_WRANGLER)
        imported_editions_mw, pools_mw, works_mw, failures_mw = (
            importer_mw.import_from_feed(feed))

        # Both books were imported, because they were new.
        eq_(2, len(imported_editions_mw))

        # But pools and works weren't created, because the data source isn't lendable.
        # 1 error message, because correctly didn't even get to trying to create pools,
        # so no messages there, but do have that entry stub at end of sample xml file,
        # which should fail with a message.
        eq_(1, len(failures_mw))
        eq_(0, len(pools_mw))
        eq_(0, len(works_mw))

        # try again, with a license pool-acceptable data source
        importer_g = OPDSImporter(self._db,
                                  data_source_name=DataSource.GUTENBERG)
        imported_editions_g, pools_g, works_g, failures_g = (
            importer_g.import_from_feed(feed))

        # we made new editions, because we're now creating edition per data source, not overwriting
        eq_(2, len(imported_editions_g))
        # TODO: and we also created presentation editions, with author and title set

        # now pools and works are in, too
        eq_(1, len(failures_g))
        eq_(2, len(pools_g))
        eq_(2, len(works_g))

        # assert that bibframe datasource from feed was correctly overwritten
        # with data source I passed into the importer.
        for pool in pools_g:
            eq_(pool.data_source.name, DataSource.GUTENBERG)
Beispiel #24
0
 def f(*args):
     message = OPDSMessage(*args)
     return OPDSImporter.coveragefailure_from_message(
         data_source, message)
Beispiel #25
0
    def test_import(self):
        feed = self.content_server_mini_feed

        imported_editions, pools, works, failures = (OPDSImporter(
            self._db).import_from_feed(feed))

        [crow, mouse] = sorted(imported_editions, key=lambda x: x.title)

        # By default, this feed is treated as though it came from the
        # metadata wrangler. No Work has been created.
        eq_(DataSource.METADATA_WRANGLER, crow.data_source.name)
        eq_(None, crow.work)
        eq_(None, crow.license_pool)
        eq_(Edition.BOOK_MEDIUM, crow.medium)

        # not even the 'mouse'
        eq_(None, mouse.work)
        eq_(Edition.PERIODICAL_MEDIUM, mouse.medium)

        popularity, quality, rating = sorted([
            x
            for x in mouse.primary_identifier.measurements if x.is_most_recent
        ],
                                             key=lambda x: x.quantity_measured)

        eq_(DataSource.METADATA_WRANGLER, popularity.data_source.name)
        eq_(Measurement.POPULARITY, popularity.quantity_measured)
        eq_(0.25, popularity.value)

        eq_(DataSource.METADATA_WRANGLER, quality.data_source.name)
        eq_(Measurement.QUALITY, quality.quantity_measured)
        eq_(0.3333, quality.value)

        eq_(DataSource.METADATA_WRANGLER, rating.data_source.name)
        eq_(Measurement.RATING, rating.quantity_measured)
        eq_(0.6, rating.value)

        seven, children, courtship, fantasy, pz, magic, new_york = sorted(
            mouse.primary_identifier.classifications,
            key=lambda x: x.subject.name)

        pz_s = pz.subject
        eq_("Juvenile Fiction", pz_s.name)
        eq_("PZ", pz_s.identifier)

        new_york_s = new_york.subject
        eq_("New York (N.Y.) -- Fiction", new_york_s.name)
        eq_("sh2008108377", new_york_s.identifier)

        eq_('7', seven.subject.identifier)
        eq_(100, seven.weight)
        eq_(Subject.AGE_RANGE, seven.subject.type)
        from classifier import Classifier
        classifier = Classifier.classifiers.get(seven.subject.type, None)
        classifier.classify(seven.subject)

        # If we import the same file again, we get the same list of Editions.
        imported_editions_2, pools_2, works_2, failures_2 = (OPDSImporter(
            self._db).import_from_feed(feed))
        eq_(imported_editions_2, imported_editions)

        # importing with a lendable data source makes license pools and works
        imported_editions, pools, works, failures = (OPDSImporter(
            self._db,
            data_source_name=DataSource.OA_CONTENT_SERVER).import_from_feed(
                feed))

        [crow_pool,
         mouse_pool] = sorted(pools,
                              key=lambda x: x.presentation_edition.title)

        # Work was created for both books.
        assert crow_pool.work is not None
        eq_(Edition.BOOK_MEDIUM, crow_pool.presentation_edition.medium)

        assert mouse_pool.work is not None
        eq_(Edition.PERIODICAL_MEDIUM, mouse_pool.presentation_edition.medium)

        work = mouse_pool.work
        work.calculate_presentation()
        eq_(0.4142, round(work.quality, 4))
        eq_(Classifier.AUDIENCE_CHILDREN, work.audience)
        eq_(NumericRange(7, 7, '[]'), work.target_age)

        # Bonus: make sure that delivery mechanisms are set appropriately.
        [mech] = mouse_pool.delivery_mechanisms
        eq_(Representation.EPUB_MEDIA_TYPE,
            mech.delivery_mechanism.content_type)
        eq_(DeliveryMechanism.NO_DRM, mech.delivery_mechanism.drm_scheme)
        eq_('http://www.gutenberg.org/ebooks/10441.epub.images',
            mech.resource.url)
Beispiel #26
0
    def test_extract_metadata_from_elementtree(self):

        data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER)

        data, failures = OPDSImporter.extract_metadata_from_elementtree(
            self.content_server_feed, data_source)

        # There are 76 entries in the feed, and we got metadata for
        # every one of them.
        eq_(76, len(data))
        eq_(0, len(failures))

        # We're going to do spot checks on a book and a periodical.

        # First, the book.
        book_id = 'urn:librarysimplified.org/terms/id/Gutenberg%20ID/1022'
        book = data[book_id]
        eq_(Edition.BOOK_MEDIUM, book['medium'])

        [contributor] = book['contributors']
        eq_("Thoreau, Henry David", contributor.sort_name)
        eq_([Contributor.AUTHOR_ROLE], contributor.roles)

        subjects = book['subjects']
        eq_(['LCSH', 'LCSH', 'LCSH', 'LCC'], [x.type for x in subjects])
        eq_(['Essays', 'Nature', 'Walking', 'PS'],
            [x.identifier for x in subjects])
        eq_([None, None, None, 'American Literature'],
            [x.name for x in book['subjects']])
        eq_([1, 1, 1, 10], [x.weight for x in book['subjects']])

        eq_([], book['measurements'])

        [link] = book['links']
        eq_(Hyperlink.OPEN_ACCESS_DOWNLOAD, link.rel)
        eq_("http://www.gutenberg.org/ebooks/1022.epub.noimages", link.href)
        eq_(Representation.EPUB_MEDIA_TYPE, link.media_type)

        # And now, the periodical.
        periodical_id = 'urn:librarysimplified.org/terms/id/Gutenberg%20ID/10441'
        periodical = data[periodical_id]
        eq_(Edition.PERIODICAL_MEDIUM, periodical['medium'])

        subjects = periodical['subjects']
        eq_([
            'LCSH', 'LCSH', 'LCSH', 'LCSH', 'LCC', 'schema:audience',
            'schema:typicalAgeRange'
        ], [x.type for x in subjects])
        eq_([
            'Courtship -- Fiction', 'New York (N.Y.) -- Fiction',
            'Fantasy fiction', 'Magic -- Fiction', 'PZ', 'Children', '7'
        ], [x.identifier for x in subjects])
        eq_([1, 1, 1, 1, 1, 100, 100], [x.weight for x in subjects])

        r1, r2, r3 = periodical['measurements']

        eq_(Measurement.QUALITY, r1.quantity_measured)
        eq_(0.3333, r1.value)
        eq_(1, r1.weight)

        eq_(Measurement.RATING, r2.quantity_measured)
        eq_(0.6, r2.value)
        eq_(1, r2.weight)

        eq_(Measurement.POPULARITY, r3.quantity_measured)
        eq_(0.25, r3.value)
        eq_(1, r3.weight)
Beispiel #27
0
    def test_resources_are_mirrored_on_import(self):

        svg = """<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">

<svg xmlns="http://www.w3.org/2000/svg" width="1000" height="500">
    <ellipse cx="50" cy="25" rx="50" ry="25" style="fill:blue;"/>
</svg>"""

        http = DummyHTTPClient()
        # The request to http://root/full-cover-image.png
        # will result in a 404 error, and the image will not be mirrored.
        http.queue_response(404, media_type="text/plain")
        http.queue_response(
            200,
            content='I am 10557.epub.images',
            media_type=Representation.EPUB_MEDIA_TYPE,
        )
        http.queue_response(200,
                            content=svg,
                            media_type=Representation.SVG_MEDIA_TYPE)
        http.queue_response(200,
                            content='I am 10441.epub.images',
                            media_type=Representation.EPUB_MEDIA_TYPE)

        s3 = DummyS3Uploader()

        importer = OPDSImporter(self._db,
                                data_source_name=DataSource.OA_CONTENT_SERVER,
                                mirror=s3,
                                http_get=http.do_get)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed, feed_url='http://root'))
        e1 = imported_editions[0]
        e2 = imported_editions[1]

        # The import process requested each remote resource in the
        # order they appeared in the OPDS feed. The thumbnail
        # image was not requested, since we were going to make our own
        # thumbnail anyway.
        eq_(http.requests, [
            'http://www.gutenberg.org/ebooks/10441.epub.images',
            'https://s3.amazonaws.com/book-covers.nypl.org/Gutenberg-Illustrated/10441/cover_10441_9.png',
            'http://www.gutenberg.org/ebooks/10557.epub.images',
            'http://root/full-cover-image.png',
        ])

        [e1_oa_link, e1_image_link,
         e1_description_link] = sorted(e1.primary_identifier.links,
                                       key=lambda x: x.rel)
        [e2_image_link, e2_oa_link] = e2.primary_identifier.links

        # The two open-access links were mirrored to S3, as was the
        # original SVG image and its PNG thumbnail. The PNG image was
        # not mirrored because our attempt to download it resulted in
        # a 404 error.
        imported_representations = [
            e1_oa_link.resource.representation,
            e1_image_link.resource.representation,
            e1_image_link.resource.representation.thumbnails[0],
            e2_oa_link.resource.representation,
        ]
        eq_(imported_representations, s3.uploaded)

        eq_(4, len(s3.uploaded))
        eq_("I am 10441.epub.images", s3.content[0])
        eq_(svg, s3.content[1])
        eq_("I am 10557.epub.images", s3.content[3])

        # Each resource was 'mirrored' to an Amazon S3 bucket.
        #
        # The "mouse" book was mirrored to a bucket corresponding to
        # Project Gutenberg, its data source.
        #
        # The images were mirrored to a bucket corresponding to the
        # open-access content server, _their_ data source.
        #
        # The "crow" book was mirrored to a bucket corresponding to
        # the open-access content source, the default data source used
        # when no distributor was specified for a book.
        url0 = 'http://s3.amazonaws.com/test.content.bucket/Gutenberg/Gutenberg%20ID/10441/The%20Green%20Mouse.epub.images'
        url1 = u'http://s3.amazonaws.com/test.cover.bucket/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10441/cover_10441_9.png'
        url2 = u'http://s3.amazonaws.com/test.cover.bucket/scaled/300/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10441/cover_10441_9.png'
        url3 = 'http://s3.amazonaws.com/test.content.bucket/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10557/Johnny%20Crow%27s%20Party.epub.images'
        uploaded_urls = [x.mirror_url for x in s3.uploaded]
        eq_([url0, url1, url2, url3], uploaded_urls)

        # If we fetch the feed again, and the entries have been updated since the
        # cutoff, but the content of the open access links hasn't changed, we won't mirror
        # them again.
        cutoff = datetime.datetime(2013, 1, 2, 16, 56, 40)

        http.queue_response(304, media_type=Representation.EPUB_MEDIA_TYPE)

        http.queue_response(304, media_type=Representation.SVG_MEDIA_TYPE)

        http.queue_response(304, media_type=Representation.EPUB_MEDIA_TYPE)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed))

        eq_([e1, e2], imported_editions)

        # Nothing new has been uploaded
        eq_(4, len(s3.uploaded))

        # If the content has changed, it will be mirrored again.
        http.queue_response(200,
                            content="I am a new version of 10557.epub.images",
                            media_type=Representation.EPUB_MEDIA_TYPE)

        http.queue_response(200,
                            content=svg,
                            media_type=Representation.SVG_MEDIA_TYPE)

        http.queue_response(200,
                            content="I am a new version of 10441.epub.images",
                            media_type=Representation.EPUB_MEDIA_TYPE)

        imported_editions, pools, works, failures = (importer.import_from_feed(
            self.content_server_mini_feed))

        eq_([e1, e2], imported_editions)
        eq_(8, len(s3.uploaded))
        eq_("I am a new version of 10441.epub.images", s3.content[4])
        eq_(svg, s3.content[5])
        eq_("I am a new version of 10557.epub.images", s3.content[7])
Beispiel #28
0
    def test_import_exception_if_unable_to_parse_feed(self):
        feed = "I am not a feed."
        importer = OPDSImporter(self._db)

        assert_raises(etree.XMLSyntaxError, importer.import_from_feed, feed)