def add_with_metadata(self, collection_details):
        """Adds identifiers with their metadata to a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(
            self._db, client, collection_details
        )

        data_source = DataSource.lookup(
            self._db, collection.name, autocreate=True
        )

        messages = []

        feed = feedparser.parse(request.data)
        entries = feed.get("entries", [])
        entries_by_urn = { entry.get('id') : entry for entry in entries }

        identifiers_by_urn, invalid_urns = Identifier.parse_urns(
            self._db, entries_by_urn.keys()
        )

        messages = list()

        for urn in invalid_urns:
            messages.append(OPDSMessage(
                urn, INVALID_URN.status_code, INVALID_URN.detail
            ))


        for urn, identifier in identifiers_by_urn.items():
            entry = entries_by_urn[urn]
            status = HTTP_OK
            description = "Already in catalog"

            if identifier not in collection.catalog:
                collection.catalog_identifier(identifier)
                status = HTTP_CREATED
                description = "Successfully added"

            message = OPDSMessage(urn, status, description)

            # Get a cover if it exists.
            image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE])
            images = [l for l in entry.get("links", []) if l.get("rel") in image_types]
            links = [LinkData(image.get("rel"), image.get("href")) for image in images]

            # Create an edition to hold the title and author. LicensePool.calculate_work
            # refuses to create a Work when there's no title, and if we have a title, author
            # and language we can attempt to look up the edition in OCLC.
            title = entry.get("title") or "Unknown Title"
            author = ContributorData(
                sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR),
                roles=[Contributor.PRIMARY_AUTHOR_ROLE]
            )
            language = entry.get("dcterms_language")

            presentation = PresentationCalculationPolicy(
                choose_edition=False,
                set_edition_metadata=False,
                classify=False,
                choose_summary=False,
                calculate_quality=False,
                choose_cover=False,
                regenerate_opds_entries=False,
            )
            replace = ReplacementPolicy(presentation_calculation_policy=presentation)
            metadata = Metadata(
                data_source,
                primary_identifier=IdentifierData(identifier.type, identifier.identifier),
                title=title,
                language=language,
                contributors=[author],
                links=links,
            )

            edition, ignore = metadata.edition(self._db)
            metadata.apply(edition, collection, replace=replace)

            messages.append(message)

        title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url)
        url = self.collection_feed_url("add_with_metadata", collection)
        addition_feed = AcquisitionFeed(
            self._db, title, url, [], VerboseAnnotator,
            precomposed_entries=messages
        )

        return feed_response(addition_feed)
Exemple #2
0
    def test_open_access_content_mirrored(self):
        # Make sure that open access material links are translated to our S3 buckets, and that
        # commercial material links are left as is.
        # Note: Mirroring tests passing does not guarantee that all code now
        # correctly calls on CirculationData, as well as Metadata.  This is a risk.

        mirrors = dict(books_mirror=MockS3Uploader(), covers_mirror=None)
        mirror_type = ExternalIntegrationLink.OPEN_ACCESS_BOOKS
        # Here's a book.
        edition, pool = self._edition(with_license_pool=True)

        # Here's a link to the content of the book, which will be mirrored.
        link_mirrored = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            href="http://example.com/",
            media_type=Representation.EPUB_MEDIA_TYPE,
            content="i am a tiny book",
        )

        # This link will not be mirrored.
        link_unmirrored = LinkData(
            rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD,
            href="http://example.com/2",
            media_type=Representation.EPUB_MEDIA_TYPE,
            content="i am a pricy book",
        )

        # Apply the metadata.
        policy = ReplacementPolicy(mirrors=mirrors)

        metadata = Metadata(
            data_source=edition.data_source,
            links=[link_mirrored, link_unmirrored],
        )
        metadata.apply(edition, pool.collection, replace=policy)
        # make sure the refactor is done right, and metadata does not upload
        assert 0 == len(mirrors[mirror_type].uploaded)

        circulation_data = CirculationData(
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
            links=[link_mirrored, link_unmirrored],
        )
        circulation_data.apply(self._db, pool.collection, replace=policy)

        # make sure the refactor is done right, and circulation does upload
        assert 1 == len(mirrors[mirror_type].uploaded)

        # Only the open-access link has been 'mirrored'.
        [book] = mirrors[mirror_type].uploaded

        # It's remained an open-access link.
        assert [Hyperlink.OPEN_ACCESS_DOWNLOAD
                ] == [x.rel for x in book.resource.links]

        # It's been 'mirrored' to the appropriate S3 bucket.
        assert book.mirror_url.startswith(
            "https://test-content-bucket.s3.amazonaws.com/")
        expect = "/%s/%s.epub" % (edition.primary_identifier.identifier,
                                  edition.title)
        assert book.mirror_url.endswith(expect)

        # make sure the mirrored link is safely on edition
        sorted_edition_links = sorted(pool.identifier.links,
                                      key=lambda x: x.rel)
        unmirrored_representation, mirrored_representation = [
            edlink.resource.representation for edlink in sorted_edition_links
        ]
        assert mirrored_representation.mirror_url.startswith(
            "https://test-content-bucket.s3.amazonaws.com/")

        # make sure the unmirrored link is safely on edition
        assert "http://example.com/2" == unmirrored_representation.url
        # make sure the unmirrored link has not been translated to an S3 URL
        assert None == unmirrored_representation.mirror_url
    def change_book_cover(self, identifier_type, identifier, mirrors=None):
        """Save a new book cover based on the submitted form."""
        self.require_librarian(flask.request.library)

        data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF)

        work = self.load_work(flask.request.library, identifier_type,
                              identifier)
        if isinstance(work, ProblemDetail):
            return work

        rights_uri = flask.request.form.get("rights_status")
        rights_explanation = flask.request.form.get("rights_explanation")

        if not rights_uri:
            return INVALID_IMAGE.detailed(
                _("You must specify the image's license."))

        collection = self._get_collection_from_pools(identifier_type,
                                                     identifier)
        if isinstance(collection, ProblemDetail):
            return collection

        # Look for an appropriate mirror to store this cover image. Since the
        # mirror should be used for covers, we don't need a mirror for books.
        mirrors = mirrors or dict(covers_mirror=MirrorUploader.for_collection(
            collection, ExternalIntegrationLink.COVERS),
                                  books_mirror=None)
        if not mirrors.get(ExternalIntegrationLink.COVERS):
            return INVALID_CONFIGURATION_OPTION.detailed(
                _("Could not find a storage integration for uploading the cover."
                  ))

        image = self.generate_cover_image(work, identifier_type, identifier)
        if isinstance(image, ProblemDetail):
            return image

        original, derivation_settings, cover_href, cover_rights_explanation = self._original_cover_info(
            image, work, data_source, rights_uri, rights_explanation)

        buffer = StringIO()
        image.save(buffer, format="PNG")
        content = buffer.getvalue()

        if not cover_href:
            cover_href = Hyperlink.generic_uri(
                data_source,
                work.presentation_edition.primary_identifier,
                Hyperlink.IMAGE,
                content=content)

        cover_data = LinkData(
            Hyperlink.IMAGE,
            href=cover_href,
            media_type=Representation.PNG_MEDIA_TYPE,
            content=content,
            rights_uri=rights_uri,
            rights_explanation=cover_rights_explanation,
            original=original,
            transformation_settings=derivation_settings,
        )

        presentation_policy = PresentationCalculationPolicy(
            choose_edition=False,
            set_edition_metadata=False,
            classify=False,
            choose_summary=False,
            calculate_quality=False,
            choose_cover=True,
            regenerate_opds_entries=True,
            regenerate_marc_record=True,
            update_search_index=False,
        )

        replacement_policy = ReplacementPolicy(
            links=True,
            # link_content is false because we already have the content.
            # We don't want the metadata layer to try to fetch it again.
            link_content=False,
            mirrors=mirrors,
            presentation_calculation_policy=presentation_policy,
        )

        metadata = Metadata(data_source, links=[cover_data])
        metadata.apply(work.presentation_edition,
                       collection,
                       replace=replacement_policy)

        # metadata.apply only updates the edition, so we also need
        # to update the work.
        work.calculate_presentation(policy=presentation_policy)

        return Response(_("Success"), 200)
Exemple #4
0
    def add_with_metadata(self, collection_details):
        """Adds identifiers with their metadata to a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(self._db, client,
                                             collection_details)

        data_source = DataSource.lookup(self._db,
                                        collection.name,
                                        autocreate=True)

        messages = []

        feed = feedparser.parse(request.data)
        entries = feed.get("entries", [])
        entries_by_urn = {entry.get('id'): entry for entry in entries}

        identifiers_by_urn, invalid_urns = Identifier.parse_urns(
            self._db, entries_by_urn.keys())

        messages = list()

        for urn in invalid_urns:
            messages.append(
                OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail))

        for urn, identifier in identifiers_by_urn.items():
            entry = entries_by_urn[urn]
            status = HTTP_OK
            description = "Already in catalog"

            if identifier not in collection.catalog:
                collection.catalog_identifier(identifier)
                status = HTTP_CREATED
                description = "Successfully added"

            message = OPDSMessage(urn, status, description)

            # Get a cover if it exists.
            image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE])
            images = [
                l for l in entry.get("links", [])
                if l.get("rel") in image_types
            ]
            links = [
                LinkData(image.get("rel"), image.get("href"))
                for image in images
            ]

            # Create an edition to hold the title and author. LicensePool.calculate_work
            # refuses to create a Work when there's no title, and if we have a title, author
            # and language we can attempt to look up the edition in OCLC.
            title = entry.get("title") or "Unknown Title"
            author = ContributorData(sort_name=(entry.get("author")
                                                or Edition.UNKNOWN_AUTHOR),
                                     roles=[Contributor.PRIMARY_AUTHOR_ROLE])
            language = entry.get("dcterms_language")

            presentation = PresentationCalculationPolicy(
                choose_edition=False,
                set_edition_metadata=False,
                classify=False,
                choose_summary=False,
                calculate_quality=False,
                choose_cover=False,
                regenerate_opds_entries=False,
            )
            replace = ReplacementPolicy(
                presentation_calculation_policy=presentation)
            metadata = Metadata(
                data_source,
                primary_identifier=IdentifierData(identifier.type,
                                                  identifier.identifier),
                title=title,
                language=language,
                contributors=[author],
                links=links,
            )

            edition, ignore = metadata.edition(self._db)
            metadata.apply(edition, collection, replace=replace)

            messages.append(message)

        title = "%s Catalog Item Additions for %s" % (collection.protocol,
                                                      client.url)
        url = self.collection_feed_url("add_with_metadata", collection)
        addition_feed = AcquisitionFeed(self._db,
                                        title,
                                        url, [],
                                        VerboseAnnotator,
                                        precomposed_entries=messages)

        return feed_response(addition_feed)
    def parse_book(cls, collection, g, uri, title):
        """Turn an RDF graph into a Edition for the given `uri` and
        `title`.
        """
        source_id = unicode(cls.ID_IN_URI.search(uri).groups()[0])
        primary_identifier = IdentifierData(
            Identifier.GUTENBERG_ID, source_id
        )

        # Split a subtitle out from the main title.
        title = unicode(title)
        subtitle = None
        for separator in "\r\n", "\n":
            if separator in title:
                parts = title.split(separator)
                title = parts[0]
                subtitle = "\n".join(parts[1:])
                break

        issued = cls._value(g, (uri, cls.dcterms.issued, None))
        issued = datetime.datetime.strptime(issued, cls.DATE_FORMAT).date()

        rights = cls._value(g, (uri, cls.dcterms.rights, None))
        if rights:
            rights = str(rights)
        else:
            rights = ''
        rights_uri = RightsStatus.rights_uri_from_string(rights)

        # As far as I can tell, Gutenberg descriptions are 100%
        # useless for our purposes. They should not be used, even if
        # no other description is available.

        publisher = cls._value(g, (uri, cls.dcterms.publisher, None))

        languages = []
        for ignore, ignore, language_uri in g.triples(
                (uri, cls.dcterms.language, None)):
            code = str(cls._value(g, (language_uri, cls.rdf.value, None)))
            code = LanguageCodes.two_to_three[code]
            if code:
                languages.append(code)

        if 'eng' in languages:
            language = 'eng'
        elif languages:
            language = languages[0]
        else:
            language = None

        contributors = []
        for ignore, ignore, author_uri in g.triples((uri, cls.dcterms.creator, None)):
            name = cls._value(g, (author_uri, cls.gutenberg.name, None))
            aliases = cls._values(g, (author_uri, cls.gutenberg.alias, None))
            contributors.append(ContributorData(
                sort_name=name,
                aliases=aliases,
                roles=[Contributor.AUTHOR_ROLE],
            ))

        subjects = []
        subject_links = cls._values(g, (uri, cls.dcterms.subject, None))
        for subject in subject_links:
            value = cls._value(g, (subject, cls.rdf.value, None))
            vocabulary = cls._value(g, (subject, cls.dcam.memberOf, None))
            vocabulary = Subject.by_uri[str(vocabulary)]
            subjects.append(SubjectData(vocabulary, value))

        medium = Edition.BOOK_MEDIUM

        # Turn the Gutenberg download links into Hyperlinks associated 
        # with the new Edition. They will serve either as open access
        # downloads or cover images.
        download_links = cls._values(g, (uri, cls.dcterms.hasFormat, None))
        links = [LinkData(
            rel=Hyperlink.CANONICAL,
            href=str(uri),
        )]

        # Gutenberg won't allow us to use any of the download or image
        # links--we have to make our own from an rsynced mirror--but
        # we can look through the links to determine which medium to
        # assign to this book.
        formats = []
        for href in download_links:
            for format_uri in cls._values(
                    g, (href, cls.dcterms['format'], None)):
                media_type = unicode(
                    cls._value(g, (format_uri, cls.rdf.value, None)))
                if media_type.startswith('audio/'):
                    medium = Edition.AUDIO_MEDIUM
                    formats.append(FormatData(
                        content_type=Representation.MP3_MEDIA_TYPE,
                        drm_scheme=DeliveryMechanism.NO_DRM,
                    ))
                elif media_type.startswith('video/'):
                    medium = Edition.VIDEO_MEDIUM
                else:
                    formats.append(FormatData(
                        content_type=Representation.EPUB_MEDIA_TYPE,
                        drm_scheme=DeliveryMechanism.NO_DRM,
                        rights_uri=rights_uri,
                    ))

        _db  = Session.object_session(collection)
        metadata = Metadata(
            data_source=DataSource.GUTENBERG,
            title=title,
            subtitle=subtitle,
            language=language,
            publisher=publisher,
            issued=issued,
            medium=medium,
            primary_identifier=primary_identifier,
            subjects=subjects,
            contributors=contributors,
            links=links,
        )
        edition, new = metadata.edition(_db)
        metadata.apply(edition, collection)

        # Ensure that an open-access LicensePool exists for this book.
        circulation_data = CirculationData(
            data_source=DataSource.GUTENBERG,
            primary_identifier=primary_identifier,
            formats=formats,
            default_rights_uri=rights_uri,
            links=links,
        )

        license_pool, new_license_pool = circulation_data.license_pool(
            _db, collection
        )
        replace = ReplacementPolicy(formats=True)
        circulation_data.apply(_db, collection, replace=replace)
        license_pool.calculate_work()
        return edition, license_pool, new