def add_with_metadata(self, collection_details): """Adds identifiers with their metadata to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details( self._db, client, collection_details ) data_source = DataSource.lookup( self._db, collection.name, autocreate=True ) messages = [] feed = feedparser.parse(request.data) entries = feed.get("entries", []) entries_by_urn = { entry.get('id') : entry for entry in entries } identifiers_by_urn, invalid_urns = Identifier.parse_urns( self._db, entries_by_urn.keys() ) messages = list() for urn in invalid_urns: messages.append(OPDSMessage( urn, INVALID_URN.status_code, INVALID_URN.detail )) for urn, identifier in identifiers_by_urn.items(): entry = entries_by_urn[urn] status = HTTP_OK description = "Already in catalog" if identifier not in collection.catalog: collection.catalog_identifier(identifier) status = HTTP_CREATED description = "Successfully added" message = OPDSMessage(urn, status, description) # Get a cover if it exists. image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE]) images = [l for l in entry.get("links", []) if l.get("rel") in image_types] links = [LinkData(image.get("rel"), image.get("href")) for image in images] # Create an edition to hold the title and author. LicensePool.calculate_work # refuses to create a Work when there's no title, and if we have a title, author # and language we can attempt to look up the edition in OCLC. title = entry.get("title") or "Unknown Title" author = ContributorData( sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR), roles=[Contributor.PRIMARY_AUTHOR_ROLE] ) language = entry.get("dcterms_language") presentation = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=False, regenerate_opds_entries=False, ) replace = ReplacementPolicy(presentation_calculation_policy=presentation) metadata = Metadata( data_source, primary_identifier=IdentifierData(identifier.type, identifier.identifier), title=title, language=language, contributors=[author], links=links, ) edition, ignore = metadata.edition(self._db) metadata.apply(edition, collection, replace=replace) messages.append(message) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url("add_with_metadata", collection) addition_feed = AcquisitionFeed( self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages ) return feed_response(addition_feed)
def test_open_access_content_mirrored(self): # Make sure that open access material links are translated to our S3 buckets, and that # commercial material links are left as is. # Note: Mirroring tests passing does not guarantee that all code now # correctly calls on CirculationData, as well as Metadata. This is a risk. mirrors = dict(books_mirror=MockS3Uploader(), covers_mirror=None) mirror_type = ExternalIntegrationLink.OPEN_ACCESS_BOOKS # Here's a book. edition, pool = self._edition(with_license_pool=True) # Here's a link to the content of the book, which will be mirrored. link_mirrored = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://example.com/", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a tiny book", ) # This link will not be mirrored. link_unmirrored = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, href="http://example.com/2", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a pricy book", ) # Apply the metadata. policy = ReplacementPolicy(mirrors=mirrors) metadata = Metadata( data_source=edition.data_source, links=[link_mirrored, link_unmirrored], ) metadata.apply(edition, pool.collection, replace=policy) # make sure the refactor is done right, and metadata does not upload assert 0 == len(mirrors[mirror_type].uploaded) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, links=[link_mirrored, link_unmirrored], ) circulation_data.apply(self._db, pool.collection, replace=policy) # make sure the refactor is done right, and circulation does upload assert 1 == len(mirrors[mirror_type].uploaded) # Only the open-access link has been 'mirrored'. [book] = mirrors[mirror_type].uploaded # It's remained an open-access link. assert [Hyperlink.OPEN_ACCESS_DOWNLOAD ] == [x.rel for x in book.resource.links] # It's been 'mirrored' to the appropriate S3 bucket. assert book.mirror_url.startswith( "https://test-content-bucket.s3.amazonaws.com/") expect = "/%s/%s.epub" % (edition.primary_identifier.identifier, edition.title) assert book.mirror_url.endswith(expect) # make sure the mirrored link is safely on edition sorted_edition_links = sorted(pool.identifier.links, key=lambda x: x.rel) unmirrored_representation, mirrored_representation = [ edlink.resource.representation for edlink in sorted_edition_links ] assert mirrored_representation.mirror_url.startswith( "https://test-content-bucket.s3.amazonaws.com/") # make sure the unmirrored link is safely on edition assert "http://example.com/2" == unmirrored_representation.url # make sure the unmirrored link has not been translated to an S3 URL assert None == unmirrored_representation.mirror_url
def change_book_cover(self, identifier_type, identifier, mirrors=None): """Save a new book cover based on the submitted form.""" self.require_librarian(flask.request.library) data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF) work = self.load_work(flask.request.library, identifier_type, identifier) if isinstance(work, ProblemDetail): return work rights_uri = flask.request.form.get("rights_status") rights_explanation = flask.request.form.get("rights_explanation") if not rights_uri: return INVALID_IMAGE.detailed( _("You must specify the image's license.")) collection = self._get_collection_from_pools(identifier_type, identifier) if isinstance(collection, ProblemDetail): return collection # Look for an appropriate mirror to store this cover image. Since the # mirror should be used for covers, we don't need a mirror for books. mirrors = mirrors or dict(covers_mirror=MirrorUploader.for_collection( collection, ExternalIntegrationLink.COVERS), books_mirror=None) if not mirrors.get(ExternalIntegrationLink.COVERS): return INVALID_CONFIGURATION_OPTION.detailed( _("Could not find a storage integration for uploading the cover." )) image = self.generate_cover_image(work, identifier_type, identifier) if isinstance(image, ProblemDetail): return image original, derivation_settings, cover_href, cover_rights_explanation = self._original_cover_info( image, work, data_source, rights_uri, rights_explanation) buffer = StringIO() image.save(buffer, format="PNG") content = buffer.getvalue() if not cover_href: cover_href = Hyperlink.generic_uri( data_source, work.presentation_edition.primary_identifier, Hyperlink.IMAGE, content=content) cover_data = LinkData( Hyperlink.IMAGE, href=cover_href, media_type=Representation.PNG_MEDIA_TYPE, content=content, rights_uri=rights_uri, rights_explanation=cover_rights_explanation, original=original, transformation_settings=derivation_settings, ) presentation_policy = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=True, regenerate_opds_entries=True, regenerate_marc_record=True, update_search_index=False, ) replacement_policy = ReplacementPolicy( links=True, # link_content is false because we already have the content. # We don't want the metadata layer to try to fetch it again. link_content=False, mirrors=mirrors, presentation_calculation_policy=presentation_policy, ) metadata = Metadata(data_source, links=[cover_data]) metadata.apply(work.presentation_edition, collection, replace=replacement_policy) # metadata.apply only updates the edition, so we also need # to update the work. work.calculate_presentation(policy=presentation_policy) return Response(_("Success"), 200)
def add_with_metadata(self, collection_details): """Adds identifiers with their metadata to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details(self._db, client, collection_details) data_source = DataSource.lookup(self._db, collection.name, autocreate=True) messages = [] feed = feedparser.parse(request.data) entries = feed.get("entries", []) entries_by_urn = {entry.get('id'): entry for entry in entries} identifiers_by_urn, invalid_urns = Identifier.parse_urns( self._db, entries_by_urn.keys()) messages = list() for urn in invalid_urns: messages.append( OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail)) for urn, identifier in identifiers_by_urn.items(): entry = entries_by_urn[urn] status = HTTP_OK description = "Already in catalog" if identifier not in collection.catalog: collection.catalog_identifier(identifier) status = HTTP_CREATED description = "Successfully added" message = OPDSMessage(urn, status, description) # Get a cover if it exists. image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE]) images = [ l for l in entry.get("links", []) if l.get("rel") in image_types ] links = [ LinkData(image.get("rel"), image.get("href")) for image in images ] # Create an edition to hold the title and author. LicensePool.calculate_work # refuses to create a Work when there's no title, and if we have a title, author # and language we can attempt to look up the edition in OCLC. title = entry.get("title") or "Unknown Title" author = ContributorData(sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR), roles=[Contributor.PRIMARY_AUTHOR_ROLE]) language = entry.get("dcterms_language") presentation = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=False, regenerate_opds_entries=False, ) replace = ReplacementPolicy( presentation_calculation_policy=presentation) metadata = Metadata( data_source, primary_identifier=IdentifierData(identifier.type, identifier.identifier), title=title, language=language, contributors=[author], links=links, ) edition, ignore = metadata.edition(self._db) metadata.apply(edition, collection, replace=replace) messages.append(message) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url("add_with_metadata", collection) addition_feed = AcquisitionFeed(self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages) return feed_response(addition_feed)
def parse_book(cls, collection, g, uri, title): """Turn an RDF graph into a Edition for the given `uri` and `title`. """ source_id = unicode(cls.ID_IN_URI.search(uri).groups()[0]) primary_identifier = IdentifierData( Identifier.GUTENBERG_ID, source_id ) # Split a subtitle out from the main title. title = unicode(title) subtitle = None for separator in "\r\n", "\n": if separator in title: parts = title.split(separator) title = parts[0] subtitle = "\n".join(parts[1:]) break issued = cls._value(g, (uri, cls.dcterms.issued, None)) issued = datetime.datetime.strptime(issued, cls.DATE_FORMAT).date() rights = cls._value(g, (uri, cls.dcterms.rights, None)) if rights: rights = str(rights) else: rights = '' rights_uri = RightsStatus.rights_uri_from_string(rights) # As far as I can tell, Gutenberg descriptions are 100% # useless for our purposes. They should not be used, even if # no other description is available. publisher = cls._value(g, (uri, cls.dcterms.publisher, None)) languages = [] for ignore, ignore, language_uri in g.triples( (uri, cls.dcterms.language, None)): code = str(cls._value(g, (language_uri, cls.rdf.value, None))) code = LanguageCodes.two_to_three[code] if code: languages.append(code) if 'eng' in languages: language = 'eng' elif languages: language = languages[0] else: language = None contributors = [] for ignore, ignore, author_uri in g.triples((uri, cls.dcterms.creator, None)): name = cls._value(g, (author_uri, cls.gutenberg.name, None)) aliases = cls._values(g, (author_uri, cls.gutenberg.alias, None)) contributors.append(ContributorData( sort_name=name, aliases=aliases, roles=[Contributor.AUTHOR_ROLE], )) subjects = [] subject_links = cls._values(g, (uri, cls.dcterms.subject, None)) for subject in subject_links: value = cls._value(g, (subject, cls.rdf.value, None)) vocabulary = cls._value(g, (subject, cls.dcam.memberOf, None)) vocabulary = Subject.by_uri[str(vocabulary)] subjects.append(SubjectData(vocabulary, value)) medium = Edition.BOOK_MEDIUM # Turn the Gutenberg download links into Hyperlinks associated # with the new Edition. They will serve either as open access # downloads or cover images. download_links = cls._values(g, (uri, cls.dcterms.hasFormat, None)) links = [LinkData( rel=Hyperlink.CANONICAL, href=str(uri), )] # Gutenberg won't allow us to use any of the download or image # links--we have to make our own from an rsynced mirror--but # we can look through the links to determine which medium to # assign to this book. formats = [] for href in download_links: for format_uri in cls._values( g, (href, cls.dcterms['format'], None)): media_type = unicode( cls._value(g, (format_uri, cls.rdf.value, None))) if media_type.startswith('audio/'): medium = Edition.AUDIO_MEDIUM formats.append(FormatData( content_type=Representation.MP3_MEDIA_TYPE, drm_scheme=DeliveryMechanism.NO_DRM, )) elif media_type.startswith('video/'): medium = Edition.VIDEO_MEDIUM else: formats.append(FormatData( content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.NO_DRM, rights_uri=rights_uri, )) _db = Session.object_session(collection) metadata = Metadata( data_source=DataSource.GUTENBERG, title=title, subtitle=subtitle, language=language, publisher=publisher, issued=issued, medium=medium, primary_identifier=primary_identifier, subjects=subjects, contributors=contributors, links=links, ) edition, new = metadata.edition(_db) metadata.apply(edition, collection) # Ensure that an open-access LicensePool exists for this book. circulation_data = CirculationData( data_source=DataSource.GUTENBERG, primary_identifier=primary_identifier, formats=formats, default_rights_uri=rights_uri, links=links, ) license_pool, new_license_pool = circulation_data.license_pool( _db, collection ) replace = ReplacementPolicy(formats=True) circulation_data.apply(_db, collection, replace=replace) license_pool.calculate_work() return edition, license_pool, new