def test_explicit_formatdata(self): # Creating an edition with an open-access download will # automatically create a delivery mechanism. edition, pool = self._edition(with_open_access_download=True) # Let's also add a DRM format. drm_format = FormatData( content_type=Representation.PDF_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM, ) circulation_data = CirculationData(formats=[drm_format], data_source=edition.data_source, primary_identifier=edition.primary_identifier) circulation_data.apply(pool) [epub, pdf] = sorted(pool.delivery_mechanisms, key=lambda x: x.delivery_mechanism.content_type) eq_(epub.resource, edition.license_pool.best_open_access_link) eq_(Representation.PDF_MEDIA_TYPE, pdf.delivery_mechanism.content_type) eq_(DeliveryMechanism.ADOBE_DRM, pdf.delivery_mechanism.drm_scheme) # If we tell Metadata to replace the list of formats, we only # have the one format we manually created. replace = ReplacementPolicy( formats=True, ) circulation_data.apply(pool, replace=replace) [pdf] = pool.delivery_mechanisms eq_(Representation.PDF_MEDIA_TYPE, pdf.delivery_mechanism.content_type)
def test_rights_status_commercial_link_with_rights(self): identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, rights_uri=RightsStatus.IN_COPYRIGHT, ) format = FormatData( content_type=link.media_type, drm_scheme=DeliveryMechanism.ADOBE_DRM, link=link, rights_uri=RightsStatus.IN_COPYRIGHT, ) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], formats=[format], ) replace = ReplacementPolicy( formats=True, ) pool, ignore = circulation_data.license_pool(self._db) circulation_data.apply(pool, replace) eq_(False, pool.open_access) eq_(1, len(pool.delivery_mechanisms)) eq_(RightsStatus.IN_COPYRIGHT, pool.delivery_mechanisms[0].rights_status.uri)
def test_circulationdata_can_be_deepcopied(self): # Check that we didn't put something in the CirculationData that # will prevent it from being copied. (e.g., self.log) subject = SubjectData(Subject.TAG, "subject") contributor = ContributorData() identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM) rights_uri = RightsStatus.GENERIC_OPEN_ACCESS circulation_data = CirculationData( DataSource.GUTENBERG, primary_identifier=identifier, links=[link], licenses_owned=5, licenses_available=5, licenses_reserved=None, patrons_in_hold_queue=None, formats=[format], default_rights_uri=rights_uri, ) circulation_data_copy = deepcopy(circulation_data) # If deepcopy didn't throw an exception we're ok. assert circulation_data_copy is not None
def test_license_pool_sets_default_license_values(self): """We have no information about how many copies of the book we've actually licensed, but a LicensePool can be created anyway, so we can store format information. """ identifier = IdentifierData(Identifier.OVERDRIVE_ID, "1") drm_format = FormatData( content_type=Representation.PDF_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM, ) circulation = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, formats=[drm_format], ) pool, is_new = circulation.license_pool( self._db, ) eq_(True, is_new) # We start with the conservative assumption that we own no # licenses for the book. eq_(0, pool.licenses_owned) eq_(0, pool.licenses_available) eq_(0, pool.licenses_reserved) eq_(0, pool.patrons_in_hold_queue)
def test_apply_removes_old_formats_based_on_replacement_policy(self): edition, pool = self._edition(with_license_pool=True) # Start with one delivery mechanism for this pool. for lpdm in pool.delivery_mechanisms: self._db.delete(lpdm) old_lpdm = pool.set_delivery_mechanism(Representation.PDF_MEDIA_TYPE, DeliveryMechanism.ADOBE_DRM, RightsStatus.IN_COPYRIGHT, None) # And it has been loaned. patron = self._patron() loan, ignore = pool.loan_to(patron, fulfillment=old_lpdm) eq_(old_lpdm, loan.fulfillment) # We have new circulation data that has a different format. format = FormatData( content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM, ) circulation_data = CirculationData( formats=[format], data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) # If we apply the new CirculationData with formats false in the policy, # we'll add the new format, but keep the old one as well. replacement_policy = ReplacementPolicy(formats=False) circulation_data.apply(self._db, pool.collection, replacement_policy) eq_(2, pool.delivery_mechanisms.count()) eq_( set([ Representation.PDF_MEDIA_TYPE, Representation.EPUB_MEDIA_TYPE ]), set([ lpdm.delivery_mechanism.content_type for lpdm in pool.delivery_mechanisms ])) eq_(old_lpdm, loan.fulfillment) # But if we make formats true in the policy, we'll delete the old format # and remove it from its loan. replacement_policy = ReplacementPolicy(formats=True) circulation_data.apply(self._db, pool.collection, replacement_policy) eq_(1, pool.delivery_mechanisms.count()) eq_(Representation.EPUB_MEDIA_TYPE, pool.delivery_mechanisms[0].delivery_mechanism.content_type) eq_(None, loan.fulfillment)
def test_format_change_may_change_open_access_status(self): # In this test, whenever we call CirculationData.apply(), we # want to destroy the old list of formats and recreate it. replace_formats = ReplacementPolicy(formats=True) # Here's a seemingly ordinary non-open-access LicensePool. edition, pool = self._edition(with_license_pool=True) eq_(False, pool.open_access) # One day, we learn that it has an open-access delivery mechanism. link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, rights_uri=RightsStatus.CC_BY_ND, ) circulation_data = CirculationData( data_source=pool.data_source, primary_identifier=pool.identifier, links=[link], ) # Applying this information turns the pool into an open-access pool. circulation_data.apply(self._db, pool.collection, replace=replace_formats) eq_(True, pool.open_access) # Then we find out it was a mistake -- the book is in copyright. format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM, rights_uri=RightsStatus.IN_COPYRIGHT) circulation_data = CirculationData(data_source=pool.data_source, primary_identifier=pool.identifier, formats=[format]) circulation_data.apply(self._db, pool.collection, replace=replace_formats) # The original LPDM has been removed and only the new one remains. eq_(False, pool.open_access) eq_(1, pool.delivery_mechanisms.count())
def internal_formats(cls, book_format): """Convert the term Bibliotheca uses to refer to a book format into a (medium [formats]) 2-tuple. """ medium = Edition.BOOK_MEDIUM format = None if book_format not in cls.format_data_for_bibliotheca_format: logging.error("Unrecognized BookFormat: %s", book_format) return medium, [] content_type, drm_scheme = cls.format_data_for_bibliotheca_format[ book_format] format = FormatData(content_type=content_type, drm_scheme=drm_scheme) if book_format == 'MP3': medium = Edition.AUDIO_MEDIUM else: medium = Edition.BOOK_MEDIUM return medium, [format]
def extract_bibliographic(self, element, ns): identifiers = [] contributors = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) sort_name = element["author"] if not sort_name: sort_name = "Unknown" contributors.append(ContributorData(sort_name=sort_name)) primary_identifier = IdentifierData(Identifier.ENKI_ID, element["id"]) metadata = Metadata( data_source=DataSource.ENKI, title=element["title"], language="ENGLISH", medium=Edition.BOOK_MEDIUM, #series=series, publisher=element["publisher"], #imprint=imprint, #published=publication_date, primary_identifier=primary_identifier, identifiers=identifiers, #subjects=subjects, contributors=contributors, ) #TODO: This should parse the content type and look it up in the Enki Delivery Data above. Currently, # we assume everything is an ePub that uses Adobe DRM, which is a safe assumption only for now. formats = [] formats.append( FormatData(content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM)) circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
def test_circulationdata_may_require_collection(self): """Depending on the information provided in a CirculationData object, it might or might not be possible to call apply() without providing a Collection. """ identifier = IdentifierData(Identifier.OVERDRIVE_ID, "1") format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM, rights_uri=RightsStatus.IN_COPYRIGHT) circdata = CirculationData(DataSource.OVERDRIVE, primary_identifier=identifier, formats=[format]) circdata.apply(self._db, collection=None) # apply() has created a LicensePoolDeliveryMechanism for this # title, even though there are no LicensePools for it. identifier_obj, ignore = identifier.load(self._db) eq_([], identifier_obj.licensed_through) [lpdm] = identifier_obj.delivery_mechanisms eq_(DataSource.OVERDRIVE, lpdm.data_source.name) eq_(RightsStatus.IN_COPYRIGHT, lpdm.rights_status.uri) mechanism = lpdm.delivery_mechanism eq_(Representation.EPUB_MEDIA_TYPE, mechanism.content_type) eq_(DeliveryMechanism.NO_DRM, mechanism.drm_scheme) # But if we put some information in the CirculationData # that can only be stored in a LicensePool, there's trouble. circdata.licenses_owned = 0 assert_raises_regexp( ValueError, 'Cannot store circulation information because no Collection was provided.', circdata.apply, self._db, collection=None)
def book_info_to_metadata(cls, book, include_bibliographic=True, include_formats=True): """Turn Overdrive's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if not 'id' in book: return None overdrive_id = book['id'] primary_identifier = IdentifierData(Identifier.OVERDRIVE_ID, overdrive_id) if include_bibliographic: title = book.get('title', None) sort_title = book.get('sortTitle') subtitle = book.get('subtitle', None) series = book.get('series', None) publisher = book.get('publisher', None) imprint = book.get('imprint', None) if 'publishDate' in book: published = datetime.datetime.strptime( book['publishDate'][:10], cls.DATE_FORMAT) else: published = None languages = [l['code'] for l in book.get('languages', [])] if 'eng' in languages or not languages: language = 'eng' else: language = sorted(languages)[0] contributors = [] for creator in book.get('creators', []): sort_name = creator['fileAs'] display_name = creator['name'] role = creator['role'] roles = cls.parse_roles(overdrive_id, role) or [Contributor.UNKNOWN_ROLE] contributor = ContributorData(sort_name=sort_name, display_name=display_name, roles=roles, biography=creator.get( 'bioText', None)) contributors.append(contributor) subjects = [] for sub in book.get('subjects', []): subject = SubjectData(type=Subject.OVERDRIVE, identifier=sub['value'], weight=100) subjects.append(subject) for sub in book.get('keywords', []): subject = SubjectData(type=Subject.TAG, identifier=sub['value'], weight=1) subjects.append(subject) extra = dict() if 'grade_levels' in book: # n.b. Grade levels are measurements of reading level, not # age appropriateness. We can use them as a measure of age # appropriateness in a pinch, but we weight them less # heavily than other information from Overdrive. for i in book['grade_levels']: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=i['value'], weight=10) subjects.append(subject) overdrive_medium = book.get('mediaType', None) if overdrive_medium and overdrive_medium not in cls.overdrive_medium_to_simplified_medium: cls.log.error("Could not process medium %s for %s", overdrive_medium, overdrive_id) medium = cls.overdrive_medium_to_simplified_medium.get( overdrive_medium, Edition.BOOK_MEDIUM) measurements = [] if 'awards' in book: extra['awards'] = book.get('awards', []) num_awards = len(extra['awards']) measurements.append( MeasurementData(Measurement.AWARDS, str(num_awards))) for name, subject_type in (('ATOS', Subject.ATOS_SCORE), ('lexileScore', Subject.LEXILE_SCORE), ('interestLevel', Subject.INTEREST_LEVEL)): if not name in book: continue identifier = str(book[name]) subjects.append( SubjectData(type=subject_type, identifier=identifier, weight=100)) for grade_level_info in book.get('gradeLevels', []): grade_level = grade_level_info.get('value') subjects.append( SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=100)) identifiers = [] links = [] for format in book.get('formats', []): for new_id in format.get('identifiers', []): t = new_id['type'] v = new_id['value'] orig_v = v type_key = None if t == 'ASIN': type_key = Identifier.ASIN elif t == 'ISBN': type_key = Identifier.ISBN if len(v) == 10: v = isbnlib.to_isbn13(v) if v is None or not isbnlib.is_isbn13(v): # Overdrive sometimes uses invalid values # like "n/a" as placeholders. Ignore such # values to avoid a situation where hundreds of # books appear to have the same ISBN. ISBNs # which fail check digit checks or are invalid # also can occur. Log them for review. cls.log.info("Bad ISBN value provided: %s", orig_v) continue elif t == 'DOI': type_key = Identifier.DOI elif t == 'UPC': type_key = Identifier.UPC elif t == 'PublisherCatalogNumber': continue if type_key and v: identifiers.append(IdentifierData(type_key, v, 1)) # Samples become links. if 'samples' in format: if not format['id'] in cls.format_data_for_overdrive_format: # Useless to us. continue content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format['id']) if Representation.is_media_type(content_type): for sample_info in format['samples']: href = sample_info['url'] links.append( LinkData(rel=Hyperlink.SAMPLE, href=href, media_type=content_type)) # A cover and its thumbnail become a single LinkData. if 'images' in book: images = book['images'] image_data = cls.image_link_to_linkdata( images.get('cover'), Hyperlink.IMAGE) for name in ['cover300Wide', 'cover150Wide', 'thumbnail']: # Try to get a thumbnail that's as close as possible # to the size we use. image = images.get(name) thumbnail_data = cls.image_link_to_linkdata( image, Hyperlink.THUMBNAIL_IMAGE) if not image_data: image_data = cls.image_link_to_linkdata( image, Hyperlink.IMAGE) if thumbnail_data: break if image_data: if thumbnail_data: image_data.thumbnail = thumbnail_data links.append(image_data) # Descriptions become links. short = book.get('shortDescription') full = book.get('fullDescription') if full: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=full, media_type="text/html", )) if short and (not full or not full.startswith(short)): links.append( LinkData( rel=Hyperlink.SHORT_DESCRIPTION, content=short, media_type="text/html", )) # Add measurements: rating and popularity if book.get('starRating') is not None and book['starRating'] > 0: measurements.append( MeasurementData(quantity_measured=Measurement.RATING, value=book['starRating'])) if book.get('popularity'): measurements.append( MeasurementData(quantity_measured=Measurement.POPULARITY, value=book['popularity'])) metadata = Metadata( data_source=DataSource.OVERDRIVE, title=title, subtitle=subtitle, sort_title=sort_title, language=language, medium=medium, series=series, publisher=publisher, imprint=imprint, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, measurements=measurements, links=links, ) else: metadata = Metadata( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, ) if include_formats: formats = [] for format in book.get('formats', []): format_id = format['id'] if format_id in cls.format_data_for_overdrive_format: content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format_id) formats.append(FormatData(content_type, drm_scheme)) elif format_id not in cls.ignorable_overdrive_formats: cls.log.error( "Could not process Overdrive format %s for %s", format_id, overdrive_id) # Also make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
class ItemListParser(XMLParser): DATE_FORMAT = "%Y-%m-%d" YEAR_FORMAT = "%Y" NAMESPACES = {} def parse(self, xml): for i in self.process_all(xml, "//Item"): yield i parenthetical = re.compile(" \([^)]+\)$") @classmethod def contributors_from_string(cls, string): contributors = [] if not string: return contributors for sort_name in string.split(';'): sort_name = cls.parenthetical.sub("", sort_name.strip()) contributors.append( ContributorData( sort_name=sort_name.strip(), roles=[Contributor.AUTHOR_ROLE] ) ) return contributors @classmethod def parse_genre_string(self, s): genres = [] if not s: return genres for i in s.split(","): i = i.strip() if not i: continue i = i.replace("&amp;", "&").replace("&", "&").replace("'", "'") genres.append(SubjectData(Subject.THREEM, i, weight=15)) return genres def process_one(self, tag, namespaces): """Turn an <item> tag into a Metadata and an encompassed CirculationData objects, and return the Metadata.""" def value(threem_key): return self.text_of_optional_subtag(tag, threem_key) links = dict() identifiers = dict() subjects = [] primary_identifier = IdentifierData( Identifier.THREEM_ID, value("ItemId") ) identifiers = [] for key in ('ISBN13', 'PhysicalISBN'): v = value(key) if v: identifiers.append( IdentifierData(Identifier.ISBN, v) ) subjects = self.parse_genre_string(value("Genre")) title = value("Title") subtitle = value("SubTitle") publisher = value("Publisher") language = value("Language") contributors = list(self.contributors_from_string(value('Authors'))) published_date = None published = value("PubDate") if published: formats = [self.DATE_FORMAT, self.YEAR_FORMAT] else: published = value("PubYear") formats = [self.YEAR_FORMAT] for format in formats: try: published_date = datetime.strptime(published, format) except ValueError, e: pass links = [] description = value("Description") if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description) ) cover_url = value("CoverLinkURL").replace("&", "&") links.append(LinkData(rel=Hyperlink.IMAGE, href=cover_url)) alternate_url = value("BookLinkURL").replace("&", "&") links.append(LinkData(rel='alternate', href=alternate_url)) measurements = [] pages = value("NumberOfPages") if pages: pages = int(pages) measurements.append( MeasurementData(quantity_measured=Measurement.PAGE_COUNT, value=pages) ) medium = Edition.BOOK_MEDIUM book_format = value("BookFormat") format = None if book_format == 'EPUB': format = FormatData( content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) elif book_format == 'PDF': format = FormatData( content_type=Representation.PDF_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) elif book_format == 'MP3': format = FormatData( content_type=Representation.MP3_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) medium = Edition.AUDIO_MEDIUM formats = [format] metadata = Metadata( data_source=DataSource.THREEM, title=title, subtitle=subtitle, language=language, medium=medium, publisher=publisher, published=published_date, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, measurements=measurements, links=links, ) # Also make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.THREEM, primary_identifier=primary_identifier, formats=formats, links=links, ) metadata.circulation = circulationdata return metadata
def extract_bibliographic(self, element, ns): """Turn bibliographic metadata into a Metadata and a CirculationData objects, and return them as a tuple.""" # TODO: These are consistently empty (some are clearly for # audiobooks) so I don't know what they do and/or what format # they're in. # # annotation # edition # narrator # runtime identifier = self.text_of_subtag(element, 'axis:titleId', ns) isbn = self.text_of_optional_subtag(element, 'axis:isbn', ns) title = self.text_of_subtag(element, 'axis:productTitle', ns) contributor = self.text_of_optional_subtag(element, 'axis:contributor', ns) contributors = [] found_primary_author = False if contributor: for c in self.parse_list(contributor): contributor = self.parse_contributor(c, found_primary_author) if Contributor.PRIMARY_AUTHOR_ROLE in contributor.roles: found_primary_author = True contributors.append(contributor) subject = self.text_of_optional_subtag(element, 'axis:subject', ns) subjects = [] if subject: for subject_identifier in self.parse_list(subject): subjects.append( SubjectData(type=Subject.BISAC, identifier=subject_identifier, weight=1)) publication_date = self.text_of_optional_subtag( element, 'axis:publicationDate', ns) if publication_date: publication_date = datetime.datetime.strptime( publication_date, self.SHORT_DATE_FORMAT) series = self.text_of_optional_subtag(element, 'axis:series', ns) publisher = self.text_of_optional_subtag(element, 'axis:publisher', ns) imprint = self.text_of_optional_subtag(element, 'axis:imprint', ns) audience = self.text_of_optional_subtag(element, 'axis:audience', ns) if audience: subjects.append( SubjectData( type=Subject.THETA_AUDIENCE, identifier=audience, weight=1, )) language = self.text_of_subtag(element, 'axis:language', ns) # We don't use this for anything. # file_size = self.int_of_optional_subtag(element, 'theta:fileSize', ns) primary_identifier = IdentifierData(Identifier.THETA_ID, identifier) identifiers = [] if isbn: identifiers.append(IdentifierData(Identifier.ISBN, isbn)) formats = [] acceptable = False seen_formats = [] for format_tag in self._xpath( element, 'axis:availability/axis:availableFormats/axis:formatName', ns): informal_name = format_tag.text seen_formats.append(informal_name) if informal_name not in self.DELIVERY_DATA_FOR_THETA_FORMAT: self.log("Unrecognized Theta format name for %s: %s" % (identifier, informal_name)) elif self.DELIVERY_DATA_FOR_THETA_FORMAT.get(informal_name): content_type, drm_scheme = self.DELIVERY_DATA_FOR_THETA_FORMAT[ informal_name] formats.append( FormatData(content_type=content_type, drm_scheme=drm_scheme)) if not formats: self.log.error("No supported format for %s (%s)! Saw: %s", identifier, title, ", ".join(seen_formats)) metadata = Metadata( data_source=DataSource.THETA, title=title, language=language, medium=Edition.BOOK_MEDIUM, series=series, publisher=publisher, imprint=imprint, published=publication_date, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, ) circulationdata = CirculationData( data_source=DataSource.THETA, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
def set_format(cls, format_received, formats): content_type, drm_scheme = cls.format_data_for_odilo_format.get( format_received) formats.append(FormatData(content_type, drm_scheme)) return cls.odilo_medium_to_simplified_medium.get(format_received)
def isbn_info_to_metadata(cls, book, include_bibliographic=True, include_formats=True): """Turn OneClick's JSON representation of a book into a Metadata object. Assumes the JSON is in the format that comes from the media/{isbn} endpoint. TODO: Use the seriesTotal field. :param book a json response-derived dictionary of book attributes """ if not 'isbn' in book: return None oneclick_id = book['isbn'] primary_identifier = IdentifierData(Identifier.ONECLICK_ID, oneclick_id) metadata = Metadata( data_source=DataSource.ONECLICK, primary_identifier=primary_identifier, ) if include_bibliographic: title = book.get('title', None) # NOTE: An item that's part of a series, will have the seriesName field, and # will have its seriesPosition and seriesTotal fields set to >0. # An item not part of a series will have the seriesPosition and seriesTotal fields # set to 0, and will not have a seriesName at all. # Sometimes, series position and total == 0, for many series items (ex: "seriesName": "EngLits"). # Sometimes, seriesName is set to "Default Blank", meaning "not actually a series". series_name = book.get('seriesName', None) series_position = book.get('seriesPosition', None) if series_position: try: series_position = int(series_position) except ValueError: # not big enough deal to stop the whole process series_position = None # ignored for now series_total = book.get('seriesTotal', None) # ignored for now has_digital_rights = book.get('hasDigitalRights', None) publisher = book.get('publisher', None) if 'publicationDate' in book: published = datetime.datetime.strptime( book['publicationDate'][:10], cls.DATE_FORMAT) else: published = None if 'language' in book: language = LanguageCodes.string_to_alpha_3(book['language']) else: language = 'eng' contributors = [] if 'authors' in book: authors = book['authors'] for author in authors.split(";"): sort_name = author.strip() if sort_name: sort_name = name_tidy(sort_name) display_name = sort_name_to_display_name(sort_name) roles = [Contributor.AUTHOR_ROLE] contributor = ContributorData( sort_name=sort_name, display_name=display_name, roles=roles) contributors.append(contributor) if 'narrators' in book: narrators = book['narrators'] for narrator in narrators.split(";"): sort_name = narrator.strip() if sort_name: sort_name = name_tidy(sort_name) display_name = sort_name_to_display_name(sort_name) roles = [Contributor.NARRATOR_ROLE] contributor = ContributorData( sort_name=sort_name, display_name=display_name, roles=roles) contributors.append(contributor) subjects = [] if 'genres' in book: # example: "FICTION / Humorous / General" genres = book['genres'] subject = SubjectData(type=Subject.BISAC, identifier=genres, weight=100) subjects.append(subject) if 'primaryGenre' in book: # example: "humorous-fiction,mystery,womens-fiction" genres = book['primaryGenre'] for genre in genres.split(","): subject = SubjectData(type=Subject.ONECLICK, identifier=genre.strip(), weight=100) subjects.append(subject) # audience options are: adult, beginning-reader, childrens, young-adult # NOTE: In OneClick metadata, audience can be set to "Adult" while publisher is "HarperTeen". audience = book.get('audience', None) if audience: subject = SubjectData(type=Subject.ONECLICK_AUDIENCE, identifier=audience.strip().lower(), weight=10) subjects.append(subject) # options are: "eBook", "eAudio" oneclick_medium = book.get('mediaType', None) if oneclick_medium and oneclick_medium not in cls.oneclick_medium_to_simplified_medium: cls.log.error("Could not process medium %s for %s", oneclick_medium, oneclick_id) medium = cls.oneclick_medium_to_simplified_medium.get( oneclick_medium, Edition.BOOK_MEDIUM) # passed to metadata.apply, the isbn_identifier will create an equivalency # between the OneClick-labeled and the ISBN-labeled identifier rows, which # will in turn allow us to ask the MetadataWrangler for more info about the book. isbn_identifier = IdentifierData(Identifier.ISBN, oneclick_id) identifiers = [primary_identifier, isbn_identifier] links = [] # A cover and its thumbnail become a single LinkData. # images come in small (ex: 71x108px), medium (ex: 95x140px), # and large (ex: 128x192px) sizes if 'images' in book: images = book['images'] for image in images: if image['name'] == "large": image_data = cls.image_link_to_linkdata( image['url'], Hyperlink.IMAGE) if image['name'] == "medium": thumbnail_data = cls.image_link_to_linkdata( image['url'], Hyperlink.THUMBNAIL_IMAGE) if image['name'] == "small": thumbnail_data_backup = cls.image_link_to_linkdata( image['url'], Hyperlink.THUMBNAIL_IMAGE) if not thumbnail_data and thumbnail_data_backup: thumbnail_data = thumbnail_data_backup if image_data: if thumbnail_data: image_data.thumbnail = thumbnail_data links.append(image_data) # Descriptions become links. description = book.get('description', None) if description: links.append( LinkData( # there can be fuller descriptions in the search endpoint output rel=Hyperlink.SHORT_DESCRIPTION, content=description, media_type="text/html", )) metadata.title = title metadata.language = language metadata.medium = medium metadata.series = series_name metadata.series_position = series_position metadata.publisher = publisher metadata.published = published metadata.identifiers = identifiers metadata.subjects = subjects metadata.contributors = contributors metadata.links = links if include_formats: formats = [] if metadata.medium == Edition.BOOK_MEDIUM: content_type, drm_scheme = cls.oneclick_formats.get( "ebook-epub-oneclick") formats.append(FormatData(content_type, drm_scheme)) elif metadata.medium == Edition.AUDIO_MEDIUM: content_type, drm_scheme = cls.oneclick_formats.get( "audiobook-mp3-oneclick") formats.append(FormatData(content_type, drm_scheme)) else: cls.log.warn("Unfamiliar format: %s", format_id) # Make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.ONECLICK, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata