def test_measurements(self): edition = self._edition() measurement = MeasurementData(quantity_measured=Measurement.POPULARITY, value=100) metadata = Metadata(measurements=[measurement], data_source=edition.data_source) metadata.apply(edition) [m] = edition.primary_identifier.measurements eq_(Measurement.POPULARITY, m.quantity_measured) eq_(100, m.value)
def extract_measurement(cls, rating_tag): type = rating_tag.get('{http://schema.org/}additionalType') value = rating_tag.get('{http://schema.org/}ratingValue') if not value: value = rating_tag.attrib.get('{http://schema.org}ratingValue') if not type: type = Measurement.RATING try: value = float(value) return MeasurementData( quantity_measured=type, value=value, ) except ValueError: return None
def test_metadata_can_be_deepcopied(self): # Check that we didn't put something in the metadata that # will prevent it from being copied. (e.g., self.log) subject = SubjectData(Subject.TAG, "subject") contributor = ContributorData() identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") measurement = MeasurementData(Measurement.RATING, 5) circulation = CirculationData(data_source=DataSource.GUTENBERG, primary_identifier=identifier, licenses_owned=0, licenses_available=0, licenses_reserved=0, patrons_in_hold_queue=0) primary_as_data = IdentifierData(type=identifier.type, identifier=identifier.identifier) other_data = IdentifierData(type=u"abc", identifier=u"def") m = Metadata( DataSource.GUTENBERG, subjects=[subject], contributors=[contributor], primary_identifier=identifier, links=[link], measurements=[measurement], circulation=circulation, title="Hello Title", subtitle="Subtle Hello", sort_title="Sorting Howdy", language="US English", medium=Edition.BOOK_MEDIUM, series="1", series_position=1, publisher="Hello World Publishing House", imprint=u"Follywood", issued=datetime.datetime.utcnow(), published=datetime.datetime.utcnow(), identifiers=[primary_as_data, other_data], data_source_last_updated=datetime.datetime.utcnow(), ) m_copy = deepcopy(m) # If deepcopy didn't throw an exception we're ok. assert m_copy is not None
def book_info_to_metadata(cls, book, include_bibliographic=True, include_formats=True): """Turn Overdrive's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if not 'id' in book: return None overdrive_id = book['id'] primary_identifier = IdentifierData(Identifier.OVERDRIVE_ID, overdrive_id) if include_bibliographic: title = book.get('title', None) sort_title = book.get('sortTitle') subtitle = book.get('subtitle', None) series = book.get('series', None) publisher = book.get('publisher', None) imprint = book.get('imprint', None) if 'publishDate' in book: published = datetime.datetime.strptime( book['publishDate'][:10], cls.DATE_FORMAT) else: published = None languages = [l['code'] for l in book.get('languages', [])] if 'eng' in languages or not languages: language = 'eng' else: language = sorted(languages)[0] contributors = [] for creator in book.get('creators', []): sort_name = creator['fileAs'] display_name = creator['name'] role = creator['role'] roles = cls.parse_roles(overdrive_id, role) or [Contributor.UNKNOWN_ROLE] contributor = ContributorData(sort_name=sort_name, display_name=display_name, roles=roles, biography=creator.get( 'bioText', None)) contributors.append(contributor) subjects = [] for sub in book.get('subjects', []): subject = SubjectData(type=Subject.OVERDRIVE, identifier=sub['value'], weight=100) subjects.append(subject) for sub in book.get('keywords', []): subject = SubjectData(type=Subject.TAG, identifier=sub['value'], weight=1) subjects.append(subject) extra = dict() if 'grade_levels' in book: # n.b. Grade levels are measurements of reading level, not # age appropriateness. We can use them as a measure of age # appropriateness in a pinch, but we weight them less # heavily than other information from Overdrive. for i in book['grade_levels']: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=i['value'], weight=10) subjects.append(subject) overdrive_medium = book.get('mediaType', None) if overdrive_medium and overdrive_medium not in cls.overdrive_medium_to_simplified_medium: cls.log.error("Could not process medium %s for %s", overdrive_medium, overdrive_id) medium = cls.overdrive_medium_to_simplified_medium.get( overdrive_medium, Edition.BOOK_MEDIUM) measurements = [] if 'awards' in book: extra['awards'] = book.get('awards', []) num_awards = len(extra['awards']) measurements.append( MeasurementData(Measurement.AWARDS, str(num_awards))) for name, subject_type in (('ATOS', Subject.ATOS_SCORE), ('lexileScore', Subject.LEXILE_SCORE), ('interestLevel', Subject.INTEREST_LEVEL)): if not name in book: continue identifier = str(book[name]) subjects.append( SubjectData(type=subject_type, identifier=identifier, weight=100)) for grade_level_info in book.get('gradeLevels', []): grade_level = grade_level_info.get('value') subjects.append( SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=100)) identifiers = [] links = [] for format in book.get('formats', []): for new_id in format.get('identifiers', []): t = new_id['type'] v = new_id['value'] orig_v = v type_key = None if t == 'ASIN': type_key = Identifier.ASIN elif t == 'ISBN': type_key = Identifier.ISBN if len(v) == 10: v = isbnlib.to_isbn13(v) if v is None or not isbnlib.is_isbn13(v): # Overdrive sometimes uses invalid values # like "n/a" as placeholders. Ignore such # values to avoid a situation where hundreds of # books appear to have the same ISBN. ISBNs # which fail check digit checks or are invalid # also can occur. Log them for review. cls.log.info("Bad ISBN value provided: %s", orig_v) continue elif t == 'DOI': type_key = Identifier.DOI elif t == 'UPC': type_key = Identifier.UPC elif t == 'PublisherCatalogNumber': continue if type_key and v: identifiers.append(IdentifierData(type_key, v, 1)) # Samples become links. if 'samples' in format: if not format['id'] in cls.format_data_for_overdrive_format: # Useless to us. continue content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format['id']) if Representation.is_media_type(content_type): for sample_info in format['samples']: href = sample_info['url'] links.append( LinkData(rel=Hyperlink.SAMPLE, href=href, media_type=content_type)) # A cover and its thumbnail become a single LinkData. if 'images' in book: images = book['images'] image_data = cls.image_link_to_linkdata( images.get('cover'), Hyperlink.IMAGE) for name in ['cover300Wide', 'cover150Wide', 'thumbnail']: # Try to get a thumbnail that's as close as possible # to the size we use. image = images.get(name) thumbnail_data = cls.image_link_to_linkdata( image, Hyperlink.THUMBNAIL_IMAGE) if not image_data: image_data = cls.image_link_to_linkdata( image, Hyperlink.IMAGE) if thumbnail_data: break if image_data: if thumbnail_data: image_data.thumbnail = thumbnail_data links.append(image_data) # Descriptions become links. short = book.get('shortDescription') full = book.get('fullDescription') if full: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=full, media_type="text/html", )) if short and (not full or not full.startswith(short)): links.append( LinkData( rel=Hyperlink.SHORT_DESCRIPTION, content=short, media_type="text/html", )) # Add measurements: rating and popularity if book.get('starRating') is not None and book['starRating'] > 0: measurements.append( MeasurementData(quantity_measured=Measurement.RATING, value=book['starRating'])) if book.get('popularity'): measurements.append( MeasurementData(quantity_measured=Measurement.POPULARITY, value=book['popularity'])) metadata = Metadata( data_source=DataSource.OVERDRIVE, title=title, subtitle=subtitle, sort_title=sort_title, language=language, medium=medium, series=series, publisher=publisher, imprint=imprint, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, measurements=measurements, links=links, ) else: metadata = Metadata( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, ) if include_formats: formats = [] for format in book.get('formats', []): format_id = format['id'] if format_id in cls.format_data_for_overdrive_format: content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format_id) formats.append(FormatData(content_type, drm_scheme)) elif format_id not in cls.ignorable_overdrive_formats: cls.log.error( "Could not process Overdrive format %s for %s", format_id, overdrive_id) # Also make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
class ItemListParser(XMLParser): DATE_FORMAT = "%Y-%m-%d" YEAR_FORMAT = "%Y" NAMESPACES = {} def parse(self, xml): for i in self.process_all(xml, "//Item"): yield i parenthetical = re.compile(" \([^)]+\)$") @classmethod def contributors_from_string(cls, string): contributors = [] if not string: return contributors for sort_name in string.split(';'): sort_name = cls.parenthetical.sub("", sort_name.strip()) contributors.append( ContributorData( sort_name=sort_name.strip(), roles=[Contributor.AUTHOR_ROLE] ) ) return contributors @classmethod def parse_genre_string(self, s): genres = [] if not s: return genres for i in s.split(","): i = i.strip() if not i: continue i = i.replace("&amp;", "&").replace("&", "&").replace("'", "'") genres.append(SubjectData(Subject.THREEM, i, weight=15)) return genres def process_one(self, tag, namespaces): """Turn an <item> tag into a Metadata and an encompassed CirculationData objects, and return the Metadata.""" def value(threem_key): return self.text_of_optional_subtag(tag, threem_key) links = dict() identifiers = dict() subjects = [] primary_identifier = IdentifierData( Identifier.THREEM_ID, value("ItemId") ) identifiers = [] for key in ('ISBN13', 'PhysicalISBN'): v = value(key) if v: identifiers.append( IdentifierData(Identifier.ISBN, v) ) subjects = self.parse_genre_string(value("Genre")) title = value("Title") subtitle = value("SubTitle") publisher = value("Publisher") language = value("Language") contributors = list(self.contributors_from_string(value('Authors'))) published_date = None published = value("PubDate") if published: formats = [self.DATE_FORMAT, self.YEAR_FORMAT] else: published = value("PubYear") formats = [self.YEAR_FORMAT] for format in formats: try: published_date = datetime.strptime(published, format) except ValueError, e: pass links = [] description = value("Description") if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description) ) cover_url = value("CoverLinkURL").replace("&", "&") links.append(LinkData(rel=Hyperlink.IMAGE, href=cover_url)) alternate_url = value("BookLinkURL").replace("&", "&") links.append(LinkData(rel='alternate', href=alternate_url)) measurements = [] pages = value("NumberOfPages") if pages: pages = int(pages) measurements.append( MeasurementData(quantity_measured=Measurement.PAGE_COUNT, value=pages) ) medium = Edition.BOOK_MEDIUM book_format = value("BookFormat") format = None if book_format == 'EPUB': format = FormatData( content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) elif book_format == 'PDF': format = FormatData( content_type=Representation.PDF_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) elif book_format == 'MP3': format = FormatData( content_type=Representation.MP3_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) medium = Edition.AUDIO_MEDIUM formats = [format] metadata = Metadata( data_source=DataSource.THREEM, title=title, subtitle=subtitle, language=language, medium=medium, publisher=publisher, published=published_date, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, measurements=measurements, links=links, ) # Also make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.THREEM, primary_identifier=primary_identifier, formats=formats, links=links, ) metadata.circulation = circulationdata return metadata