def test_circulationdata_can_be_deepcopied(self): # Check that we didn't put something in the CirculationData that # will prevent it from being copied. (e.g., self.log) subject = SubjectData(Subject.TAG, "subject") contributor = ContributorData() identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM) rights_uri = RightsStatus.GENERIC_OPEN_ACCESS circulation_data = CirculationData( DataSource.GUTENBERG, primary_identifier=identifier, links=[link], licenses_owned=5, licenses_available=5, licenses_reserved=None, patrons_in_hold_queue=None, formats=[format], default_rights_uri=rights_uri, ) circulation_data_copy = deepcopy(circulation_data) # If deepcopy didn't throw an exception we're ok. assert circulation_data_copy is not None
def test_update_contributions(self): edition = self._edition() # A test edition is created with a test contributor. This # particular contributor is about to be destroyed and replaced by # new data. [old_contributor] = edition.contributors contributor = ContributorData(display_name="Robert Jordan", sort_name="Jordan, Robert", wikipedia_name="Robert_Jordan", viaf="79096089", lc="123", roles=[Contributor.PRIMARY_AUTHOR_ROLE]) metadata = Metadata(DataSource.OVERDRIVE, contributors=[contributor]) metadata.update_contributions(self._db, edition, replace=True) # The old contributor has been removed and replaced with the new # one. [contributor] = edition.contributors assert contributor != old_contributor # And the new one has all the information provided by # the Metadata object. eq_("Jordan, Robert", contributor.sort_name) eq_("Robert Jordan", contributor.display_name) eq_("79096089", contributor.viaf) eq_("123", contributor.lc) eq_("Robert_Jordan", contributor.wikipedia_name)
def test_from_contribution(self): # Makes sure ContributorData.from_contribution copies all the fields over. # make author with that name, add author to list and pass to edition contributors = ["PrimaryAuthor"] edition, pool = self._edition(with_license_pool=True, authors=contributors) contribution = edition.contributions[0] contributor = contribution.contributor contributor.lc = "1234567" contributor.viaf = "ABC123" contributor.aliases = ["Primo"] contributor.display_name = "Test Author For The Win" contributor.family_name = "TestAuttie" contributor.wikipedia_name = "TestWikiAuth" contributor.biography = "He was born on Main Street." contributor_data = ContributorData.from_contribution(contribution) # make sure contributor fields are still what I expect eq_(contributor_data.lc, contributor.lc) eq_(contributor_data.viaf, contributor.viaf) eq_(contributor_data.aliases, contributor.aliases) eq_(contributor_data.display_name, contributor.display_name) eq_(contributor_data.family_name, contributor.family_name) eq_(contributor_data.wikipedia_name, contributor.wikipedia_name) eq_(contributor_data.biography, contributor.biography)
def extract_contributor(cls, parser, author_tag): """Turn an <atom:author> tag into a ContributorData object.""" subtag = parser.text_of_optional_subtag sort_name = subtag(author_tag, 'simplified:sort_name') display_name = subtag(author_tag, 'atom:name') family_name = subtag(author_tag, "simplified:family_name") wikipedia_name = subtag(author_tag, "simplified:wikipedia_name") # TODO: we need a way of conveying roles. I believe Bibframe # has the answer. # TODO: Also collect VIAF and LC numbers if present. This # requires parsing the URIs. Only the metadata wrangler will # provide this information. viaf = None if sort_name or display_name or viaf: return ContributorData( sort_name=sort_name, display_name=display_name, family_name=family_name, wikipedia_name=wikipedia_name, roles=None ) logging.info("Refusing to create ContributorData for contributor with no sort name, display name, or VIAF.") return None
def contributors_from_string(cls, string, role=Contributor.AUTHOR_ROLE): contributors = [] if not string: return contributors for sort_name in string.split(';'): sort_name = cls.parenthetical.sub("", sort_name.strip()) contributors.append( ContributorData(sort_name=sort_name.strip(), roles=[role])) return contributors
def test_apply(self): # Makes sure ContributorData.apply copies all the fields over when there's changes to be made. contributor_old, made_new = self._contributor(sort_name="Doe, John", viaf="viaf12345") kwargs = dict() kwargs[Contributor.BIRTH_DATE] = '2001-01-01' contributor_data = ContributorData( sort_name="Doerr, John", lc="1234567", viaf="ABC123", aliases=["Primo"], display_name="Test Author For The Win", family_name="TestAuttie", wikipedia_name="TestWikiAuth", biography="He was born on Main Street.", extra=kwargs, ) contributor_new, changed = contributor_data.apply(contributor_old) eq_(changed, True) eq_(contributor_new.sort_name, u"Doerr, John") eq_(contributor_new.lc, u"1234567") eq_(contributor_new.viaf, u"ABC123") eq_(contributor_new.aliases, [u"Primo"]) eq_(contributor_new.display_name, u"Test Author For The Win") eq_(contributor_new.family_name, u"TestAuttie") eq_(contributor_new.wikipedia_name, u"TestWikiAuth") eq_(contributor_new.biography, u"He was born on Main Street.") eq_(contributor_new.extra[Contributor.BIRTH_DATE], u"2001-01-01") #eq_(contributor_new.contributions, u"Audio") #eq_(contributor_new.work_contributions, u"Audio") contributor_new, changed = contributor_data.apply(contributor_new) eq_(changed, False)
def parse_contributor(cls, author, primary_author_found=False): if primary_author_found: default_author_role = Contributor.AUTHOR_ROLE else: default_author_role = Contributor.PRIMARY_AUTHOR_ROLE role = default_author_role match = cls.role_abbreviation.search(author) if match: role_type = match.groups()[0] role = cls.role_abbreviation_to_role.get(role_type, Contributor.UNKNOWN_ROLE) if role is cls.generic_author: role = default_author_role author = author[:-5].strip() return ContributorData(sort_name=author, roles=role)
def test_metadata_can_be_deepcopied(self): # Check that we didn't put something in the metadata that # will prevent it from being copied. (e.g., self.log) subject = SubjectData(Subject.TAG, "subject") contributor = ContributorData() identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") measurement = MeasurementData(Measurement.RATING, 5) circulation = CirculationData(data_source=DataSource.GUTENBERG, primary_identifier=identifier, licenses_owned=0, licenses_available=0, licenses_reserved=0, patrons_in_hold_queue=0) primary_as_data = IdentifierData(type=identifier.type, identifier=identifier.identifier) other_data = IdentifierData(type=u"abc", identifier=u"def") m = Metadata( DataSource.GUTENBERG, subjects=[subject], contributors=[contributor], primary_identifier=identifier, links=[link], measurements=[measurement], circulation=circulation, title="Hello Title", subtitle="Subtle Hello", sort_title="Sorting Howdy", language="US English", medium=Edition.BOOK_MEDIUM, series="1", series_position=1, publisher="Hello World Publishing House", imprint=u"Follywood", issued=datetime.datetime.utcnow(), published=datetime.datetime.utcnow(), identifiers=[primary_as_data, other_data], data_source_last_updated=datetime.datetime.utcnow(), ) m_copy = deepcopy(m) # If deepcopy didn't throw an exception we're ok. assert m_copy is not None
def parse_contributor(cls, author, primary_author_found=False, force_role=None): """Parse an Axis 360 contributor string. The contributor string looks like "Butler, Octavia" or "Walt Disney Pictures (COR)" or "Rex, Adam (ILT)". The optional three-letter code describes the contributor's role in the book. :param author: The string to parse. :param primary_author_found: If this is false, then a contributor with no three-letter code will be treated as the primary author. If this is true, then a contributor with no three-letter code will be treated as just a regular author. :param force_role: If this is set, the contributor will be assigned this role, no matter what. This takes precedence over the value implied by primary_author_found. """ if primary_author_found: default_author_role = Contributor.AUTHOR_ROLE else: default_author_role = Contributor.PRIMARY_AUTHOR_ROLE role = default_author_role match = cls.role_abbreviation.search(author) if match: role_type = match.groups()[0] role = cls.role_abbreviation_to_role.get(role_type, Contributor.UNKNOWN_ROLE) if role is cls.generic_author: role = default_author_role author = author[:-5].strip() if force_role: role = force_role return ContributorData(sort_name=author, roles=[role])
def extract_bibliographic(self, element, ns): identifiers = [] contributors = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) sort_name = element["author"] if not sort_name: sort_name = "Unknown" contributors.append(ContributorData(sort_name=sort_name)) primary_identifier = IdentifierData(Identifier.ENKI_ID, element["id"]) metadata = Metadata( data_source=DataSource.ENKI, title=element["title"], language="ENGLISH", medium=Edition.BOOK_MEDIUM, #series=series, publisher=element["publisher"], #imprint=imprint, #published=publication_date, primary_identifier=primary_identifier, identifiers=identifiers, #subjects=subjects, contributors=contributors, ) #TODO: This should parse the content type and look it up in the Enki Delivery Data above. Currently, # we assume everything is an ePub that uses Adobe DRM, which is a safe assumption only for now. formats = [] formats.append( FormatData(content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM)) circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
def book_info_to_metadata(cls, book, include_bibliographic=True, include_formats=True): """Turn Overdrive's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if not 'id' in book: return None overdrive_id = book['id'] primary_identifier = IdentifierData(Identifier.OVERDRIVE_ID, overdrive_id) if include_bibliographic: title = book.get('title', None) sort_title = book.get('sortTitle') subtitle = book.get('subtitle', None) series = book.get('series', None) publisher = book.get('publisher', None) imprint = book.get('imprint', None) if 'publishDate' in book: published = datetime.datetime.strptime( book['publishDate'][:10], cls.DATE_FORMAT) else: published = None languages = [l['code'] for l in book.get('languages', [])] if 'eng' in languages or not languages: language = 'eng' else: language = sorted(languages)[0] contributors = [] for creator in book.get('creators', []): sort_name = creator['fileAs'] display_name = creator['name'] role = creator['role'] roles = cls.parse_roles(overdrive_id, role) or [Contributor.UNKNOWN_ROLE] contributor = ContributorData(sort_name=sort_name, display_name=display_name, roles=roles, biography=creator.get( 'bioText', None)) contributors.append(contributor) subjects = [] for sub in book.get('subjects', []): subject = SubjectData(type=Subject.OVERDRIVE, identifier=sub['value'], weight=100) subjects.append(subject) for sub in book.get('keywords', []): subject = SubjectData(type=Subject.TAG, identifier=sub['value'], weight=1) subjects.append(subject) extra = dict() if 'grade_levels' in book: # n.b. Grade levels are measurements of reading level, not # age appropriateness. We can use them as a measure of age # appropriateness in a pinch, but we weight them less # heavily than other information from Overdrive. for i in book['grade_levels']: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=i['value'], weight=10) subjects.append(subject) overdrive_medium = book.get('mediaType', None) if overdrive_medium and overdrive_medium not in cls.overdrive_medium_to_simplified_medium: cls.log.error("Could not process medium %s for %s", overdrive_medium, overdrive_id) medium = cls.overdrive_medium_to_simplified_medium.get( overdrive_medium, Edition.BOOK_MEDIUM) measurements = [] if 'awards' in book: extra['awards'] = book.get('awards', []) num_awards = len(extra['awards']) measurements.append( MeasurementData(Measurement.AWARDS, str(num_awards))) for name, subject_type in (('ATOS', Subject.ATOS_SCORE), ('lexileScore', Subject.LEXILE_SCORE), ('interestLevel', Subject.INTEREST_LEVEL)): if not name in book: continue identifier = str(book[name]) subjects.append( SubjectData(type=subject_type, identifier=identifier, weight=100)) for grade_level_info in book.get('gradeLevels', []): grade_level = grade_level_info.get('value') subjects.append( SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=100)) identifiers = [] links = [] for format in book.get('formats', []): for new_id in format.get('identifiers', []): t = new_id['type'] v = new_id['value'] orig_v = v type_key = None if t == 'ASIN': type_key = Identifier.ASIN elif t == 'ISBN': type_key = Identifier.ISBN if len(v) == 10: v = isbnlib.to_isbn13(v) if v is None or not isbnlib.is_isbn13(v): # Overdrive sometimes uses invalid values # like "n/a" as placeholders. Ignore such # values to avoid a situation where hundreds of # books appear to have the same ISBN. ISBNs # which fail check digit checks or are invalid # also can occur. Log them for review. cls.log.info("Bad ISBN value provided: %s", orig_v) continue elif t == 'DOI': type_key = Identifier.DOI elif t == 'UPC': type_key = Identifier.UPC elif t == 'PublisherCatalogNumber': continue if type_key and v: identifiers.append(IdentifierData(type_key, v, 1)) # Samples become links. if 'samples' in format: if not format['id'] in cls.format_data_for_overdrive_format: # Useless to us. continue content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format['id']) if Representation.is_media_type(content_type): for sample_info in format['samples']: href = sample_info['url'] links.append( LinkData(rel=Hyperlink.SAMPLE, href=href, media_type=content_type)) # A cover and its thumbnail become a single LinkData. if 'images' in book: images = book['images'] image_data = cls.image_link_to_linkdata( images.get('cover'), Hyperlink.IMAGE) for name in ['cover300Wide', 'cover150Wide', 'thumbnail']: # Try to get a thumbnail that's as close as possible # to the size we use. image = images.get(name) thumbnail_data = cls.image_link_to_linkdata( image, Hyperlink.THUMBNAIL_IMAGE) if not image_data: image_data = cls.image_link_to_linkdata( image, Hyperlink.IMAGE) if thumbnail_data: break if image_data: if thumbnail_data: image_data.thumbnail = thumbnail_data links.append(image_data) # Descriptions become links. short = book.get('shortDescription') full = book.get('fullDescription') if full: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=full, media_type="text/html", )) if short and (not full or not full.startswith(short)): links.append( LinkData( rel=Hyperlink.SHORT_DESCRIPTION, content=short, media_type="text/html", )) # Add measurements: rating and popularity if book.get('starRating') is not None and book['starRating'] > 0: measurements.append( MeasurementData(quantity_measured=Measurement.RATING, value=book['starRating'])) if book.get('popularity'): measurements.append( MeasurementData(quantity_measured=Measurement.POPULARITY, value=book['popularity'])) metadata = Metadata( data_source=DataSource.OVERDRIVE, title=title, subtitle=subtitle, sort_title=sort_title, language=language, medium=medium, series=series, publisher=publisher, imprint=imprint, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, measurements=measurements, links=links, ) else: metadata = Metadata( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, ) if include_formats: formats = [] for format in book.get('formats', []): format_id = format['id'] if format_id in cls.format_data_for_overdrive_format: content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format_id) formats.append(FormatData(content_type, drm_scheme)) elif format_id not in cls.ignorable_overdrive_formats: cls.log.error( "Could not process Overdrive format %s for %s", format_id, overdrive_id) # Also make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
class TestBibliographicCoverageProvider(DatabaseTest): BIBLIOGRAPHIC_DATA = Metadata( DataSource.OVERDRIVE, publisher=u'Perfection Learning', language='eng', title=u'A Girl Named Disaster', published=datetime.datetime(1998, 3, 1, 0, 0), primary_identifier=IdentifierData( type=Identifier.OVERDRIVE_ID, identifier=u'ba9b3419-b0bd-4ca7-a24f-26c4246b6b44'), identifiers=[ IdentifierData(type=Identifier.OVERDRIVE_ID, identifier=u'ba9b3419-b0bd-4ca7-a24f-26c4246b6b44'), IdentifierData(type=Identifier.ISBN, identifier=u'9781402550805') ], contributors=[ ContributorData(sort_name=u"Nancy Farmer", roles=[Contributor.PRIMARY_AUTHOR_ROLE]) ], subjects=[ SubjectData(type=Subject.TOPIC, identifier=u'Action & Adventure'), SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Young Adult'), SubjectData(type=Subject.PLACE, identifier=u'Africa') ], ) CIRCULATION_DATA = CirculationData( DataSource.OVERDRIVE, primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier, ) def test_edition(self): provider = MockBibliographicCoverageProvider(self._db) provider.CAN_CREATE_LICENSE_POOLS = False identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) test_metadata = self.BIBLIOGRAPHIC_DATA # Returns a CoverageFailure if the identifier doesn't have a # license pool and none can be created. result = provider.work(identifier) assert isinstance(result, CoverageFailure) eq_("No license pool available", result.exception) # Returns an Edition otherwise, creating it if necessary. edition, lp = self._edition(with_license_pool=True) identifier = edition.primary_identifier eq_(edition, provider.edition(identifier)) # The Edition will be created if necessary. lp.identifier.primarily_identifies = [] e2 = provider.edition(identifier) assert edition != e2 assert isinstance(e2, Edition) def test_work(self): provider = MockBibliographicCoverageProvider(self._db) identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) test_metadata = self.BIBLIOGRAPHIC_DATA provider.CAN_CREATE_LICENSE_POOLS = False # Returns a CoverageFailure if the identifier doesn't have a # license pool. result = provider.work(identifier) assert isinstance(result, CoverageFailure) eq_("No license pool available", result.exception) # Returns a CoverageFailure if there's no work available. edition, lp = self._edition(with_license_pool=True) # Remove edition so that the work won't be calculated lp.identifier.primarily_identifies = [] result = provider.work(lp.identifier) assert isinstance(result, CoverageFailure) eq_("Work could not be calculated", result.exception) # Returns the work if it can be created or found. ed, lp = self._edition(with_license_pool=True) result = provider.work(lp.identifier) eq_(result, lp.work) def test_set_metadata(self): provider = MockBibliographicCoverageProvider(self._db) provider.CAN_CREATE_LICENSE_POOLS = False identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) test_metadata = self.BIBLIOGRAPHIC_DATA test_circulationdata = self.CIRCULATION_DATA # If there is no LicensePool and it can't be autocreated, a # CoverageRecord results. result = provider.work(identifier) assert isinstance(result, CoverageFailure) eq_("No license pool available", result.exception) edition, lp = self._edition(data_source_name=DataSource.OVERDRIVE, identifier_type=Identifier.OVERDRIVE_ID, identifier_id=self.BIBLIOGRAPHIC_DATA. primary_identifier.identifier, with_license_pool=True) # If no metadata is passed in, a CoverageFailure results. result = provider.set_metadata_and_circulation_data( edition.primary_identifier, None, None) assert isinstance(result, CoverageFailure) eq_("Received neither metadata nor circulation data from input source", result.exception) # If no work can be created (in this case, because there's no title), # a CoverageFailure results. edition.title = None old_title = test_metadata.title test_metadata.title = None result = provider.set_metadata_and_circulation_data( edition.primary_identifier, test_metadata, test_circulationdata) assert isinstance(result, CoverageFailure) eq_("Work could not be calculated", result.exception) test_metadata.title = old_title # Test success result = provider.set_metadata_and_circulation_data( edition.primary_identifier, test_metadata, test_circulationdata) eq_(result, edition.primary_identifier) # If there's an exception setting the metadata, a # CoverageRecord results. This call raises a ValueError # because the primary identifier & the edition's primary # identifier don't match. test_metadata.primary_identifier = self._identifier( identifier_type=Identifier.OVERDRIVE_ID) result = provider.set_metadata_and_circulation_data( lp.identifier, test_metadata, test_circulationdata) assert isinstance(result, CoverageFailure) assert "ValueError" in result.exception def test_autocreate_licensepool(self): provider = MockBibliographicCoverageProvider(self._db) identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) # If this constant is set to False, the coverage provider cannot # autocreate LicensePools for identifiers. provider.CAN_CREATE_LICENSE_POOLS = False eq_(None, provider.license_pool(identifier)) # If it's set to True, the coverage provider can autocreate # LicensePools for identifiers. provider.CAN_CREATE_LICENSE_POOLS = True pool = provider.license_pool(identifier) eq_(pool.data_source, provider.output_source) eq_(pool.identifier, identifier) def test_set_presentation_ready(self): provider = MockBibliographicCoverageProvider(self._db) identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) test_metadata = self.BIBLIOGRAPHIC_DATA # If the work can't be found, it can't be made presentation ready. provider.CAN_CREATE_LICENSE_POOLS = False result = provider.set_presentation_ready(identifier) assert isinstance(result, CoverageFailure) eq_("No license pool available", result.exception) # Test success. ed, lp = self._edition(with_license_pool=True) result = provider.set_presentation_ready(ed.primary_identifier) eq_(result, ed.primary_identifier) def test_process_batch_sets_work_presentation_ready(self): work = self._work(with_license_pool=True, with_open_access_download=True) identifier = work.license_pools[0].identifier work.presentation_ready = False provider = MockBibliographicCoverageProvider(self._db) [result] = provider.process_batch([identifier]) eq_(result, identifier) eq_(True, work.presentation_ready) # ensure_coverage does the same thing. work.presentation_ready = False result = provider.ensure_coverage(identifier) assert isinstance(result, CoverageRecord) eq_(result.identifier, identifier) eq_(True, work.presentation_ready) def test_failure_does_not_set_work_presentation_ready(self): work = self._work(with_license_pool=True, with_open_access_download=True) identifier = work.license_pools[0].identifier work.presentation_ready = False provider = MockFailureBibliographicCoverageProvider(self._db) [result] = provider.process_batch([identifier]) assert isinstance(result, CoverageFailure) eq_(False, work.presentation_ready)
def record_info_to_metadata(cls, book, availability): """Turn Odilo's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if 'id' not in book: return None odilo_id = book['id'] primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id) active = book.get('active') title = book.get('title') subtitle = book.get('subtitle') series = book.get('series') series_position = book.get('seriesPosition') contributors = [] sort_author = book.get('author') if sort_author: roles = [Contributor.AUTHOR_ROLE] display_author = sort_name_to_display_name(sort_author) contributor = ContributorData(sort_name=sort_author, display_name=display_author, roles=roles, biography=None) contributors.append(contributor) publisher = book.get('publisher') # Metadata --> Marc21 260$c published = book.get('publicationDate') if not published: # yyyyMMdd --> record creation date published = book.get('releaseDate') if published: try: published = datetime.datetime.strptime(published, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse publication date from: ' + published + ', message: ' + e.message) # yyyyMMdd --> record last modification date last_update = book.get('modificationDate') if last_update: try: last_update = datetime.datetime.strptime(last_update, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse last update date from: ' + last_update + ', message: ' + e.message) language = book.get('language', 'spa') subjects = [] for subject in book.get('subjects', []): subjects.append( SubjectData(type=Subject.TAG, identifier=subject, weight=100)) for subjectBisacCode in book.get('subjectsBisacCodes', []): subjects.append( SubjectData(type=Subject.BISAC, identifier=subjectBisacCode, weight=100)) grade_level = book.get('gradeLevel') if grade_level: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=10) subjects.append(subject) medium = None file_format = book.get('fileFormat') formats = [] for format_received in book.get('formats', []): if format_received in cls.format_data_for_odilo_format: medium = cls.set_format(format_received, formats) elif format_received == OdiloAPI.ACSM and file_format: medium = cls.set_format( format_received + '_' + file_format.upper(), formats) else: cls.log.warn('Unrecognized format received: ' + format_received) if not medium: medium = Edition.BOOK_MEDIUM identifiers = [] isbn = book.get('isbn') if isbn: if isbnlib.is_isbn10(isbn): isbn = isbnlib.to_isbn13(isbn) identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1)) # A cover links = [] cover_image_url = book.get('coverImageUrl') if cover_image_url: image_data = cls.image_link_to_linkdata(cover_image_url, Hyperlink.THUMBNAIL_IMAGE) if image_data: links.append(image_data) original_image_url = book.get('originalImageUrl') if original_image_url: image_data = cls.image_link_to_linkdata(original_image_url, Hyperlink.IMAGE) if image_data: links.append(image_data) # Descriptions become links. description = book.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html")) metadata = Metadata(data_source=DataSource.ODILO, title=title, subtitle=subtitle, language=language, medium=medium, series=series, series_position=series_position, publisher=publisher, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, data_source_last_updated=last_update) metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation( availability) # 'active' --> means that the book exists but it's no longer in the collection # (it could be available again in the future) if not active: metadata.circulation.licenses_owned = 0 metadata.circulation.formats = formats return metadata, active
# other books in the same series, as well as ISBNs that # are just wrong. Assign these equivalencies at a low # level of confidence. for isbn in d.get('isbns', []): isbn13 = isbn.get('isbn13', None) if isbn13: other_isbns.append( IdentifierData(Identifier.ISBN, isbn13, 0.50)) primary_isbn = primary_isbn13 or primary_isbn10 if primary_isbn: primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90) contributors = [] if display_author: contributors.append(ContributorData(display_name=display_author)) metadata = Metadata( data_source=DataSource.NYT, title=title, language='eng', published=published_date, publisher=publisher, contributors=contributors, primary_identifier=primary_isbn, identifiers=other_isbns, ) super(NYTBestSellerListTitle, self).__init__(metadata, first_appearance, most_recent_appearance, annotation)
def isbn_info_to_metadata(cls, book, include_bibliographic=True, include_formats=True): """Turn OneClick's JSON representation of a book into a Metadata object. Assumes the JSON is in the format that comes from the media/{isbn} endpoint. TODO: Use the seriesTotal field. :param book a json response-derived dictionary of book attributes """ if not 'isbn' in book: return None oneclick_id = book['isbn'] primary_identifier = IdentifierData(Identifier.ONECLICK_ID, oneclick_id) metadata = Metadata( data_source=DataSource.ONECLICK, primary_identifier=primary_identifier, ) if include_bibliographic: title = book.get('title', None) # NOTE: An item that's part of a series, will have the seriesName field, and # will have its seriesPosition and seriesTotal fields set to >0. # An item not part of a series will have the seriesPosition and seriesTotal fields # set to 0, and will not have a seriesName at all. # Sometimes, series position and total == 0, for many series items (ex: "seriesName": "EngLits"). # Sometimes, seriesName is set to "Default Blank", meaning "not actually a series". series_name = book.get('seriesName', None) series_position = book.get('seriesPosition', None) if series_position: try: series_position = int(series_position) except ValueError: # not big enough deal to stop the whole process series_position = None # ignored for now series_total = book.get('seriesTotal', None) # ignored for now has_digital_rights = book.get('hasDigitalRights', None) publisher = book.get('publisher', None) if 'publicationDate' in book: published = datetime.datetime.strptime( book['publicationDate'][:10], cls.DATE_FORMAT) else: published = None if 'language' in book: language = LanguageCodes.string_to_alpha_3(book['language']) else: language = 'eng' contributors = [] if 'authors' in book: authors = book['authors'] for author in authors.split(";"): sort_name = author.strip() if sort_name: sort_name = name_tidy(sort_name) display_name = sort_name_to_display_name(sort_name) roles = [Contributor.AUTHOR_ROLE] contributor = ContributorData( sort_name=sort_name, display_name=display_name, roles=roles) contributors.append(contributor) if 'narrators' in book: narrators = book['narrators'] for narrator in narrators.split(";"): sort_name = narrator.strip() if sort_name: sort_name = name_tidy(sort_name) display_name = sort_name_to_display_name(sort_name) roles = [Contributor.NARRATOR_ROLE] contributor = ContributorData( sort_name=sort_name, display_name=display_name, roles=roles) contributors.append(contributor) subjects = [] if 'genres' in book: # example: "FICTION / Humorous / General" genres = book['genres'] subject = SubjectData(type=Subject.BISAC, identifier=genres, weight=100) subjects.append(subject) if 'primaryGenre' in book: # example: "humorous-fiction,mystery,womens-fiction" genres = book['primaryGenre'] for genre in genres.split(","): subject = SubjectData(type=Subject.ONECLICK, identifier=genre.strip(), weight=100) subjects.append(subject) # audience options are: adult, beginning-reader, childrens, young-adult # NOTE: In OneClick metadata, audience can be set to "Adult" while publisher is "HarperTeen". audience = book.get('audience', None) if audience: subject = SubjectData(type=Subject.ONECLICK_AUDIENCE, identifier=audience.strip().lower(), weight=10) subjects.append(subject) # options are: "eBook", "eAudio" oneclick_medium = book.get('mediaType', None) if oneclick_medium and oneclick_medium not in cls.oneclick_medium_to_simplified_medium: cls.log.error("Could not process medium %s for %s", oneclick_medium, oneclick_id) medium = cls.oneclick_medium_to_simplified_medium.get( oneclick_medium, Edition.BOOK_MEDIUM) # passed to metadata.apply, the isbn_identifier will create an equivalency # between the OneClick-labeled and the ISBN-labeled identifier rows, which # will in turn allow us to ask the MetadataWrangler for more info about the book. isbn_identifier = IdentifierData(Identifier.ISBN, oneclick_id) identifiers = [primary_identifier, isbn_identifier] links = [] # A cover and its thumbnail become a single LinkData. # images come in small (ex: 71x108px), medium (ex: 95x140px), # and large (ex: 128x192px) sizes if 'images' in book: images = book['images'] for image in images: if image['name'] == "large": image_data = cls.image_link_to_linkdata( image['url'], Hyperlink.IMAGE) if image['name'] == "medium": thumbnail_data = cls.image_link_to_linkdata( image['url'], Hyperlink.THUMBNAIL_IMAGE) if image['name'] == "small": thumbnail_data_backup = cls.image_link_to_linkdata( image['url'], Hyperlink.THUMBNAIL_IMAGE) if not thumbnail_data and thumbnail_data_backup: thumbnail_data = thumbnail_data_backup if image_data: if thumbnail_data: image_data.thumbnail = thumbnail_data links.append(image_data) # Descriptions become links. description = book.get('description', None) if description: links.append( LinkData( # there can be fuller descriptions in the search endpoint output rel=Hyperlink.SHORT_DESCRIPTION, content=description, media_type="text/html", )) metadata.title = title metadata.language = language metadata.medium = medium metadata.series = series_name metadata.series_position = series_position metadata.publisher = publisher metadata.published = published metadata.identifiers = identifiers metadata.subjects = subjects metadata.contributors = contributors metadata.links = links if include_formats: formats = [] if metadata.medium == Edition.BOOK_MEDIUM: content_type, drm_scheme = cls.oneclick_formats.get( "ebook-epub-oneclick") formats.append(FormatData(content_type, drm_scheme)) elif metadata.medium == Edition.AUDIO_MEDIUM: content_type, drm_scheme = cls.oneclick_formats.get( "audiobook-mp3-oneclick") formats.append(FormatData(content_type, drm_scheme)) else: cls.log.warn("Unfamiliar format: %s", format_id) # Make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.ONECLICK, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata