def extract_bibliographic(self, element): identifiers = [] contributors = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) sort_name = element["author"] if not sort_name: sort_name = Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) image_url = element["large_image"] thumbnail_url = element["large_image"] images = [ LinkData(rel=Hyperlink.THUMBNAIL_IMAGE, href=thumbnail_url, media_type=Representation.PNG_MEDIA_TYPE), LinkData(rel=Hyperlink.IMAGE, href=image_url, media_type=Representation.PNG_MEDIA_TYPE) ] metadata = Metadata( data_source=DataSource.ENKI, title=element["title"], language="eng", medium=Edition.BOOK_MEDIUM, publisher=element["publisher"], primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=images, ) licenses_owned = element["availability"]["totalCopies"] licenses_available = element["availability"]["availableCopies"] hold = element["availability"]["onHold"] drm_type = EnkiAPI.adobe_drm if (element["availability"]["accessType"] == 'acs') else EnkiAPI.no_drm formats = [] formats.append( FormatData(content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=drm_type)) circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier=primary_identifier, formats=formats, licenses_owned=int(licenses_owned), licenses_available=int(licenses_available), patrons_in_hold_queue=int(hold)) metadata.circulation = circulationdata return metadata
def extract_bibliographic(self, element): """Extract Metadata and CirculationData from a dictionary of information from Enki. :return: A Metadata with attached CirculationData. """ # TODO: it's not clear what these are or whether we'd find them # useful: # dateSaved # length # publishDate primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) identifiers = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) contributors = [] sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) links = [] description = element.get("description") if description: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html", ) ) # NOTE: When this method is called by, e.g. updated_titles(), # the large and small images are available separately. When # this method is called by get_item(), we only get a single # image, in 'cover'. In get_item() we ask that that image be 'large', # which means we'll be filing it as a normal-sized image. # full_image = None thumbnail_image = None for key, rel in ( ("cover", Hyperlink.IMAGE), ("small_image", Hyperlink.THUMBNAIL_IMAGE), ("large_image", Hyperlink.IMAGE), ): url = element.get(key) if not url: continue link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE) if rel == Hyperlink.THUMBNAIL_IMAGE: # Don't add a thumbnail to the list of links -- wait # until the end and then make it a thumbnail of the # primary image. thumbnail_image = link else: if rel == Hyperlink.IMAGE: full_image = link links.append(link) if thumbnail_image: if full_image: # Set the thumbnail as the thumbnail _of_ the full image. full_image.thumbnail = thumbnail_image else: # Treat the thumbnail as the full image. thumbnail_image.rel = Hyperlink.IMAGE links.append(thumbnail_image) # We treat 'subject', 'topic', and 'genre' as interchangeable # sets of tags. This data is based on BISAC but it's not reliably # presented in a form that can be parsed as BISAC. subjects = [] seen_topics = set() for key in ("subject", "topic", "genre"): for topic in element.get(key, []): if not topic or topic in seen_topics: continue subjects.append( SubjectData( Subject.TAG, topic, weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT, ) ) seen_topics.add(topic) language_code = element.get("language", "English") language = self.LANGUAGE_CODES.get(language_code, "eng") metadata = Metadata( data_source=DataSource.ENKI, title=element.get("title"), language=language, medium=Edition.BOOK_MEDIUM, publisher=element.get("publisher"), primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=links, subjects=subjects, ) circulationdata = self.extract_circulation( primary_identifier, element.get("availability", {}), element.get("formattype", None), ) metadata.circulation = circulationdata return metadata
def record_info_to_metadata(cls, book, availability): """Turn Odilo's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if 'id' not in book: return None odilo_id = book['id'] primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id) active = book.get('active') title = book.get('title') subtitle = book.get('subtitle') series = book.get('series') series_position = book.get('seriesPosition') contributors = [] sort_author = book.get('author') if sort_author: roles = [Contributor.AUTHOR_ROLE] display_author = sort_name_to_display_name(sort_author) contributor = ContributorData(sort_name=sort_author, display_name=display_author, roles=roles, biography=None) contributors.append(contributor) publisher = book.get('publisher') # Metadata --> Marc21 260$c published = book.get('publicationDate') if not published: # yyyyMMdd --> record creation date published = book.get('releaseDate') if published: try: published = datetime.datetime.strptime(published, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse publication date from: ' + published + ', message: ' + e.message) # yyyyMMdd --> record last modification date last_update = book.get('modificationDate') if last_update: try: last_update = datetime.datetime.strptime(last_update, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse last update date from: ' + last_update + ', message: ' + e.message) language = book.get('language', 'spa') subjects = [] for subject in book.get('subjects', []): subjects.append( SubjectData(type=Subject.TAG, identifier=subject, weight=100)) for subjectBisacCode in book.get('subjectsBisacCodes', []): subjects.append( SubjectData(type=Subject.BISAC, identifier=subjectBisacCode, weight=100)) grade_level = book.get('gradeLevel') if grade_level: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=10) subjects.append(subject) medium = None file_format = book.get('fileFormat') formats = [] for format_received in book.get('formats', []): if format_received in cls.format_data_for_odilo_format: medium = cls.set_format(format_received, formats) elif format_received == cls.ACSM and file_format: medium = cls.set_format( format_received + '_' + file_format.upper(), formats) else: cls.log.warn('Unrecognized format received: ' + format_received) if not medium: medium = Edition.BOOK_MEDIUM identifiers = [] isbn = book.get('isbn') if isbn: if isbnlib.is_isbn10(isbn): isbn = isbnlib.to_isbn13(isbn) identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1)) # A cover links = [] cover_image_url = book.get('coverImageUrl') if cover_image_url: image_data = cls.image_link_to_linkdata(cover_image_url, Hyperlink.THUMBNAIL_IMAGE) if image_data: links.append(image_data) original_image_url = book.get('originalImageUrl') if original_image_url: image_data = cls.image_link_to_linkdata(original_image_url, Hyperlink.IMAGE) if image_data: links.append(image_data) # Descriptions become links. description = book.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html")) metadata = Metadata(data_source=DataSource.ODILO, title=title, subtitle=subtitle, language=language, medium=medium, series=series, series_position=series_position, publisher=publisher, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, data_source_last_updated=last_update) metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation( availability) # 'active' --> means that the book exists but it's no longer in the collection # (it could be available again in the future) if not active: metadata.circulation.licenses_owned = 0 metadata.circulation.formats = formats return metadata, active
def test_annotate_metadata(self): """Verify that annotate_metadata calls load_circulation_data and load_cover_link appropriately. """ # First, test an unsuccessful annotation. class MockNoCirculationData(DirectoryImportScript): """Do nothing when load_circulation_data is called. Explode if load_cover_link is called. """ def load_circulation_data(self, *args): self.load_circulation_data_args = args return None def load_cover_link(self, *args): raise Exception("Explode!") gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) identifier = IdentifierData(Identifier.GUTENBERG_ID, "11111") identifier_obj, ignore = identifier.load(self._db) metadata = Metadata( title=self._str, data_source=gutenberg, primary_identifier=identifier ) mirror = object() policy = ReplacementPolicy(mirror=mirror) cover_directory = object() ebook_directory = object() rights_uri = object() script = MockNoCirculationData(self._db) args = (metadata, policy, cover_directory, ebook_directory, rights_uri) script.annotate_metadata(*args) # load_circulation_data was called. eq_( (identifier_obj, gutenberg, ebook_directory, mirror, metadata.title, rights_uri), script.load_circulation_data_args ) # But because load_circulation_data returned None, # metadata.circulation_data was not modified and # load_cover_link was not called (which would have raised an # exception). eq_(None, metadata.circulation) # Test a successful annotation with no cover image. class MockNoCoverLink(DirectoryImportScript): """Return an object when load_circulation_data is called. Do nothing when load_cover_link is called. """ def load_circulation_data(self, *args): return "Some circulation data" def load_cover_link(self, *args): self.load_cover_link_args = args return None script = MockNoCoverLink(self._db) script.annotate_metadata(*args) # The Metadata object was annotated with the return value of # load_circulation_data. eq_("Some circulation data", metadata.circulation) # load_cover_link was called. eq_( (identifier_obj, gutenberg, cover_directory, mirror), script.load_cover_link_args ) # But since it provided no cover link, metadata.links was empty. eq_([], metadata.links) # Finally, test a completely successful annotation. class MockWithCoverLink(DirectoryImportScript): """Mock success for both load_circulation_data and load_cover_link. """ def load_circulation_data(self, *args): return "Some circulation data" def load_cover_link(self, *args): return "A cover link" metadata.circulation = None script = MockWithCoverLink(self._db) script.annotate_metadata(*args) eq_("Some circulation data", metadata.circulation) eq_(['A cover link'], metadata.links)
def extract_bibliographic(self, element): """Extract Metadata and CirculationData from a dictionary of information from Enki. :return: A Metadata with attached CirculationData. """ # TODO: it's not clear what these are or whether we'd find them # useful: # dateSaved # length # publishDate primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) identifiers = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) contributors = [] sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) links = [] description = element.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html") ) # NOTE: When this method is called by, e.g. updated_titles(), # the large and small images are available separately. When # this method is called by get_item(), we only get a single # image, in 'cover'. In get_item() we ask that that image be 'large', # which means we'll be filing it as a normal-sized image. # full_image = None thumbnail_image = None for key, rel in ( ('cover', Hyperlink.IMAGE), ('small_image', Hyperlink.THUMBNAIL_IMAGE), ('large_image', Hyperlink.IMAGE) ): url = element.get(key) if not url: continue link = LinkData( rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE ) if rel == Hyperlink.THUMBNAIL_IMAGE: # Don't add a thumbnail to the list of links -- wait # until the end and then make it a thumbnail of the # primary image. thumbnail_image = link else: if rel == Hyperlink.IMAGE: full_image = link links.append(link) if thumbnail_image: if full_image: # Set the thumbnail as the thumbnail _of_ the full image. full_image.thumbnail = thumbnail_image else: # Treat the thumbnail as the full image. thumbnail_image.rel = Hyperlink.IMAGE links.append(thumbnail_image) # We treat 'subject', 'topic', and 'genre' as interchangeable # sets of tags. This data is based on BISAC but it's not reliably # presented in a form that can be parsed as BISAC. subjects = [] seen_topics = set() for key in ('subject', 'topic', 'genre'): for topic in element.get(key, []): if not topic or topic in seen_topics: continue subjects.append(SubjectData(Subject.TAG, topic)) seen_topics.add(topic) language_code = element.get("language", "English") language = self.LANGUAGE_CODES.get(language_code, "eng") metadata = Metadata( data_source=DataSource.ENKI, title=element.get("title"), language=language, medium=Edition.BOOK_MEDIUM, publisher=element.get("publisher"), primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=links, subjects=subjects, ) circulationdata = self.extract_circulation( primary_identifier, element.get('availability', {}), element.get('formattype', None) ) metadata.circulation = circulationdata return metadata