def extract_bibliographic(self, element): identifiers = [] contributors = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) sort_name = element["author"] if not sort_name: sort_name = Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) image_url = element["large_image"] thumbnail_url = element["large_image"] images = [ LinkData(rel=Hyperlink.THUMBNAIL_IMAGE, href=thumbnail_url, media_type=Representation.PNG_MEDIA_TYPE), LinkData(rel=Hyperlink.IMAGE, href=image_url, media_type=Representation.PNG_MEDIA_TYPE) ] metadata = Metadata( data_source=DataSource.ENKI, title=element["title"], language="eng", medium=Edition.BOOK_MEDIUM, publisher=element["publisher"], primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=images, ) licenses_owned = element["availability"]["totalCopies"] licenses_available = element["availability"]["availableCopies"] hold = element["availability"]["onHold"] drm_type = EnkiAPI.adobe_drm if (element["availability"]["accessType"] == 'acs') else EnkiAPI.no_drm formats = [] formats.append( FormatData(content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=drm_type)) circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier=primary_identifier, formats=formats, licenses_owned=int(licenses_owned), licenses_available=int(licenses_available), patrons_in_hold_queue=int(hold)) metadata.circulation = circulationdata return metadata
def test_new_isbns(self): existing_id = self._identifier() metadata = Metadata(DataSource.lookup(self._db, DataSource.GUTENBERG), identifiers=[ IdentifierData(type=Identifier.OCLC_WORK, identifier="abra"), IdentifierData( type=existing_id.type, identifier=existing_id.identifier), IdentifierData(type=Identifier.ISBN, identifier="kadabra"), ]) eq_(2, self.provider.new_isbns(metadata))
def test_links_filtered(self): # Tests that passed-in links filter down to only the relevant ones. link1 = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") link2 = LinkData(rel=Hyperlink.IMAGE, href="http://example.com/") link3 = LinkData(rel=Hyperlink.DESCRIPTION, content="foo") link4 = LinkData( rel=Hyperlink.THUMBNAIL_IMAGE, href="http://thumbnail.com/", media_type=Representation.JPEG_MEDIA_TYPE, ) link5 = LinkData( rel=Hyperlink.IMAGE, href="http://example.com/", thumbnail=link4, media_type=Representation.JPEG_MEDIA_TYPE, ) links = [link1, link2, link3, link4, link5] identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") circulation_data = CirculationData( DataSource.GUTENBERG, primary_identifier=identifier, links=links, ) filtered_links = sorted(circulation_data.links, key=lambda x: x.rel) assert [link1] == filtered_links
def test_rights_status_default_rights_from_data_source(self): identifier = IdentifierData( Identifier.GUTENBERG_ID, "abcd", ) link = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) circulation_data = CirculationData( data_source=DataSource.OA_CONTENT_SERVER, primary_identifier=identifier, links=[link], ) replace = ReplacementPolicy(formats=True, ) # This pool starts off as not being open-access. pool, ignore = circulation_data.license_pool(self._db, self._default_collection) assert False == pool.open_access circulation_data.apply(self._db, pool.collection, replace) # The pool became open-access because it was given a # link that came from the OS content server. assert True == pool.open_access assert 1 == len(pool.delivery_mechanisms) # The rights status is the default for the OA content server. assert (RightsStatus.GENERIC_OPEN_ACCESS == pool.delivery_mechanisms[0].rights_status.uri)
def test_rights_status_open_access_link_with_rights(self): identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, rights_uri=RightsStatus.CC_BY_ND, ) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], ) replace = ReplacementPolicy(formats=True, ) pool, ignore = circulation_data.license_pool(self._db, self._default_collection) circulation_data.apply(self._db, pool.collection, replace) assert True == pool.open_access assert 1 == len(pool.delivery_mechanisms) assert RightsStatus.CC_BY_ND == pool.delivery_mechanisms[ 0].rights_status.uri
def test_rights_status_commercial_link_with_rights(self): identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, rights_uri=RightsStatus.IN_COPYRIGHT, ) format = FormatData( content_type=link.media_type, drm_scheme=DeliveryMechanism.ADOBE_DRM, link=link, rights_uri=RightsStatus.IN_COPYRIGHT, ) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], formats=[format], ) replace = ReplacementPolicy(formats=True, ) pool, ignore = circulation_data.license_pool(self._db, self._default_collection) circulation_data.apply(self._db, pool.collection, replace) assert False == pool.open_access assert 1 == len(pool.delivery_mechanisms) assert (RightsStatus.IN_COPYRIGHT == pool.delivery_mechanisms[0].rights_status.uri)
def record_info_to_circulation(cls, availability): """ Note: The json data passed into this method is from a different file/stream from the json data that goes into the record_info_to_metadata() method. """ if 'recordId' not in availability: return None record_id = availability['recordId'] primary_identifier = IdentifierData(Identifier.ODILO_ID, record_id) # We own this availability. licenses_owned = int(availability['totalCopies']) licenses_available = int(availability['availableCopies']) # 'licenses_reserved' is the number of patrons who put the book on hold earlier, # but who are now at the front of the queue and who could get the book right now if they wanted to. if 'notifiedHolds' in availability: licenses_reserved = int(availability['notifiedHolds']) else: licenses_reserved = 0 # 'patrons_in_hold_queue' contains the number of patrons who are currently waiting for a copy of the book. if 'holdsQueueSize' in availability: patrons_in_hold_queue = int(availability['holdsQueueSize']) else: patrons_in_hold_queue = 0 return CirculationData( data_source=DataSource.ODILO, primary_identifier=primary_identifier, licenses_owned=licenses_owned, licenses_available=licenses_available, licenses_reserved=licenses_reserved, patrons_in_hold_queue=patrons_in_hold_queue, )
def test_circulationdata_can_be_deepcopied(self): # Check that we didn't put something in the CirculationData that # will prevent it from being copied. (e.g., self.log) subject = SubjectData(Subject.TAG, "subject") contributor = ContributorData() identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM) rights_uri = RightsStatus.GENERIC_OPEN_ACCESS circulation_data = CirculationData( DataSource.GUTENBERG, primary_identifier=identifier, links=[link], licenses_owned=5, licenses_available=5, licenses_reserved=None, patrons_in_hold_queue=None, formats=[format], default_rights_uri=rights_uri, ) circulation_data_copy = deepcopy(circulation_data) # If deepcopy didn't throw an exception we're ok. assert circulation_data_copy is not None
def test_has_open_access_link(self): identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") circulationdata = CirculationData( DataSource.GUTENBERG, identifier, ) # No links assert False == circulationdata.has_open_access_link linkdata = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href=self._url, ) circulationdata.links = [linkdata] # Open-access link with no explicit rights URI. assert True == circulationdata.has_open_access_link # Open-access link with contradictory rights URI. linkdata.rights_uri = RightsStatus.IN_COPYRIGHT assert False == circulationdata.has_open_access_link # Open-access link with consistent rights URI. linkdata.rights_uri = RightsStatus.GENERIC_OPEN_ACCESS assert True == circulationdata.has_open_access_link
def test_availability_needs_update(self): """Test the logic that controls whether a LicensePool's availability information should actually be updated. """ identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") now = utc_now() yesterday = now - datetime.timedelta(days=1) recent_data = CirculationData(DataSource.GUTENBERG, identifier) # CirculationData.last_checked defaults to the current time. assert (recent_data.last_checked - now).total_seconds() < 10 old_data = CirculationData(DataSource.GUTENBERG, identifier, last_checked=yesterday) edition, pool = self._edition(with_license_pool=True) # A pool that has never been checked always needs to be updated. pool.last_checked = None assert True == recent_data._availability_needs_update(pool) assert True == old_data._availability_needs_update(pool) # A pool that has been checked before only needs to be updated # if the information is at least as new as what we had before. pool.last_checked = now assert True == recent_data._availability_needs_update(pool) assert False == old_data._availability_needs_update(pool)
def test_process_book_updates_old_licensepool(self): """If the LicensePool already exists, the circulation monitor updates it. """ edition, licensepool = self._edition( with_license_pool=True, identifier_type=Identifier.AXIS_360_ID, identifier_id=u'0003642860') # We start off with availability information based on the # default for test data. eq_(1, licensepool.licenses_owned) identifier = IdentifierData( type=licensepool.identifier.type, identifier=licensepool.identifier.identifier) metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, licensepool = monitor.process_book(metadata, self.AVAILABILITY_DATA) # Now we have information based on the CirculationData. eq_(9, licensepool.licenses_owned)
def book_info_to_metadata(self, subgraph, book_info): """Filters raw book information to exclude irrelevant or unhelpful data. :returns: None if information is unhelpful; metadata object otherwise. """ if not self._has_relevant_types(book_info): # This book is not available in any format we're # interested in from a metadata perspective. return None (oclc_id_type, oclc_id, titles, descriptions, subjects, creator_uris, publisher_names, publication_dates, example_uris) = self.extract_useful_data(subgraph, book_info) if not oclc_id_type or not oclc_id: return None self.log.info("Processing edition %s: %r", oclc_id, titles) metadata = Metadata(self.source) metadata.primary_identifier = IdentifierData(type=oclc_id_type, identifier=oclc_id) if titles: metadata.title = titles[0] for d in publication_dates: try: metadata.published = datetime.datetime.strptime(d[:4], "%Y") except Exception, e: pass
def recent_activity(self, start, end): """Find circulation events from a certain timeframe that affected loans or holds. :param start: A DateTime :yield: A sequence of CirculationData objects. """ epoch = datetime.datetime.utcfromtimestamp(0) start = int((start - epoch).total_seconds()) end = int((end - epoch).total_seconds()) url = self.base_url + self.item_endpoint args = dict( method='getRecentActivityTime', stime=str(start), etime=str(end) ) response = self.request(url, params=args) data = json.loads(response.content) parser = BibliographicParser() for element in data['result']['recentactivity']: identifier = IdentifierData(Identifier.ENKI_ID, element['id']) yield parser.extract_circulation( identifier, element['availability'], None # The recent activity API does not include format info )
def test_rights_status_default_rights_passed_in(self): identifier = IdentifierData( Identifier.GUTENBERG_ID, "abcd", ) link = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) circulation_data = CirculationData( data_source=DataSource.OA_CONTENT_SERVER, primary_identifier=identifier, default_rights_uri=RightsStatus.CC_BY, links=[link], ) replace = ReplacementPolicy(formats=True, ) pool, ignore = circulation_data.license_pool(self._db, self._default_collection) circulation_data.apply(self._db, pool.collection, replace) assert True == pool.open_access assert 1 == len(pool.delivery_mechanisms) # The rights status is the one that was passed in to CirculationData. assert RightsStatus.CC_BY == pool.delivery_mechanisms[ 0].rights_status.uri
def test_license_pool_sets_default_license_values(self): """We have no information about how many copies of the book we've actually licensed, but a LicensePool can be created anyway, so we can store format information. """ identifier = IdentifierData(Identifier.OVERDRIVE_ID, "1") drm_format = FormatData( content_type=Representation.PDF_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM, ) circulation = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, formats=[drm_format], ) collection = self._default_collection pool, is_new = circulation.license_pool(self._db, collection) assert True == is_new assert collection == pool.collection # We start with the conservative assumption that we own no # licenses for the book. assert 0 == pool.licenses_owned assert 0 == pool.licenses_available assert 0 == pool.licenses_reserved assert 0 == pool.patrons_in_hold_queue
def _extract_isbns(self, book_info): isbns = [] synonymous_ids = book_info.get('manifestations') for synonymous_id in synonymous_ids: isbn = synonymous_id.get('ISBN') if isbn: isbn_data = IdentifierData(Identifier.ISBN, isbn) isbns.append(isbn_data) return isbns
def test_set_equivalence(self): edition = self._edition() edition.title = "The House on Mango Street" edition.add_contributor(Contributor(viaf="112460612"), Contributor.AUTHOR_ROLE) identifier = edition.primary_identifier i1 = self._identifier() identifierdata1 = IdentifierData(type=i1.type, identifier=i1.identifier) good_metadata = Metadata(DataSource.lookup(self._db, DataSource.GUTENBERG), primary_identifier=identifierdata1, title="The House on Mango Street", contributors=[Contributor(viaf="112460612")]) i2 = self._identifier() identifierdata2 = IdentifierData(type=i2.type, identifier=i2.identifier) bad_metadata = Metadata(DataSource.lookup(self._db, DataSource.GUTENBERG), primary_identifier=identifierdata2, title="Calvin & Hobbes", contributors=[Contributor(viaf="101010")]) self.provider.set_equivalence(identifier, good_metadata) self.provider.set_equivalence(identifier, bad_metadata) equivalencies = Equivalency.for_identifiers(self._db, [identifier]).all() # The identifier for the bad metadata isn't made equivalent eq_([i1], [x.output for x in equivalencies]) eq_([1], [x.strength for x in equivalencies]) # But if the existing identifier has no editions, they're made equivalent. identifier = self._identifier() self.provider.set_equivalence(identifier, bad_metadata) equivalencies = Equivalency.for_identifiers(self._db, [identifier]).all() eq_([i2], [x.output for x in equivalencies]) eq_([1], [x.strength for x in equivalencies])
def test_circulationdata_may_require_collection(self): """Depending on the information provided in a CirculationData object, it might or might not be possible to call apply() without providing a Collection. """ identifier = IdentifierData(Identifier.OVERDRIVE_ID, "1") format = FormatData( Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM, rights_uri=RightsStatus.IN_COPYRIGHT, ) circdata = CirculationData(DataSource.OVERDRIVE, primary_identifier=identifier, formats=[format]) circdata.apply(self._db, collection=None) # apply() has created a LicensePoolDeliveryMechanism for this # title, even though there are no LicensePools for it. identifier_obj, ignore = identifier.load(self._db) assert [] == identifier_obj.licensed_through [lpdm] = identifier_obj.delivery_mechanisms assert DataSource.OVERDRIVE == lpdm.data_source.name assert RightsStatus.IN_COPYRIGHT == lpdm.rights_status.uri mechanism = lpdm.delivery_mechanism assert Representation.EPUB_MEDIA_TYPE == mechanism.content_type assert DeliveryMechanism.NO_DRM == mechanism.drm_scheme # But if we put some information in the CirculationData # that can only be stored in a LicensePool, there's trouble. circdata.licenses_owned = 0 with pytest.raises(ValueError) as excinfo: circdata.apply(self._db, collection=None) assert ( "Cannot store circulation information because no Collection was provided." in str(excinfo.value))
def _fetch_remote_availability(self, identifiers): for i, identifier in enumerate(identifiers): # The first identifer in the list is still # available. identifier_data = IdentifierData( type=identifier.type, identifier=identifier.identifier) metadata = Metadata(data_source=DataSource.AXIS_360, primary_identifier=identifier_data) availability = CirculationData( data_source=DataSource.AXIS_360, primary_identifier=identifier_data, licenses_owned=7, licenses_available=6) yield metadata, availability # The rest have been 'forgotten' by Axis 360. break
def process_item(self, identifier): self.log.debug( "Seeing if %s needs reaping", identifier.identifier ) metadata = self.api.get_item(identifier.identifier) if metadata: # This title is still in the collection. Do nothing. return # Get this collection's license pool for this identifier. # We'll reap it by setting its licenses_owned to 0. pool = identifier.licensed_through_collection(self.collection) if not pool or pool.licenses_owned == 0: # It's already been reaped. return if pool.presentation_edition: self.log.warn( "Removing %r from circulation", pool.presentation_edition ) else: self.log.warn( "Removing unknown title %s from circulation.", identifier.identifier ) now = datetime.datetime.utcnow() circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier= IdentifierData( identifier.type, identifier.identifier ), licenses_owned = 0, licenses_available = 0, patrons_in_hold_queue = 0, last_checked = now ) circulationdata.apply( self._db, self.collection, replace=ReplacementPolicy.from_license_source(self._db) ) return circulationdata
def reaper_request(self, identifier): self.log.debug ("Checking availability for " + str(identifier.identifier)) now = datetime.datetime.utcnow() url = str(self.base_url) + str(self.item_endpoint) args = dict() args['method'] = "getItem" args['recordid'] = identifier.identifier args['size'] = "small" args['lib'] = self.library_id response = self.request(url, method='get', params=args) try: # If a book doesn't exist in Enki, we'll just get an HTML page saying we did something wrong. data = json.loads(response.content) self.log.debug ("Keeping existing book: " + str(identifier)) except: # Get this collection's license pool for this identifier. pool = identifier.licensed_through_collection(self.collection) if pool and (pool.licenses_owned > 0): if pool.presentation_edition: self.log.warn("Removing %s (%s) from circulation", pool.presentation_edition.title, pool.presentation_edition.author) else: self.log.warn( "Removing unknown work %s from circulation.", identifier.identifier ) circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier= IdentifierData(EnkiAPI.ENKI_ID, identifier.identifier), licenses_owned = 0, licenses_available = 0, patrons_in_hold_queue = 0, last_checked = now ) circulationdata.apply( self._db, self.collection, replace=ReplacementPolicy.from_license_source(self._db) ) return circulationdata
def recent_activity(self, since): """Find recent circulation events that affected loans or holds. :param since: A DateTime :yield: A sequence of CirculationData objects. """ minutes = self._minutes_since(since) url = self.base_url + self.item_endpoint args = dict(method='getRecentActivity', minutes=minutes, lib=self.library_id) response = self.request(url, params=args) data = json.loads(response.content) parser = BibliographicParser() for element in data['result']['recentactivity']: identifier = IdentifierData(Identifier.ENKI_ID, element['id']) yield parser.extract_circulation( identifier, element['availability'], None # The recent activity API does not include format info )
def update_consolidated_copy(self, _db, copy_info, analytics=None): """Process information about the current status of a consolidated copy from the consolidated copies feed. """ identifier = copy_info.get("identifier") licenses = copy_info.get("licenses") available = copy_info.get("available") identifier_data = IdentifierData(Identifier.URI, identifier) circulation_data = CirculationData( data_source=self.data_source_name, primary_identifier=identifier_data, licenses_owned=licenses, licenses_available=available, ) replacement_policy = ReplacementPolicy(analytics=analytics) pool, ignore = circulation_data.apply(_db, self.collection(_db), replacement_policy) # Update licenses reserved if there are holds. if len(pool.holds) > 0 and pool.licenses_available > 0: self.update_hold_queue(pool)
title = d.get('title', None) display_author = d.get('author', None) publisher = d.get('publisher', None) annotation = d.get('description', None) primary_isbn10 = d.get('primary_isbn10', None) primary_isbn13 = d.get('primary_isbn13', None) # The list of other ISBNs frequently contains ISBNs for # other books in the same series, as well as ISBNs that # are just wrong. Assign these equivalencies at a low # level of confidence. for isbn in d.get('isbns', []): isbn13 = isbn.get('isbn13', None) if isbn13: other_isbns.append( IdentifierData(Identifier.ISBN, isbn13, 0.50)) primary_isbn = primary_isbn13 or primary_isbn10 if primary_isbn: primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90) contributors = [] if display_author: contributors.append(ContributorData(display_name=display_author)) metadata = Metadata( data_source=DataSource.NYT, title=title, medium=medium, language='eng', published=published_date,
class TestCirculationMonitor(Axis360Test): BIBLIOGRAPHIC_DATA = Metadata( DataSource.AXIS_360, publisher=u'Random House Inc', language='eng', title=u'Faith of My Fathers : A Family Memoir', imprint=u'Random House Inc2', published=datetime.datetime(2000, 3, 7, 0, 0), primary_identifier=IdentifierData(type=Identifier.AXIS_360_ID, identifier=u'0003642860'), identifiers=[ IdentifierData(type=Identifier.ISBN, identifier=u'9780375504587') ], contributors=[ ContributorData(sort_name=u"McCain, John", roles=[Contributor.PRIMARY_AUTHOR_ROLE]), ContributorData(sort_name=u"Salter, Mark", roles=[Contributor.AUTHOR_ROLE]), ], subjects=[ SubjectData(type=Subject.BISAC, identifier=u'BIOGRAPHY & AUTOBIOGRAPHY / Political'), SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Adult'), ], ) AVAILABILITY_DATA = CirculationData( data_source=DataSource.AXIS_360, primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier, licenses_owned=9, licenses_available=8, licenses_reserved=0, patrons_in_hold_queue=0, last_checked=datetime.datetime(2015, 5, 20, 2, 9, 8), ) def test_process_book(self): integration, ignore = create( self._db, ExternalIntegration, goal=ExternalIntegration.ANALYTICS_GOAL, protocol="core.local_analytics_provider", ) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, license_pool = monitor.process_book(self.BIBLIOGRAPHIC_DATA, self.AVAILABILITY_DATA) eq_(u'Faith of My Fathers : A Family Memoir', edition.title) eq_(u'eng', edition.language) eq_(u'Random House Inc', edition.publisher) eq_(u'Random House Inc2', edition.imprint) eq_(Identifier.AXIS_360_ID, edition.primary_identifier.type) eq_(u'0003642860', edition.primary_identifier.identifier) [isbn] = [ x for x in edition.equivalent_identifiers() if x is not edition.primary_identifier ] eq_(Identifier.ISBN, isbn.type) eq_(u'9780375504587', isbn.identifier) eq_( ["McCain, John", "Salter, Mark"], sorted([x.sort_name for x in edition.contributors]), ) subs = sorted((x.subject.type, x.subject.identifier) for x in edition.primary_identifier.classifications) eq_([(Subject.BISAC, u'BIOGRAPHY & AUTOBIOGRAPHY / Political'), (Subject.FREEFORM_AUDIENCE, u'Adult')], subs) eq_(9, license_pool.licenses_owned) eq_(8, license_pool.licenses_available) eq_(0, license_pool.patrons_in_hold_queue) eq_(datetime.datetime(2015, 5, 20, 2, 9, 8), license_pool.last_checked) # Three circulation events were created, backdated to the # last_checked date of the license pool. events = license_pool.circulation_events eq_([ u'distributor_title_add', u'distributor_check_in', u'distributor_license_add' ], [x.type for x in events]) for e in events: eq_(e.start, license_pool.last_checked) # A presentation-ready work has been created for the LicensePool. work = license_pool.work eq_(True, work.presentation_ready) eq_("Faith of My Fathers : A Family Memoir", work.title) # A CoverageRecord has been provided for this book in the Axis # 360 bibliographic coverage provider, so that in the future # it doesn't have to make a separate API request to ask about # this book. records = [ x for x in license_pool.identifier.coverage_records if x.data_source.name == DataSource.AXIS_360 and x.operation is None ] eq_(1, len(records)) def test_process_book_updates_old_licensepool(self): """If the LicensePool already exists, the circulation monitor updates it. """ edition, licensepool = self._edition( with_license_pool=True, identifier_type=Identifier.AXIS_360_ID, identifier_id=u'0003642860') # We start off with availability information based on the # default for test data. eq_(1, licensepool.licenses_owned) identifier = IdentifierData( type=licensepool.identifier.type, identifier=licensepool.identifier.identifier) metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, licensepool = monitor.process_book(metadata, self.AVAILABILITY_DATA) # Now we have information based on the CirculationData. eq_(9, licensepool.licenses_owned)
def extract_bibliographic(self, element): """Extract Metadata and CirculationData from a dictionary of information from Enki. :return: A Metadata with attached CirculationData. """ # TODO: it's not clear what these are or whether we'd find them # useful: # dateSaved # length # publishDate primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) identifiers = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) contributors = [] sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) links = [] description = element.get("description") if description: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html", ) ) # NOTE: When this method is called by, e.g. updated_titles(), # the large and small images are available separately. When # this method is called by get_item(), we only get a single # image, in 'cover'. In get_item() we ask that that image be 'large', # which means we'll be filing it as a normal-sized image. # full_image = None thumbnail_image = None for key, rel in ( ("cover", Hyperlink.IMAGE), ("small_image", Hyperlink.THUMBNAIL_IMAGE), ("large_image", Hyperlink.IMAGE), ): url = element.get(key) if not url: continue link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE) if rel == Hyperlink.THUMBNAIL_IMAGE: # Don't add a thumbnail to the list of links -- wait # until the end and then make it a thumbnail of the # primary image. thumbnail_image = link else: if rel == Hyperlink.IMAGE: full_image = link links.append(link) if thumbnail_image: if full_image: # Set the thumbnail as the thumbnail _of_ the full image. full_image.thumbnail = thumbnail_image else: # Treat the thumbnail as the full image. thumbnail_image.rel = Hyperlink.IMAGE links.append(thumbnail_image) # We treat 'subject', 'topic', and 'genre' as interchangeable # sets of tags. This data is based on BISAC but it's not reliably # presented in a form that can be parsed as BISAC. subjects = [] seen_topics = set() for key in ("subject", "topic", "genre"): for topic in element.get(key, []): if not topic or topic in seen_topics: continue subjects.append( SubjectData( Subject.TAG, topic, weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT, ) ) seen_topics.add(topic) language_code = element.get("language", "English") language = self.LANGUAGE_CODES.get(language_code, "eng") metadata = Metadata( data_source=DataSource.ENKI, title=element.get("title"), language=language, medium=Edition.BOOK_MEDIUM, publisher=element.get("publisher"), primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=links, subjects=subjects, ) circulationdata = self.extract_circulation( primary_identifier, element.get("availability", {}), element.get("formattype", None), ) metadata.circulation = circulationdata return metadata
def record_info_to_metadata(cls, book, availability): """Turn Odilo's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if 'id' not in book: return None odilo_id = book['id'] primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id) active = book.get('active') title = book.get('title') subtitle = book.get('subtitle') series = book.get('series') series_position = book.get('seriesPosition') contributors = [] sort_author = book.get('author') if sort_author: roles = [Contributor.AUTHOR_ROLE] display_author = sort_name_to_display_name(sort_author) contributor = ContributorData(sort_name=sort_author, display_name=display_author, roles=roles, biography=None) contributors.append(contributor) publisher = book.get('publisher') # Metadata --> Marc21 260$c published = book.get('publicationDate') if not published: # yyyyMMdd --> record creation date published = book.get('releaseDate') if published: try: published = datetime.datetime.strptime(published, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse publication date from: ' + published + ', message: ' + e.message) # yyyyMMdd --> record last modification date last_update = book.get('modificationDate') if last_update: try: last_update = datetime.datetime.strptime(last_update, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse last update date from: ' + last_update + ', message: ' + e.message) language = book.get('language', 'spa') subjects = [] for subject in book.get('subjects', []): subjects.append( SubjectData(type=Subject.TAG, identifier=subject, weight=100)) for subjectBisacCode in book.get('subjectsBisacCodes', []): subjects.append( SubjectData(type=Subject.BISAC, identifier=subjectBisacCode, weight=100)) grade_level = book.get('gradeLevel') if grade_level: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=10) subjects.append(subject) medium = None file_format = book.get('fileFormat') formats = [] for format_received in book.get('formats', []): if format_received in cls.format_data_for_odilo_format: medium = cls.set_format(format_received, formats) elif format_received == cls.ACSM and file_format: medium = cls.set_format( format_received + '_' + file_format.upper(), formats) else: cls.log.warn('Unrecognized format received: ' + format_received) if not medium: medium = Edition.BOOK_MEDIUM identifiers = [] isbn = book.get('isbn') if isbn: if isbnlib.is_isbn10(isbn): isbn = isbnlib.to_isbn13(isbn) identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1)) # A cover links = [] cover_image_url = book.get('coverImageUrl') if cover_image_url: image_data = cls.image_link_to_linkdata(cover_image_url, Hyperlink.THUMBNAIL_IMAGE) if image_data: links.append(image_data) original_image_url = book.get('originalImageUrl') if original_image_url: image_data = cls.image_link_to_linkdata(original_image_url, Hyperlink.IMAGE) if image_data: links.append(image_data) # Descriptions become links. description = book.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html")) metadata = Metadata(data_source=DataSource.ODILO, title=title, subtitle=subtitle, language=language, medium=medium, series=series, series_position=series_position, publisher=publisher, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, data_source_last_updated=last_update) metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation( availability) # 'active' --> means that the book exists but it's no longer in the collection # (it could be available again in the future) if not active: metadata.circulation.licenses_owned = 0 metadata.circulation.formats = formats return metadata, active
def test_annotate_metadata(self): """Verify that annotate_metadata calls load_circulation_data and load_cover_link appropriately. """ # First, test an unsuccessful annotation. class MockNoCirculationData(DirectoryImportScript): """Do nothing when load_circulation_data is called. Explode if load_cover_link is called. """ def load_circulation_data(self, *args): self.load_circulation_data_args = args return None def load_cover_link(self, *args): raise Exception("Explode!") gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) identifier = IdentifierData(Identifier.GUTENBERG_ID, "11111") identifier_obj, ignore = identifier.load(self._db) metadata = Metadata( title=self._str, data_source=gutenberg, primary_identifier=identifier ) mirror = object() policy = ReplacementPolicy(mirror=mirror) cover_directory = object() ebook_directory = object() rights_uri = object() script = MockNoCirculationData(self._db) args = (metadata, policy, cover_directory, ebook_directory, rights_uri) script.annotate_metadata(*args) # load_circulation_data was called. eq_( (identifier_obj, gutenberg, ebook_directory, mirror, metadata.title, rights_uri), script.load_circulation_data_args ) # But because load_circulation_data returned None, # metadata.circulation_data was not modified and # load_cover_link was not called (which would have raised an # exception). eq_(None, metadata.circulation) # Test a successful annotation with no cover image. class MockNoCoverLink(DirectoryImportScript): """Return an object when load_circulation_data is called. Do nothing when load_cover_link is called. """ def load_circulation_data(self, *args): return "Some circulation data" def load_cover_link(self, *args): self.load_cover_link_args = args return None script = MockNoCoverLink(self._db) script.annotate_metadata(*args) # The Metadata object was annotated with the return value of # load_circulation_data. eq_("Some circulation data", metadata.circulation) # load_cover_link was called. eq_( (identifier_obj, gutenberg, cover_directory, mirror), script.load_cover_link_args ) # But since it provided no cover link, metadata.links was empty. eq_([], metadata.links) # Finally, test a completely successful annotation. class MockWithCoverLink(DirectoryImportScript): """Mock success for both load_circulation_data and load_cover_link. """ def load_circulation_data(self, *args): return "Some circulation data" def load_cover_link(self, *args): return "A cover link" metadata.circulation = None script = MockWithCoverLink(self._db) script.annotate_metadata(*args) eq_("Some circulation data", metadata.circulation) eq_(['A cover link'], metadata.links)
def test_work_from_metadata(self): """Validate the ability to create a new Work from appropriate metadata. """ class Mock(MockDirectoryImportScript): """In this test we need to verify that annotate_metadata was called but did nothing. """ def annotate_metadata(self, metadata, *args, **kwargs): metadata.annotated = True return super(Mock, self).annotate_metadata( metadata, *args, **kwargs ) identifier = IdentifierData(Identifier.GUTENBERG_ID, "1003") identifier_obj, ignore = identifier.load(self._db) metadata = Metadata( DataSource.GUTENBERG, primary_identifier=identifier, title=u"A book" ) metadata.annotated = False datasource = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy.from_license_source(self._db) mirror = MockS3Uploader() policy.mirror = mirror # Here, work_from_metadata calls annotate_metadata, but does # not actually import anything because there are no files 'on # disk' and thus no way to actually get the book. collection = self._default_collection args = (collection, metadata, policy, "cover directory", "ebook directory", RightsStatus.CC0) script = Mock(self._db) eq_(None, script.work_from_metadata(*args)) eq_(True, metadata.annotated) # Now let's try it with some files 'on disk'. with open(self.sample_cover_path('test-book-cover.png')) as fh: image = fh.read() mock_filesystem = { 'cover directory' : ( 'cover.jpg', Representation.JPEG_MEDIA_TYPE, image ), 'ebook directory' : ( 'book.epub', Representation.EPUB_MEDIA_TYPE, "I'm an EPUB." ) } script = MockDirectoryImportScript( self._db, mock_filesystem=mock_filesystem ) work = script.work_from_metadata(*args) # We have created a book. It has a cover image, which has a # thumbnail. eq_("A book", work.title) assert work.cover_full_url.endswith( '/test.cover.bucket/Gutenberg/Gutenberg+ID/1003/1003.jpg' ) assert work.cover_thumbnail_url.endswith( '/test.cover.bucket/scaled/300/Gutenberg/Gutenberg+ID/1003/1003.png' ) [pool] = work.license_pools assert pool.open_access_download_url.endswith( '/test.content.bucket/Gutenberg/Gutenberg+ID/1003/A+book.epub' ) eq_(RightsStatus.CC0, pool.delivery_mechanisms[0].rights_status.uri) # The mock S3Uploader has a record of 'uploading' all these files # to S3. epub, full, thumbnail = mirror.uploaded eq_(epub.url, pool.open_access_download_url) eq_(full.url, work.cover_full_url) eq_(thumbnail.url, work.cover_thumbnail_url) # The EPUB Representation was cleared out after the upload, to # save database space. eq_("I'm an EPUB.", mirror.content[0]) eq_(None, epub.content)
def test_viaf_authors_get_viaf_lookup(self): # TODO: The code this calls could be refactored quite a bit -- # we don't really need to test all of process_item() here. # But ATM it does seem to be our only test of process_item(). oclc = MockOCLCLinkedDataAPI() viaf = MockVIAFClient() provider = LinkedDataCoverageProvider(self._db, api=oclc, viaf_api=viaf) # Here's a placeholder that will be filled in with information from # OCLC Linked Data. edition = self._edition() for i in edition.contributions: self._db.delete(i) self._db.commit() identifier = edition.primary_identifier # OCLC Linked Data is going to mention two authors -- one with # a sort name + VIAF, and one with a VIAF but no sort name. contributor1 = ContributorData(viaf="1") contributor2 = ContributorData(viaf="2", sort_name="Jordan, Robert") contributor3 = ContributorData(sort_name="Rice, Anne", display_name="Anne Rice") idata = IdentifierData(type=identifier.type, identifier=identifier.identifier) metadata = Metadata( DataSource.OCLC_LINKED_DATA, contributors=[contributor1, contributor2, contributor3], primary_identifier=idata, title=u"foo") oclc.queue_info_for(metadata) # Our OCLC Linked Data client is going to try to fill in the # data, asking VIAF about the contributors that have VIAF data, # and not those who do not. lookup1 = (ContributorData(viaf="1", display_name="Display Name", family_name="Family", sort_name="Name, Sort", wikipedia_name="Wikipedia_Name"), None, None) lookup2 = (ContributorData(viaf="2", wikipedia_name="Robert_Jordan_(Author)", biography="That guy."), None, None) viaf.queue_lookup(lookup1, lookup2, "Unrequested lookup") provider.process_item(identifier) # Both VIAF-identified authors have had their information updated # with the VIAF results. filled_in = sorted([(x.sort_name, x.display_name, x.viaf, x.wikipedia_name, x.biography) for x in edition.contributors]) eq_([(u'Jordan, Robert', None, u'2', u'Robert_Jordan_(Author)', u'That guy.'), (u'Name, Sort', u'Display Name', u'1', u'Wikipedia_Name', None), (u'Rice, Anne', u'Anne Rice', None, None, None)], filled_in) # The author without VIAF data didn't request a VIAF lookup. # Instead, that result is still in the mock VIAF queue. eq_(viaf.results, ["Unrequested lookup"])
def test_rights_status_open_access_link_no_rights_uses_data_source_default( self): identifier = IdentifierData( Identifier.GUTENBERG_ID, "abcd", ) # Here's a CirculationData that will create an open-access # LicensePoolDeliveryMechanism. link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) circulation_data = CirculationData( data_source=DataSource.GUTENBERG, primary_identifier=identifier, links=[link], ) replace_formats = ReplacementPolicy(formats=True, ) pool, ignore = circulation_data.license_pool(self._db, self._default_collection) pool.open_access = False # Applying this CirculationData to a LicensePool makes it # open-access. circulation_data.apply(self._db, pool.collection, replace_formats) assert True == pool.open_access assert 1 == len(pool.delivery_mechanisms) # The delivery mechanism's rights status is the default for # the data source. assert (RightsStatus.PUBLIC_DOMAIN_USA == pool.delivery_mechanisms[0].rights_status.uri) # Even if a commercial source like Overdrive should offer a # link with rel="open access", unless we know it's an # open-access link we will give it a RightsStatus of # IN_COPYRIGHT. identifier = IdentifierData( Identifier.OVERDRIVE_ID, "abcd", ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) circulation_data = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=identifier, links=[link], ) pool, ignore = circulation_data.license_pool(self._db, self._default_collection) pool.open_access = False circulation_data.apply(self._db, pool.collection, replace_formats) assert (RightsStatus.IN_COPYRIGHT == pool.delivery_mechanisms[0].rights_status.uri) assert False == pool.open_access
def add_with_metadata(self, collection_details): """Adds identifiers with their metadata to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details(self._db, client, collection_details) data_source = DataSource.lookup(self._db, collection.name, autocreate=True) messages = [] feed = feedparser.parse(request.data) entries = feed.get("entries", []) entries_by_urn = {entry.get('id'): entry for entry in entries} identifiers_by_urn, invalid_urns = Identifier.parse_urns( self._db, entries_by_urn.keys()) messages = list() for urn in invalid_urns: messages.append( OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail)) for urn, identifier in identifiers_by_urn.items(): entry = entries_by_urn[urn] status = HTTP_OK description = "Already in catalog" if identifier not in collection.catalog: collection.catalog_identifier(identifier) status = HTTP_CREATED description = "Successfully added" message = OPDSMessage(urn, status, description) # Get a cover if it exists. image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE]) images = [ l for l in entry.get("links", []) if l.get("rel") in image_types ] links = [ LinkData(image.get("rel"), image.get("href")) for image in images ] # Create an edition to hold the title and author. LicensePool.calculate_work # refuses to create a Work when there's no title, and if we have a title, author # and language we can attempt to look up the edition in OCLC. title = entry.get("title") or "Unknown Title" author = ContributorData(sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR), roles=[Contributor.PRIMARY_AUTHOR_ROLE]) language = entry.get("dcterms_language") presentation = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=False, regenerate_opds_entries=False, ) replace = ReplacementPolicy( presentation_calculation_policy=presentation) metadata = Metadata( data_source, primary_identifier=IdentifierData(identifier.type, identifier.identifier), title=title, language=language, contributors=[author], links=links, ) edition, ignore = metadata.edition(self._db) metadata.apply(edition, collection, replace=replace) messages.append(message) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url("add_with_metadata", collection) addition_feed = AcquisitionFeed(self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages) return feed_response(addition_feed)