def process_work(self, work): primary_identifier_ids = [ x.primary_identifier.id for x in work.editions] data = Identifier.recursively_equivalent_identifier_ids( self._db, primary_identifier_ids, 5, threshold=0.5) flattened_data = Identifier.flatten_identifier_ids(data) workgenres, work.fiction, work.audience, target_age = work.assign_genres( flattened_data) old_target_age = work.target_age work.target_age = NumericRange(*target_age) if work.target_age != old_target_age and work.target_age.lower != None: print "%r: %r->%r" % (work.title, old_target_age, work.target_age)
def test_confirm_same_identifier(self): source = DataSource.lookup(self._db, DataSource.NOVELIST) identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, '84752928' ) unmatched_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, '23781947' ) metadata = Metadata(source, primary_identifier=identifier) match = Metadata(source, primary_identifier=identifier) mistake = Metadata(source, primary_identifier=unmatched_identifier) eq_(False, self.novelist._confirm_same_identifier([metadata, mistake])) eq_(True, self.novelist._confirm_same_identifier([metadata, match]))
def page(cls, _db, title, url, annotator=None, use_materialized_works=True): """Create a feed of content to preload on devices.""" configured_content = Configuration.policy(Configuration.PRELOADED_CONTENT) identifiers = [Identifier.parse_urn(_db, urn)[0] for urn in configured_content] identifier_ids = [identifier.id for identifier in identifiers] if use_materialized_works: from core.model import MaterializedWork q = _db.query(MaterializedWork) q = q.filter(MaterializedWork.primary_identifier_id.in_(identifier_ids)) # Avoid eager loading of objects that are contained in the # materialized view. q = q.options( lazyload(MaterializedWork.license_pool, LicensePool.data_source), lazyload(MaterializedWork.license_pool, LicensePool.identifier), lazyload(MaterializedWork.license_pool, LicensePool.edition), ) else: q = _db.query(Work).join(Work.primary_edition) q = q.filter(Edition.primary_identifier_id.in_(identifier_ids)) works = q.all() feed = cls(_db, title, url, works, annotator) annotator.annotate_feed(feed, None) content = unicode(feed) return content
def get_identifiers(self, url=None): """Pulls mapped identifiers from a feed of SimplifiedOPDSMessages.""" response = self.get_response(url=url) feed = response.text etree_feed = etree.parse(StringIO(response.text)) messages = self.importer.extract_messages(self.parser, etree_feed) urns = [m.urn for m in messages] identifiers_by_urn, _failures = Identifier.parse_urns( self._db, urns, autocreate=False ) urns = identifiers_by_urn.keys() identifiers = identifiers_by_urn.values() self.importer.build_identifier_mapping(urns) mapped_identifiers = list() for identifier in identifiers: mapped_identifier = self.importer.identifier_mapping.get( identifier, identifier ) mapped_identifiers.append(mapped_identifier) parsed_feed = feedparser.parse(feed) next_links = self.importer.extract_next_links(parsed_feed) return mapped_identifiers, next_links
def process_urn(self, urn, collection=None, **kwargs): """Turn a URN into a Work suitable for use in an OPDS feed. """ try: identifier, is_new = Identifier.parse_urn(self._db, urn) except ValueError, e: identifier = None
def oclc_number_for_isbn(self, isbn): """Turn an ISBN identifier into an OCLC Number identifier.""" # Let's pretend any id can be an oclc id. oclc_number = isbn.identifier oclc_identifier, made_new = Identifier.for_foreign_id( self._db, Identifier.OCLC_NUMBER, oclc_number, autocreate=True) return oclc_identifier
def test_run_once(self): # Setup authentication and Metadata Wrangler details. lp = self._licensepool( None, data_source_name=DataSource.BIBLIOTHECA, collection=self.collection ) lp.identifier.type = Identifier.BIBLIOTHECA_ID isbn = Identifier.parse_urn(self._db, u'urn:isbn:9781594632556')[0] lp.identifier.equivalent_to( DataSource.lookup(self._db, DataSource.BIBLIOTHECA), isbn, 1 ) eq_([], lp.identifier.links) eq_([], lp.identifier.measurements) # Queue some data to be found. responses = ( 'metadata_updates_response.opds', 'metadata_updates_empty_response.opds', ) for filename in responses: data = sample_data(filename, 'opds') self.lookup.queue_response( 200, {'content-type' : OPDSFeed.ACQUISITION_FEED_TYPE}, data ) timestamp = self.ts new_timestamp = self.monitor.run_once(timestamp) # We have a new value to use for the Monitor's timestamp -- the # earliest date seen in the last OPDS feed that contained # any entries. eq_(datetime.datetime(2016, 9, 20, 19, 37, 2), new_timestamp.finish) eq_("Editions processed: 1", new_timestamp.achievements) # Normally run_once() doesn't update the monitor's timestamp, # but this implementation does, so that work isn't redone if # run_once() crashes or the monitor is killed. eq_(new_timestamp.finish, self.monitor.timestamp().finish) # The original Identifier has information from the # mock Metadata Wrangler. mw_source = DataSource.lookup(self._db, DataSource.METADATA_WRANGLER) eq_(3, len(lp.identifier.links)) [quality] = lp.identifier.measurements eq_(mw_source, quality.data_source) # Check the URLs we processed. url1, url2 = [x[0] for x in self.lookup.requests] # The first URL processed was the default one for the # MetadataWranglerOPDSLookup. eq_(self.lookup.get_collection_url(self.lookup.UPDATES_ENDPOINT), url1) # The second URL processed was whatever we saw in the 'next' link. eq_("http://next-link/", url2)
def remove_items(self, collection_details): """Removes identifiers from a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details( self._db, client, collection_details ) urns = request.args.getlist('urn') messages = [] identifiers_by_urn, failures = Identifier.parse_urns(self._db, urns) for urn in failures: message = OPDSMessage( urn, INVALID_URN.status_code, INVALID_URN.detail ) messages.append(message) # Find the IDs of the subset of provided identifiers that are # in the catalog, so we know which ones to delete and give a # 200 message. Also get a SQLAlchemy clause that selects only # those IDs. matching_ids, identifier_match_clause = self._in_catalog_subset( collection, identifiers_by_urn ) # Use that clause to delete all of the relevant catalog # entries. delete_stmt = collections_identifiers.delete().where( identifier_match_clause ) self._db.execute(delete_stmt) # IDs that matched get a 200 message; all others get a 404 # message. for urn, identifier in identifiers_by_urn.items(): if identifier.id in matching_ids: status = HTTP_OK description = "Successfully removed" else: status = HTTP_NOT_FOUND description = "Not in catalog" message = OPDSMessage(urn, status, description) messages.append(message) title = "%s Catalog Item Removal for %s" % (collection.protocol, client.url) url = self.collection_feed_url("remove", collection, urn=urns) removal_feed = AcquisitionFeed( self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages ) return feed_response(removal_feed)
def parse_identifier(self, urn): """Try to parse a URN into an identifier. :return: An Identifier if possible; otherwise None. """ if not urn: return None try: result = Identifier.parse_urn(self._db, urn, False) except ValueError, e: # The identifier is parseable but invalid, e.g. an # ASIN used as an ISBN. Ignore it. return None
def add_items(self, collection_details): """Adds identifiers to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details( self._db, client, collection_details ) urns = request.args.getlist('urn') messages = [] identifiers_by_urn, failures = Identifier.parse_urns(self._db, urns) for urn in failures: message = OPDSMessage( urn, INVALID_URN.status_code, INVALID_URN.detail ) messages.append(message) # Find the subset of incoming identifiers that are already # in the catalog. already_in_catalog, ignore = self._in_catalog_subset( collection, identifiers_by_urn ) # Everything else needs to be added to the catalog. needs_to_be_added = [ x for x in identifiers_by_urn.values() if x.id not in already_in_catalog ] collection.catalog_identifiers(needs_to_be_added) for urn, identifier in identifiers_by_urn.items(): if identifier.id in already_in_catalog: status = HTTP_OK description = "Already in catalog" else: status = HTTP_CREATED description = "Successfully added" messages.append(OPDSMessage(urn, status, description)) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url('add', collection, urn=urns) addition_feed = AcquisitionFeed( self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages ) return feed_response(addition_feed)
def handle_event(self, threem_id, isbn, foreign_patron_id, start_time, end_time, internal_event_type): # Find or lookup the LicensePool for this event. license_pool, is_new = LicensePool.for_foreign_id( self._db, self.api.source, Identifier.THREEM_ID, threem_id) if is_new: # Immediately acquire bibliographic coverage for this book. # This will set the DistributionMechanisms and make the # book presentation-ready. However, its circulation information # might not be up to date until we process some more events. record = self.bibliographic_coverage_provider.ensure_coverage( license_pool.identifier, force=True ) threem_identifier = license_pool.identifier isbn, ignore = Identifier.for_foreign_id( self._db, Identifier.ISBN, isbn) edition, ignore = Edition.for_foreign_id( self._db, self.api.source, Identifier.THREEM_ID, threem_id) # The ISBN and the 3M identifier are exactly equivalent. threem_identifier.equivalent_to(self.api.source, isbn, strength=1) # Log the event. event, was_new = get_one_or_create( self._db, CirculationEvent, license_pool=license_pool, type=internal_event_type, start=start_time, foreign_patron_id=foreign_patron_id, create_method_kwargs=dict(delta=1,end=end_time) ) # If this is our first time seeing this LicensePool, log its # occurance as a separate event if is_new: event = get_one_or_create( self._db, CirculationEvent, type=CirculationEvent.TITLE_ADD, license_pool=license_pool, create_method_kwargs=dict( start=license_pool.last_checked or start_time, delta=1, end=license_pool.last_checked or end_time, ) ) title = edition.title or "[no title]" self.log.info("%r %s: %s", start_time, title, internal_event_type) return start_time
def cover_links(cls, work): """The content server sends out _all_ cover links for the work. For books covered by Gutenberg Illustrated, this can be over a hundred cover links. """ _db = Session.object_session(work) ids = work.all_identifier_ids() image_resources = Identifier.resources_for_identifier_ids( _db, ids, Resource.IMAGE) thumbnails = [] full = [] for cover in image_resources: if cover.mirrored_path: full.append(cover.mirrored_path) if cover.scaled_path: thumbnails.append(cover.scaled_path) return thumbnails, full
def oclc_number_for_isbn(self, isbn): """Turn an ISBN identifier into an OCLC Number identifier.""" url = self.ISBN_BASE_URL % dict(id=isbn.identifier) representation, cached = Representation.get( self._db, url, Representation.http_get_no_redirect) if not representation.location: raise IOError( "Expected %s to redirect, but couldn't find location." % url ) location = representation.location match = self.URI_WITH_OCLC_NUMBER.match(location) if not match: raise IOError( "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location) oclc_number = match.groups()[0] return Identifier.for_foreign_id( self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
def handle_import_messages(self, messages_by_id): """Turn import messages from the OPDS importer into CoverageFailure objects. """ for identifier, message in messages_by_id.items(): # If the message indicates success but we didn't actually # get the data, treat it as a transient error. # # If the message does not indicate success, create a # CoverageRecord with the error so we stop trying this # book. if not message.success: exception = str(message.status_code) if message.message: exception += ": %s" % message.message transient = message.transient identifier_obj, ignore = Identifier.parse_urn(self._db, identifier) yield CoverageFailure(self, identifier_obj, exception, transient)
def process_item(self, work): try: content_item = self.content_item_from_work(work) result = self.api.create_content_item(content_item) except Exception as e: return CoverageFailure( work, str(e), data_source=self.data_source, transient=True ) content_item_id = result.get('contentItemId') bibblio_identifier, _is_new = Identifier.for_foreign_id( self._db, Identifier.BIBBLIO_CONTENT_ITEM_ID, content_item_id ) identifier = work.presentation_edition.primary_identifier identifier.equivalent_to(self.data_source, bibblio_identifier, 1) return work
def test_process_urn_isbn(self): # Create a new ISBN identifier. # Ask online providers for metadata to turn into an opds feed about this identifier. # Make sure a coverage record was created, and a 201 status obtained from provider. # Ask online provider again, and make sure we're now getting a 202 "working on it" status. # Ask again, this time getting a result. Make sure know that got a result. isbn, ignore = Identifier.for_foreign_id( self._db, Identifier.ISBN, self._isbn ) # The first time we look up an ISBN a CoverageRecord is created # representing the work to be done. self.controller.process_urn(isbn.urn) self.assert_one_message( isbn.urn, HTTP_CREATED, self.controller.IDENTIFIER_REGISTERED ) [record] = isbn.coverage_records eq_(record.exception, self.controller.NO_WORK_DONE_EXCEPTION) eq_(record.status, CoverageRecord.TRANSIENT_FAILURE) # So long as the necessary coverage is not provided, # future lookups will not provide useful information self.controller.precomposed_entries = [] self.controller.process_urn(isbn.urn) self.assert_one_message( isbn.urn, HTTP_ACCEPTED, self.controller.WORKING_TO_RESOLVE_IDENTIFIER ) # Let's provide the coverage. metadata_sources = DataSource.metadata_sources_for( self._db, isbn ) for source in metadata_sources: CoverageRecord.add_for(isbn, source) # Process the ISBN again, and we get an <entry> tag with the # information. self.controller.precomposed_entries = [] self.controller.process_urn(isbn.urn) expect = isbn.opds_entry() [actual] = self.controller.precomposed_entries eq_(etree.tostring(expect), etree.tostring(actual))
def lookup(self, identifier_or_uri, processed_uris=set()): """Perform an OCLC Open Data lookup for the given identifier.""" type = None identifier = None if isinstance(identifier_or_uri, basestring): # e.g. http://experiment.worldcat.org/oclc/1862341597.json match = self.URI_WITH_OCLC_NUMBER.search(identifier_or_uri) if match: type = Identifier.OCLC_NUMBER id = match.groups()[0] if not type or not id: return None, None identifier, is_new = Identifier.for_foreign_id( self._db, type, id) else: identifier = identifier_or_uri type = identifier.type if not type or not identifier: return None, None return self.lookup_by_identifier(identifier, processed_uris)
def remove_items(self): collection = self.authenticated_collection_from_request() if isinstance(collection, ProblemDetail): return collection urns = request.args.getlist('urn') messages = [] for urn in urns: message = None identifier = None try: identifier, ignore = Identifier.parse_urn(self._db, urn) except Exception as e: identifier = None if not identifier: message = OPDSMessage( urn, INVALID_URN.status_code, INVALID_URN.detail ) else: if identifier in collection.catalog: collection.catalog.remove(identifier) message = OPDSMessage( urn, HTTP_OK, "Successfully removed" ) else: message = OPDSMessage( urn, HTTP_NOT_FOUND, "Not in collection catalog" ) if message: messages.append(message) title = "%s Catalog Item Removal" % collection.name url = cdn_url_for("remove", urn=urns) removal_feed = AcquisitionFeed( self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages ) return feed_response(removal_feed)
def test_process_urn_isbn(self): # Create a new ISBN identifier. # Ask online providers for metadata to turn into an opds feed about this identifier. # Make sure a coverage record was created, and a 201 status obtained from provider. # Ask online provider again, and make sure we're now getting a 202 "working on it" status. # Ask again, this time getting a result. Make sure know that got a result. isbn, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN, self._isbn) # The first time we look up an ISBN a CoverageRecord is created # representing the work to be done. self.controller.process_urn(isbn.urn) self.assert_one_message(isbn.urn, HTTP_CREATED, self.controller.IDENTIFIER_REGISTERED) [record] = isbn.coverage_records eq_(record.exception, self.controller.NO_WORK_DONE_EXCEPTION) eq_(record.status, CoverageRecord.TRANSIENT_FAILURE) # So long as the necessary coverage is not provided, # future lookups will not provide useful information self.controller.precomposed_entries = [] self.controller.process_urn(isbn.urn) self.assert_one_message(isbn.urn, HTTP_ACCEPTED, self.controller.WORKING_TO_RESOLVE_IDENTIFIER) # Let's provide the coverage. metadata_sources = DataSource.metadata_sources_for(self._db, isbn) for source in metadata_sources: CoverageRecord.add_for(isbn, source) # Process the ISBN again, and we get an <entry> tag with the # information. self.controller.precomposed_entries = [] self.controller.process_urn(isbn.urn) expect = isbn.opds_entry() [actual] = self.controller.precomposed_entries eq_(etree.tostring(expect), etree.tostring(actual))
def _process_batch(self, client_method, success_codes, batch): results = list() id_mapping = self.create_identifier_mapping(batch) mapped_batch = id_mapping.keys() try: response = client_method(mapped_batch) self.lookup_client.check_content_type(response) except RemoteIntegrationException as e: return [ self.failure(id_mapping[obj], e.debug_message) for obj in mapped_batch ] for message in self.process_feed_response(response, id_mapping): try: identifier, _new = Identifier.parse_urn(self._db, message.urn) mapped_batch.remove(identifier) except ValueError as e: # For some reason this URN can't be parsed. This # shouldn't happen. continue if message.status_code in success_codes: result = id_mapping[identifier] results.append(result) elif message.status_code == 400: # The URN couldn't be recognized. (This shouldn't happen, # since if we can parse it here, we can parse it on MW, too.) exception = "%s: %s" % (message.status_code, message.message) failure = self.failure(identifier, exception) results.append(failure) else: exception = "Unknown OPDSMessage status: %s" % message.status_code failure = self.failure(identifier, exception) results.append(failure) return results
def test_lookup_info_to_metadata(self): # Basic book information is returned identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.ISBN, "9780804171335" ) bad_character = self.sample_representation("a_bad_character.json") metadata = self.novelist.lookup_info_to_metadata(bad_character) assert True == isinstance(metadata, Metadata) assert Identifier.NOVELIST_ID == metadata.primary_identifier.type assert "10392078" == metadata.primary_identifier.identifier assert "A bad character" == metadata.title assert None == metadata.subtitle assert 1 == len(metadata.contributors) [contributor] = metadata.contributors assert "Kapoor, Deepti" == contributor.sort_name assert 4 == len(metadata.identifiers) assert 4 == len(metadata.subjects) assert 2 == len(metadata.measurements) ratings = sorted(metadata.measurements, key=lambda m: m.value) assert 2 == ratings[0].value assert 3.27 == ratings[1].value assert 625 == len(metadata.recommendations) # Confirm that Lexile and series data is extracted with a # different sample. vampire = self.sample_representation("vampire_kisses.json") metadata = self.novelist.lookup_info_to_metadata(vampire) [lexile] = filter(lambda s: s.type == "Lexile", metadata.subjects) assert "630" == lexile.identifier assert "Vampire kisses manga" == metadata.series # The full title should be selected, since every volume # has the same main title: 'Vampire kisses' assert "Vampire kisses: blood relatives. Volume 1" == metadata.title assert 1 == metadata.series_position assert 5 == len(metadata.recommendations)
def test_lookup_info_to_metadata(self): # Basic book information is returned identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.ISBN, "9780804171335" ) bad_character = self.sample_representation("a_bad_character.json") metadata = self.novelist.lookup_info_to_metadata(bad_character) eq_(True, isinstance(metadata, Metadata)) eq_(Identifier.NOVELIST_ID, metadata.primary_identifier.type) eq_('10392078', metadata.primary_identifier.identifier) eq_("A bad character", metadata.title) eq_(None, metadata.subtitle) eq_(1, len(metadata.contributors)) [contributor] = metadata.contributors eq_("Kapoor, Deepti", contributor.sort_name) eq_(4, len(metadata.identifiers)) eq_(4, len(metadata.subjects)) eq_(2, len(metadata.measurements)) ratings = sorted(metadata.measurements, key=lambda m: m.value) eq_(2, ratings[0].value) eq_(3.27, ratings[1].value) eq_(625, len(metadata.recommendations)) # Confirm that Lexile and series data is extracted with a # different sample. vampire = self.sample_representation("vampire_kisses.json") metadata = self.novelist.lookup_info_to_metadata(vampire) [lexile] = filter(lambda s: s.type=='Lexile', metadata.subjects) eq_(u'630', lexile.identifier) eq_(u'Vampire kisses manga', metadata.series) # The full title should be selected, since every volume # has the same main title: 'Vampire kisses' eq_(u'Vampire kisses: blood relatives. Volume 1', metadata.title) eq_(1, metadata.series_position) eq_(5, len(metadata.recommendations))
def oclc_works_for_isbn(self, isbn, processed_uris=set()): """Yield every OCLC Work graph for the given ISBN.""" # Find the OCLC Number for this ISBN. oclc_number = self.oclc_number_for_isbn(isbn) # Retrieve the OCLC Linked Data document for that OCLC Number. oclc_number_data, was_new = self.lookup_by_identifier( oclc_number, processed_uris) if not oclc_number_data: return # Look up every work referenced in that document and yield its data. graph = OCLCLinkedData.graph(oclc_number_data) works = OCLCLinkedData.extract_works(graph) for work_uri in works: m = self.URI_WITH_OCLC_WORK_ID.match(work_uri) if m: work_id = m.groups()[0] identifier, was_new = Identifier.for_foreign_id( self._db, Identifier.OCLC_WORK, work_id) oclc_work_data, cached = self.lookup_by_identifier( identifier, processed_uris) yield oclc_work_data
which won't be necessary for this migration. """ def __init__(self, collection): super(IdentifierResolutionCoverageProvider, self).__init__( collection, registered_only=True ) try: _db = production_session() registrar = IdentifierResolutionRegistrar(_db) log.info('Finding unresolved identifiers') data_source = DataSource.lookup(_db, DataSource.INTERNAL_PROCESSING) unresolved_qu = Identifier.missing_coverage_from( _db, [], data_source, operation=CoverageRecord.RESOLVE_IDENTIFIER_OPERATION, count_as_covered=CoverageRecord.SUCCESS ).filter(CoverageRecord.id != None) log.info('Finding unaffiliated identifiers without a collection') unresolved_and_unaffiliated = unresolved_qu.outerjoin(Identifier.collections)\ .group_by(Identifier.id).having(func.count(Collection.id)==0)\ .options(lazyload(Identifier.licensed_through)).distinct() if unresolved_and_unaffiliated.count() > 1: # Use a bulk insert to add them all to the unaffiliated_collection. log.info('Giving all unaffiliated identifiers a collection') unaffiliated_collection, ignore = MockResolver.unaffiliated_collection(_db) _db.execute( collections_identifiers.insert(), [
# go through patron's checkouts and generate LoanInfo objects, # with FulfillmentInfo objects included media_type = item.get('mediaType', 'eBook') isbn = item.get('isbn', None) can_renew = item.get('canRenew', None) title = item.get('title', None) authors = item.get('authors', None) # refers to checkout expiration date, not the downloadUrl's expires = item.get('expiration', None) if expires: expires = datetime.datetime.strptime( expires, self.EXPIRATION_DATE_FORMAT).date() identifier, made_new = Identifier.for_foreign_id( self._db, foreign_identifier_type=Identifier.ONECLICK_ID, foreign_id=isbn, autocreate=False) # Note: if OneClick knows about a patron's checked-out item that wasn't # checked out through us, we ignore it if not identifier: continue files = item.get('files', None) for file in files: filename = file.get('filename', None) # assume fileFormat is same for all files associated with this checkout # and use the last one mentioned. Ex: "fileFormat": "EPUB". # note: audio books don't list fileFormat field, just the filename, and the mediaType. file_format = file.get('fileFormat', None)
def process_item(self, identifier): try: new_info_counter = Counter() self.log.info("Processing identifier %r", identifier) metadatas = [m for m in self.api.info_for(identifier)] if identifier.type == Identifier.ISBN: # Currently info_for seeks the results of OCLC Work IDs only # This segment will get the metadata of any equivalent OCLC Numbers # as well. equivalents = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id]) oclc_numbers = self._db.query(Identifier).\ filter(Identifier.id.in_(equivalents)).\ filter(Identifier.type==Identifier.OCLC_NUMBER).all() for oclc_number in oclc_numbers: more_metadata = [m for m in self.api.info_for(oclc_number)] metadatas += more_metadata metadatas = [m for m in metadatas if m] for metadata in metadatas: other_identifier, ignore = metadata.primary_identifier.load( self._db) oclc_editions = other_identifier.primarily_identifies # Keep track of the number of editions OCLC associates # with this identifier. other_identifier.add_measurement( self.data_source, Measurement.PUBLISHED_EDITIONS, len(oclc_editions)) # Clean up contributor information. self.apply_viaf_to_contributor_data(metadata) # Remove any empty ContributorData objects that may have # been created. metadata.contributors = filter( lambda c: c.sort_name or c.display_name, metadata.contributors) # When metadata is applied, it must be given a client that can # response to 'canonicalize_author_name'. Usually this is an # OPDSImporter that reaches out to the Metadata Wrangler, but # in the case of being _on_ the Metadata Wrangler...: from canonicalize import AuthorNameCanonicalizer metadata_client = AuthorNameCanonicalizer(self._db, oclcld=self.api, viaf=self.viaf) num_new_isbns = self.new_isbns(metadata) new_info_counter['isbns'] += num_new_isbns if oclc_editions: # There are existing OCLC editions. Apply any new information to them. for edition in oclc_editions: metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter) else: # Create a new OCLC edition to hold the information. edition, ignore = get_one_or_create( self._db, Edition, data_source=self.data_source, primary_identifier=other_identifier) metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter) # Set the new OCLC edition's identifier equivalent to this # identifier so we know they're related. self.set_equivalence(identifier, metadata) self.log.info( "Total: %(editions)d editions, %(isbns)d ISBNs, "\ "%(descriptions)d descriptions, %(subjects)d classifications.", new_info_counter ) except IOError as e: if ", but couldn't find location" in e.message: exception = "OCLC doesn't know about this ISBN: %r" % e transient = False else: exception = "OCLC raised an error: %r" % e transient = True return self.failure(identifier, exception, transient=transient) return identifier
def process_urns(self, urns, collection_details=None, **kwargs): """Processes URNs submitted via lookup request An authenticated request can process up to 30 URNs at once, but must specify a collection under which to catalog the URNs. This is used when initially recording the fact that certain URNs are in a collection, to get a baseline set of metadata. Updates on the books should be obtained through the CatalogController. An unauthenticated request is used for testing. Such a request does not have to specify a collection (the "Unaffiliated" collection is used), but can only process one URN at a time. :return: None or ProblemDetail """ client = authenticated_client_from_request(self._db, required=False) if isinstance(client, ProblemDetail): return client resolve_now = request.args.get('resolve_now', None) is not None collection = collection_from_details( self._db, client, collection_details ) if client: # Authenticated access. if not collection: return INVALID_INPUT.detailed(_("No collection provided.")) limit = 30 else: # Anonymous access. collection = self.default_collection limit = 1 if resolve_now: # You can't force-resolve more than one Identifier at a time. limit = 1 if len(urns) > limit: return INVALID_INPUT.detailed( _("The maximum number of URNs you can provide at once is %d. (You sent %d)") % (limit, len(urns)) ) identifiers_by_urn, failures = Identifier.parse_urns( self._db, urns, allowed_types=self.VALID_TYPES ) self.add_urn_failure_messages(failures) # Catalog all identifiers. collection.catalog_identifiers(identifiers_by_urn.values()) # Load all coverage records in a single query to speed up the # code that reports on the status of Identifiers that aren't # ready. self.bulk_load_coverage_records(identifiers_by_urn.values()) resolver = IdentifierResolutionCoverageProvider( collection, provide_coverage_immediately=resolve_now, **self.coverage_provider_kwargs ) for urn, identifier in identifiers_by_urn.items(): self.process_identifier( identifier, urn, resolver=resolver )
def do_run(self, _db): identifier = Identifier(type="Keep It", identifier="100") _db.add(identifier)
def add_with_metadata(self, collection_details): """Adds identifiers with their metadata to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details( self._db, client, collection_details ) data_source = DataSource.lookup( self._db, collection.name, autocreate=True ) messages = [] feed = feedparser.parse(request.data) entries = feed.get("entries", []) entries_by_urn = { entry.get('id') : entry for entry in entries } identifiers_by_urn, invalid_urns = Identifier.parse_urns( self._db, entries_by_urn.keys() ) messages = list() for urn in invalid_urns: messages.append(OPDSMessage( urn, INVALID_URN.status_code, INVALID_URN.detail )) for urn, identifier in identifiers_by_urn.items(): entry = entries_by_urn[urn] status = HTTP_OK description = "Already in catalog" if identifier not in collection.catalog: collection.catalog_identifier(identifier) status = HTTP_CREATED description = "Successfully added" message = OPDSMessage(urn, status, description) # Get a cover if it exists. image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE]) images = [l for l in entry.get("links", []) if l.get("rel") in image_types] links = [LinkData(image.get("rel"), image.get("href")) for image in images] # Create an edition to hold the title and author. LicensePool.calculate_work # refuses to create a Work when there's no title, and if we have a title, author # and language we can attempt to look up the edition in OCLC. title = entry.get("title") or "Unknown Title" author = ContributorData( sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR), roles=[Contributor.PRIMARY_AUTHOR_ROLE] ) language = entry.get("dcterms_language") presentation = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=False, regenerate_opds_entries=False, ) replace = ReplacementPolicy(presentation_calculation_policy=presentation) metadata = Metadata( data_source, primary_identifier=IdentifierData(identifier.type, identifier.identifier), title=title, language=language, contributors=[author], links=links, ) edition, ignore = metadata.edition(self._db) metadata.apply(edition, collection, replace=replace) messages.append(message) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url("add_with_metadata", collection) addition_feed = AcquisitionFeed( self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages ) return feed_response(addition_feed)
def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info['TitleInfo'] if book_info: novelist_identifier = book_info.get('ui') if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get('author') if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get('description') if description: metadata.links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN)) audience_level = book_info.get('audience_level') if audience_level: metadata.subjects.append( SubjectData(Subject.FREEFORM_AUDIENCE, audience_level)) novelist_rating = book_info.get('rating') if novelist_rating: metadata.measurements.append( MeasurementData(Measurement.RATING, novelist_rating)) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get('FeatureContent') if feature_content: series_info = feature_content.get('SeriesInfo') appeals_info = feature_content.get('Appeals') lexile_info = feature_content.get('LexileInfo') goodreads_info = feature_content.get('GoodReads') recommendations_info = feature_content.get('SimilarTitles') metadata, title_key = self.get_series_information( metadata, series_info, book_info) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle(metadata.title, book_info.get('full_title')) metadata.subtitle = self._scrub_subtitle(subtitle) # TODO: How well do we trust this data? We could conceivably bump up # the weight here. if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get('genres') if genres: for genre in genres: metadata.subjects.append( SubjectData(Subject.TAG, genre['Name'])) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append( SubjectData(Subject.LEXILE_SCORE, lexile_info['Lexile'])) if goodreads_info: metadata.measurements.append( MeasurementData(Measurement.RATING, goodreads_info['average_rating'])) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not (metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations): metadata = None return metadata
) from threem import ThreeMAPI from overdrive import OverdriveAPI from axis import Axis360API from circulation import CirculationAPI from circulation_exceptions import * barcode, pin, borrow_urn, hold_urn = sys.argv[1:5] email = os.environ.get('DEFAULT_NOTIFICATION_EMAIL_ADDRESS', '*****@*****.**') _db = production_session() patron, ignore = get_one_or_create( _db, Patron, authorization_identifier=barcode) borrow_identifier = Identifier.parse_urn(_db, borrow_urn, True)[0] hold_identifier = Identifier.parse_urn(_db, hold_urn, True)[0] borrow_pool = borrow_identifier.licensed_through hold_pool = hold_identifier.licensed_through if any(x.type == Identifier.THREEM_ID for x in [borrow_identifier, hold_identifier]): threem = ThreeMAPI(_db) else: threem = None if any(x.type == Identifier.OVERDRIVE_ID for x in [borrow_identifier, hold_identifier]): overdrive = OverdriveAPI(_db) else: overdrive = None if any(x.type == Identifier.AXIS_360_ID for x in [borrow_identifier, hold_identifier]):
def add_with_metadata(self, collection_details): """Adds identifiers with their metadata to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details(self._db, client, collection_details) data_source = DataSource.lookup(self._db, collection.name, autocreate=True) messages = [] feed = feedparser.parse(request.data) entries = feed.get("entries", []) entries_by_urn = {entry.get('id'): entry for entry in entries} identifiers_by_urn, invalid_urns = Identifier.parse_urns( self._db, entries_by_urn.keys()) messages = list() for urn in invalid_urns: messages.append( OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail)) for urn, identifier in identifiers_by_urn.items(): entry = entries_by_urn[urn] status = HTTP_OK description = "Already in catalog" if identifier not in collection.catalog: collection.catalog_identifier(identifier) status = HTTP_CREATED description = "Successfully added" message = OPDSMessage(urn, status, description) # Get a cover if it exists. image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE]) images = [ l for l in entry.get("links", []) if l.get("rel") in image_types ] links = [ LinkData(image.get("rel"), image.get("href")) for image in images ] # Create an edition to hold the title and author. LicensePool.calculate_work # refuses to create a Work when there's no title, and if we have a title, author # and language we can attempt to look up the edition in OCLC. title = entry.get("title") or "Unknown Title" author = ContributorData(sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR), roles=[Contributor.PRIMARY_AUTHOR_ROLE]) language = entry.get("dcterms_language") presentation = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=False, regenerate_opds_entries=False, ) replace = ReplacementPolicy( presentation_calculation_policy=presentation) metadata = Metadata( data_source, primary_identifier=IdentifierData(identifier.type, identifier.identifier), title=title, language=language, contributors=[author], links=links, ) edition, ignore = metadata.edition(self._db) metadata.apply(edition, collection, replace=replace) messages.append(message) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url("add_with_metadata", collection) addition_feed = AcquisitionFeed(self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages) return feed_response(addition_feed)
def run(self): id_type, identifier = sys.argv[1:] identifier, ignore = Identifier.for_foreign_id( self._db, id_type, identifier ) self.fix_identifier(identifier)
def run(self): id_type, identifier = sys.argv[1:] identifier, ignore = Identifier.for_foreign_id(self._db, id_type, identifier) self.fix_identifier(identifier)
class OCLCXMLParser(XMLParser): # OCLC in-representation 'status codes' SINGLE_WORK_SUMMARY_STATUS = 0 SINGLE_WORK_DETAIL_STATUS = 2 MULTI_WORK_STATUS = 4 NO_INPUT_STATUS = 100 INVALID_INPUT_STATUS = 101 NOT_FOUND_STATUS = 102 UNEXPECTED_ERROR_STATUS = 200 INTS = set([OCLC.HOLDING_COUNT, OCLC.EDITION_COUNT]) NAMESPACES = {'oclc': 'http://classify.oclc.org'} LIST_TYPE = "works" log = logging.getLogger("OCLC XML Parser") @classmethod def parse(cls, _db, xml, **restrictions): """Turn XML data from the OCLC lookup service into a list of SWIDs (for a multi-work response) or a list of Edition objects (for a single-work response). """ tree = etree.fromstring(xml, parser=etree.XMLParser(recover=True)) response = cls._xpath1(tree, "oclc:response") representation_type = int(response.get('code')) workset_record = None editions = [] edition_records = [] if representation_type == cls.UNEXPECTED_ERROR_STATUS: raise IOError("Unexpected error from OCLC API: %s" % xml) elif representation_type in (cls.NO_INPUT_STATUS, cls.INVALID_INPUT_STATUS): return representation_type, [] elif representation_type == cls.SINGLE_WORK_SUMMARY_STATUS: raise IOError( "Got single-work summary from OCLC despite requesting detail: %s" % xml) # The real action happens here. if representation_type == cls.SINGLE_WORK_DETAIL_STATUS: authors_tag = cls._xpath1(tree, "//oclc:authors") work_tag = cls._xpath1(tree, "//oclc:work") if work_tag is not None: author_string = work_tag.get('author') primary_author = cls.primary_author_from_author_string( _db, author_string) existing_authors = cls.extract_authors( _db, authors_tag, primary_author=primary_author) # The representation lists a single work, its authors, its editions, # plus summary classification information for the work. edition, ignore = cls.extract_edition(_db, work_tag, existing_authors, **restrictions) if edition: cls.log.info("EXTRACTED %r", edition) records = [] if edition: records.append(edition) else: # The work record itself failed one of the # restrictions. None of its editions are likely to # succeed either. return representation_type, records elif representation_type == cls.MULTI_WORK_STATUS: # The representation lists a set of works that match the # search query. cls.log.debug("Extracting SWIDs from search results.") records = cls.extract_swids(_db, tree, **restrictions) elif representation_type == cls.NOT_FOUND_STATUS: # No problem; OCLC just doesn't have any data. records = [] else: raise IOError("Unrecognized status code from OCLC API: %s (%s)" % (representation_type, xml)) return representation_type, records @classmethod def extract_swids(cls, _db, tree, **restrictions): """Turn a multi-work response into a list of SWIDs.""" swids = [] for work_tag in cls._xpath(tree, "//oclc:work"): # We're not calling extract_basic_info because we care about # the info, we're calling it to make sure this work meets # the restriction. If this work meets the restriction, # we'll store its info when we look up the SWID. response = cls._extract_basic_info(_db, work_tag, **restrictions) if response: title, author_names, language = response # TODO: 'swid' is what it's called in older representations. # That code can be removed once we replace all representations. work_identifier = work_tag.get('wi') or work_tag.get('swid') cls.log.debug("WORK ID %s (%s, %r, %s)", work_identifier, title, author_names, language) swids.append(work_identifier) return swids ROLES = re.compile("\[([^]]+)\]$") LIFESPAN = re.compile("([0-9]+)-([0-9]*)[.;]?$") @classmethod def extract_authors(cls, _db, authors_tag, primary_author=None): results = [] if authors_tag is not None: for author_tag in cls._xpath(authors_tag, "//oclc:author"): lc = author_tag.get('lc', None) viaf = author_tag.get('viaf', None) contributor, roles, default_role_used = cls._parse_single_author( _db, author_tag.text, lc=lc, viaf=viaf, primary_author=primary_author) if contributor: results.append(contributor) return results @classmethod def _contributor_match(cls, contributor, name, lc, viaf): return (contributor.sort_name == name and (lc is None or contributor.lc == lc) and (viaf is None or contributor.viaf == viaf)) @classmethod def _parse_single_author(cls, _db, author, lc=None, viaf=None, existing_authors=[], default_role=Contributor.AUTHOR_ROLE, primary_author=None): default_role_used = False # First find roles if present # "Giles, Lionel, 1875-1958 [Writer of added commentary; Translator]" author = author.strip() m = cls.ROLES.search(author) if m: author = author[:m.start()].strip() role_string = m.groups()[0] roles = [x.strip() for x in role_string.split(";")] elif default_role: roles = [default_role] default_role_used = True else: roles = [] # Author string now looks like # "Giles, Lionel, 1875-1958" m = cls.LIFESPAN.search(author) kwargs = dict() if m: author = author[:m.start()].strip() birth, death = m.groups() if birth: kwargs[Contributor.BIRTH_DATE] = birth if death: kwargs[Contributor.DEATH_DATE] = death # Author string now looks like # "Giles, Lionel," if author.endswith(","): author = author[:-1] contributor = None if not author: # No name was given for the author. return None, roles, default_role_used if primary_author and author == primary_author.sort_name: if Contributor.AUTHOR_ROLE in roles: roles.remove(Contributor.AUTHOR_ROLE) if Contributor.UNKNOWN_ROLE in roles: roles.remove(Contributor.UNKNOWN_ROLE) roles.insert(0, Contributor.PRIMARY_AUTHOR_ROLE) if existing_authors: # Calling Contributor.lookup will result in a database # hit, and looking up a contributor based on name may # result in multiple results (see below). We'll have no # way of distinguishing between those results. If # possible, it's much more reliable to look through # existing_authors (the authors derived from an entry's # <authors> tag). for x in existing_authors: if cls._contributor_match(x, author, lc, viaf): contributor = x break if contributor: was_new = False if not contributor: contributor, was_new = Contributor.lookup(_db, author, viaf, lc, extra=kwargs) if isinstance(contributor, list): # We asked for an author based solely on the name, which makes # Contributor.lookup() return a list. if len(contributor) == 1: # Fortunately, either the database knows about only # one author with that name, or it didn't know about # any authors with that name and it just created one, # so we can unambiguously use it. contributor = contributor[0] else: # Uh-oh. The database knows about multiple authors # with that name. We have no basis for deciding which # author we mean. But we would prefer to identify with # an author who has a known LC or VIAF number. # # This should happen very rarely because of our check # against existing_authors above. But it will happen # for authors that have a work in Project Gutenberg. with_id = [ x for x in contributor if x.lc is not None or x.viaf is not None ] if with_id: contributor = with_id[0] else: contributor = contributor[0] return contributor, roles, default_role_used @classmethod def primary_author_from_author_string(cls, _db, author_string): # If the first author mentioned in the author string # does not have an explicit role set, treat them as the primary # author. if not author_string: return None authors = author_string.split("|") if not authors: return None author, roles, default_role_used = cls._parse_single_author( _db, authors[0], default_role=Contributor.PRIMARY_AUTHOR_ROLE) if roles == [Contributor.PRIMARY_AUTHOR_ROLE]: return author return None @classmethod def parse_author_string(cls, _db, author_string, existing_authors=[], primary_author=None): default_role = Contributor.PRIMARY_AUTHOR_ROLE authors = [] if not author_string: return authors for author in author_string.split("|"): author, roles, default_role_used = cls._parse_single_author( _db, author, existing_authors=existing_authors, default_role=default_role, primary_author=primary_author) if roles: if Contributor.PRIMARY_AUTHOR_ROLE in roles: # That was the primary author. If we see someone # with no explicit role after this point, they're # just a regular author. default_role = Contributor.AUTHOR_ROLE elif not default_role_used: # We're dealing with someone whose role was # explicitly specified. If we see someone with no # explicit role after this point, it's probably # because their role is so minor as to not be # worth mentioning, not because it's so major that # we can assume they're an author. default_role = Contributor.UNKNOWN_ROLE roles = roles or [default_role] if author: authors.append((author, roles)) return authors @classmethod def _extract_basic_info(cls, _db, tag, existing_authors=None, **restrictions): """Extract information common to work tag and edition tag.""" title = tag.get('title') author_string = tag.get('author') authors_and_roles = cls.parse_author_string(_db, author_string, existing_authors) if 'language' in tag.keys(): language = tag.get('language') else: language = None if title and 'title' in restrictions: must_resemble_title = restrictions['title'] threshold = restrictions.get('title_similarity', 0.25) similarity = MetadataSimilarity.title_similarity( must_resemble_title, title) if similarity < threshold: # The title of the book under consideration is not # similar enough to the given title. cls.log.debug("FAILURE TO RESEMBLE: %s vs %s (%.2f)", title, must_resemble_title, similarity) return None # The semicolon is frequently used to separate multiple # works in an anthology. If there is no semicolon in the # original title, do not consider titles that contain # semicolons. if (not ' ; ' in must_resemble_title and ' ; ' in title and threshold > 0): cls.log.debug("SEMICOLON DISQUALIFICATION: %s", title) return None # Apply restrictions. If they're not met, return None. if 'language' in restrictions and language: # We know which language this record is for. Match it # against the language used in the Edition we're # matching against. restrict_to_language = set(restrictions['language']) if language != restrict_to_language: # This record is for a book in a different language cls.log.debug("WRONG LANGUAGE: %s", language) return None if 'authors' in restrictions: restrict_to_authors = restrictions['authors'] if restrict_to_authors and isinstance(restrict_to_authors[0], Contributor): restrict_to_authors = [ x.sort_name for x in restrict_to_authors ] primary_author = None for a, roles in authors_and_roles: if Contributor.PRIMARY_AUTHOR_ROLE in roles: primary_author = a break if (not primary_author or (primary_author not in restrict_to_authors and primary_author.sort_name not in restrict_to_authors)): # None of the given authors showed up as the # primary author of this book. They may have had # some other role in it, or the book may be about # them, or incorporate their work, but this book # is not *by* them. return None author_names = ", ".join([x.sort_name for x, y in authors_and_roles]) return title, authors_and_roles, language UNUSED_MEDIA = set([ "itemtype-intmm", "itemtype-msscr", "itemtype-artchap-artcl", "itemtype-jrnl", "itemtype-map", "itemtype-vis", "itemtype-jrnl-digital", "itemtype-image-2d", "itemtype-artchap-digital", "itemtype-intmm-digital", "itemtype-archv", "itemtype-msscr-digital", "itemtype-game", "itemtype-web-digital", "itemtype-map-digital", ]) @classmethod def extract_edition(cls, _db, work_tag, existing_authors, **restrictions): """Create a new Edition object with information about a work (identified by OCLC Work ID). """ # TODO: 'pswid' is what it's called in older representations. # That code can be removed once we replace all representations. oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid')) # if oclc_work_id: # print " owi: %s" % oclc_work_id # else: # print " No owi in %s" % etree.tostring(work_tag) if not oclc_work_id: raise ValueError("Work has no owi") item_type = work_tag.get("itemtype") if (item_type.startswith('itemtype-book') or item_type.startswith('itemtype-compfile')): medium = Edition.BOOK_MEDIUM elif item_type.startswith('itemtype-audiobook' ) or item_type.startswith('itemtype-music'): # Pretty much all Gutenberg texts, even the audio texts, # are based on a book, and the ones that aren't # (recordings of individual songs) probably aren't in OCLC # anyway. So we just want to get the books. medium = Edition.AUDIO_MEDIUM medium = None elif item_type.startswith('itemtype-video'): #medium = Edition.VIDEO_MEDIUM medium = None elif item_type in cls.UNUSED_MEDIA: medium = None else: medium = None # Only create Editions for books with a recognized medium if medium is None: return None, False result = cls._extract_basic_info(_db, work_tag, existing_authors, **restrictions) if not result: # This record did not meet one of the restrictions. return None, False title, authors_and_roles, language = result # Record some extra OCLC-specific information editions = work_tag.get('editions') holdings = work_tag.get('holdings') # Get an identifier for this work. identifier, ignore = Identifier.for_foreign_id(_db, Identifier.OCLC_WORK, oclc_work_id) data_source = DataSource.lookup(_db, DataSource.OCLC) identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings) identifier.add_measurement(data_source, Measurement.PUBLISHED_EDITIONS, editions) # Create a Edition for source + identifier edition, new = get_one_or_create(_db, Edition, data_source=data_source, primary_identifier=identifier, create_method_kwargs=dict( title=title, language=language, )) # Get the most popular Dewey and LCC classification for this # work. for tag_name, subject_type in (("ddc", Subject.DDC), ("lcc", Subject.LCC)): tag = cls._xpath1(work_tag, "//oclc:%s/oclc:mostPopular" % tag_name) if tag is not None: id = tag.get('nsfa') or tag.get('sfa') weight = int(tag.get('holdings')) identifier.classify(data_source, subject_type, id, weight=weight) # Find FAST subjects for the work. for heading in cls._xpath(work_tag, "//oclc:fast//oclc:heading"): id = heading.get('ident') weight = int(heading.get('heldby')) value = heading.text identifier.classify(data_source, Subject.FAST, id, value, weight) # Associate the authors with the Edition. for contributor, roles in authors_and_roles: edition.add_contributor(contributor, roles) return edition, new @classmethod def extract_edition_record(cls, _db, edition_tag, existing_authors, **restrictions): """Create a new Edition object with information about an edition of a book (identified by OCLC Number). """ oclc_number = unicode(edition_tag.get('oclc')) try: int(oclc_number) except ValueError, e: # This record does not have a valid OCLC number. return None, False # Fill in some basic information about this new record. result = cls._extract_basic_info(_db, edition_tag, existing_authors, **restrictions) if not result: # This record did not meet one of the restrictions. return None, False title, authors_and_roles, language = result # Add a couple extra bits of OCLC-specific information. extra = { OCLC.HOLDING_COUNT: edition_tag.get('holdings'), OCLC.FORMAT: edition_tag.get('itemtype'), } # Get an identifier for this edition. identifier, ignore = Identifier.for_foreign_id(_db, Identifier.OCLC_NUMBER, oclc_number) # Create a Edition for source + identifier data_source = DataSource.lookup(_db, DataSource.OCLC) edition_record, new = get_one_or_create(_db, Edition, data_source=data_source, primary_identifier=identifier, create_method_kwargs=dict( title=title, language=language, subjects=subjects, extra=extra, )) subjects = {} for subject_type, oclc_code in ((Subject.LCC, "050"), (Subject.DDC, "082")): classification = cls._xpath1( edition_tag, "oclc:classifications/oclc:class[@tag=%s]" % oclc_code) if classification is not None: value = classification.get("nsfa") or classification.get('sfa') identifier.classify(data_source, subject_type, value) # Associated each contributor with the new record. for author, roles in authors_and_roles: edition_record.add_contributor(author, roles) return edition_record, new
class AnnotationParser(object): @classmethod def parse(cls, _db, data, patron): if patron.synchronize_annotations != True: return PATRON_NOT_OPTED_IN_TO_ANNOTATION_SYNC try: data = json.loads(data) data = jsonld.expand(data) except ValueError, e: return INVALID_ANNOTATION_FORMAT if not data or not len(data) == 1: return INVALID_ANNOTATION_TARGET data = data[0] target = data.get("http://www.w3.org/ns/oa#hasTarget") if not target or not len(target) == 1: return INVALID_ANNOTATION_TARGET target = target[0] source = target.get("http://www.w3.org/ns/oa#hasSource") if not source or not len(source) == 1: return INVALID_ANNOTATION_TARGET source = source[0].get('@id') identifier, ignore = Identifier.parse_urn(_db, source) motivation = data.get("http://www.w3.org/ns/oa#motivatedBy") if not motivation or not len(motivation) == 1: return INVALID_ANNOTATION_MOTIVATION motivation = motivation[0].get('@id') if motivation not in Annotation.MOTIVATIONS: return INVALID_ANNOTATION_MOTIVATION loans = patron.loans loan_identifiers = [loan.license_pool.identifier for loan in loans] if identifier not in loan_identifiers: return INVALID_ANNOTATION_TARGET content = data.get("http://www.w3.org/ns/oa#hasBody") if content and len(content) == 1: content = content[0] else: content = None target = json.dumps(target) extra_kwargs = {} if motivation == Annotation.IDLING: # A given book can only have one 'idling' annotation. pass elif motivation == Annotation.BOOKMARKING: # A given book can only have one 'bookmarking' annotation # per target. extra_kwargs['target'] = target annotation, ignore = Annotation.get_one_or_create( _db, patron=patron, identifier=identifier, motivation=motivation, **extra_kwargs) annotation.target = target if content: annotation.content = json.dumps(content) annotation.active = True annotation.timestamp = datetime.now() return annotation
def process_item(self, identifier): # Books are not looked up in OCLC Linked Data directly, since # there is no Collection that identifies a book by its OCLC Number. # However, when a book is looked up through OCLC Classify, some # OCLC Numbers may be associated with it, and _those_ numbers # can be run through OCLC Linked Data. try: new_info_counter = Counter() self.log.info("Processing identifier %r", identifier) metadatas = [m for m in self.api.info_for(identifier)] if identifier.type==Identifier.ISBN: # Currently info_for seeks the results of OCLC Work IDs only # This segment will get the metadata of any equivalent OCLC Numbers # as well. equivalents = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id] ) oclc_numbers = self._db.query(Identifier).\ filter(Identifier.id.in_(equivalents)).\ filter(Identifier.type==Identifier.OCLC_NUMBER).all() for oclc_number in oclc_numbers: more_metadata = [m for m in self.api.info_for(oclc_number)] metadatas += more_metadata metadatas = [m for m in metadatas if m] for metadata in metadatas: other_identifier, ignore = metadata.primary_identifier.load(self._db) oclc_editions = other_identifier.primarily_identifies # Keep track of the number of editions OCLC associates # with this identifier. other_identifier.add_measurement( self.data_source, Measurement.PUBLISHED_EDITIONS, len(oclc_editions) ) # Clean up contributor information. self.apply_viaf_to_contributor_data(metadata) # Remove any empty ContributorData objects that may have # been created. metadata.contributors = filter( lambda c: c.sort_name or c.display_name, metadata.contributors ) # When metadata is applied, it must be given a client that can # response to 'canonicalize_author_name'. Usually this is an # OPDSImporter that reaches out to the Metadata Wrangler, but # in the case of being _on_ the Metadata Wrangler...: from canonicalize import AuthorNameCanonicalizer metadata_client = AuthorNameCanonicalizer( self._db, oclcld=self.api, viaf=self.viaf ) num_new_isbns = self.new_isbns(metadata) new_info_counter['isbns'] += num_new_isbns if oclc_editions: # There are existing OCLC editions. Apply any new information to them. for edition in oclc_editions: metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter ) else: # Create a new OCLC edition to hold the information. edition, ignore = get_one_or_create( self._db, Edition, data_source=self.data_source, primary_identifier=other_identifier ) metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter ) # Set the new OCLC edition's identifier equivalent to this # identifier so we know they're related. self.set_equivalence(identifier, metadata) self.log.info( "Total: %(editions)d editions, %(isbns)d ISBNs, "\ "%(descriptions)d descriptions, %(subjects)d classifications.", new_info_counter ) except IOError as e: if ", but couldn't find location" in e.message: exception = "OCLC doesn't know about this ISBN: %r" % e transient = False else: exception = "OCLC raised an error: %r" % e transient = True return self.failure(identifier, exception, transient=transient) # Try to calculate or recalculate a work for ISBNs. # # We won't do this for other Identifier types because we don't want # to overwrite the high-quality metadata direct from the source. # With ISBNs, that higher-quality metadata is not available, so we # depend on OCLC for title and author information. if identifier.type == Identifier.ISBN: self.calculate_work_for_isbn(identifier) return identifier
def process_urns(self, urns, collection_details=None, **kwargs): """Processes URNs submitted via lookup request An authenticated request can process up to 30 URNs at once, but must specify a collection under which to catalog the URNs. This is used when initially recording the fact that certain URNs are in a collection, to get a baseline set of metadata. Updates on the books should be obtained through the CatalogController. An unauthenticated request is used for testing. Such a request does not have to specify a collection (the "Unaffiliated" collection is used), but can only process one URN at a time. :return: None or ProblemDetail """ client = authenticated_client_from_request(self._db, required=False) if isinstance(client, ProblemDetail): return client resolve_now = request.args.get('resolve_now', None) is not None collection = collection_from_details(self._db, client, collection_details) if client: # Authenticated access. if not collection: return INVALID_INPUT.detailed(_("No collection provided.")) limit = 30 else: # Anonymous access. collection = self.default_collection limit = 1 if resolve_now: # You can't force-resolve more than one Identifier at a time. limit = 1 if len(urns) > limit: return INVALID_INPUT.detailed( _("The maximum number of URNs you can provide at once is %d. (You sent %d)" ) % (limit, len(urns))) identifiers_by_urn, failures = Identifier.parse_urns( self._db, urns, allowed_types=self.VALID_TYPES) self.add_urn_failure_messages(failures) # Catalog all identifiers. collection.catalog_identifiers(identifiers_by_urn.values()) # Load all coverage records in a single query to speed up the # code that reports on the status of Identifiers that aren't # ready. self.bulk_load_coverage_records(identifiers_by_urn.values()) resolver = IdentifierResolutionCoverageProvider( collection, provide_coverage_immediately=resolve_now, **self.coverage_provider_kwargs) for urn, identifier in identifiers_by_urn.items(): self.process_identifier(identifier, urn, resolver=resolver)
def extract_edition(cls, _db, work_tag, existing_authors, **restrictions): """Create a new Edition object with information about a work (identified by OCLC Work ID). """ # TODO: 'pswid' is what it's called in older representations. # That code can be removed once we replace all representations. oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid')) # if oclc_work_id: # print " owi: %s" % oclc_work_id # else: # print " No owi in %s" % etree.tostring(work_tag) if not oclc_work_id: raise ValueError("Work has no owi") item_type = work_tag.get("itemtype") if (item_type.startswith('itemtype-book') or item_type.startswith('itemtype-compfile')): medium = Edition.BOOK_MEDIUM elif item_type.startswith('itemtype-audiobook') or item_type.startswith('itemtype-music'): # Pretty much all Gutenberg texts, even the audio texts, # are based on a book, and the ones that aren't # (recordings of individual songs) probably aren't in OCLC # anyway. So we just want to get the books. medium = Edition.AUDIO_MEDIUM medium = None elif item_type.startswith('itemtype-video'): #medium = Edition.VIDEO_MEDIUM medium = None elif item_type in cls.UNUSED_MEDIA: medium = None else: medium = None # Only create Editions for books with a recognized medium if medium is None: return None, False result = cls._extract_basic_info(_db, work_tag, existing_authors, **restrictions) if not result: # This record did not meet one of the restrictions. return None, False title, authors_and_roles, language = result # Record some extra OCLC-specific information editions = work_tag.get('editions') holdings = work_tag.get('holdings') # Get an identifier for this work. identifier, ignore = Identifier.for_foreign_id( _db, Identifier.OCLC_WORK, oclc_work_id ) data_source = DataSource.lookup(_db, DataSource.OCLC) identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings) identifier.add_measurement( data_source, Measurement.PUBLISHED_EDITIONS, editions) # Create a Edition for source + identifier edition, new = get_one_or_create( _db, Edition, data_source=data_source, primary_identifier=identifier, create_method_kwargs=dict( title=title, language=language, ) ) # Get the most popular Dewey and LCC classification for this # work. for tag_name, subject_type in ( ("ddc", Subject.DDC), ("lcc", Subject.LCC)): tag = cls._xpath1( work_tag, "//oclc:%s/oclc:mostPopular" % tag_name) if tag is not None: id = tag.get('nsfa') or tag.get('sfa') weight = int(tag.get('holdings')) identifier.classify( data_source, subject_type, id, weight=weight) # Find FAST subjects for the work. for heading in cls._xpath( work_tag, "//oclc:fast//oclc:heading"): id = heading.get('ident') weight = int(heading.get('heldby')) value = heading.text identifier.classify( data_source, Subject.FAST, id, value, weight) # Associate the authors with the Edition. for contributor, roles in authors_and_roles: edition.add_contributor(contributor, roles) return edition, new
def test_recursively_equivalent_identifiers(self): # We start with a Gutenberg book. gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) record, ignore = Edition.for_foreign_id(self._db, gutenberg, Identifier.GUTENBERG_ID, "100") gutenberg_id = record.primary_identifier # We use OCLC Classify to do a title/author lookup. oclc = DataSource.lookup(self._db, DataSource.OCLC) search_id, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_WORK, "60010") gutenberg_id.equivalent_to(oclc, search_id, 1) # The title/author lookup associates the search term with two # different OCLC Numbers. oclc_id, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, "9999") oclc_id_2, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, "1000") search_id.equivalent_to(oclc, oclc_id, 1) search_id.equivalent_to(oclc, oclc_id_2, 1) # We then use OCLC Linked Data to connect one of the OCLC # Numbers with an ISBN. linked_data = DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA) isbn_id, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN, "900100434X") oclc_id.equivalent_to(linked_data, isbn_id, 1) # As it turns out, we have an Overdrive work record... overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE) overdrive_record, ignore = Edition.for_foreign_id( self._db, overdrive, Identifier.OVERDRIVE_ID, "{111-222}") overdrive_id = overdrive_record.primary_identifier # ...which is tied (by Overdrive) to the same ISBN. overdrive_id.equivalent_to(overdrive, isbn_id, 1) # Finally, here's a completely unrelated Edition, which # will not be showing up. gutenberg2, ignore = Edition.for_foreign_id(self._db, gutenberg, Identifier.GUTENBERG_ID, "200") gutenberg2.title = "Unrelated Gutenberg record." levels = [ record.equivalent_identifiers(policy=PresentationCalculationPolicy( equivalent_identifier_levels=i)) for i in range(0, 5) ] # At level 0, the only identifier found is the Gutenberg ID. assert set([gutenberg_id]) == set(levels[0]) # At level 1, we pick up the title/author lookup. assert set([gutenberg_id, search_id]) == set(levels[1]) # At level 2, we pick up the title/author lookup and the two # OCLC Numbers. assert set([gutenberg_id, search_id, oclc_id, oclc_id_2]) == set(levels[2]) # At level 3, we also pick up the ISBN. assert set([gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id]) == set(levels[3]) # At level 4, the recursion starts to go in the other # direction: we pick up the Overdrive ID that's equivalent to # the same ISBN as the OCLC Number. assert set([ gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id, overdrive_id ]) == set(levels[4])
def do_run(self, _db): identifier = Identifier(type="You Can", identifier="Keep It") _db.add(identifier) raise RuntimeError
message=message, action=action) # by now we can assume response is either empty or a list for item in resp_obj: # go through patron's holds and HoldInfo objects. media_type = item.get('mediaType', 'eBook') isbn = item.get('isbn', None) title = item.get('title', None) authors = item.get('authors', None) expires = item.get('expiration', None) if expires: expires = datetime.datetime.strptime( expires, self.EXPIRATION_DATE_FORMAT).date() identifier = Identifier.from_asin(self._db, isbn, autocreate=False) # Note: if OneClick knows about a patron's checked-out item that wasn't # checked out through us, we ignore it if not identifier: continue hold = HoldInfo(self.collection, DataSource.RB_DIGITAL, Identifier.RB_DIGITAL_ID, isbn, start_date=None, end_date=expires, hold_position=None) holds.append(hold)
def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info['TitleInfo'] if book_info: novelist_identifier = book_info.get('ui') if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier ) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get('author') if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get('description') if description: metadata.links.append(LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN )) audience_level = book_info.get('audience_level') if audience_level: metadata.subjects.append(SubjectData( Subject.FREEFORM_AUDIENCE, audience_level )) novelist_rating = book_info.get('rating') if novelist_rating: metadata.measurements.append(MeasurementData( Measurement.RATING, novelist_rating )) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get('FeatureContent') if feature_content: series_info = feature_content.get('SeriesInfo') appeals_info = feature_content.get('Appeals') lexile_info = feature_content.get('LexileInfo') goodreads_info = feature_content.get('GoodReads') recommendations_info = feature_content.get('SimilarTitles') metadata, title_key = self.get_series_information( metadata, series_info, book_info ) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle( metadata.title, book_info.get('full_title') ) metadata.subtitle = self._scrub_subtitle(subtitle) if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get('genres') if genres: for genre in genres: metadata.subjects.append(SubjectData( Subject.TAG, genre['Name'] )) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append(SubjectData( Subject.LEXILE_SCORE, lexile_info['Lexile'] )) if goodreads_info: metadata.measurements.append(MeasurementData( Measurement.RATING, goodreads_info['average_rating'] )) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not (metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations ): metadata = None return metadata
from overdrive import OverdriveAPI from threem import ThreeMAPI from circulation import CirculationAPI from core.model import Identifier, Patron, get_one_or_create, production_session barcode, pin, borrow_urn, hold_urn = sys.argv[1:5] email = os.environ.get("DEFAULT_NOTIFICATION_EMAIL_ADDRESS", "*****@*****.**") _db = production_session() patron, ignore = get_one_or_create(_db, Patron, authorization_identifier=barcode) borrow_identifier = Identifier.parse_urn(_db, borrow_urn, True)[0] hold_identifier = Identifier.parse_urn(_db, hold_urn, True)[0] borrow_pool = borrow_identifier.licensed_through hold_pool = hold_identifier.licensed_through if any(x.type == Identifier.THREEM_ID for x in [borrow_identifier, hold_identifier]): threem = ThreeMAPI(_db) else: threem = None if any(x.type == Identifier.OVERDRIVE_ID for x in [borrow_identifier, hold_identifier]): overdrive = OverdriveAPI(_db) else: overdrive = None
def extract_edition(cls, _db, work_tag, existing_authors, **restrictions): """Create a new Edition object with information about a work (identified by OCLC Work ID). """ # TODO: 'pswid' is what it's called in older representations. # That code can be removed once we replace all representations. oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid')) # if oclc_work_id: # print " owi: %s" % oclc_work_id # else: # print " No owi in %s" % etree.tostring(work_tag) if not oclc_work_id: raise ValueError("Work has no owi") item_type = work_tag.get("itemtype") if (item_type.startswith('itemtype-book') or item_type.startswith('itemtype-compfile')): medium = Edition.BOOK_MEDIUM elif item_type.startswith('itemtype-audiobook' ) or item_type.startswith('itemtype-music'): # Pretty much all Gutenberg texts, even the audio texts, # are based on a book, and the ones that aren't # (recordings of individual songs) probably aren't in OCLC # anyway. So we just want to get the books. medium = Edition.AUDIO_MEDIUM medium = None elif item_type.startswith('itemtype-video'): #medium = Edition.VIDEO_MEDIUM medium = None elif item_type in cls.UNUSED_MEDIA: medium = None else: medium = None # Only create Editions for books with a recognized medium if medium is None: return None, False result = cls._extract_basic_info(_db, work_tag, existing_authors, **restrictions) if not result: # This record did not meet one of the restrictions. return None, False title, authors_and_roles, language = result # Record some extra OCLC-specific information editions = work_tag.get('editions') holdings = work_tag.get('holdings') # Get an identifier for this work. identifier, ignore = Identifier.for_foreign_id(_db, Identifier.OCLC_WORK, oclc_work_id) data_source = DataSource.lookup(_db, DataSource.OCLC) identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings) identifier.add_measurement(data_source, Measurement.PUBLISHED_EDITIONS, editions) # Create a Edition for source + identifier edition, new = get_one_or_create(_db, Edition, data_source=data_source, primary_identifier=identifier, create_method_kwargs=dict( title=title, language=language, )) # Get the most popular Dewey and LCC classification for this # work. for tag_name, subject_type in (("ddc", Subject.DDC), ("lcc", Subject.LCC)): tag = cls._xpath1(work_tag, "//oclc:%s/oclc:mostPopular" % tag_name) if tag is not None: id = tag.get('nsfa') or tag.get('sfa') weight = int(tag.get('holdings')) identifier.classify(data_source, subject_type, id, weight=weight) # Find FAST subjects for the work. for heading in cls._xpath(work_tag, "//oclc:fast//oclc:heading"): id = heading.get('ident') weight = int(heading.get('heldby')) value = heading.text identifier.classify(data_source, Subject.FAST, id, value, weight) # Associate the authors with the Edition. for contributor, roles in authors_and_roles: edition.add_contributor(contributor, roles) return edition, new