class TestVIAFClient(DatabaseTest): def setup(self): super(TestVIAFClient, self).setup() self.client = VIAFClient(self._db) self.log = logging.getLogger("VIAF Client Test") def sample_data(self, filename): return sample_data(filename, "viaf") def queue_file_in_mock_http(self, filename): h = DummyHTTPClient() xml = self.sample_data(filename) h.queue_response(200, media_type='text/xml', content=xml) return h def test_process_contributor(self): client = MockVIAFClientLookup(self._db, self.log) contributor = self._contributor()[0] # If lookup returns an empty array (as in the case of # VIAFParser#parse_multiple), the contributor is not updated. client.queue_lookup([]) client.process_contributor(contributor) eq_(contributor.sort_name, '2001') eq_(contributor.display_name, None) def queue_lookup_result(): http = self.queue_file_in_mock_http("mindy_kaling.xml") lookup = self.client.lookup_by_viaf(viaf="9581122", do_get=http.do_get) client.results = [lookup] # When lookup is successful, the contributor is updated. queue_lookup_result() client.process_contributor(contributor) eq_(contributor.sort_name, "Kaling, Mindy") eq_(contributor.display_name, "Mindy Kaling") # If a contributor with the same VIAF number already exists, # the original contributor will be updated with VIAF data # and the processed contributor will be merged into the original. earliest_contributor = contributor # Reset the contributors sort name to confirm the data update. earliest_contributor.sort_name = None # Create a new contributor and contribution to confirm the merge. contributor = self._contributor()[0] edition = self._edition(authors=contributor.sort_name) eq_(edition.contributors, set([contributor])) queue_lookup_result() client.process_contributor(contributor) eq_(earliest_contributor.sort_name, "Kaling, Mindy") eq_(edition.contributors, set([earliest_contributor])) # The new contributor has been deleted. assert contributor not in self._db # If the display name of the original contributor is suspiciously # different from the VIAF display name, the new contributor will be # updated without being merged. earliest_contributor.display_name = "Mindy L. Kaling" earliest_contributor.sort_name = None contributor = self._contributor()[0] edition = self._edition(authors=contributor.sort_name) queue_lookup_result() client.process_contributor(contributor) eq_(contributor.viaf, "9581122") eq_(contributor.sort_name, "Kaling, Mindy") # Earlier contributor has not been updated or merged. eq_(earliest_contributor.sort_name, None) assert earliest_contributor not in edition.contributors def test_lookup_by_viaf(self): # there can be one and only one Mindy h = self.queue_file_in_mock_http("mindy_kaling.xml") contributor_candidate = self.client.lookup_by_viaf(viaf="9581122", do_get=h.do_get) (selected_candidate, match_confidences, contributor_titles) = contributor_candidate eq_(selected_candidate.viaf, "9581122") eq_(selected_candidate.sort_name, "Kaling, Mindy") def test_lookup_by_name(self): # there can be one and only one Mindy h = self.queue_file_in_mock_http("mindy_kaling.xml") (selected_candidate, match_confidences, contributor_titles) = self.client.lookup_by_name(sort_name="Mindy Kaling", do_get=h.do_get) eq_(selected_candidate.viaf, "9581122") eq_(selected_candidate.sort_name, "Kaling, Mindy")
class TestVIAFClient(DatabaseTest): def setup(self): super(TestVIAFClient, self).setup() self.client = VIAFClient(self._db) self.log = logging.getLogger("VIAF Client Test") def sample_data(self, filename): return sample_data(filename, "viaf") def queue_file_in_mock_http(self, filename): h = DummyHTTPClient() xml = self.sample_data(filename) h.queue_response(200, media_type='text/xml', content=xml) return h def test_process_contributor(self): client = MockVIAFClientLookup(self._db, self.log) contributor = self._contributor()[0] # If lookup returns an empty array (as in the case of # VIAFParser#parse_multiple), the contributor is not updated. client.queue_lookup([]) client.process_contributor(contributor) eq_(contributor.sort_name, '2001') eq_(contributor.display_name, None) def queue_lookup_result(): http = self.queue_file_in_mock_http("mindy_kaling.xml") lookup = self.client.lookup_by_viaf(viaf="9581122", do_get=http.do_get) client.results = [lookup] # When lookup is successful, the contributor is updated. queue_lookup_result() client.process_contributor(contributor) eq_(contributor.sort_name, "Kaling, Mindy") eq_(contributor.display_name, "Mindy Kaling") # If a contributor with the same VIAF number already exists, # the original contributor will be updated with VIAF data # and the processed contributor will be merged into the original. earliest_contributor = contributor # Reset the contributors sort name to confirm the data update. earliest_contributor.sort_name = None # Create a new contributor and contribution to confirm the merge. contributor = self._contributor()[0] edition = self._edition(authors=contributor.sort_name) eq_(edition.contributors, set([contributor])) queue_lookup_result() client.process_contributor(contributor) eq_(earliest_contributor.sort_name, "Kaling, Mindy") eq_(edition.contributors, set([earliest_contributor])) # The new contributor has been deleted. assert contributor not in self._db # If the display name of the original contributor is suspiciously # different from the VIAF display name, the new contributor will be # updated without being merged. earliest_contributor.display_name = "Mindy L. Kaling" earliest_contributor.sort_name = None contributor = self._contributor()[0] edition = self._edition(authors=contributor.sort_name) queue_lookup_result() client.process_contributor(contributor) eq_(contributor.viaf, "9581122") eq_(contributor.sort_name, "Kaling, Mindy") # Earlier contributor has not been updated or merged. eq_(earliest_contributor.sort_name, None) assert earliest_contributor not in edition.contributors def test_lookup_by_viaf(self): # there can be one and only one Mindy h = self.queue_file_in_mock_http("mindy_kaling.xml") contributor_candidate = self.client.lookup_by_viaf(viaf="9581122", do_get=h.do_get) (selected_candidate, match_confidences, contributor_titles) = contributor_candidate eq_(selected_candidate.viaf, "9581122") eq_(selected_candidate.sort_name, "Kaling, Mindy") def test_lookup_by_name(self): # there can be one and only one Mindy h = self.queue_file_in_mock_http("mindy_kaling.xml") (selected_candidate, match_confidences, contributor_titles) = self.client.lookup_by_name( sort_name="Mindy Kaling", do_get=h.do_get) eq_(selected_candidate.viaf, "9581122") eq_(selected_candidate.sort_name, "Kaling, Mindy")
class LinkedDataCoverageProvider(IdentifierCoverageProvider): """Runs Editions obtained from OCLC Lookup through OCLC Linked Data. This (maybe) associates a edition with a (potentially) large number of ISBNs, which can be used as input into other services. """ SERVICE_NAME = "OCLC Linked Data Coverage Provider" DEFAULT_BATCH_SIZE = 10 DATA_SOURCE_NAME = DataSource.OCLC_LINKED_DATA INPUT_IDENTIFIER_TYPES = [ Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN, Identifier.OVERDRIVE_ID ] def __init__(self, _db, *args, **kwargs): if 'api' in kwargs: self.api = kwargs['api'] del kwargs['api'] else: self.api = OCLCLinkedData(_db) if 'viaf_api' in kwargs: self.viaf = kwargs['viaf_api'] del kwargs['viaf_api'] else: self.viaf = VIAFClient(_db) super(LinkedDataCoverageProvider, self).__init__(_db, *args, **kwargs) def process_item(self, identifier): try: new_info_counter = Counter() self.log.info("Processing identifier %r", identifier) metadatas = [m for m in self.api.info_for(identifier)] if identifier.type == Identifier.ISBN: # Currently info_for seeks the results of OCLC Work IDs only # This segment will get the metadata of any equivalent OCLC Numbers # as well. equivalents = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id]) oclc_numbers = self._db.query(Identifier).\ filter(Identifier.id.in_(equivalents)).\ filter(Identifier.type==Identifier.OCLC_NUMBER).all() for oclc_number in oclc_numbers: more_metadata = [m for m in self.api.info_for(oclc_number)] metadatas += more_metadata metadatas = [m for m in metadatas if m] for metadata in metadatas: other_identifier, ignore = metadata.primary_identifier.load( self._db) oclc_editions = other_identifier.primarily_identifies # Keep track of the number of editions OCLC associates # with this identifier. other_identifier.add_measurement( self.data_source, Measurement.PUBLISHED_EDITIONS, len(oclc_editions)) # Clean up contributor information. self.apply_viaf_to_contributor_data(metadata) # Remove any empty ContributorData objects that may have # been created. metadata.contributors = filter( lambda c: c.sort_name or c.display_name, metadata.contributors) # When metadata is applied, it must be given a client that can # response to 'canonicalize_author_name'. Usually this is an # OPDSImporter that reaches out to the Metadata Wrangler, but # in the case of being _on_ the Metadata Wrangler...: from canonicalize import AuthorNameCanonicalizer metadata_client = AuthorNameCanonicalizer(self._db, oclcld=self.api, viaf=self.viaf) num_new_isbns = self.new_isbns(metadata) new_info_counter['isbns'] += num_new_isbns if oclc_editions: # There are existing OCLC editions. Apply any new information to them. for edition in oclc_editions: metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter) else: # Create a new OCLC edition to hold the information. edition, ignore = get_one_or_create( self._db, Edition, data_source=self.data_source, primary_identifier=other_identifier) metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter) # Set the new OCLC edition's identifier equivalent to this # identifier so we know they're related. self.set_equivalence(identifier, metadata) self.log.info( "Total: %(editions)d editions, %(isbns)d ISBNs, "\ "%(descriptions)d descriptions, %(subjects)d classifications.", new_info_counter ) except IOError as e: if ", but couldn't find location" in e.message: exception = "OCLC doesn't know about this ISBN: %r" % e transient = False else: exception = "OCLC raised an error: %r" % e transient = True return self.failure(identifier, exception, transient=transient) return identifier def apply_viaf_to_contributor_data(self, metadata): """Looks up VIAF information for contributors identified by OCLC This is particularly crucial for contributors identified solely by VIAF IDs (and no sort_name), as it raises errors later in the process. """ for contributor_data in metadata.contributors: if contributor_data.viaf: viaf_contributor_data = self.viaf.lookup_by_viaf( contributor_data.viaf, working_sort_name=contributor_data.sort_name, working_display_name=contributor_data.display_name)[0] if viaf_contributor_data: viaf_contributor_data.apply(contributor_data) def apply_metadata_to_edition(self, edition, metadata, metadata_client, counter): """Applies metadata and increments counters""" metadata.apply(edition, collection=None, metadata_client=metadata_client) counter['editions'] += 1 counter['descriptions'] += len(metadata.links) counter['subjects'] += len(metadata.subjects) return metadata, counter def new_isbns(self, metadata): """Returns the number of new isbns on a metadata object""" new_isbns = 0 for identifier_data in metadata.identifiers: identifier, new = identifier_data.load(self._db) if new: new_isbns += 1 return new_isbns def set_equivalence(self, identifier, metadata): """Identify the OCLC Number with the OCLC Work""" primary_editions = identifier.primarily_identifies if primary_editions: strength = 0 for primary_edition in primary_editions: if metadata.title: title_strength = MetadataSimilarity.title_similarity( metadata.title, primary_edition.title) else: title_strength = 0 edition_viafs = set( [c.viaf for c in primary_edition.contributors if c.viaf]) metadata_viafs = set( [c.viaf for c in metadata.contributors if c.viaf]) author_strength = MetadataSimilarity._proportion( edition_viafs, metadata_viafs) edition_strength = (title_strength * 0.8) + (author_strength * 0.2) if edition_strength > strength: strength = edition_strength else: strength = 1 if strength > 0: primary_identifier, ignore = metadata.primary_identifier.load( self._db) identifier.equivalent_to(self.data_source, primary_identifier, strength)
import re from core.model import ( production_session, Contributor, ) from viaf import VIAFClient _db = production_session() viaf_client = VIAFClient(_db) from sqlalchemy.sql import text contributors = _db.query(Contributor).filter( text("contributors.display_name ~ '^Q[0-9]'")).order_by(Contributor.id) print contributors.count() for contributor in contributors: if contributor.viaf: viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf( contributor.viaf) else: viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_name( contributor.name) print "%s: %s => %s, %s => %s" % (contributor.id, contributor.display_name, display_name, contributor.wikipedia_name, wikipedia_name) contributor.display_name = display_name contributor.wikipedia_name = wikipedia_name contributor.family_name = family_name viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf( contributor.viaf) for contribution in contributor.contributions: edition = contribution.edition if edition.work: edition.work.calculate_presentation()
from core.model import ( production_session, Contributor, ) from viaf import VIAFClient _db = production_session() viaf_client = VIAFClient(_db) from sqlalchemy.sql import text contributors = _db.query(Contributor).filter( text("contributors.display_name ~ '^Q[0-9]'") ).order_by(Contributor.id) print contributors.count() for contributor in contributors: if contributor.viaf: viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf(contributor.viaf) else: viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_name(contributor.name) print "%s: %s => %s, %s => %s" % ( contributor.id, contributor.display_name, display_name, contributor.wikipedia_name, wikipedia_name ) contributor.display_name = display_name contributor.wikipedia_name = wikipedia_name contributor.family_name = family_name viaf, display_name, family_name, sort_name, wikipedia_name = viaf_client.lookup_by_viaf(contributor.viaf) for contribution in contributor.contributions: edition = contribution.edition if edition.work: edition.work.calculate_presentation()