def __init__(self, _db=None, cmd_args=None): super(CheckContributorNamesOnWeb, self).__init__(_db=_db) parsed_args = self.parse_command_line(_db=self._db, cmd_args=cmd_args) self.mock_mode = parsed_args.mock if self.mock_mode: self.log.debug( "This is mocked run, with metadata coming from test files, rather than live OneClick connection." ) self.base_path = os.path.split(__file__)[0] self.base_path = os.path.join(self.base_path, "tests") self.canonicalizer = MockAuthorNameCanonicalizer(self._db) else: self.canonicalizer = AuthorNameCanonicalizer(self._db)
class CanonicalizationController(object): log = logging.getLogger("Canonicalization Controller") def __init__(self, _db): self._db = _db self.canonicalizer = AuthorNameCanonicalizer(self._db) def canonicalize_author_name(self): urn = request.args.get('urn') display_name = request.args.get('display_name') if urn: identifier = URNLookupController.parse_urn(self._db, urn, False) if not isinstance(identifier, Identifier): return INVALID_URN else: identifier = None author_name = self.canonicalizer.canonicalize_author_name( identifier, display_name ) self.log.info( "Incoming display name/identifier: %r/%s. Canonicalizer said: %s", display_name, identifier, author_name ) if not author_name: return make_response("", 404) return make_response(author_name, 200, {"Content-Type": "text/plain"})
class CheckContributorNamesOnWeb(CheckContributorNamesInDB): """ Inherits process_contribution_local from parent. Adds process_contribution_viaf functionality, which sends a request to viaf to try and determine correct sort_name for a given author. """ COMPLAINT_SOURCE = "CheckContributorNamesOnWeb" def __init__(self, _db=None, cmd_args=None): super(CheckContributorNamesOnWeb, self).__init__(_db=_db) parsed_args = self.parse_command_line(_db=self._db, cmd_args=cmd_args) self.mock_mode = parsed_args.mock if self.mock_mode: self.log.debug( "This is mocked run, with metadata coming from test files, rather than live OneClick connection." ) self.base_path = os.path.split(__file__)[0] self.base_path = os.path.join(self.base_path, "tests") self.canonicalizer = MockAuthorNameCanonicalizer(self._db) else: self.canonicalizer = AuthorNameCanonicalizer(self._db) def run(self, batch_size=10): """ TODO: run the local db one, make a fix_mismatch, and override it here. in db local make it just register the complaint, but here make it first check the web, then register the complaint. start by running the db local to make sure generated complaints where should then run the web search only on the ones that have complaints about. either run only on the non- """ param_args = self.parse_command_line(self._db) self.query = self.make_query( self._db, param_args.identifier_type, param_args.identifiers, self.log ) editions = True offset = 0 output = "ContributorID|\tSortName|\tDisplayName|\tComputedSortName|\tResolution|\tComplaintSource" print output.encode("utf8") while editions: my_query = self.query.offset(offset).limit(batch_size) editions = my_query.all() for edition in editions: if edition.contributions: for contribution in edition.contributions: self.process_contribution_local(self._db, contribution, self.log) offset += batch_size self._db.commit() self._db.commit() @classmethod def arg_parser(cls): parser = super(CheckContributorNamesOnWeb, cls).arg_parser() parser.add_argument( '--mock', help='If turned on, will use the MockCheckContributorNamesOnWeb client.', action='store_true' ) return parser def process_local_mismatch(self, _db, contribution, computed_sort_name, error_message_detail, log=None): """ Overrides parent method to allow further resolution of sort_name problems by calling process_contribution_web, which asks OCLC and VIAF for info. Determines if a problem is to be investigated further or recorded as a Complaint, to be solved by a human. """ self.process_contribution_web(_db=_db, contribution=contribution, redo_complaints=False, log=log) def process_contribution_web(self, _db, contribution, redo_complaints=False, log=None): """ If sort_name that got from VIAF is not too far off from sort_name we already have, then use it (auto-fix). If it is far off, then it's possible we did not match the author very well. Make a wrong-author complaint, and ask a human to fix it. Searches VIAF by contributor's display_name and contribution title. If the contributor already has a viaf_id store in our database, ignore it. It's possible that id was produced by an older, less precise matching algorithm and might want replacing. :param redo_complaints: Should try OCLC/VIAF on the names that already have Complaint objects lodged against them? Alternative is to require human review of all Complaints. """ if not contribution or not contribution.edition: return contributor = contribution.contributor if not contributor.display_name: return identifier = contribution.edition.primary_identifier if not identifier: return known_titles = [] if contribution.edition.title: known_titles.append(contribution.edition.title) # Searching viaf can be resource-expensive, so only do it if specifically asked # See if there are any complaints already lodged by a previous run of this script. pool = contribution.edition.is_presentation_for parent_source = super(CheckContributorNamesOnWeb, self).COMPLAINT_SOURCE complaint = get_one( _db, Complaint, on_multiple='interchangeable', license_pool=pool, source=self.COMPLAINT_SOURCE, type=self.COMPLAINT_TYPE, ) if not redo_complaints and complaint: # We already did some work on this contributor, and determined to # ask a human for help. This method was called with the time-saving # redo_complaints=False flag. Skip calling OCLC and VIAF. return # can we find an ISBN-type Identifier for this Contribution to send # a request to OCLC with? isbn_identifier = None if identifier.type == Identifier.ISBN: isbn_identifier = identifier else: equivalencies = identifier.equivalencies for equivalency in equivalencies: if equivalency.output.type == Identifier.ISBN: isbn_identifier = equivalency.output break if isbn_identifier: # we can ask OCLC Linked Data about this ISBN uris = None sort_name, uris = self.canonicalizer.sort_name_from_oclc_linked_data( isbn_identifier, contributor.display_name) if sort_name: # see it's in correct format and not too far off from display_name name_ok = self.verify_sort_name(sort_name, contributor) if name_ok: self.resolve_local_complaints(contribution) self.set_contributor_sort_name(sort_name, contribution) return else: # Nope. If OCLC Linked Data gave us any VIAF IDs, look them up # and see if we can get a sort name out of them. if uris: for uri in uris: match_found = self.canonicalizer.VIAF_ID.search(uri) if match_found: viaf_id = match_found.groups()[0] contributor_data = self.canonicalizer.viaf.lookup_by_viaf( viaf_id, working_display_name=contributor.display_name )[0] if contributor_data.sort_name: # see it's in correct format and not too far off from display_name name_ok = self.verify_sort_name(sort_name, contributor) if name_ok: self.resolve_local_complaints(contribution) self.set_contributor_sort_name(sort_name, contribution) return # Nope. If we were given a display name, let's ask VIAF about it # and see what it says. sort_name = self.canonicalizer.sort_name_from_viaf(contributor.display_name, known_titles) if sort_name: # see it's in correct format and not too far off from display_name name_ok = self.verify_sort_name(sort_name, contributor) if name_ok: self.resolve_local_complaints(contribution) self.set_contributor_sort_name(sort_name, contribution) return # If we got to this point, we have not gotten a satisfying enough answer from # either OCLC or VIAF. Now is the time to generate a Complaint, ask a human to # come fix this. error_message_detail = "Contributor[id=%s].sort_name cannot be resolved from outside web services, human intervention required." % contributor.id self.register_problem(source=self.COMPLAINT_SOURCE, contribution=contribution, computed_sort_name=sort_name, error_message_detail=error_message_detail, log=log) @classmethod def verify_sort_name(cls, sort_name, contributor): """ See how well the new sort_name matches the display_name and the expected 'Last, First' format. Too far off is an unexpected result and is a problem. Does not check for proper formatting, like "Last, First". :return name_ok: Boolean answer to "is this computed name good enough?" """ if not contributor.sort_name: # any port in a storm is an acceptable sort name return True computed_sort_name = unicodedata.normalize("NFKD", unicode(sort_name)) if (contributor.sort_name.strip().lower() == computed_sort_name.strip().lower()): # no change is good change return True # computed names don't match. by how much? if it's a matter of a comma or a misplaced # suffix, we can fix without asking for human intervention. if the names are very different, # there's a chance the sort and display names are different on purpose, s.a. when foreign names # are passed as translated into only one of the fields, or when the author has a popular pseudonym. # best ask a human. # if the relative lengths are off than by a stray space or comma, ask a human # it probably means that a human metadata professional had added an explanation/expansion to the # sort_name, s.a. "Bob A. Jones" --> "Bob A. (Allan) Jones", and we'd rather not replace this data # with the "Jones, Bob A." that the auto-algorigthm would generate. length_difference = len(contributor.sort_name.strip()) - len(computed_sort_name.strip()) if abs(length_difference) > 3: return False match_ratio = contributor_name_match_ratio(contributor.sort_name, computed_sort_name, normalize_names=False) if (match_ratio < 40): # ask a human. this kind of score can happen when the sort_name is a transliteration of the display_name, # and is non-trivial to fix. return False else: # we can fix it! return True def resolve_local_complaints(self, contribution): """ Resolves any complaints that the parent script may have made about this contributor's sort_name, because we've now asked the Web, and it gave us the answer. """ pool = contribution.edition.is_presentation_for parent_source = super(CheckContributorNamesOnWeb, self).COMPLAINT_SOURCE parent_type = super(CheckContributorNamesOnWeb, self).COMPLAINT_TYPE query = self._db.query(Complaint) query = query.filter(Complaint.license_pool_id == pool.id) query = query.filter(Complaint.source == parent_source) query = query.filter(Complaint.type == parent_type) query = query.filter(Complaint.resolved == None) complaints = query.all() for complaint in complaints: # say that we fixed it complaint.resolved = datetime.datetime.utcnow()
def __init__(self, _db): self._db = _db self.canonicalizer = AuthorNameCanonicalizer(self._db)
def setup(self): super(TestAuthorNameCanonicalizer, self).setup() self.log = logging.getLogger("Author Name Canonicalizer Test") self.canonicalizer = AuthorNameCanonicalizer(self._db) self.viaf_client = MockVIAFClientLookup(self._db, self.log) self.canonicalizer.viaf = self.viaf_client
class TestAuthorNameCanonicalizer(DatabaseTest): def setup(self): super(TestAuthorNameCanonicalizer, self).setup() self.log = logging.getLogger("Author Name Canonicalizer Test") self.canonicalizer = AuthorNameCanonicalizer(self._db) self.viaf_client = MockVIAFClientLookup(self._db, self.log) self.canonicalizer.viaf = self.viaf_client #self.oclc_client = MockOCLCLinkedData() #self.canonicalizer.oclcld = self.oclc_client def sample_data(self, filename): return sample_data(filename=filename, sample_data_dir="viaf") def queue_file_in_mock_http(self, filename): h = DummyHTTPClient() xml = self.sample_data(filename) h.queue_response(200, media_type='text/xml', content=xml) return h #def queue_viaf_lookup_result(): # http = self.queue_file_in_mock_http("mindy_kaling.xml") # lookup = self.viaf_client.lookup_by_viaf(viaf="9581122", do_get=http.do_get) # client.results = [lookup] def test_primary_author_name(self): # Test our ability to turn a freeform string that identifies # one or more people into the likely name of one person. m = self.canonicalizer.primary_author_name # Test the simplest case. eq_("Mindy Kaling", m("Mindy Kaling")) # Make sure only the first human's name is used. eq_("Mindy Kaling", m("Mindy Kaling, Bob Saget and Co")) eq_("Bill O'Reilly", m("Bill O'Reilly with Martin Dugard")) eq_("Clare Verbeek", m("Clare Verbeek, Thembani Dladla, Zanele Buthelezi")) # In most cases, when a sort name is passed in as a display # name, the situation is correctly diagnosed and the name is # returned as-is. for sort_name in ( 'Kaling, Mindy', 'Tolkien, J. R. R.', 'van Damme, Jean-Claude', ): eq_(sort_name, m(sort_name)) # Similarly when there is no distinction between display # and sort name. for sort_name in ( 'Cher', 'Various', 'Anonymous', ): eq_(sort_name, m(sort_name)) # These are not likely to show up in real usage, but we can # handle them. eq_("Rand, Ayn", m('Rand, Ayn, and Cher')) eq_("Rand, Ayn", m('Rand, Ayn, and Kaling, Mindy')) def test__canonicalize_single_name(self): # For single-named entities, the sort name and display name # are identical. We don't need to ask VIAF. self.canonicalizer.viaf.queue_lookup("bad data") for one_name in ( 'Various', 'Anonymous', 'Cher', ): eq_( one_name, self.canonicalizer._canonicalize( identifier=None, display_name=one_name ) ) # We didn't ask the mock VIAF about anything. eq_(["bad data"], self.canonicalizer.viaf.results) def test_found_contributor(self): # If we find a matching contributor already in our database, # then don't bother looking at OCLC or VIAF. contributor_1, made_new = self._contributor(sort_name="Zebra, Ant") contributor_1.display_name = "Ant Zebra" contributor_2, made_new = self._contributor(sort_name="Yarrow, Bloom") contributor_2.display_name = "Bloom Yarrow" # _canonicalize shouldn't try to contact viaf or oclc, but in case it does, make sure # the contact brings wrong results. self.canonicalizer.viaf.queue_lookup([]) #self.canonicalizer.oclcld.queue_lookup([]) canonicalized_author = self.canonicalizer._canonicalize(identifier=None, display_name="Ant Zebra") eq_(canonicalized_author, contributor_1.sort_name) def test_oclc_contributor(self): # TODO: make sure isbn ids get directed to OCLC pass def test_non_isbn_identifier(self): # TODO: make sure non-isbn ids get directed to VIAF pass
class TestAuthorNameCanonicalizer(DatabaseTest): def setup(self): super(TestAuthorNameCanonicalizer, self).setup() self.log = logging.getLogger("Author Name Canonicalizer Test") self.canonicalizer = AuthorNameCanonicalizer(self._db) self.viaf_client = MockVIAFClientLookup(self._db, self.log) self.canonicalizer.viaf = self.viaf_client #self.oclc_client = MockOCLCLinkedData() #self.canonicalizer.oclcld = self.oclc_client def sample_data(self, filename): return sample_data(filename=filename, sample_data_dir="viaf") def queue_file_in_mock_http(self, filename): h = DummyHTTPClient() xml = self.sample_data(filename) h.queue_response(200, media_type='text/xml', content=xml) return h #def queue_viaf_lookup_result(): # http = self.queue_file_in_mock_http("mindy_kaling.xml") # lookup = self.viaf_client.lookup_by_viaf(viaf="9581122", do_get=http.do_get) # client.results = [lookup] def test_primary_author_name(self): # Test our ability to turn a freeform string that identifies # one or more people into the likely name of one person. m = self.canonicalizer.primary_author_name # Test the simplest case. eq_("Mindy Kaling", m("Mindy Kaling")) # Make sure only the first human's name is used. eq_("Mindy Kaling", m("Mindy Kaling, Bob Saget and Co")) eq_("Bill O'Reilly", m("Bill O'Reilly with Martin Dugard")) eq_("Clare Verbeek", m("Clare Verbeek, Thembani Dladla, Zanele Buthelezi")) # In most cases, when a sort name is passed in as a display # name, the situation is correctly diagnosed and the name is # returned as-is. for sort_name in ( 'Kaling, Mindy', 'Tolkien, J. R. R.', 'van Damme, Jean-Claude', ): eq_(sort_name, m(sort_name)) # Similarly when there is no distinction between display # and sort name. for sort_name in ( 'Cher', 'Various', 'Anonymous', ): eq_(sort_name, m(sort_name)) # These are not likely to show up in real usage, but we can # handle them. eq_("Rand, Ayn", m('Rand, Ayn, and Cher')) eq_("Rand, Ayn", m('Rand, Ayn, and Kaling, Mindy')) def test__canonicalize_single_name(self): # For single-named entities, the sort name and display name # are identical. We don't need to ask VIAF. self.canonicalizer.viaf.queue_lookup("bad data") for one_name in ( 'Various', 'Anonymous', 'Cher', ): eq_( one_name, self.canonicalizer._canonicalize(identifier=None, display_name=one_name)) # We didn't ask the mock VIAF about anything. eq_(["bad data"], self.canonicalizer.viaf.results) def test_found_contributor(self): # If we find a matching contributor already in our database, # then don't bother looking at OCLC or VIAF. contributor_1, made_new = self._contributor(sort_name="Zebra, Ant") contributor_1.display_name = "Ant Zebra" contributor_2, made_new = self._contributor(sort_name="Yarrow, Bloom") contributor_2.display_name = "Bloom Yarrow" # _canonicalize shouldn't try to contact viaf or oclc, but in case it does, make sure # the contact brings wrong results. self.canonicalizer.viaf.queue_lookup([]) #self.canonicalizer.oclcld.queue_lookup([]) canonicalized_author = self.canonicalizer._canonicalize( identifier=None, display_name="Ant Zebra") eq_(canonicalized_author, contributor_1.sort_name) def test_oclc_contributor(self): # TODO: make sure isbn ids get directed to OCLC pass def test_non_isbn_identifier(self): # TODO: make sure non-isbn ids get directed to VIAF pass