def lookup_by_name(self, sort_name, display_name=None, do_get=None, known_titles=None): """ Asks VIAF for a list of author clusters, matching the passed-in author name. Selects the cluster we deem the best match for the author we mean. :param sort_name: Author name in Last, First format. :param display_name: Author name in First Last format. :param do_get: Ask Representation to use Http GET? :param known_titles: A list of titles we know this author wrote. :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData. """ author_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range (1, 51): start_record = 1 + maximum_records * (page-1) scope = 'local.personalNames' if is_corporate_name(author_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, author_name=author_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record ) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id==representation.id ).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 best_match = self.select_best_match(candidates=contributor_candidates, working_sort_name=author_name, known_titles=known_titles) return best_match
def primary_author_name(self, author_name): """From an 'author' name that may contain multiple people, extract just the first name. This is intended to extract e.g. "Bill O'Reilly" from "Bill O'Reilly with Martin Dugard". TODO: When the author is "Ryan and Josh Shook" I really have no clue what to do. """ if not author_name: return None if is_corporate_name(author_name): return author_name for splitter in (" with ", " and "): if splitter in author_name: author_name = author_name.split(splitter)[0] author_name = author_name.split(", ")[0] return author_name
def primary_author_name(self, author_name): """From an 'author' name that may contain multiple people, extract just the first name. This is intended to extract e.g. "Bill O'Reilly" from "Bill O'Reilly with Martin Dugard". TODO: When the author is "Ryan and Josh Shook" I really have no clue what to do. """ if not author_name: return None if is_corporate_name(author_name): return author_name for splitter in (' with ', ' and '): if splitter in author_name: author_name = author_name.split(splitter)[0] author_name = author_name.split(", ")[0] return author_name
def primary_author_name(self, author_name): """From an 'author' name that may contain multiple people, extract just the first name. This is intended to extract e.g. "Bill O'Reilly" from "Bill O'Reilly with Martin Dugard". TODO: Cases we can't handle: van Damme, Jean Claude Madonna, Cher Ryan and Josh Shook """ if not author_name: return None if is_corporate_name(author_name): return author_name for splitter in (' with ', ' and '): if splitter in author_name: author_name = author_name.split(splitter)[0] author_names = author_name.split(", ") if len(author_names) == 2 and any( ' ' not in name for name in author_names ): # There are two putative author names, and one of them doesn't # have a space in it. The most likely scenario is that # this is the sort name of a single person # (e.g. "Tolkien, J. R. R."), not two different display names. # In that situation the best we can do is return the # sort name as-is. pass else: # Either there is no comma here, or the comma really does seem to # separate multiple peoples' names. Pick the first one. author_name = author_names[0] if author_name.endswith(','): # Sometimes peoples' sort names end with a period, but # commas, not so much. author_name = author_name[:-1] return author_name
def lookup_by_name(self, sort_name, display_name=None, do_get=None, best_match=False): sort_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range (1, 51): start_record = 1 + maximum_records * (page-1) scope = 'local.personalNames' if is_corporate_name(sort_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, sort_name=sort_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record ) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id==representation.id ).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 return self.select_best_match(contributor_candidates, sort_name)
def lookup_by_name(self, sort_name, display_name=None, do_get=None, known_titles=None): """ Asks VIAF for a list of author clusters, matching the passed-in author name. Selects the cluster we deem the best match for the author we mean. :param sort_name: Author name in Last, First format. :param display_name: Author name in First Last format. :param do_get: Ask Representation to use Http GET? :param known_titles: A list of titles we know this author wrote. :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData. """ author_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range(1, 51): start_record = 1 + maximum_records * (page - 1) scope = 'local.personalNames' if is_corporate_name(author_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, author_name=author_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id == representation.id).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 best_match = self.select_best_match(candidates=contributor_candidates, working_sort_name=author_name, known_titles=known_titles) return best_match