def test_mismatch(self): self.assertAlmostEqual( shallower_name_similarity(('Robin K.', 'Ryder'), ('Robin J.', 'Ryder')), 0) self.assertAlmostEqual( shallower_name_similarity(('Robin', 'Ryder'), ('Robin', 'Rider')), 0)
def test_symmetric(self): pairs = [ (('Robin', 'Ryder'), ('Robin', 'Ryder')), (('Robin', 'Ryder'), ('R.', 'Ryder')), (('R.', 'Ryder'), ('R.', 'Ryder')), (('Robin J.', 'Ryder'), ('R.', 'Ryder')), (('Robin J.', 'Ryder'), ('R. J.', 'Ryder')), (('R. J.', 'Ryder'), ('J.', 'Ryder')), (('Robin', 'Ryder'), ('Robin J.', 'Ryder')), (('W. Timothy', 'Gowers'), ('Timothy', 'Gowers')), (('Robin K.', 'Ryder'), ('Robin J.', 'Ryder')), (('Claire', 'Mathieu'), ('Claire', 'Kenyon-Mathieu')), ] for a, b in pairs: self.assertAlmostEqual(shallower_name_similarity( a, b), shallower_name_similarity(b, a))
def test_symmetric(self): pairs = [ (('Robin', 'Ryder'), ('Robin', 'Ryder')), (('Robin', 'Ryder'), ('R.', 'Ryder')), (('R.', 'Ryder'), ('R.', 'Ryder')), (('Robin J.', 'Ryder'), ('R.', 'Ryder')), (('Robin J.', 'Ryder'), ('R. J.', 'Ryder')), (('R. J.', 'Ryder'), ('J.', 'Ryder')), (('Robin', 'Ryder'), ('Robin J.', 'Ryder')), (('W. Timothy', 'Gowers'), ('Timothy', 'Gowers')), (('Robin K.', 'Ryder'), ('Robin J.', 'Ryder')), (('Claire', 'Mathieu'), ('Claire', 'Kenyon-Mathieu')), ] for a, b in pairs: self.assertAlmostEqual(shallower_name_similarity(a, b), shallower_name_similarity(b, a))
def affiliate_author_with_orcid(ref_name, orcid, authors, initial_affiliations=None): """ Given a reference name and an ORCiD for a researcher, find out which author in the list is the most likely to be that author. This function is run on author lists of papers listed in the ORCiD record so we expect that one of the authors should be the same person as the ORCiD holder. This just finds the most similar name and returns the appropriate affiliations list (None everywhere except for the most similar name where it is the ORCiD). """ max_sim_idx = None max_sim = 0. for idx, name in enumerate(authors): cur_similarity = shallower_name_similarity(name, ref_name) if cur_similarity > max_sim: max_sim_idx = idx max_sim = cur_similarity affiliations = [None] * len(authors) if initial_affiliations and len(initial_affiliations) == len(authors): affiliations = initial_affiliations if max_sim_idx is not None: affiliations[max_sim_idx] = orcid return affiliations
def test_malformed(self): inputs = [ ((' ', ' '), ('John', 'Doe')), (('Alfred', 'Kastler'), (' ', ' ')), ('', (None, '')), ] for a, b in inputs: self.assertEqual(shallower_name_similarity(a, b), False)
def test_matching(self): self.assertAlmostEqual( shallower_name_similarity(('Robin', 'Ryder'), ('Robin', 'Ryder')), 1.0) self.assertGreater( shallower_name_similarity(('Robin', 'Ryder'), ('R.', 'Ryder')), 0) self.assertGreater( shallower_name_similarity(('R.', 'Ryder'), ('R.', 'Ryder')), 0) self.assertGreater( shallower_name_similarity(('Robin J.', 'Ryder'), ('R.', 'Ryder')), 0) self.assertGreater( shallower_name_similarity(('Robin J.', 'Ryder'), ('R. J.', 'Ryder')), 0) self.assertGreater( shallower_name_similarity(('Robin', 'Ryder'), ('Robin J.', 'Ryder')), 0) self.assertGreater( shallower_name_similarity(('Robin', 'Ryder'), ('', 'Ryder')), 0)
def affiliate_author_with_orcid(ref_name, orcid, authors, initial_affiliations=None): """ Given a reference name and an ORCiD for a researcher, find out which author in the list is the most likely to be that author. This function is run on author lists of papers listed in the ORCiD record so we expect that one of the authors should be the same person as the ORCiD holder. This just finds the most similar name and returns the appropriate affiliations list (None everywhere except for the most similar name where it is the ORCiD). """ max_sim_idx = None max_sim = 0. for idx, name in enumerate(authors): cur_similarity = shallower_name_similarity(name, ref_name) if cur_similarity > max_sim: max_sim_idx = idx max_sim = cur_similarity affiliations = [None]*len(authors) if initial_affiliations and len(initial_affiliations) == len(authors): affiliations = initial_affiliations if max_sim_idx is not None: affiliations[max_sim_idx] = orcid return affiliations
def test_multiple(self): self.assertAlmostEqual( shallower_name_similarity(('Juan Pablo', 'Corella'), ('J. Pablo', 'Corella')), 1.0)
def test_unicode(self): self.assertGreater( shallower_name_similarity(('Cl\u0102\u0160ment', 'Pit-Claudel'), ('Clément', 'Pit-Claudel')), 0)
def test_reverse(self): self.assertGreater( shallower_name_similarity(('W. Timothy', 'Gowers'), ('Timothy', 'Gowers')), 0)
def test_hyphen(self): self.assertGreater( shallower_name_similarity(('Clement F.', 'Pit Claudel'), ('Clément', 'Pit-Claudel')), 0)
def search_by_name(first, last, instance=settings.ORCID_BASE_DOMAIN): """ Searches for an ORCID profile matching this (first,last) name. Returns a list of such ORCID profiles. """ # Validate arguments if not last: return # Perform query base_base_pub = "https://pub." + instance + "/" baseurl = base_base_pub + 'v1.2/search/orcid-bio/' dct = { 'rows': 10, 'start': 0, 'q': 'family-name:%s given-names:%s' % (last, first), } url = baseurl + '?' + urlencode(dct) try: r = requests.get(url) # the namespace is the same for both the production and the # sandbox versions. ns = {'ns': 'http://www.orcid.org/ns/orcid'} xml = etree.fromstring(r.text.encode('utf-8')) for elem in xml.xpath('//ns:orcid-search-result', namespaces=ns): candidateFirst = None candidateLast = None # Get name pers_details = elem.xpath('.//ns:personal-details', namespaces=ns) if not pers_details: continue for item in pers_details[0]: if item.tag.endswith('given-names'): candidateFirst = item.text elif item.tag.endswith('family-name'): candidateLast = item.text if not candidateFirst or not candidateLast: continue # Check that the names are compatible if shallower_name_similarity( (first, last), (candidateFirst, candidateLast)) == 0: continue # Get ORCID iD orcid_elem = elem.xpath( './ns:orcid-profile/ns:orcid-identifier/ns:path', namespaces=ns) if not orcid_elem: continue orcid = orcid_elem[0].text # Add other things lst = elem.xpath( './ns:orcid-profile/ns:orcid-bio/ns:researcher-urls/ns:researcher-url/ns:url/text()', namespaces=ns) homepage = None for url in lst: homepage = urlize(url) break keywords = elem.xpath( './ns:orcid-profile/ns:orcid-bio/ns:keywords/ns:keyword/text()', namespaces=ns) yield { 'first': candidateFirst, 'last': candidateLast, 'orcid': orcid, 'homepage': homepage, 'keywords': keywords, } except etree.XMLSyntaxError as e: print e except requests.exceptions.RequestException as e: print e
def search_by_name(first, last): """ Searches for an ORCID profile matching this (first,last) name. Returns a list of such ORCID profiles. """ # Validate arguments if not last: return # Perform query baseurl = 'http://pub.orcid.org/v1.2/search/orcid-bio/' dct = { 'rows':10, 'start':0, 'q':'family-name:%s given-names:%s' % (last,first), } url = baseurl+'?'+urlencode(dct) try: r = requests.get(url) ns = {'ns':'http://www.orcid.org/ns/orcid' } xml = etree.fromstring(r.text.encode('utf-8')) for elem in xml.xpath('//ns:orcid-search-result', namespaces=ns): candidateFirst = None candidateLast = None # Get name pers_details = elem.xpath('.//ns:personal-details', namespaces=ns) if not pers_details: continue for item in pers_details[0]: if item.tag.endswith('given-names'): candidateFirst = item.text elif item.tag.endswith('family-name'): candidateLast = item.text if not candidateFirst or not candidateLast: continue # Check that the names are compatible if shallower_name_similarity((first,last),(candidateFirst,candidateLast)) == 0: continue # Get ORCID iD orcid_elem = elem.xpath('./ns:orcid-profile/ns:orcid-identifier/ns:path', namespaces=ns) if not orcid_elem: continue orcid = orcid_elem[0].text # Add other things lst = elem.xpath('./ns:orcid-profile/ns:orcid-bio/ns:researcher-urls/ns:researcher-url/ns:url/text()', namespaces=ns) homepage = None for url in lst: homepage = urlize(url) break keywords = elem.xpath('./ns:orcid-profile/ns:orcid-bio/ns:keywords/ns:keyword/text()', namespaces=ns) yield { 'first':candidateFirst, 'last':candidateLast, 'orcid':orcid, 'homepage':homepage, 'keywords':keywords, } except etree.XMLSyntaxError as e: print e except requests.exceptions.RequestException as e: print e