def test_mismatch(self): self.assertAlmostEqual( name_similarity(('Robin K.', 'Ryder'), ('Robin J.', 'Ryder')), 0) self.assertAlmostEqual( name_similarity(('Claire', 'Mathieu'), ('Claire', 'Kenyon-Mathieu')), 0) self.assertAlmostEqual( name_similarity(('Amanda P.', 'Brown'), ('Patrick', 'Brown')), 0)
def test_symmetric(self): pairs = [ (('Robin', 'Ryder'), ('Robin', 'Ryder')), (('Robin', 'Ryder'), ('R.', 'Ryder')), (('R.', 'Ryder'), ('R.', 'Ryder')), (('Robin J.', 'Ryder'), ('R.', 'Ryder')), (('Robin J.', 'Ryder'), ('R. J.', 'Ryder')), (('R. J.', 'Ryder'), ('J.', 'Ryder')), (('Robin', 'Ryder'), ('Robin J.', 'Ryder')), (('W. Timothy', 'Gowers'), ('Timothy', 'Gowers')), (('Robin K.', 'Ryder'), ('Robin J.', 'Ryder')), (('Claire', 'Mathieu'), ('Claire', 'Kenyon-Mathieu')), ] for a, b in pairs: self.assertAlmostEqual(name_similarity(a, b), name_similarity(b, a))
def score(self, dataA, dataB): # TODO: this score function is far from optimal # refine it so that 'Claire Mathieu' and 'Claire Mathieu-Kenyon' gets # a decent score firstA, lastA = dataA firstB, lastB = dataB return name_similarity(dataA,dataB)
def get_or_create_by_orcid(cls, orcid, profile=None, user=None): researcher = None try: researcher = Researcher.objects.get(orcid=orcid) except Researcher.DoesNotExist: if profile is None: profile = OrcidProfile(id=orcid) else: profile = OrcidProfile(json=profile) name = profile.name homepage = profile.homepage email = profile.email researcher = Researcher.create_by_name(name[0], name[1], orcid=orcid, user=user, homepage=homepage, email=email) # Ensure that extra info is added. save = False for kw, val in [('homepage', homepage), ('orcid', orcid), ('email', email)]: if not researcher.__dict__[kw] and val: researcher.__dict__[kw] = val save = True if save: researcher.save() for variant in profile.other_names: confidence = name_similarity(variant, variant) name = Name.lookup_name(variant) researcher.add_name_variant(name, confidence) return researcher
def update_variants(self, reset=False): """ Sets the variants of this name to the candidates returned by variants_queryset and which have a positive name similarity with the reference name. .. todo:: This should rather rely on the name variants with confidence 1.0 """ nvqs = self.namevariant_set.all() if reset: for nv in nvqs: name = nv.name nv.delete() name.update_best_confidence() current_name_variants = set() else: current_name_variants = set([nv.name_id for nv in nvqs]) last = self.name.last for name in self.variants_queryset(): sim = name_similarity((name.first, name.last), (self.name.first, self.name.last)) if sim > 0 and name.id not in current_name_variants: self.add_name_variant(name, sim, force_update=reset)
def get_or_create_by_orcid(cls, orcid, profile=None, user=None): researcher = None try: researcher = Researcher.objects.get(orcid=orcid) except Researcher.DoesNotExist: if profile is None: profile = OrcidProfile(id=orcid) else: profile = OrcidProfile(json=profile) name = profile.name homepage = profile.homepage email = profile.email researcher = Researcher.create_by_name(name[0],name[1], orcid=orcid, user=user, homepage=homepage, email=email) # Ensure that extra info is added. save = False for kw, val in [('homepage',homepage),('orcid',orcid),('email',email)]: if not researcher.__dict__[kw] and val: researcher.__dict__[kw] = val save = True if save: researcher.save() for variant in profile.other_names: confidence = name_similarity(variant, variant) name = Name.lookup_name(variant) researcher.add_name_variant(name, confidence) return researcher
def update_variants(self, reset=False): """ Sets the variants of this name to the candidates returned by variants_queryset and which have a positive name similarity with the reference name. .. todo:: This should rather rely on the name variants with confidence 1.0 """ nvqs = self.namevariant_set.all() if reset: for nv in nvqs: name = nv.name nv.delete() name.update_best_confidence() current_name_variants = set() else: current_name_variants = set([nv.name_id for nv in nvqs]) last = self.name.last for name in self.variants_queryset(): sim = name_similarity((name.first,name.last), (self.name.first,self.name.last)) if sim > 0 and name.id not in current_name_variants: self.add_name_variant(name, sim, force_update=reset)
def update_variants(self): """ Sets the variants of this name to the candidates returned by variants_queryset """ for researcher in self.variants_queryset(): sim = name_similarity((researcher.name.first,researcher.name.last), (self.first,self.last)) if sim > 0: old_sim = self.best_confidence self.best_confidence = sim if self.pk is None or old_sim < sim: self.save() NameVariant.objects.get_or_create(name=self,researcher=researcher, defaults={'confidence':sim})
def score(self, dataA, dataB): score = 0. if dataA is None or dataB is None: return 0. for a in dataA: for b in dataB: firstA, lastA = a firstB, lastB = b score += name_similarity(a,b) #score += name_tools.match(firstA+' '+lastA,firstB+' '+lastB) # Previously, it was: #if match_names(a,b): # score += 1. return score
def update_variants(self): """ Sets the variants of this name to the candidates returned by variants_queryset """ for researcher in self.variants_queryset(): sim = name_similarity( (researcher.name.first, researcher.name.last), (self.first, self.last)) if sim > 0: old_sim = self.best_confidence self.best_confidence = sim if self.pk is None or old_sim < sim: self.save() NameVariant.objects.get_or_create(name=self, researcher=researcher, defaults={'confidence': sim})
def test_matching(self): self.assertAlmostEqual( name_similarity(('Robin', 'Ryder'), ('Robin', 'Ryder')), 0.8) self.assertAlmostEqual( name_similarity(('Robin', 'Ryder'), ('R.', 'Ryder')), 0.4) self.assertAlmostEqual( name_similarity(('R.', 'Ryder'), ('R.', 'Ryder')), 0.4) self.assertAlmostEqual( name_similarity(('Robin J.', 'Ryder'), ('R.', 'Ryder')), 0.3) self.assertAlmostEqual( name_similarity(('Robin J.', 'Ryder'), ('R. J.', 'Ryder')), 0.8) self.assertAlmostEqual( name_similarity(('R. J.', 'Ryder'), ('J.', 'Ryder')), 0.3) self.assertAlmostEqual( name_similarity(('Robin', 'Ryder'), ('Robin J.', 'Ryder')), 0.7)
def get_or_create_by_orcid(cls, orcid, profile=None, user=None): """ Creates (or returns an existing) researcher from its ORCID id. :param profile: an :class:`OrcidProfile` object if it has already been fetched from the API (otherwise we will fetch it ourselves) :param user: an user to associate with the profile. :returns: a :class:`Researcher` if everything went well, raises MetadataSourceException otherwise """ researcher = None if orcid is None: raise MetadataSourceException('Invalid ORCID id') try: researcher = Researcher.objects.get(orcid=orcid) except Researcher.DoesNotExist: if profile is None: profile = OrcidProfile(id=orcid) else: profile = OrcidProfile(json=profile) name = profile.name homepage = profile.homepage email = profile.email researcher = Researcher.create_by_name(name[0], name[1], orcid=orcid, user=user, homepage=homepage, email=email) # Ensure that extra info is added. save = False for kw, val in [('homepage', homepage), ('orcid', orcid), ('email', email)]: if not researcher.__dict__[kw] and val: researcher.__dict__[kw] = val save = True if save: researcher.save() for variant in profile.other_names: confidence = name_similarity(variant, variant) name = Name.lookup_name(variant) researcher.add_name_variant(name, confidence) return researcher
def get_or_create_by_orcid(cls, orcid, profile=None, user=None): """ Creates (or returns an existing) researcher from its ORCID id. :param profile: an :class:`OrcidProfile` object if it has already been fetched from the API (otherwise we will fetch it ourselves) :param user: an user to associate with the profile. :returns: a :class:`Researcher` if everything went well, raises MetadataSourceException otherwise """ researcher = None if orcid is None: raise MetadataSourceException('Invalid ORCID id') try: researcher = Researcher.objects.get(orcid=orcid) except Researcher.DoesNotExist: if profile is None: profile = OrcidProfile(id=orcid) else: profile = OrcidProfile(json=profile) name = profile.name homepage = profile.homepage email = profile.email researcher = Researcher.create_by_name(name[0],name[1], orcid=orcid, user=user, homepage=homepage, email=email) # Ensure that extra info is added. save = False for kw, val in [('homepage',homepage),('orcid',orcid),('email',email)]: if not researcher.__dict__[kw] and val: researcher.__dict__[kw] = val save = True if save: researcher.save() for variant in profile.other_names: confidence = name_similarity(variant, variant) name = Name.lookup_name(variant) researcher.add_name_variant(name, confidence) return researcher
def test_multiple(self): self.assertAlmostEqual( name_similarity(('Juan Pablo', 'Corella'), ('J. Pablo', 'Corella')), 1.0)
def test_reverse(self): self.assertAlmostEqual( name_similarity(('W. Timothy', 'Gowers'), ('Timothy', 'Gowers')), 0.7)