예제 #1
0
def names_are_substrings(name1, name2):
    '''
    Checks if two names are substrings of each other; e.g. "Christoph" vs. "Ch"
    Only checks for the beginning of the names. 

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    onames = name1[2]
    tnames = name2[2]
    #    oname = "".join(onames).lower()
    #    tname = "".join(tnames).lower()
    oname = clean_name_string("".join(onames).lower(), "", False, True)
    tname = clean_name_string("".join(tnames).lower(), "", False, True)
    names_are_substrings_b = False

    if (oname.startswith(tname) or tname.startswith(oname)):
        names_are_substrings_b = True

    return names_are_substrings_b
예제 #2
0
def names_are_equal_composites(name1, name2):
    '''
    Checks if names are equal composites; e.g. "guangsheng" vs. "guang sheng"

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: Are the names equal composites?
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    is_equal_composite = False
    oname_variations = create_name_tuples(name1[2])
    tname_variations = create_name_tuples(name2[2])

    for oname_variation in oname_variations:
        for tname_variation in tname_variations:
            oname = clean_name_string(oname_variation.lower(), "", False, True)
            tname = clean_name_string(tname_variation.lower(), "", False, True)

            if oname == tname:
                is_equal_composite = True
                break

    return is_equal_composite
def names_are_substrings(name1, name2):
    '''
    Checks if two names are substrings of each other; e.g. "Christoph" vs. "Ch"
    Only checks for the beginning of the names. 

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    onames = name1[2]
    tnames = name2[2]
#    oname = "".join(onames).lower()
#    tname = "".join(tnames).lower()
    oname = clean_name_string("".join(onames).lower(), "", False, True)
    tname = clean_name_string("".join(tnames).lower(), "", False, True)
    names_are_substrings_b = False

    if (oname.startswith(tname)
        or tname.startswith(oname)):
        names_are_substrings_b = True

    return names_are_substrings_b
예제 #4
0
def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = None

    try:
        from Levenshtein import jaro_winkler
        jaro_fctn = jaro_winkler
    except ImportError:
        jaro_fctn = jaro_winkler_str_similarity
    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)
    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    if orig_name[0] == targ_name[0]:
        score += 0.6
    else:
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
                or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [
                clean_name_string(i, replacement="", keep_whitespace=False)
                for i in targ_name[2]
            ]
            for i in orig_name[2]:
                if clean_name_string(
                        i, replacement="",
                        keep_whitespace=False) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names +
                                                        max_initials)
        score += name_score
    return score
def names_are_equal_composites(name1, name2):
    '''
    Checks if names are equal composites; e.g. "guangsheng" vs. "guang sheng"

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: Are the names equal composites?
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    is_equal_composite = False
    oname_variations = create_name_tuples(name1[2])
    tname_variations = create_name_tuples(name2[2])

    for oname_variation in oname_variations:
        for tname_variation in tname_variations:
            oname = clean_name_string(oname_variation.lower(), "", False, True)
            tname = clean_name_string(tname_variation.lower(), "", False, True)

            if oname == tname:
                is_equal_composite = True
                break

    return is_equal_composite
예제 #6
0
    def test_clean_name_string(self):
        """bibauthorid - test cleaning of name strings"""

        self.assertEqual('this is a full name',
           baidu.clean_name_string('this is a full name'))

        self.assertEqual('this is a full ,. pz',
            baidu.clean_name_string('this is a full ;,.$&[{{}}(=*)+]pz'))

        self.assertEqual('',
            baidu.clean_name_string(''))
def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = None

    try:
        from Levenshtein import jaro_winkler
        jaro_fctn = jaro_winkler
    except ImportError:
        jaro_fctn = jaro_winkler_str_similarity
    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)
    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    if orig_name[0] == targ_name[0]:
        score += 0.6
    else:
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
            or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]]
            for i in orig_name[2]:
                if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials)
        score += name_score
    return score
예제 #8
0
def names_are_equal_gender(name1, name2, gendernames):
    '''
    Checks on gender equality of two names baes on a word list

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string
    @param gendernames: dictionary of male/female names
    @type gendernames: dict

    @return: Are names gender-equal?
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    print_debug = False
    names_are_equal_gender_b = True
    ogender = None
    tgender = None
    oname = name1[2][0].lower()
    tname = name2[2][0].lower()
    oname = clean_name_string(oname, "", False, True)
    tname = clean_name_string(tname, "", False, True)

    if oname in gendernames['boys']:
        ogender = 'Male'
    elif oname in gendernames['girls']:
        ogender = 'Female'

    if tname in gendernames['boys']:
        tgender = 'Male'
    elif tname in gendernames['girls']:
        tgender = 'Female'

    if print_debug:
        print '     Gender check: ', oname, ' is a ', ogender
        print '     Gender check: ', tname, ' is a ', tgender

    if ogender and tgender:
        if ogender != tgender:
            if print_debug:
                print '    Gender differs, force split!'

            names_are_equal_gender_b = False

    return names_are_equal_gender_b
def names_are_equal_gender(name1, name2, gendernames):
    '''
    Checks on gender equality of two names baes on a word list

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string
    @param gendernames: dictionary of male/female names
    @type gendernames: dict

    @return: Are names gender-equal?
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    print_debug = False
    names_are_equal_gender_b = True
    ogender = None
    tgender = None
    oname = name1[2][0].lower()
    tname = name2[2][0].lower()
    oname = clean_name_string(oname, "", False, True)
    tname = clean_name_string(tname, "", False, True)

    if oname in gendernames['boys']:
        ogender = 'Male'
    elif oname in gendernames['girls']:
        ogender = 'Female'

    if tname in gendernames['boys']:
        tgender = 'Male'
    elif tname in gendernames['girls']:
        tgender = 'Female'

    if print_debug:
        print '     Gender check: ', oname, ' is a ', ogender
        print '     Gender check: ', tname, ' is a ', tgender

    if ogender and tgender:
        if ogender != tgender:
            if print_debug:
                print '    Gender differs, force split!'

            names_are_equal_gender_b = False

    return names_are_equal_gender_b
def update_virtualauthor_cluster(va_id):
    """
    Computes the clusterID for the virtualauthor.
    The clustering at this level is done accumulating all the compatible
    surnames, but there is
    still space for a smarter choice.
    @note: The clustering number is used as a speed up tool, and it is meant to
        accumulate all the possible compatible virtualauthors so we do not have
        to search anywhere else. This means that a worst case cluster is no
        clustering at all (only one cluster for the whole virtualauthors set)
        and a best case cluster is a clusterID for the minimum set of
        virtualauthors representing a real author. It is important
        _NOT to loose_ virtualauthors! It is better to have a bigger cluster
        than loosing possibly precious information later.

    @param va_id: ID of the virtual author the cluster shall be updated for
    @type va_id: int
    """
    ori_name = get_virtualauthor_records(va_id, 'orig_name_string')[0]['value']

    ori_name = clean_name_string(ori_name)
    current_cluster_ids = get_clusterids_from_name(ori_name, True)

    for va_item in [row for row in dat.VIRTUALAUTHORS
               if row['virtualauthorid'] == va_id]:
        va_item['clusterid'] = current_cluster_ids[1]

    bconfig.LOGGER.debug("| Found %s cluster for %s. Now set to %s." %
          (len(current_cluster_ids[0]), ori_name, current_cluster_ids[1]))

    update_virtualauthor_record(va_id, 'updated', 'True')
예제 #11
0
def update_virtualauthor_cluster(va_id):
    """
    Computes the clusterID for the virtualauthor.
    The clustering at this level is done accumulating all the compatible
    surnames, but there is
    still space for a smarter choice.
    @note: The clustering number is used as a speed up tool, and it is meant to
        accumulate all the possible compatible virtualauthors so we do not have
        to search anywhere else. This means that a worst case cluster is no
        clustering at all (only one cluster for the whole virtualauthors set)
        and a best case cluster is a clusterID for the minimum set of
        virtualauthors representing a real author. It is important
        _NOT to loose_ virtualauthors! It is better to have a bigger cluster
        than loosing possibly precious information later.

    @param va_id: ID of the virtual author the cluster shall be updated for
    @type va_id: int
    """
    ori_name = get_virtualauthor_records(va_id, 'orig_name_string')[0]['value']

    ori_name = clean_name_string(ori_name)
    current_cluster_ids = get_clusterids_from_name(ori_name, True)

    for va_item in [
            row for row in dat.VIRTUALAUTHORS
            if row['virtualauthorid'] == va_id
    ]:
        va_item['clusterid'] = current_cluster_ids[1]

    bconfig.LOGGER.debug(
        "| Found %s cluster for %s. Now set to %s." %
        (len(current_cluster_ids[0]), ori_name, current_cluster_ids[1]))

    update_virtualauthor_record(va_id, 'updated', 'True')
def names_are_synonymous(name1, name2, name_variations):
    '''
    Checks if two names are synonymous; e.g. "Robert" vs. "Bob"

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string
    @param name_variations: name variations list
    @type name_variations: list of lists

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    print_debug = False
    names_are_synonymous_b = False
    max_matches = min(len(name1[2]), len(name2[2]))
    matches = []

    for i in xrange(max_matches):
        matches.append(False)

    for nvar in name_variations:
        for i in xrange(max_matches):
            oname = name1[2][i].lower()
            tname = name2[2][i].lower()
            oname = clean_name_string(oname, "", False, True)
            tname = clean_name_string(tname, "", False, True)

            if oname in nvar and tname in nvar:
                if print_debug:
                    print '      ', oname, ' and ', tname, ' are synonyms! Not splitting!'

                matches[i] = True

        if sum(matches) == max_matches:
            names_are_synonymous_b = True
            break

    return names_are_synonymous_b
예제 #13
0
def names_are_synonymous(name1, name2, name_variations):
    '''
    Checks if two names are synonymous; e.g. "Robert" vs. "Bob"

    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string
    @param name_variations: name variations list
    @type name_variations: list of lists

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    print_debug = False
    names_are_synonymous_b = False
    max_matches = min(len(name1[2]), len(name2[2]))
    matches = []

    for i in xrange(max_matches):
        matches.append(False)

    for nvar in name_variations:
        for i in xrange(max_matches):
            oname = name1[2][i].lower()
            tname = name2[2][i].lower()
            oname = clean_name_string(oname, "", False, True)
            tname = clean_name_string(tname, "", False, True)

            if oname in nvar and tname in nvar:
                if print_debug:
                    print '      ', oname, ' and ', tname, ' are synonyms! Not splitting!'

                matches[i] = True

        if sum(matches) == max_matches:
            names_are_synonymous_b = True
            break

    return names_are_synonymous_b
def names_minimum_levenshtein_distance(name1, name2):
    '''
    Determines the minimum distance D between two names.
    Comparison is base on the minimum number of first names.
    Examples:
    D("guang", "guang sheng") = 0
    D("guang", "guangsheng") = 5
    D("guang sheng", "guangsheng") = 5
    D("guang sheng", "guang shing") = 1
    D("guang ming", "guang fin") = 2

    @precondition: Names have been checked for composition equality.
    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: the minimum Levenshtein distance between two names
    @rtype: int
    '''
    try:
        from Levenshtein import distance
    except ImportError:
        bconfig.LOGGER.exception("Levenshtein Module not available!")
        return - 1

    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    onames = name1[2]
    tnames = name2[2]
#    min_names_count = min(len(onames), len(tnames))
#
#    if min_names_count <= 0:
#        return -1
#
#    oname = "".join(onames[:min_names_count]).lower()
#    tname = "".join(tnames[:min_names_count]).lower()
    oname = clean_name_string("".join(onames).lower(), "", False, True)
    tname = clean_name_string("".join(tnames).lower(), "", False, True)

    return distance(oname, tname)
예제 #15
0
def names_minimum_levenshtein_distance(name1, name2):
    '''
    Determines the minimum distance D between two names.
    Comparison is base on the minimum number of first names.
    Examples:
    D("guang", "guang sheng") = 0
    D("guang", "guangsheng") = 5
    D("guang sheng", "guangsheng") = 5
    D("guang sheng", "guang shing") = 1
    D("guang ming", "guang fin") = 2

    @precondition: Names have been checked for composition equality.
    @param name1: Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Name string of the second name (w/ last name)
    @type name2: string

    @return: the minimum Levenshtein distance between two names
    @rtype: int
    '''
    try:
        from Levenshtein import distance
    except ImportError:
        bconfig.LOGGER.exception("Levenshtein Module not available!")
        return -1

    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    onames = name1[2]
    tnames = name2[2]
    #    min_names_count = min(len(onames), len(tnames))
    #
    #    if min_names_count <= 0:
    #        return -1
    #
    #    oname = "".join(onames[:min_names_count]).lower()
    #    tname = "".join(tnames[:min_names_count]).lower()
    oname = clean_name_string("".join(onames).lower(), "", False, True)
    tname = clean_name_string("".join(tnames).lower(), "", False, True)

    return distance(oname, tname)
def get_clusterids_from_name(name, return_matching=False):
    '''
    Returns a list of cluster IDs, which are fitting for the parameter 'name'.
    First checks if, in general, a cluster for this name exists. If not,
    create one. If there is a cluster, try to find all other fitting clusters
    and add the found cluster IDs to the list to be returned

    @param name: The name to be on the lookout for.
    @type name: string
    @param return_matching: also return the reference name's matching cluster
    @type return_matching: boolean

    @return:
        if return_matching: list of 1) list of cluster IDs 2) the cluster ID
            matching the name
        if not return_matching: list of cluster IDs
    @rtype:
        if return_matching: list of (list of int, int)
        if not return_matching: list of int
    '''
    search_string = create_unified_name(name)
    search_string = clean_name_string(search_string)

    if len(search_string) > 150:
        search_string = search_string[:150]

    clusterids = set()
    matching_cluster = -1
    initials = ""
    split_string = ""

    if search_string[:-1].count(",") > 0:
        split_string = search_string[:-1].replace(' ', '').split(',')

        if split_string[1]:
            initials = split_string[1].split('.')

    if len(initials) > 2 and len(initials) <= 5:
        permutation_list = initials

        permutation_base = ("%s, %s." %
                            (search_string.split(',')[0], permutation_list[0]))

        for permutation in permutations(permutation_list[1:]):
            name_string = "%s %s." % (permutation_base, ". ".join(permutation))
            clusters = _get_clusterids_from_name(name_string, return_matching)
            if return_matching:
                matching_cluster = clusters[1]
                for clusterid in clusters[0]:
                    clusterids.add(clusterid)
            else:
                for clusterid in clusters:
                    clusterids.add(clusterid)
    else:
        clusters = _get_clusterids_from_name(search_string, return_matching)
        if return_matching:
            matching_cluster = clusters[1]
            clusterids = clusters[0]
        else:
            clusterids = clusters

    if return_matching:
        return [clusterids, matching_cluster]
    else:
        return clusterids
def compare_names(origin_name, target_name):
    """
    Compute an index of confidence that would like to indicate whether two
    names might represent the same person.The computation is based on
    similarities of name structure, in particular:
        Initials:
            We assign an high score if all the initials matches are in the
            right order, much lower if they are in the wrong order
        Names:
            We assign a lower score for mismatching names and higher score for
            fully matching names
    If there is nothing to compare we are forced to assume a high score.

    Example for splitting names:
        In : bibauthorid.split_name_parts("Ellis, John R")
        Out: ['Ellis', ['J', 'R'], ['John']]

        Ellis, R. Keith        => [ [Ellis], [R, K], [Keith] ]
        Ellis, Richard Keith   => [ [Ellis], [R, K], [Richard, Keith] ]

    Since the initials are computed whether on the real initials present in the
    name string and using the full name, if there is no initials match we are 1
    00% confident that:
        1. we have no names/initials at all, or
        2. we have completely different names; hence if there is no initial
            match we skip this step.

    @param orig_name: The first author's last name, first name(s) and initial
    @type orig_name: list of strings and lists of strings
    @param targ_name: The second author's last name, first name(s) and initial
    @type targ_name: list of strings and lists of strings

    @return: a value that describes the likelihood of the names being the same
    @rtype: float
    """

    jaro_fctn = None

    try:
        from Levenshtein import jaro_winkler
        jaro_fctn = jaro_winkler
    except ImportError:
        jaro_fctn = jaro_winkler_str_similarity

    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())

    bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" %
                      (origin_name, target_name))

    lastname_modifier = 0.0

    if not (orig_name[0] == targ_name[0]):
        # last names are not equal before cleaning them. Assign entry penalty.
        lastname_modifier = 0.15

    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)

    if not (orig_name[0] == targ_name[0]):
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
            or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            bconfig.LOGGER.warn(("Unequal lastnames(%s vs. %s)."
                               + "Skipping Comparison")
                               % (orig_name[0], targ_name[0]))
            return 0.0
        else:
            bconfig.LOGGER.log(25, "Last names are not equal; "
                          + "but similar enough to continue the comparison")
            # Let it go through...however, reduce the final result a little.
            lastname_modifier = 0.24
    else:
        # last names are equal after cleaning them. Reduce penalty.
        if lastname_modifier == 0.15:
            lastname_modifier = 0.02

    if orig_name[2] and targ_name[2]:
        if len(orig_name[2]) > 1 or len(targ_name[2]) > 1:
            variation_ps = []
            oname_variations = create_name_tuples(orig_name[2])
            tname_variations = create_name_tuples(targ_name[2])

            for oname_variation in oname_variations:
                for tname_variation in tname_variations:
                    oname_var = split_name_parts("%s, %s"
                                                 % (orig_name[0],
                                                    oname_variation))
                    tname_var = split_name_parts("%s, %s"
                                                 % (targ_name[0],
                                                    tname_variation))
                    variation_ps.append(_perform_matching(oname_var,
                                                          tname_var))

            return max(variation_ps) - lastname_modifier

    return _perform_matching(orig_name, targ_name) - lastname_modifier
예제 #18
0
def get_clusterids_from_name(name, return_matching=False):
    '''
    Returns a list of cluster IDs, which are fitting for the parameter 'name'.
    First checks if, in general, a cluster for this name exists. If not,
    create one. If there is a cluster, try to find all other fitting clusters
    and add the found cluster IDs to the list to be returned

    @param name: The name to be on the lookout for.
    @type name: string
    @param return_matching: also return the reference name's matching cluster
    @type return_matching: boolean

    @return:
        if return_matching: list of 1) list of cluster IDs 2) the cluster ID
            matching the name
        if not return_matching: list of cluster IDs
    @rtype:
        if return_matching: list of (list of int, int)
        if not return_matching: list of int
    '''
    search_string = create_unified_name(name)
    search_string = clean_name_string(search_string)

    if len(search_string) > 150:
        search_string = search_string[:150]

    clusterids = set()
    matching_cluster = -1
    initials = ""
    split_string = ""

    if search_string[:-1].count(",") > 0:
        split_string = search_string[:-1].replace(' ', '').split(',')

        if split_string[1]:
            initials = split_string[1].split('.')

    if len(initials) > 2 and len(initials) <= 5:
        permutation_list = initials

        permutation_base = ("%s, %s." %
                            (search_string.split(',')[0], permutation_list[0]))

        for permutation in permutations(permutation_list[1:]):
            name_string = "%s %s." % (permutation_base, ". ".join(permutation))
            clusters = _get_clusterids_from_name(name_string, return_matching)
            if return_matching:
                matching_cluster = clusters[1]
                for clusterid in clusters[0]:
                    clusterids.add(clusterid)
            else:
                for clusterid in clusters:
                    clusterids.add(clusterid)
    else:
        clusters = _get_clusterids_from_name(search_string, return_matching)
        if return_matching:
            matching_cluster = clusters[1]
            clusterids = clusters[0]
        else:
            clusterids = clusters

    if return_matching:
        return [clusterids, matching_cluster]
    else:
        return clusterids
예제 #19
0
def compare_names(origin_name, target_name):
    """
    Compute an index of confidence that would like to indicate whether two
    names might represent the same person.The computation is based on
    similarities of name structure, in particular:
        Initials:
            We assign an high score if all the initials matches are in the
            right order, much lower if they are in the wrong order
        Names:
            We assign a lower score for mismatching names and higher score for
            fully matching names
    If there is nothing to compare we are forced to assume a high score.

    Example for splitting names:
        In : bibauthorid.split_name_parts("Ellis, John R")
        Out: ['Ellis', ['J', 'R'], ['John']]

        Ellis, R. Keith        => [ [Ellis], [R, K], [Keith] ]
        Ellis, Richard Keith   => [ [Ellis], [R, K], [Richard, Keith] ]

    Since the initials are computed whether on the real initials present in the
    name string and using the full name, if there is no initials match we are 1
    00% confident that:
        1. we have no names/initials at all, or
        2. we have completely different names; hence if there is no initial
            match we skip this step.

    @param orig_name: The first author's last name, first name(s) and initial
    @type orig_name: list of strings and lists of strings
    @param targ_name: The second author's last name, first name(s) and initial
    @type targ_name: list of strings and lists of strings

    @return: a value that describes the likelihood of the names being the same
    @rtype: float
    """

    jaro_fctn = None

    try:
        from Levenshtein import jaro_winkler
        jaro_fctn = jaro_winkler
    except ImportError:
        jaro_fctn = jaro_winkler_str_similarity

    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())

    bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" %
                        (origin_name, target_name))

    lastname_modifier = 0.0

    if not (orig_name[0] == targ_name[0]):
        # last names are not equal before cleaning them. Assign entry penalty.
        lastname_modifier = 0.15

    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)

    if not (orig_name[0] == targ_name[0]):
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
                or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            bconfig.LOGGER.warn(
                ("Unequal lastnames(%s vs. %s)." + "Skipping Comparison") %
                (orig_name[0], targ_name[0]))
            return 0.0
        else:
            bconfig.LOGGER.log(
                25, "Last names are not equal; " +
                "but similar enough to continue the comparison")
            # Let it go through...however, reduce the final result a little.
            lastname_modifier = 0.24
    else:
        # last names are equal after cleaning them. Reduce penalty.
        if lastname_modifier == 0.15:
            lastname_modifier = 0.02

    if orig_name[2] and targ_name[2]:
        if len(orig_name[2]) > 1 or len(targ_name[2]) > 1:
            variation_ps = []
            oname_variations = create_name_tuples(orig_name[2])
            tname_variations = create_name_tuples(targ_name[2])

            for oname_variation in oname_variations:
                for tname_variation in tname_variations:
                    oname_var = split_name_parts(
                        "%s, %s" % (orig_name[0], oname_variation))
                    tname_var = split_name_parts(
                        "%s, %s" % (targ_name[0], tname_variation))
                    variation_ps.append(_perform_matching(
                        oname_var, tname_var))

            return max(variation_ps) - lastname_modifier

    return _perform_matching(orig_name, targ_name) - lastname_modifier