def full_names_are_synonymous(name1, name2, name_variations):
    '''
    Checks if two names are synonymous; e.g. "Robert" vs. "Bob"

    @param name1: Full Name string of the first name (w/ last name)
    @type name1: string
    @param name2: Full Name string of the second name (w/ last name)
    @type name2: string
    @param name_variations: name variations list
    @type name_variations: list of lists

    @return: are names synonymous
    @rtype: boolean
    '''
    if not isinstance(name1, list):
        name1 = split_name_parts(name1)

    if not isinstance(name2, list):
        name2 = split_name_parts(name2)

    names_are_synonymous_b = False
    max_matches = min(len(name1[2]), len(name2[2]))
    matches = []

    for i in xrange(max_matches):
        matches.append(False)

    for nvar in name_variations:
        for i in xrange(max_matches):
            oname = name1[2][i].lower()
            tname = name2[2][i].lower()
            oname = clean_name_string(oname, "", False, True)
            tname = clean_name_string(tname, "", False, True)

            if (oname in nvar and tname in nvar) or oname == tname:
                name_comparison_print('      ', oname, ' and ', tname, ' are synonyms!')
                matches[i] = True

        if sum(matches) == max_matches:
            names_are_synonymous_b = True
            break

    return names_are_synonymous_b
def compare_names(origin_name, target_name, initials_penalty=False):
    '''
    Compare two names.
    '''
    MAX_ALLOWED_SURNAME_DISTANCE = 2
    name_comparison_print("\nComparing: " , origin_name, ' ', target_name)
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations
    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: ", no)
    name_comparison_print("|- splitted nt: ", nt)

    score = 0.0

    surname_dist = distance(no[0], nt[0])
    name_comparison_print("|- surname distance: ", surname_dist)

    if surname_dist > 0:
        artifact_removal = re.compile("[^a-zA-Z0-9]")
        fn1 = artifact_removal.sub("", no[0])
        fn2 = artifact_removal.sub("", nt[0])

        if fn1 == fn2:
            score = 1.0
        else:
            score = max(0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE)))
    else:
        score = 1.0
    name_comparison_print('||- surname score: ', score)

    initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    only_initials_available = False
    if len(no[2]) == len(nt[2]) and initials_only:
        only_initials_available = True

    name_comparison_print('|- initials only: ', initials_only)
    name_comparison_print('|- only initials available: ', only_initials_available)

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(origin_name, target_name)
    name_comparison_print("|- equal composites: ", names_are_equal_composites)

    max_n_initials = max(len(no[1]), len(nt[1]))
    initials_intersection = set(no[1]).intersection(set(nt[1]))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(no[1]).union(set(nt[1]))
    n_initials_union = len(initials_union)


    initials_distance = distance("".join(no[1]), "".join(nt[1]))
    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    if len(no[1]) > len(nt[1]):
        alo = no[1]
        alt = nt[1]
    else:
        alo = nt[1]
        alt = no[1]
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = initials_distance / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    score = score - (0.75 * initials_screwup + 0.10 * (1 - initials_c)\
            + 0.15 * initials_distance) * (score)
    name_comparison_print("|- initials sets: ", no[1], " ", nt[1])
    name_comparison_print("|- initials distance: ", initials_distance)
    name_comparison_print("|- initials c: ", initials_c)
    name_comparison_print("|- initials screwup: ", initials_screwup)
    name_comparison_print("||- initials score: ", score)

    composits_eq = full_names_are_equal_composites(no, nt)
    if len(no[2]) > 0 and len(nt[2]) > 0:
        gender_eq = full_names_are_equal_gender(no, nt, gendernames)
    else:
        gender_eq = True
    vars_eq = full_names_are_synonymous(no, nt, name_variations)
    substr_eq = full_names_are_substrings(no, nt)

    if not initials_only:
        if len(no[2]) > len(nt[2]):
            nalo = no[2]
            nalt = nt[2]
        else:
            nalo = nt[2]
            nalt = no[2]
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]
        max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list])
        avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\
                            / len(names_screwup_list)

    else:
        max_names_screwup = 0
        avg_names_screwup = 0

    score = score - score * 0.75 * max_names_screwup - score * 0.25 * avg_names_screwup
    name_comparison_print("|- max names screwup: ", max_names_screwup)
    name_comparison_print("|- avg screwup: ", avg_names_screwup)
    name_comparison_print("||- names score: ", score)
    name_comparison_print("|- names composites: ", composits_eq)
    name_comparison_print("|- same gender: ", gender_eq)
    name_comparison_print("|- synonims: ", vars_eq)
    name_comparison_print("|- substrings: ", substr_eq)

    if vars_eq:
        synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]]
        synmap = [i for i in synmap if i[2] == True]
        name_comparison_print("|-- synmap: ", synmap)
        for i in synmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.5
            else:
                score = score + (1 - score) * 0.15
    else:
        name_comparison_print("|-- synmap: empty")
    name_comparison_print("|-- synmap score: ", score)

    if substr_eq and not initials_only:
        ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]]
        ssmap = [i for i in ssmap if i[2] == True]
        name_comparison_print("|-- substr map: ", ssmap)
        for i in ssmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.2
            else:
                score = score + (1 - score) * 0.05
    else:
        name_comparison_print("|-- substr map: empty")

    name_comparison_print("|-- substring score: ", score)

    if composits_eq and not initials_only:
        name_comparison_print("|-- composite names")
        score = score + (1 - score) * 0.2
    else:
        name_comparison_print("|-- not composite names")
    name_comparison_print("|-- composite score: ", score)

    if not gender_eq:
        score = score / 3.
        name_comparison_print("|-- apply gender penalty")
    else:
        name_comparison_print("|--   no  gender penalty")

    name_comparison_print("|-- gender score: ", score)

    if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE:
        score = 0.0
        name_comparison_print("|- surname trim: ", score)
    else:
        name_comparison_print("|- no surname trim: ", score)

    if initials_only and (not only_initials_available or initials_penalty):
        score = score * .9
        name_comparison_print("|- initials only penalty: ", score, initials_only, only_initials_available)
    else:
        name_comparison_print("|- no initials only penalty", initials_only, only_initials_available)

    name_comparison_print("||- final score:  ", score)


    return score
from bibauthorid_general_utils import name_comparison_print

try:
    from invenio.config import CFG_ETCDIR
    NO_CFG_ETCDIR = False
except ImportError:
    NO_CFG_ETCDIR = True

try:
    from editdist import distance
except ImportError:
    try:
        from Levenshtein import distance
    except ImportError:
        name_comparison_print("Levenshtein Module not available!")
        def distance(s1, s2):
            d = {}
            lenstr1 = len(s1)
            lenstr2 = len(s2)
            for i in xrange(-1, lenstr1 + 1):
                d[(i, -1)] = i + 1
            for j in xrange(-1, lenstr2 + 1):
                d[(-1, j)] = j + 1

            for i in xrange(0, lenstr1):
                for j in xrange(0, lenstr2):
                    if s1[i] == s2[j]:
                        cost = 0
                    else:
                        cost = 1