Ejemplo n.º 1
0
def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = distance

#    try:
#        from Levenshtein import jaro_winkler
#        jaro_fctn = jaro_winkler
#    except ImportError:
#        jaro_fctn = jaro_winkler_str_similarity

    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    oname = translate_to_ascii(oname)[0]
    tname = translate_to_ascii(tname)[0]

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    if orig_name[0].lower() == targ_name[0].lower():
        score += 0.6
    else:
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
            or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]]
            for i in orig_name[2]:
                if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials)
        score += name_score
    return score
Ejemplo n.º 2
0
def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = distance

#    try:
#        from Levenshtein import jaro_winkler
#        jaro_fctn = jaro_winkler
#    except ImportError:
#        jaro_fctn = jaro_winkler_str_similarity

    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    oname = translate_to_ascii(oname)[0]
    tname = translate_to_ascii(tname)[0]

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    if orig_name[0].lower() == targ_name[0].lower():
        score += 0.6
    else:
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
            or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]]
            for i in orig_name[2]:
                if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials)
        score += name_score
    return score
Ejemplo n.º 3
0
 def test_text_to_ascii(self):
     """textutils - translate_to_ascii"""
     self.assertEqual(translate_to_ascii(["á í Ú", "H\xc3\xb6hne", "Åge Øst Vær", "normal"]), \
                                         ["a i U", "Hohne", "Age Ost Vaer", "normal"])
     self.assertEqual(translate_to_ascii("àèéìòù"), ["aeeiou"])
     self.assertEqual(translate_to_ascii(None), None)
     self.assertEqual(translate_to_ascii([]), [])
     self.assertEqual(translate_to_ascii([None]), [None])
Ejemplo n.º 4
0
 def test_text_to_ascii(self):
     """textutils - transliterate to ascii using unidecode"""
     self.assertEqual(
         translate_to_ascii(
             ["á í Ú", "H\xc3\xb6hne", "Åge Øst Vær", "normal"]),
         ["a i U", "Hohne", "Age Ost Vaer", "normal"])
     self.assertEqual(translate_to_ascii("àèéìòù"), ["aeeiou"])
     self.assertEqual(translate_to_ascii(None), None)
     self.assertEqual(translate_to_ascii([]), [])
     self.assertEqual(translate_to_ascii([None]), [None])
Ejemplo n.º 5
0
def compare_names(origin_name, target_name, initials_penalty=False):
    ''' Compare two names '''

    name_comparison_print("\nComparing: " , origin_name, ' ', target_name)

    origin_name = translate_to_ascii(origin_name)[0]
    target_name = translate_to_ascii(target_name)[0]

    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: %s"% no)
    name_comparison_print("|- splitted nt: %s"% nt)

    FS_surname_score = surname_compatibility(no[0], nt[0])

    assert FS_surname_score >= 0 and FS_surname_score <=1, "Compare_names: Surname score out of range"

    name_comparison_print("|- surname score: %s"% FS_surname_score)

    FS_initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    FS_initials_score = initials_compatibility(no[1], nt[1])

    assert FS_initials_score >= 0 and FS_initials_score <=1, "Compare_names: initials score out of range"

    name_comparison_print('|- initials only %s'% FS_initials_only)
    name_comparison_print('|- initials score %s'% FS_initials_score)

    FS_first_names_score = compare_first_names(no[2],nt[2])

    assert FS_first_names_score >= 0 and FS_first_names_score <=1, "Compare_names: firstname score out of range"
    name_comparison_print('|- names score %s'% FS_first_names_score )

    if not FS_initials_only:
        x = FS_initials_score
        y = FS_first_names_score
        try:
            FS_ns = (x*y)/sqrt(x**2+y**2)*SQRT2
        except ZeroDivisionError:
            FS_ns = 0.0
    else:
        FS_ns = FS_initials_score * 0.6

    name_comparison_print('|- final scores %s %s'% (FS_surname_score, FS_ns))

    x = FS_surname_score
    y = FS_ns

    try:
        final_score = (x*y)/sqrt(x**2+y**2)*SQRT2
    except ZeroDivisionError:
        final_score = 0.0

    name_comparison_print("|- final score is... %s" % final_score)
    return final_score
Ejemplo n.º 6
0
def compare_names(origin_name, target_name, initials_penalty=False):
    ''' Compare two names '''

    name_comparison_print("\nComparing: ", origin_name, ' ', target_name)

    origin_name = translate_to_ascii(origin_name)[0]
    target_name = translate_to_ascii(target_name)[0]

    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: %s" % no)
    name_comparison_print("|- splitted nt: %s" % nt)

    FS_surname_score = surname_compatibility(no[0], nt[0])

    assert FS_surname_score >= 0 and FS_surname_score <= 1, "Compare_names: Surname score out of range"

    name_comparison_print("|- surname score: %s" % FS_surname_score)

    FS_initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    FS_initials_score = initials_compatibility(no[1], nt[1])

    assert FS_initials_score >= 0 and FS_initials_score <= 1, "Compare_names: initials score out of range"

    name_comparison_print('|- initials only %s' % FS_initials_only)
    name_comparison_print('|- initials score %s' % FS_initials_score)

    FS_first_names_score = compare_first_names(no[2], nt[2])

    assert FS_first_names_score >= 0 and FS_first_names_score <= 1, "Compare_names: firstname score out of range"
    name_comparison_print('|- names score %s' % FS_first_names_score)

    if not FS_initials_only:
        x = FS_initials_score
        y = FS_first_names_score
        try:
            FS_ns = (x * y) / sqrt(x**2 + y**2) * SQRT2
        except ZeroDivisionError:
            FS_ns = 0.0
    else:
        FS_ns = FS_initials_score * 0.6

    name_comparison_print('|- final scores %s %s' % (FS_surname_score, FS_ns))

    x = FS_surname_score
    y = FS_ns

    try:
        final_score = (x * y) / sqrt(x**2 + y**2) * SQRT2
    except ZeroDivisionError:
        final_score = 0.0

    name_comparison_print("|- final score is... %s" % final_score)
    return final_score
def create_bibauthorid_indexer():
    '''
    It constructs the disk-based indexer. It consists of the dense index (which maps a name
    to the set of personids who withhold that name) and the inverted lists (which map a qgram
    to the set of name ids that share that qgram).
    '''
    name_pids_dict = get_confirmed_name_to_authors_mapping()
    if not name_pids_dict:
        return

    indexable_name_pids_dict = dict()

    for name in name_pids_dict.keys():
        asciified_name = translate_to_ascii(name)[0]
        indexable_name = create_indexable_name(asciified_name)
        if indexable_name:
            try:
                asciified_name, pids = indexable_name_pids_dict[indexable_name]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_name] = (asciified_name, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_name] = (asciified_name, name_pids_dict[name])

        surname = split_name_parts(name)[0]
        asciified_surname = translate_to_ascii(surname)[0]
        indexable_surname = create_indexable_name(asciified_surname)
        if indexable_surname:
            try:
                asciified_surname, pids = indexable_name_pids_dict[indexable_surname]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_surname] = (asciified_surname, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_surname] = (asciified_surname, name_pids_dict[name])

    indexable_names_list = indexable_name_pids_dict.keys()

    # If an exception/error occurs in any of the threads it is not detectable
    # so inter-thread communication is necessary to make it visible.
    q = Queue()
    threads = list()
    threads.append(Thread(target=create_dense_index, args=(indexable_name_pids_dict, indexable_names_list, q)))
    threads.append(Thread(target=create_inverted_lists, args=(indexable_names_list, q)))

    for t in threads:
        t.start()

    for t in threads:
        all_ok, error = q.get(block=True)
        if not all_ok:
            raise error
        q.task_done()

    for t in threads:
        t.join()
Ejemplo n.º 8
0
def _split_and_index(el):
    name, pids = el
    asciified_name = translate_to_ascii(name)[0]
    split_name = split_name_parts(indexable_name_re.sub(' ', asciified_name))
    indexable_name = create_indexable_name(split_name)
    surname = split_name[0] + ','
    indexable_surname = create_indexable_name([surname, [], [], []])
    return (name, pids, indexable_name, indexable_surname)
Ejemplo n.º 9
0
def fallback_find_personids_by_name_string(target):
    '''
    Search engine to find persons matching the given string
    The matching is done on the surname first, and names if present.
    An ordered list (per compatibility) of pids and found names is returned.

    @param namestring: string name, 'surname, names I.'
    @type: string
    @param strict: Define if this shall perform an exact or a fuzzy match
    @type strict: boolean
    @return: pid list of lists
    [pid,[[name string, occur count, compatibility]]]
    '''
    family = get_surname(target)
    ascii_family = get_surname(translate_to_ascii(target)[0])
    clean_family = get_surname(clean_string(target))

    #SANITY: avoid empty queries
    if not family:
        return list()

    levels = (  # target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele
        family + '%', '%' + family + ',%', '%' + family[1:-1] + '%')

    if len(family) <= 4:
        levels = [levels[0], levels[2]]

    names = list(set().union(
        *map(get_authors_by_name_regexp, (family + ',%', ascii_family + ',%',
                                          clean_family + ',%'))))

    if not names:
        for lev in levels:
            names = dbinter.get_authors_by_name_regexp(lev)
            if names:
                break

    is_canonical = False
    if not names:
        names = dbinter.get_authors_by_canonical_name_regexp(target)
        is_canonical = True

    names = groupby(sorted(names))
    names = [(key[0], key[1], len(list(data)),
              soft_compare_names(target, key[1])) for key, data in names]
    names = groupby(names, itemgetter(0))
    names = [(key,
              sorted([(d[1], d[2], d[3])
                      for d in data if (d[3] > 0.5 or is_canonical)],
                     key=itemgetter(2),
                     reverse=True)) for key, data in names]
    names = [name for name in names if name[1]]
    names = sorted(names,
                   key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]),
                   reverse=True)

    return names
Ejemplo n.º 10
0
def create_canonical_name(name):
    try:
        name = translate_to_ascii(name)[0]
    except IndexError:
        name = 'invalid'

    canonical_name = create_unified_name(name, reverse=True)
    canonical_name = create_canonical_name_artifact_removal_re.sub(" ", canonical_name)
    canonical_name = create_canonical_name_whitespace_removal.sub(" ", canonical_name)
    canonical_name = canonical_name.strip().replace(" ", ".")
    return canonical_name
Ejemplo n.º 11
0
def format_element(bfo):
    """
    ensure correct utf-8 handling of first initial and
    return asciified string <lastname>_<first initial>
    """
    out = str(bfo.field('100__a')).decode('utf-8').replace(', ', '_')
    position = out.find('_')
    if position >= 0:
        out = out[:position+2]
    out = translate_to_ascii([out]).pop()
    return out
Ejemplo n.º 12
0
def create_canonical_name(name):
    try:
        name = translate_to_ascii(name)[0]
    except IndexError:
        name = 'invalid'

    canonical_name = create_unified_name(name, reverse=True)
    canonical_name = create_canonical_name_artifact_removal_re.sub(" ", canonical_name)
    canonical_name = create_canonical_name_whitespace_removal.sub(" ", canonical_name)
    canonical_name = canonical_name.strip().replace(" ", ".")
    return canonical_name
Ejemplo n.º 13
0
 def _sort_alphanumerically_remove_leading_articles_strip_accents(self, val):
     """
     Convert:
     'The title' => 'title'
     'A title' => 'title'
     'Title' => 'title'
     """
     if not val:
         return ''
     val = translate_to_ascii(val).pop().lower()
     val_tokens = val.split(" ", 1) #split in leading_word, phrase_without_leading_word
     if len(val_tokens) == 2 and val_tokens[0].strip() in LEADING_ARTICLES:
         return val_tokens[1].strip()
     return val.strip()
Ejemplo n.º 14
0
 def test_text_to_ascii(self):
     """textutils - transliterate to ascii using unidecode"""
     self.assert_(translate_to_ascii(
         ["á í Ú", "H\xc3\xb6hne", "Åge Øst Vær", "normal"]) in
         (["a i U", "Hohne", "Age Ost Vaer", "normal"],  ## unidecode < 0.04.13
             ['a i U', 'Hoehne', 'Age Ost Vaer', 'normal']) ## unidecode >= 0.04.13
     )
     self.assertEqual(translate_to_ascii("àèéìòù"), ["aeeiou"])
     self.assertEqual(translate_to_ascii("ß"), ["ss"])
     self.assertEqual(translate_to_ascii(None), None)
     self.assertEqual(translate_to_ascii([]), [])
     self.assertEqual(translate_to_ascii([None]), [None])
     self.assertEqual(translate_to_ascii("√"), [""])
Ejemplo n.º 15
0
def find_personids_by_name1(query_string):
    '''
    It finds a collection of personids who own a signature that is similar to the given query string.
    Its approach is by solving a 'T-occurance problem' and then it applies some filters to the candidate
    answers so it can remove the false positives. In the end it sorts the result set based on the score
    they obtained.

    @param query_string:
    @type query_string: str

    @return: personids which own a signature similar to the query string
    @rtype: list
    '''
    search_engine_is_functioning = search_engine_is_operating()
    if not search_engine_is_functioning:
        return list()

    asciified_query_string = translate_to_ascii(query_string)[0]
    indexable_query_string = create_indexable_name(asciified_query_string)
    if not indexable_query_string:
        return list()

    #query_string_surname = split_name_parts(query_string)[0]
    #asciified_query_string_surname = translate_to_ascii(query_string_surname)[0]
    #indexable_query_string_surname = create_indexable_name(asciified_query_string_surname)

    #if not indexable_query_string and not indexable_query_string_surname:
    #    return list()

    s1 = solve_T_occurence_problem(indexable_query_string)

    if not s1:
        s1 = intbitset()

    nameids = solve_T_occurence_problem(indexable_query_string)

    #s2 = solve_T_occurence_problem(indexable_query_string_surname)
    #if not s2:
    #    s2 = intbitset()

    #nameids = s1 | s2
    if not nameids:
        return list()

    name_score_list = calculate_name_score(asciified_query_string, nameids)

    return name_score_list
def find_personids_by_name1(query_string):
    '''
    It finds a collection of personids who own a signature that is similar to the given query string.
    Its approach is by solving a 'T-occurance problem' and then it applies some filters to the candidate
    answers so it can remove the false positives. In the end it sorts the result set based on the score
    they obtained.

    @param query_string:
    @type query_string: str

    @return: personids which own a signature similar to the query string
    @rtype: list
    '''
    search_engine_is_functioning = search_engine_is_operating()
    if not search_engine_is_functioning:
        return list()

    asciified_query_string = translate_to_ascii(query_string)[0]
    indexable_query_string = create_indexable_name(asciified_query_string)
    if not indexable_query_string:
        return list()

    #query_string_surname = split_name_parts(query_string)[0]
    #asciified_query_string_surname = translate_to_ascii(query_string_surname)[0]
    #indexable_query_string_surname = create_indexable_name(asciified_query_string_surname)

    #if not indexable_query_string and not indexable_query_string_surname:
    #    return list()

    s1 = solve_T_occurence_problem(indexable_query_string)

    if not s1:
        s1 = intbitset()

    nameids = solve_T_occurence_problem(indexable_query_string)

    #s2 = solve_T_occurence_problem(indexable_query_string_surname)
    #if not s2:
    #    s2 = intbitset()

    #nameids = s1 | s2
    if not nameids:
        return list()

    name_score_list = calculate_name_score(asciified_query_string, nameids)
    
    return name_score_list
Ejemplo n.º 17
0
 def test_text_to_ascii(self):
     """textutils - transliterate to ascii using unidecode"""
     self.assert_(
         translate_to_ascii(
             ["á í Ú", "H\xc3\xb6hne", "Åge Øst Vær", "normal"]) in (
                 ["a i U", "Hohne", "Age Ost Vaer", "normal"
                  ],  ## unidecode < 0.04.13
                 ['a i U', 'Hoehne', 'Age Ost Vaer', 'normal']
             )  ## unidecode >= 0.04.13
     )
     self.assertEqual(translate_to_ascii("àèéìòù"), ["aeeiou"])
     self.assertEqual(translate_to_ascii("ß"), ["ss"])
     self.assertEqual(translate_to_ascii(None), None)
     self.assertEqual(translate_to_ascii([]), [])
     self.assertEqual(translate_to_ascii([None]), [None])
     self.assertEqual(translate_to_ascii("√"), [""])
Ejemplo n.º 18
0
def cache_name_variants_of_authors(author_to_name_and_occurrence_mapping):
    args = list()
    for author, names_and_occurrence in author_to_name_and_occurrence_mapping.iteritems(
    ):
        indexable_names_and_occurrence = dict()
        for name, occurrences in names_and_occurrence.iteritems():
            asciified_name = translate_to_ascii(name)[0]
            indexable_name = create_indexable_name(
                split_name_parts(indexable_name_re.sub(' ', asciified_name)))
            try:
                indexable_names_and_occurrence[indexable_name] += occurrences
            except KeyError:
                indexable_names_and_occurrence[indexable_name] = occurrences

        args += [author, serialize(indexable_names_and_occurrence), 1]

    populate_table('aidDENSEINDEX', ['id', 'personids', 'flag'],
                   args,
                   empty_table_first=False)
Ejemplo n.º 19
0
def validate_match(org_record,
                   matched_record,
                   ruleset,
                   verbose=0,
                   ascii_mode=False):
    """
    This function will try to match the original record with matched record.
    This comparison uses various methods defined in configuration and/or
    determined from the source record.

    These methods can be derived from each rule-set defined, which contains a
    mapping of a certain pattern to a list of rules defining the "match-strategy".

    For example:

    ('260__', [{ 'tags' : '260__c',
                 'threshold' : 0.8,
                 'compare_mode' : 'lazy',
                 'match_mode' : 'date',
                 'result_mode' : 'normal' }])

    Quick run-down of possible values:
      Compare mode:
        'strict'    : all (sub-)fields are compared, and all must match. Order is significant.
        'normal'    : all (sub-)fields are compared, and all must match. Order is ignored.
        'lazy'      : all (sub-)fields are compared with each other and at least one must match
        'ignored'   : the tag is ignored in the match. Used to disable previously defined rules.

      Match mode:
        'title'     : uses a method specialized for comparing titles, e.g. looking for subtitles
        'author'    : uses a special authorname comparison. Will take initials into account.
        'identifier': special matching for identifiers, stripping away punctuation
        'date'      : matches dates by extracting and comparing the year
        'normal'    : normal string comparison.

      Result mode:
        'normal'    : a failed match will cause the validation to continue on other rules (if any)
                      a successful match will cause the validation to continue on other rules (if any)
        'final'     : a failed match will cause the validation to immediately exit as a failure.
                      a successful match will cause validation to immediately exit as a success.
        'joker'     : a failed match will cause the validation to continue on other rules (if any).
                      a successful match will cause validation to immediately exit as a success.

    Fields are considered matching when all its subfields or values match. ALL matching strategy
    must return successfully for a match to be validated (except for 'joker' mode).

    @param org_record: bibrec structure of original record
    @type org_record: dict

    @param matched_record: bibrec structure of matched record
    @type matched_record: dict

    @param ruleset: the default rule-set {tag: strategy,..} used when validating
    @type ruleset: dict

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: Number of matches succeeded divided by number of comparisons done. At least two
        successful matches must be done unless a joker or final match is found
    @rtype: float
    """
    total_number_of_matches = 0
    total_number_of_comparisons = 0
    for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset:
        field_tag_list = field_tags.split(',')
        if verbose > 8:
            sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\
 mode '%s' as '%s' result with threshold %0.2f\n" \
                             % (field_tag_list, compare_mode, match_mode, \
                                result_mode, threshold))
        current_matching_status = False

        ## 1. COMPARE MODE
        # Fetch defined fields from both records
        original_record_values = []
        matched_record_values = []
        for field_tag in field_tag_list:
            tag_structure = validate_tag(field_tag)
            if tag_structure != None:
                tag, ind1, ind2, code = tag_structure
                # Fetch all field instances to match
                original_values = record_get_field_values(
                    org_record, tag, ind1, ind2, code)
                original_record_values.extend(
                    [value for value in original_values if value])
                matched_values = record_get_field_values(
                    matched_record, tag, ind1, ind2, code)
                matched_record_values.extend(
                    [value for value in matched_values if value])

        if (len(original_record_values) == 0
                or len(matched_record_values) == 0):
            # Both records do not have values, ignore.
            if verbose > 8:
                sys.stderr.write(
                    "\nBoth records do not have this field. Continue.\n")
            continue

        if result_mode != 'joker':
            # Since joker is a special beast (should have no impact on failure),
            # We first check if it is the current mode before incrementing number
            # of matching comparisons / attempts
            total_number_of_comparisons += 1

        if ascii_mode:
            original_record_values = translate_to_ascii(original_record_values)
            matched_record_values = translate_to_ascii(matched_record_values)

        ignore_order = True
        matches_needed = 0
        # How many field-value matches are needed for successful validation of this record
        if compare_mode == 'lazy':
            # 'lazy' : all fields are matched with each other, if any match = success
            matches_needed = 1
        elif compare_mode == 'normal':
            # 'normal' : all fields are compared, and all must match.
            # Order is ignored. The number of matches needed is equal
            # to the value count of original record
            matches_needed = len(original_record_values)
        elif compare_mode == 'strict':
            # 'strict' : all fields are compared, and all must match. Order matters.
            if len(original_record_values) != len(matched_record_values):
                # Not the same number of fields, not a valid match
                # Unless this is a joker, we return indicating failure
                if result_mode != 'joker':
                    return 0.0
                continue
            matches_needed = len(original_record_values)
            ignore_order = False
        if verbose > 8:
            sys.stderr.write("Total matches needed: %d -> " %
                             (matches_needed, ))

        ## 2. MATCH MODE
        comparison_function = None
        if match_mode == 'title':
            # Special title mode
            comparison_function = compare_fieldvalues_title
        elif match_mode == 'author':
            # Special author mode
            comparison_function = compare_fieldvalues_authorname
        elif match_mode == 'identifier':
            # Special identifier mode
            comparison_function = compare_fieldvalues_identifier
        elif match_mode == 'date':
            # Special identifier mode
            comparison_function = compare_fieldvalues_date
        else:
            # Normal mode
            comparison_function = compare_fieldvalues_normal

        # Get list of comparisons to perform containing extracted values
        field_comparisons = get_paired_comparisons(original_record_values, \
                                                   matched_record_values, \
                                                   ignore_order)

        if verbose > 8:
            sys.stderr.write("Field comparison values:\n%s\n" %
                             (field_comparisons, ))

        # Run comparisons according to match_mode
        current_matching_status, matches = comparison_function(field_comparisons, \
                                                               threshold, \
                                                               matches_needed)
        CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \
                                 (str(original_record_values), \
                                  str(matched_record_values), \
                                  matches, matches_needed))

        ## 3. RESULT MODE
        if current_matching_status:
            if verbose > 8:
                sys.stderr.write("Fields matched successfully.\n")
            if result_mode in ['final', 'joker']:
                # Matching success. Return 5,5 indicating exact-match when final or joker.
                return 1.0
            total_number_of_matches += 1
        else:
            # Matching failed. Not a valid match
            if result_mode == 'final':
                # Final does not allow failure
                return 0.0
            elif result_mode == 'joker':
                if verbose > 8:
                    sys.stderr.write("Fields not matching. (Joker)\n")
            else:
                if verbose > 8:
                    sys.stderr.write("Fields not matching. \n")
    if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \
        or total_number_of_comparisons == 0:
        return 0.0
    return total_number_of_matches / float(total_number_of_comparisons)
Ejemplo n.º 20
0
 def _sort_case_insensitive_strip_accents(self, val):
     """Remove accents and convert to lower case"""
     if not val:
         return ''
     return translate_to_ascii(val).pop().lower()
Ejemplo n.º 21
0
 def _sort_nosymbols_case_insensitive_strip_accents(self, val):
     """Remove accents, remove symbols, and convert to lower case"""
     if not val:
         return ''
     return ''.join(_RE_NOSYMBOLS.findall(translate_to_ascii(val).pop().lower()))
Ejemplo n.º 22
0
def compare_names(origin_name, target_name, initials_penalty=False):
    '''
    Compare two names.
    '''
    MAX_ALLOWED_SURNAME_DISTANCE = 2
    name_comparison_print("\nComparing: ", origin_name, ' ', target_name)
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations

    origin_name = translate_to_ascii(origin_name)[0]
    target_name = translate_to_ascii(target_name)[0]

    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: ", no)
    name_comparison_print("|- splitted nt: ", nt)

    score = 0.0

    surname_dist = distance(no[0], nt[0])
    name_comparison_print("|- surname distance: ", surname_dist)

    if surname_dist > 0:
        l_artifact_removal = re.compile("[^a-zA-Z0-9]")
        fn1 = l_artifact_removal.sub("", no[0])
        fn2 = l_artifact_removal.sub("", nt[0])

        if fn1 == fn2:
            score = 1.0
        else:
            score = max(
                0.0, 0.5 -
                (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE)))
    else:
        score = 1.0
    name_comparison_print('||- surname score: ', score)

    initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    only_initials_available = False
    if len(no[2]) == len(nt[2]) and initials_only:
        only_initials_available = True

    name_comparison_print('|- initials only: ', initials_only)
    name_comparison_print('|- only initials available: ',
                          only_initials_available)

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(
            origin_name, target_name)
    name_comparison_print("|- equal composites: ", names_are_equal_composites)

    max_n_initials = max(len(no[1]), len(nt[1]))
    initials_intersection = set(no[1]).intersection(set(nt[1]))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(no[1]).union(set(nt[1]))
    n_initials_union = len(initials_union)

    initials_distance = distance("".join(no[1]), "".join(nt[1]))
    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    if len(no[1]) > len(nt[1]):
        alo = no[1]
        alt = nt[1]
    else:
        alo = nt[1]
        alt = no[1]
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = initials_distance / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    score = max((score - ((0.75 * initials_screwup + 0.10 * (1. - initials_c)\
            + 0.15 * initials_distance) * score)), 0.0)
    name_comparison_print("|- initials sets: ", no[1], " ", nt[1])
    name_comparison_print("|- initials distance: ", initials_distance)
    name_comparison_print("|- initials c: ", initials_c)
    name_comparison_print("|- initials screwup: ", initials_screwup)
    name_comparison_print("||- initials score: ", score)

    composits_eq = full_names_are_equal_composites(no, nt)
    if len(no[2]) > 0 and len(nt[2]) > 0:
        gender_eq = full_names_are_equal_gender(no, nt, gendernames)
    else:
        gender_eq = True
    vars_eq = full_names_are_synonymous(no, nt, name_variations)
    substr_eq = full_names_are_substrings(no, nt)

    if not initials_only:
        if len(no[2]) > len(nt[2]):
            nalo = no[2]
            nalt = nt[2]
        else:
            nalo = nt[2]
            nalt = no[2]
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]
        max_names_screwup = max(
            [float(i[0]) / i[1] for i in names_screwup_list])
        avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\
                            / len(names_screwup_list)

    else:
        max_names_screwup = 0
        avg_names_screwup = 0

    score = max(
        score - score * (0.75 * max_names_screwup + 0.25 * avg_names_screwup),
        0.0)
    name_comparison_print("|- max names screwup: ", max_names_screwup)
    name_comparison_print("|- avg screwup: ", avg_names_screwup)
    name_comparison_print("||- names score: ", score)
    name_comparison_print("|- names composites: ", composits_eq)
    name_comparison_print("|- same gender: ", gender_eq)
    name_comparison_print("|- synonims: ", vars_eq)
    name_comparison_print("|- substrings: ", substr_eq)

    if vars_eq:
        synmap = [[i, j, names_are_synonymous(i, j, name_variations)]
                  for i in no[2] for j in nt[2]]
        synmap = [i for i in synmap if i[2] == True]
        name_comparison_print("|-- synmap: ", synmap)
        for i in synmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.5
            else:
                score = score + (1 - score) * 0.15
    else:
        name_comparison_print("|-- synmap: empty")
    name_comparison_print("|-- synmap score: ", score)

    if substr_eq and not initials_only:
        ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2]
                 for j in nt[2]]
        ssmap = [i for i in ssmap if i[2] == True]
        name_comparison_print("|-- substr map: ", ssmap)
        for i in ssmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.2
            else:
                score = score + (1 - score) * 0.05
    else:
        name_comparison_print("|-- substr map: empty")

    name_comparison_print("|-- substring score: ", score)

    if composits_eq and not initials_only:
        name_comparison_print("|-- composite names")
        score = score + (1 - score) * 0.2
    else:
        name_comparison_print("|-- not composite names")
    name_comparison_print("|-- composite score: ", score)

    if not gender_eq:
        score = score / 3.
        name_comparison_print("|-- apply gender penalty")
    else:
        name_comparison_print("|--   no  gender penalty")

    name_comparison_print("|-- gender score: ", score)

    if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE:
        score = 0.0
        name_comparison_print("|- surname trim: ", score)
    else:
        name_comparison_print("|- no surname trim: ", score)

    if initials_only and (not only_initials_available or initials_penalty):
        score = score * .9
        name_comparison_print("|- initials only penalty: ", score,
                              initials_only, only_initials_available)
    else:
        name_comparison_print("|- no initials only penalty", initials_only,
                              only_initials_available)

    name_comparison_print("||- final score:  ", score)

    return score
Ejemplo n.º 23
0
def find_personids_by_name(query_string, trust_is_operating=False):
    '''
    It returns all the authors that match the query string, sorted by compatibility.

    WARNING: this is just querying the search engine, for a proper person search query one
    should use person_search_engine_query in bibauthorid_dbinterface

    @param query_string: the query string
    @type query_string: str

    @return: author identifiers
    @rtype: list [int,]
    '''
    if not trust_is_operating:
        search_engine_is_oper = search_engine_is_operating()
        if not search_engine_is_oper:
            return None

    asciified_qstring = translate_to_ascii(query_string)[0]
    indexable_qstring = create_indexable_name(
        split_name_parts(indexable_name_re.sub(' ', asciified_qstring)))

    surname = split_name_parts(query_string)[0] + ','
    asciified_qstring_sur = translate_to_ascii(surname)[0]
    indexable_qstring_sur = create_indexable_name(
        split_name_parts(indexable_name_re.sub(' ', asciified_qstring_sur)))

    qstring_first_names = indexable_qstring.split(
        ' ')[len(indexable_qstring_sur.split(' ')):]

    string_ids = solve_T_occurence_problem(
        indexable_qstring) | solve_T_occurence_problem(indexable_qstring_sur)
    if not string_ids:
        return list()

    strings_to_ids_mapping = get_indexed_strings(string_ids)

    passing_string_ids, surname_score_cache = remove_false_positives(
        indexable_qstring_sur, strings_to_ids_mapping)

    if not passing_string_ids:
        return list()

    author_groups = get_author_groups_from_string_ids(passing_string_ids)

    authors = set()
    for author_group in author_groups:
        authors |= set(deserialize(author_group[0]))

    author_to_names_mapping = get_name_variants_for_authors(authors)

    surname_score_clusters = create_surname_score_clusters(
        indexable_qstring_sur, author_to_names_mapping, surname_score_cache,
        strings_to_ids_mapping)

    sorted_authors = sort_authors(indexable_qstring, qstring_first_names,
                                  surname_score_clusters,
                                  author_to_names_mapping,
                                  strings_to_ids_mapping)

    return sorted_authors
Ejemplo n.º 24
0
def create_bibauthorid_indexer():
    '''
    It constructs the disk-based indexer. It consists of the dense index (which maps a name
    to the set of personids who withhold that name) and the inverted lists (which map a qgram
    to the set of name ids that share that qgram).
    '''
    name_pids_dict = get_confirmed_name_to_authors_mapping()
    if not name_pids_dict:
        return

    indexable_name_pids_dict = dict()

    for name in name_pids_dict.keys():
        asciified_name = translate_to_ascii(name)[0]
        indexable_name = create_indexable_name(asciified_name)
        if indexable_name:
            try:
                asciified_name, pids = indexable_name_pids_dict[indexable_name]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_name] = (asciified_name,
                                                            updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_name] = (
                    asciified_name, name_pids_dict[name])

        surname = split_name_parts(name)[0]
        asciified_surname = translate_to_ascii(surname)[0]
        indexable_surname = create_indexable_name(asciified_surname)
        if indexable_surname:
            try:
                asciified_surname, pids = indexable_name_pids_dict[
                    indexable_surname]
                updated_pids = pids | name_pids_dict[name]
                indexable_name_pids_dict[indexable_surname] = (
                    asciified_surname, updated_pids)
            except KeyError:
                indexable_name_pids_dict[indexable_surname] = (
                    asciified_surname, name_pids_dict[name])

    indexable_names_list = indexable_name_pids_dict.keys()

    # If an exception/error occurs in any of the threads it is not detectable
    # so inter-thread communication is necessary to make it visible.
    q = Queue()
    threads = list()
    threads.append(
        Thread(target=create_dense_index,
               args=(indexable_name_pids_dict, indexable_names_list, q)))
    threads.append(
        Thread(target=create_inverted_lists, args=(indexable_names_list, q)))

    for t in threads:
        t.start()

    for t in threads:
        all_ok, error = q.get(block=True)
        if not all_ok:
            raise error
        q.task_done()

    for t in threads:
        t.join()
Ejemplo n.º 25
0
def validate_match(org_record, matched_record, ruleset, verbose=0, ascii_mode=False):
    """
    This function will try to match the original record with matched record.
    This comparison uses various methods defined in configuration and/or
    determined from the source record.

    These methods can be derived from each rule-set defined, which contains a
    mapping of a certain pattern to a list of rules defining the "match-strategy".

    For example:

    ('260__', [{ 'tags' : '260__c',
                 'threshold' : 0.8,
                 'compare_mode' : 'lazy',
                 'match_mode' : 'date',
                 'result_mode' : 'normal' }])

    Quick run-down of possible values:
      Compare mode:
        'strict'    : all (sub-)fields are compared, and all must match. Order is significant.
        'normal'    : all (sub-)fields are compared, and all must match. Order is ignored.
        'lazy'      : all (sub-)fields are compared with each other and at least one must match
        'ignored'   : the tag is ignored in the match. Used to disable previously defined rules.

      Match mode:
        'title'     : uses a method specialized for comparing titles, e.g. looking for subtitles
        'author'    : uses a special authorname comparison. Will take initials into account.
        'identifier': special matching for identifiers, stripping away punctuation
        'date'      : matches dates by extracting and comparing the year
        'normal'    : normal string comparison.

      Result mode:
        'normal'    : a failed match will cause the validation to continue on other rules (if any)
                      a successful match will cause the validation to continue on other rules (if any)
        'final'     : a failed match will cause the validation to immediately exit as a failure.
                      a successful match will cause validation to immediately exit as a success.
        'joker'     : a failed match will cause the validation to continue on other rules (if any).
                      a successful match will cause validation to immediately exit as a success.

    Fields are considered matching when all its subfields or values match. ALL matching strategy
    must return successfully for a match to be validated (except for 'joker' mode).

    @param org_record: bibrec structure of original record
    @type org_record: dict

    @param matched_record: bibrec structure of matched record
    @type matched_record: dict

    @param ruleset: the default rule-set {tag: strategy,..} used when validating
    @type ruleset: dict

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: Number of matches succeeded divided by number of comparisons done. At least two
        successful matches must be done unless a joker or final match is found
    @rtype: float
    """
    total_number_of_matches = 0
    total_number_of_comparisons = 0
    for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset:
        field_tag_list = field_tags.split(',')
        if verbose > 8:
            sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\
 mode '%s' as '%s' result with threshold %0.2f\n" \
                             % (field_tag_list, compare_mode, match_mode, \
                                result_mode, threshold))
        current_matching_status = False

        ## 1. COMPARE MODE
        # Fetch defined fields from both records
        original_record_values = []
        matched_record_values = []
        for field_tag in field_tag_list:
            tag_structure = validate_tag(field_tag)
            if tag_structure != None:
                tag, ind1, ind2, code = tag_structure
                # Fetch all field instances to match
                original_values = record_get_field_values(org_record, tag, ind1, ind2, code)
                original_record_values.extend([value for value in original_values if value])
                matched_values = record_get_field_values(matched_record, tag, ind1, ind2, code)
                matched_record_values.extend([value for value in matched_values if value])

        if (len(original_record_values) == 0 or len(matched_record_values) == 0):
            # Both records do not have values, ignore.
            if verbose > 8:
                sys.stderr.write("\nBoth records do not have this field. Continue.\n")
            continue

        if result_mode != 'joker':
            # Since joker is a special beast (should have no impact on failure),
            # We first check if it is the current mode before incrementing number
            # of matching comparisons / attempts
            total_number_of_comparisons += 1

        if ascii_mode:
            original_record_values = translate_to_ascii(original_record_values)
            matched_record_values = translate_to_ascii(matched_record_values)

        ignore_order = True
        matches_needed = 0
        # How many field-value matches are needed for successful validation of this record
        if compare_mode == 'lazy':
            # 'lazy' : all fields are matched with each other, if any match = success
            matches_needed = 1
        elif compare_mode == 'normal':
            # 'normal' : all fields are compared, and all must match.
            # Order is ignored. The number of matches needed is equal
            # to the value count of original record
            matches_needed = len(original_record_values)
        elif compare_mode == 'strict':
            # 'strict' : all fields are compared, and all must match. Order matters.
            if len(original_record_values) != len(matched_record_values):
                # Not the same number of fields, not a valid match
                # Unless this is a joker, we return indicating failure
                if result_mode != 'joker':
                    return 0.0
                continue
            matches_needed = len(original_record_values)
            ignore_order = False
        if verbose > 8:
            sys.stderr.write("Total matches needed: %d -> " % (matches_needed,))

        ## 2. MATCH MODE
        comparison_function = None
        if match_mode == 'title':
            # Special title mode
            comparison_function = compare_fieldvalues_title
        elif match_mode == 'author':
            # Special author mode
            comparison_function = compare_fieldvalues_authorname
        elif match_mode == 'identifier':
            # Special identifier mode
            comparison_function = compare_fieldvalues_identifier
        elif match_mode == 'date':
            # Special identifier mode
            comparison_function = compare_fieldvalues_date
        else:
            # Normal mode
            comparison_function = compare_fieldvalues_normal

        # Get list of comparisons to perform containing extracted values
        field_comparisons = get_paired_comparisons(original_record_values, \
                                                   matched_record_values, \
                                                   ignore_order)

        if verbose > 8:
            sys.stderr.write("Field comparison values:\n%s\n" % (field_comparisons,))

        # Run comparisons according to match_mode
        current_matching_status, matches = comparison_function(field_comparisons, \
                                                               threshold, \
                                                               matches_needed)
        CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \
                                 (str(original_record_values), \
                                  str(matched_record_values), \
                                  matches, matches_needed))

        ## 3. RESULT MODE
        if current_matching_status:
            if verbose > 8:
                sys.stderr.write("Fields matched successfully.\n")
            if result_mode in ['final', 'joker']:
                # Matching success. Return 5,5 indicating exact-match when final or joker.
                return 1.0
            total_number_of_matches += 1
        else:
            # Matching failed. Not a valid match
            if result_mode == 'final':
                # Final does not allow failure
                return 0.0
            elif result_mode == 'joker':
                if verbose > 8:
                    sys.stderr.write("Fields not matching. (Joker)\n")
            else:
                if verbose > 8:
                    sys.stderr.write("Fields not matching. \n")
    if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \
        or total_number_of_comparisons == 0:
        return 0.0
    return total_number_of_matches / float(total_number_of_comparisons)
Ejemplo n.º 26
0
def compare_names(origin_name, target_name, initials_penalty=False):
    '''
    Compare two names.
    '''
    MAX_ALLOWED_SURNAME_DISTANCE = 2
    name_comparison_print("\nComparing: " , origin_name, ' ', target_name)
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations

    origin_name = translate_to_ascii(origin_name)[0]
    target_name = translate_to_ascii(target_name)[0]

    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: ", no)
    name_comparison_print("|- splitted nt: ", nt)

    score = 0.0

    surname_dist = distance(no[0], nt[0])
    name_comparison_print("|- surname distance: ", surname_dist)

    if surname_dist > 0:
        l_artifact_removal = re.compile("[^a-zA-Z0-9]")
        fn1 = l_artifact_removal.sub("", no[0])
        fn2 = l_artifact_removal.sub("", nt[0])

        if fn1 == fn2:
            score = 1.0
        else:
            score = max(0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE)))
    else:
        score = 1.0
    name_comparison_print('||- surname score: ', score)

    initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    only_initials_available = False
    if len(no[2]) == len(nt[2]) and initials_only:
        only_initials_available = True

    name_comparison_print('|- initials only: ', initials_only)
    name_comparison_print('|- only initials available: ', only_initials_available)

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(origin_name, target_name)
    name_comparison_print("|- equal composites: ", names_are_equal_composites)

    max_n_initials = max(len(no[1]), len(nt[1]))
    initials_intersection = set(no[1]).intersection(set(nt[1]))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(no[1]).union(set(nt[1]))
    n_initials_union = len(initials_union)


    initials_distance = distance("".join(no[1]), "".join(nt[1]))
    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    if len(no[1]) > len(nt[1]):
        alo = no[1]
        alt = nt[1]
    else:
        alo = nt[1]
        alt = no[1]
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = initials_distance / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    score = max((score - ((0.75 * initials_screwup + 0.10 * (1. - initials_c)\
            + 0.15 * initials_distance) * score)), 0.0)
    name_comparison_print("|- initials sets: ", no[1], " ", nt[1])
    name_comparison_print("|- initials distance: ", initials_distance)
    name_comparison_print("|- initials c: ", initials_c)
    name_comparison_print("|- initials screwup: ", initials_screwup)
    name_comparison_print("||- initials score: ", score)

    composits_eq = full_names_are_equal_composites(no, nt)
    if len(no[2]) > 0 and len(nt[2]) > 0:
        gender_eq = full_names_are_equal_gender(no, nt, gendernames)
    else:
        gender_eq = True
    vars_eq = full_names_are_synonymous(no, nt, name_variations)
    substr_eq = full_names_are_substrings(no, nt)

    if not initials_only:
        if len(no[2]) > len(nt[2]):
            nalo = no[2]
            nalt = nt[2]
        else:
            nalo = nt[2]
            nalt = no[2]
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]
        max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list])
        avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\
                            / len(names_screwup_list)

    else:
        max_names_screwup = 0
        avg_names_screwup = 0

    score = max(score - score * ( 0.75 * max_names_screwup + 0.25 * avg_names_screwup), 0.0)
    name_comparison_print("|- max names screwup: ", max_names_screwup)
    name_comparison_print("|- avg screwup: ", avg_names_screwup)
    name_comparison_print("||- names score: ", score)
    name_comparison_print("|- names composites: ", composits_eq)
    name_comparison_print("|- same gender: ", gender_eq)
    name_comparison_print("|- synonims: ", vars_eq)
    name_comparison_print("|- substrings: ", substr_eq)

    if vars_eq:
        synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]]
        synmap = [i for i in synmap if i[2] == True]
        name_comparison_print("|-- synmap: ", synmap)
        for i in synmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.5
            else:
                score = score + (1 - score) * 0.15
    else:
        name_comparison_print("|-- synmap: empty")
    name_comparison_print("|-- synmap score: ", score)

    if substr_eq and not initials_only:
        ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]]
        ssmap = [i for i in ssmap if i[2] == True]
        name_comparison_print("|-- substr map: ", ssmap)
        for i in ssmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.2
            else:
                score = score + (1 - score) * 0.05
    else:
        name_comparison_print("|-- substr map: empty")

    name_comparison_print("|-- substring score: ", score)

    if composits_eq and not initials_only:
        name_comparison_print("|-- composite names")
        score = score + (1 - score) * 0.2
    else:
        name_comparison_print("|-- not composite names")
    name_comparison_print("|-- composite score: ", score)

    if not gender_eq:
        score = score / 3.
        name_comparison_print("|-- apply gender penalty")
    else:
        name_comparison_print("|--   no  gender penalty")

    name_comparison_print("|-- gender score: ", score)

    if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE:
        score = 0.0
        name_comparison_print("|- surname trim: ", score)
    else:
        name_comparison_print("|- no surname trim: ", score)

    if initials_only and (not only_initials_available or initials_penalty):
        score = score * .9
        name_comparison_print("|- initials only penalty: ", score, initials_only, only_initials_available)
    else:
        name_comparison_print("|- no initials only penalty", initials_only, only_initials_available)

    name_comparison_print("||- final score:  ", score)

    return score