コード例 #1
0
ファイル: name.py プロジェクト: Lysxia/dissemin
def name_signature(first, last):
    ident = iunaccent(last.strip())
    ident = nn_escaping_chars_re.sub('',ident)
    ident = nn_final_nontext_re.sub('',ident)
    ident = nn_nontext_re.sub('-',ident)
    if len(first):
        ident = iunaccent(first[0])+'-'+ident
    return ident
コード例 #2
0
def name_signature(first, last):
    ident = iunaccent(last.strip())
    ident = nn_escaping_chars_re.sub('', ident)
    ident = nn_final_nontext_re.sub('', ident)
    ident = nn_nontext_re.sub('-', ident)
    if len(first):
        ident = iunaccent(first[0]) + '-' + ident
    return ident
コード例 #3
0
def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    """

    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = list(zip(partsA, partsB))
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = list(zip(partsA, partsB))
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i]) > 1, len(partsB[i]) > 1))
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
            expanded.append((len(partsA[i]) > 1, False))
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i]) > 1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([wa or not wb for wa, wb in expanded]) or
            all([wb or not wa for wa, wb in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores
コード例 #4
0
def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    Examples:
    name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')) == 0.8
    name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('R.', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')) ==0.3
    name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')) == 0.8
    name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')) == 0.3
    name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')) == 0.7
    name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')) == 0.7
    name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')) == 0
    name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu') == 0
    """

    if not a or not b:
        return False
    (firstA, lastA) = a
    (firstB, lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    #if firstA == firstB:
    #    return 1.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.
    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
        elif i < len(partsA):
            sumscores -= 0.25 * weight_first_name(partsA[i])
        else:
            sumscores -= 0.25 * weight_first_name(partsB[i])
    sumscores = max(min(sumscores, 1), 0)
    return sumscores
コード例 #5
0
ファイル: name.py プロジェクト: jilljenn/dissemin
def name_similarity(a,b):
    """
    Returns a float: how similar are these two names?
    Examples:
    name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')) == 0.8
    name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('R.', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')) ==0.3
    name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')) == 0.8
    name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')) == 0.3
    name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')) == 0.7
    name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')) == 0.7
    name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')) == 0
    name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu') == 0
    """

    if not a or not b:
        return False
    (firstA,lastA) = a
    (firstB,lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    #if firstA == firstB:
    #    return 1.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.
    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
    sumscores = max(min(sumscores, 1), 0)
    return sumscores
コード例 #6
0
def populate_identifiers(apps, se):
    Institution = apps.get_model('papers', 'Institution')
    for i in Institution.objects.all():
        if i.country and i.name:
            i.identifiers = [i.identifier, i.country + ':' + iunaccent(i.name)]
        else:
            i.identifiers = [i.identifier]
        i.save(update_fields=['identifiers'])
コード例 #7
0
def populate_identifiers(apps, se):
    Institution = apps.get_model('papers', 'Institution')
    for i in Institution.objects.all():
        if i.country and i.name:
            i.identifiers = [i.identifier, i.country+':'+iunaccent(i.name)]
        else:
            i.identifiers = [i.identifier]
        i.save(update_fields=['identifiers'])
コード例 #8
0
 def create(cls, first, last):
     """
     Creates an instance of the Name object without saving it.
     Useful for name lookups where we are not sure we want to
     keep the name in the model.
     """
     instance = cls()
     instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip())
     instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip())
     instance.full = iunaccent(instance.first+' '+instance.last)
     return instance
コード例 #9
0
ファイル: baremodels.py プロジェクト: Phyks/dissemin
 def create(cls, first, last):
     """
     Creates an instance of the Name object without saving it.
     Useful for name lookups where we are not sure we want to
     keep the name in the model.
     """
     instance = cls()
     instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip())
     instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip())
     instance.full = iunaccent(instance.first+' '+instance.last)
     return instance
コード例 #10
0
def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    (in the next function)
    """
    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, _ = split_name_words(lastA)
    wordsB, _ = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    if not wordsA or not wordsB:
        return False
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = [p[0] for p in partsA]
    partsB = [p[0] for p in partsB]

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    return ratio * (len(parts) + 1) / (maxlen + 1)
コード例 #11
0
ファイル: name.py プロジェクト: Phyks/dissemin
def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    (in the next function)
    """
    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, _ = split_name_words(lastA)
    wordsB, _ = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    if not wordsA or not wordsB:
        return False
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = [ p[0] for p in partsA ]
    partsB = [ p[0] for p in partsB ]

    parts = list(zip(partsA, partsB))
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = list(zip(partsA, partsB))
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    return ratio*(len(parts)+1)/(maxlen+1)
コード例 #12
0
def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    """
    if not a or not b:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, sepA = split_name_words(lastA)
    wordsB, sepB = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = map(lambda x: x[0], partsA)
    partsB = map(lambda x: x[0], partsB)

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    if maxlen > 0:
        return ratio * len(parts) / maxlen
    return 0.
コード例 #13
0
ファイル: name.py プロジェクト: jilljenn/dissemin
def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    """
    if not a or not b:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, sepA = split_name_words(lastA)
    wordsB, sepB = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = map(lambda x: x[0], partsA)
    partsB = map(lambda x: x[0], partsB)

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    if maxlen > 0:
        return ratio*len(parts)/maxlen
    return 0.
コード例 #14
0
 def test_iunaccent(self):
         self.assertEqual(iunaccent('BÉPO forever'), 'bepo forever')
コード例 #15
0
def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    Examples:

    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')))
    8
    >>> int(10*name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('R.', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')))
    8
    >>> int(10*name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')))
    7
    >>> int(10*name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')))
    7
    >>> int(10*name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')))
    0
    >>> int(10*name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu')))
    0
    >>> int(10*name_similarity(('Amanda P.','Brown'),('Patrick','Brown')))
    0
    """

    if not a or not b:
        return False
    (firstA, lastA) = a
    (firstB, lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i]) > 1, len(partsB[i]) > 1))
        elif i < len(partsA):
            sumscores -= 0.25 * weight_first_name(partsA[i])
            expanded.append((len(partsA[i]) > 1, False))
        else:
            sumscores -= 0.25 * weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i]) > 1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([a or not b for a, b in expanded])
            or all([b or not a for a, b in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores
コード例 #16
0
ファイル: name.py プロジェクト: Lysxia/dissemin
def name_similarity(a,b):
    """
    Returns a float: how similar are these two names?
    Examples:

    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')))
    8
    >>> int(10*name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('R.', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')))
    8
    >>> int(10*name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')))
    7
    >>> int(10*name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')))
    7
    >>> int(10*name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')))
    0
    >>> int(10*name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu')))
    0
    >>> int(10*name_similarity(('Amanda P.','Brown'),('Patrick','Brown')))
    0
    """

    if not a or not b:
        return False
    (firstA,lastA) = a
    (firstB,lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i])>1, len(partsB[i])>1))
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
            expanded.append((len(partsA[i])>1, False))
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i])>1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([a or not b for a,b in expanded]) or
        all([b or not a for a,b in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores