Python iunaccentの例、papers.utils.iunaccent Pythonの例

コード例 #1

0

ファイルを表示

ファイル: name.py プロジェクト: Lysxia/dissemin

def name_signature(first, last):
    ident = iunaccent(last.strip())
    ident = nn_escaping_chars_re.sub('',ident)
    ident = nn_final_nontext_re.sub('',ident)
    ident = nn_nontext_re.sub('-',ident)
    if len(first):
        ident = iunaccent(first[0])+'-'+ident
    return ident

コード例 #2

0

ファイルを表示

def name_signature(first, last):
    ident = iunaccent(last.strip())
    ident = nn_escaping_chars_re.sub('', ident)
    ident = nn_final_nontext_re.sub('', ident)
    ident = nn_nontext_re.sub('-', ident)
    if len(first):
        ident = iunaccent(first[0]) + '-' + ident
    return ident

コード例 #3

0

ファイルを表示

def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    """

    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = list(zip(partsA, partsB))
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = list(zip(partsA, partsB))
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i]) > 1, len(partsB[i]) > 1))
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
            expanded.append((len(partsA[i]) > 1, False))
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i]) > 1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([wa or not wb for wa, wb in expanded]) or
            all([wb or not wa for wa, wb in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores

コード例 #4

0

ファイルを表示

def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    Examples:
    name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')) == 0.8
    name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('R.', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')) ==0.3
    name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')) == 0.8
    name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')) == 0.3
    name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')) == 0.7
    name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')) == 0.7
    name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')) == 0
    name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu') == 0
    """

    if not a or not b:
        return False
    (firstA, lastA) = a
    (firstB, lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    #if firstA == firstB:
    #    return 1.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.
    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
        elif i < len(partsA):
            sumscores -= 0.25 * weight_first_name(partsA[i])
        else:
            sumscores -= 0.25 * weight_first_name(partsB[i])
    sumscores = max(min(sumscores, 1), 0)
    return sumscores

コード例 #5

0

ファイルを表示

ファイル: name.py プロジェクト: jilljenn/dissemin

def name_similarity(a,b):
    """
    Returns a float: how similar are these two names?
    Examples:
    name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')) == 0.8
    name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('R.', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')) ==0.3
    name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')) == 0.8
    name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')) == 0.3
    name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')) == 0.7
    name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')) == 0.7
    name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')) == 0
    name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu') == 0
    """

    if not a or not b:
        return False
    (firstA,lastA) = a
    (firstB,lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    #if firstA == firstB:
    #    return 1.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.
    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
    sumscores = max(min(sumscores, 1), 0)
    return sumscores

コード例 #6

0

ファイルを表示

def populate_identifiers(apps, se):
    Institution = apps.get_model('papers', 'Institution')
    for i in Institution.objects.all():
        if i.country and i.name:
            i.identifiers = [i.identifier, i.country + ':' + iunaccent(i.name)]
        else:
            i.identifiers = [i.identifier]
        i.save(update_fields=['identifiers'])

コード例 #7

0

ファイルを表示

ファイル: 0043_institutions_multiple_identifiers.py プロジェクト: Phyks/dissemin

def populate_identifiers(apps, se):
    Institution = apps.get_model('papers', 'Institution')
    for i in Institution.objects.all():
        if i.country and i.name:
            i.identifiers = [i.identifier, i.country+':'+iunaccent(i.name)]
        else:
            i.identifiers = [i.identifier]
        i.save(update_fields=['identifiers'])

コード例 #8

0

ファイルを表示

 def create(cls, first, last):
     """
     Creates an instance of the Name object without saving it.
     Useful for name lookups where we are not sure we want to
     keep the name in the model.
     """
     instance = cls()
     instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip())
     instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip())
     instance.full = iunaccent(instance.first+' '+instance.last)
     return instance

コード例 #9

0

ファイルを表示

ファイル: baremodels.py プロジェクト: Phyks/dissemin

 def create(cls, first, last):
     """
     Creates an instance of the Name object without saving it.
     Useful for name lookups where we are not sure we want to
     keep the name in the model.
     """
     instance = cls()
     instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip())
     instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip())
     instance.full = iunaccent(instance.first+' '+instance.last)
     return instance

コード例 #10

0

ファイルを表示

def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    (in the next function)
    """
    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, _ = split_name_words(lastA)
    wordsB, _ = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    if not wordsA or not wordsB:
        return False
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = [p[0] for p in partsA]
    partsB = [p[0] for p in partsB]

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    return ratio * (len(parts) + 1) / (maxlen + 1)

コード例 #11

0

ファイルを表示

ファイル: name.py プロジェクト: Phyks/dissemin

def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    (in the next function)
    """
    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, _ = split_name_words(lastA)
    wordsB, _ = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    if not wordsA or not wordsB:
        return False
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = [ p[0] for p in partsA ]
    partsB = [ p[0] for p in partsB ]

    parts = list(zip(partsA, partsB))
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = list(zip(partsA, partsB))
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    return ratio*(len(parts)+1)/(maxlen+1)

コード例 #12

0

ファイルを表示

def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    """
    if not a or not b:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, sepA = split_name_words(lastA)
    wordsB, sepB = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = map(lambda x: x[0], partsA)
    partsB = map(lambda x: x[0], partsB)

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    if maxlen > 0:
        return ratio * len(parts) / maxlen
    return 0.

コード例 #13

0

ファイルを表示

ファイル: name.py プロジェクト: jilljenn/dissemin

def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    """
    if not a or not b:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, sepA = split_name_words(lastA)
    wordsB, sepB = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = map(lambda x: x[0], partsA)
    partsB = map(lambda x: x[0], partsB)

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    if maxlen > 0:
        return ratio*len(parts)/maxlen
    return 0.

コード例 #14

0

ファイルを表示

 def test_iunaccent(self):
         self.assertEqual(iunaccent('BÉPO forever'), 'bepo forever')

コード例 #15

0

ファイルを表示

def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    Examples:

    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')))
    8
    >>> int(10*name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('R.', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')))
    8
    >>> int(10*name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')))
    7
    >>> int(10*name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')))
    7
    >>> int(10*name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')))
    0
    >>> int(10*name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu')))
    0
    >>> int(10*name_similarity(('Amanda P.','Brown'),('Patrick','Brown')))
    0
    """

    if not a or not b:
        return False
    (firstA, lastA) = a
    (firstB, lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i]) > 1, len(partsB[i]) > 1))
        elif i < len(partsA):
            sumscores -= 0.25 * weight_first_name(partsA[i])
            expanded.append((len(partsA[i]) > 1, False))
        else:
            sumscores -= 0.25 * weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i]) > 1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([a or not b for a, b in expanded])
            or all([b or not a for a, b in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores

コード例 #16

0

ファイルを表示

ファイル: name.py プロジェクト: Lysxia/dissemin

def name_similarity(a,b):
    """
    Returns a float: how similar are these two names?
    Examples:

    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')))
    8
    >>> int(10*name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('R.', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')))
    8
    >>> int(10*name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')))
    7
    >>> int(10*name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')))
    7
    >>> int(10*name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')))
    0
    >>> int(10*name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu')))
    0
    >>> int(10*name_similarity(('Amanda P.','Brown'),('Patrick','Brown')))
    0
    """

    if not a or not b:
        return False
    (firstA,lastA) = a
    (firstB,lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i])>1, len(partsB[i])>1))
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
            expanded.append((len(partsA[i])>1, False))
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i])>1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([a or not b for a,b in expanded]) or
        all([b or not a for a,b in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores