コード例 #1
0
 def test_identity(self, text_pairs):
     for text1, text2 in text_pairs:
         assert similarity.levenshtein(text1,
                                       text1) == pytest.approx(1.0,
                                                               rel=1e-3)
         assert similarity.levenshtein(text2,
                                       text2) == pytest.approx(1.0,
                                                               rel=1e-3)
コード例 #2
0
def quality_metric(lines, gold):
    """
    Compare a list of text lines to the ``gold standard`` (ground truth)
    for a given receipt. This metric takes into account both the character
    level OCR quality and the recall of lines.

    The key method here is that of the ``fuzzy_compare_iter`` function with
    a levenshtein (edit) distance and 2 lists of strings.

    By using ``fuzzy_compare_iter`` and then filtering values with a placeholder
    we can then check how many rows were recalled 'close enough' (using the default)
    threshold value in ``fuzzy_compare_iter``. TODO: Add parametrisation of this fn.

    Once we have the lines we recalled, we then use levenstein distance to compare them
    and use that as our precision metric.

    Finally the number of lines recalled 'close enough' divided by the number of lines
    in the gold standard is our ``recall`` approximation and the precision of is measured
    by how well the characters in each of the recalled lines match using levenshtein
    distance across the whole string.

    To get the final metric we use a harmonic mean of precision and recall.

    Parameters
    ----------
    lines : list[str]
        list of text strings

    gold : list[str]
        list of text strings representing ground truth

    Returns
    -------
    float
        an approximation of a makeshift F1-score

    """
    filt = lambda tup: tup[1] != "N/A"
    edit = lambda tup: levenshtein(tup[0], tup[1])

    matched = list(
        filter(
            filt,
            zip(
                lines,
                fuzzy_compare_iter(lines,
                                   gold,
                                   function=levenshtein,
                                   fill_val="N/A"),
            ),
        ))

    precision = np.mean(list(map(edit, matched)))
    recall = len(matched) / len(gold)
    if precision and recall:
        return 2 / ((1 / precision) + (1 / recall))

    else:
        raise ZeroDivisionError(
            "recall or precision is 0, can't compute the final score")
コード例 #3
0
def are_nouns_similar(noun1, noun2):
    # jaccard, jaro_winkler, hamming, token_sort_ratio
    jaccardD = jaccard(noun1, noun2)
    jaro = jaro_winkler(noun1, noun2)
    lev = levenshtein(noun1, noun2)
    hammingD = hamming(noun1, noun2)
    tsr = token_sort_ratio(noun1, noun2)
    dice = dice_coefficient(noun1, noun2)
    if lev > 0.42:
        return True
コード例 #4
0
def editMatrix(xs):
    """
    Create edit distance matrix

    Parameters
    ----------
    xs : list[str]
        text to create edit distance from

    Returns
    -------
    numpy.ndarray
        the distance array
    """
    return np.array([[levenshtein(x, y) for y in xs] for x in strip(xs)])
コード例 #5
0
def condition(xs):
    """
    Average edit distance on a cluster

    Parameters
    ----------
    xs : list[str]
         list of texts to calculate the edit distances from

    Returns
    -------
    float
    """
    xs_ = strip(xs)
    return (0.0  # levenshtein returns 1 if they are identical
            if len(xs_) <= 1 else np.mean(
                [levenshtein(*ys) for ys in itertools.combinations(xs_, 2)]))
コード例 #6
0
def editMatrix(xs: List[str]) -> Array:
    return np.array([[levenshtein(x, y) for y in xs] for x in strip(xs)])
コード例 #7
0
def condition(xs: List[str]) -> float:
    "average edit distance on cluster"
    xs_ = strip(xs)
    return (0.0  # levenshtein returns 1 if they are identical
            if len(xs_) <= 1 else np.mean(
                [levenshtein(*ys) for ys in itertools.combinations(xs_, 2)]))
コード例 #8
0
ファイル: test_similarity.py プロジェクト: zf109/textacy
def test_levenshtein(text1, text2):
    assert similarity.levenshtein(text1, text2) == 0.3589743589743589
コード例 #9
0
def are_nouns_similar(noun1, noun2):
    # jaccard, jaro_winkler, hamming, token_sort_ratio
    lev = levenshtein(noun1, noun2)
    if lev > 0.72:
        return True
コード例 #10
0
 def test_empty(self, text_pairs):
     for text1, text2 in text_pairs:
         assert similarity.levenshtein(text1, "") == 0.0
コード例 #11
0
 def test_default(self, text_pairs):
     for text1, text2 in text_pairs:
         assert 0.0 <= similarity.levenshtein(text1, text2) <= 1.0
コード例 #12
0
ファイル: select.py プロジェクト: finnkauski/woffle
def edCond(xs: List[str]) -> float:
    "aver1age edit distance on cluster"
    return (0 if len(xs) == 1 else
            sum([levenshtein(*ys)
                 for ys in itertools.combinations(xs, 2)]) / len(xs))
コード例 #13
0
def similar_resumo(resumo_referencia, lista_resumos_escritorio):
    # referencia = nlp(resumo_referencia)
    for ref in lista_resumos_escritorio:
        yield (lista_resumos_escritorio.index(ref),
               similarity.levenshtein(resumo_referencia, ref))