Ejemplo n.º 1
0
def main(filepath):

    linkset = defaultdict(set)

    df = pd.read_csv(filepath)

    for r in df.to_dict(orient='records'):

        dataset = r['dataset']

        a = r['owner_a']
        b = r['owner_b']

        p = token_sort_ratio(a, b)
        if p > .8:
            print(round(p, 2), a, b, sep='\t')

            if r['actType'] not in [
                    "Boedelinventaris", "Boedelscheiding", "Testament",
                    "Overig", "Huwelijkse voorwaarden", "Kwitantie"
            ]:
                continue

            linkset[r['inventory']].add(
                (r['dataset'], r['record'], r['actType']))

    return linkset
Ejemplo n.º 2
0
def are_nouns_similar(noun1, noun2):
    # jaccard, jaro_winkler, hamming, token_sort_ratio
    jaccardD = jaccard(noun1, noun2)
    jaro = jaro_winkler(noun1, noun2)
    lev = levenshtein(noun1, noun2)
    hammingD = hamming(noun1, noun2)
    tsr = token_sort_ratio(noun1, noun2)
    dice = dice_coefficient(noun1, noun2)
    if lev > 0.42:
        return True
Ejemplo n.º 3
0
def aggregate_term_variants(terms,
                            acro_defs=None,
                            fuzzy_dedupe=True):
    """
    Take a set of unique terms and aggregate terms that are symbolic, lexical,
    and ordering variants of each other, as well as acronyms and fuzzy string matches.

    Args:
        terms (Set[str]): set of unique terms with potential duplicates
        acro_defs (dict): if not None, terms that are acronyms will be
            aggregated with their definitions and terms that are definitions will
            be aggregated with their acronyms
        fuzzy_dedupe (bool): if True, fuzzy string matching will be used
            to aggregate similar terms of a sufficient length

    Returns:
        List[Set[str]]: each item is a set of aggregated terms

    Notes:
        Partly inspired by aggregation of variants discussed in
        Park, Youngja, Roy J. Byrd, and Branimir K. Boguraev.
        "Automatic glossary extraction: beyond terminology identification."
        Proceedings of the 19th international conference on Computational linguistics-Volume 1.
        Association for Computational Linguistics, 2002.
    """
    agg_terms = []
    seen_terms = set()
    for term in sorted(terms, key=len, reverse=True):

        if term in seen_terms:
            continue

        variants = set([term])
        seen_terms.add(term)

        # symbolic variations
        if '-' in term:
            variant = term.replace('-', ' ').strip()
            if variant in terms.difference(seen_terms):
                variants.add(variant)
                seen_terms.add(variant)
        if '/' in term:
            variant = term.replace('/', ' ').strip()
            if variant in terms.difference(seen_terms):
                variants.add(variant)
                seen_terms.add(variant)

        # lexical variations
        term_words = term.split()
        # last_word = term_words[-1]
        # # assume last word is a noun
        # last_word_lemmatized = lemmatizer.lemmatize(last_word, 'n')
        # # if the same, either already a lemmatized noun OR a verb; try verb
        # if last_word_lemmatized == last_word:
        #     last_word_lemmatized = lemmatizer.lemmatize(last_word, 'v')
        # # if at least we have a new term... add it
        # if last_word_lemmatized != last_word:
        #     term_lemmatized = ' '.join(term_words[:-1] + [last_word_lemmatized])
        #     if term_lemmatized in terms.difference(seen_terms):
        #         variants.add(term_lemmatized)
        #         seen_terms.add(term_lemmatized)

        # if term is an acronym, add its definition
        # if term is a definition, add its acronym
        if acro_defs:
            for acro, def_ in acro_defs.items():
                if acro.lower() == term.lower():
                    variants.add(def_.lower())
                    seen_terms.add(def_.lower())
                    break
                elif def_.lower() == term.lower():
                    variants.add(acro.lower())
                    seen_terms.add(acro.lower())
                    break

        # if 3+ -word term differs by one word at the start or the end
        # of a longer phrase, aggregate
        if len(term_words) > 2:
            term_minus_first_word = ' '.join(term_words[1:])
            term_minus_last_word = ' '.join(term_words[:-1])
            if term_minus_first_word in terms.difference(seen_terms):
                variants.add(term_minus_first_word)
                seen_terms.add(term_minus_first_word)
            if term_minus_last_word in terms.difference(seen_terms):
                variants.add(term_minus_last_word)
                seen_terms.add(term_minus_last_word)
            # check for "X of Y" <=> "Y X" term variants
            if ' of ' in term:
                split_term = term.split(' of ')
                variant = split_term[1] + ' ' + split_term[0]
                if variant in terms.difference(seen_terms):
                    variants.add(variant)
                    seen_terms.add(variant)

        # intense de-duping for sufficiently long terms
        if fuzzy_dedupe is True and len(term) >= 13:
            for other_term in sorted(terms.difference(seen_terms), key=len, reverse=True):
                if len(other_term) < 13:
                    break
                tsr = token_sort_ratio(term, other_term)
                if tsr > 0.93:
                    variants.add(other_term)
                    seen_terms.add(other_term)
                    break

        agg_terms.append(variants)

    return agg_terms
Ejemplo n.º 4
0
 def test_empty(self, text_pairs):
     for text1, text2 in text_pairs:
         assert similarity.token_sort_ratio(text1, "") == 0.0
Ejemplo n.º 5
0
 def test_identity(self, text_pairs):
     for text1, text2 in text_pairs:
         assert similarity.token_sort_ratio(text1, text1) == pytest.approx(1.0, rel=1e-3)
         assert similarity.token_sort_ratio(text2, text2) == pytest.approx(1.0, rel=1e-3)
Ejemplo n.º 6
0
 def test_default(self, text_pairs):
     for text1, text2 in text_pairs:
         assert 0.0 <= similarity.token_sort_ratio(text1, text2) <= 1.0