Example #1
0
def test_jaro_distance(jf, s1, s2, value):
    value = float(value)
    assertAlmostEqual(jf.jaro_distance(s1, s2), value, places=3)
Example #2
0
def test_jaro_distance_type(jf):
    assert jf.jaro_distance(u'abc', u'abc') == 1
    with pytest.raises(TypeError) as exc:
        jf.jaro_distance(b'abc', b'abc')
    assert 'expected' in str(exc.value)
Example #3
0
def test_jaro_distance_type(jf):
    assert jf.jaro_distance(u'abc', u'abc') == 1
    with pytest.raises(TypeError) as exc:
        jf.jaro_distance(b'abc', b'abc')
    assert 'expected' in str(exc.value)
Example #4
0
def test_jaro_distance(jf, s1, s2, value):
    value = float(value)
    assertAlmostEqual(jf.jaro_distance(s1, s2), value, places=3)
Example #5
0
def load_data(row):

    lev_dist = Levenshtein.distance(str(row[0]).lower(), str(row[1]).lower())
    jar_dist = jaro_distance(str(row[0]).lower(), str(row[1]).lower())
    dam_dist = damerau_levenshtein_distance(
        str(row[0]).lower(),
        str(row[1]).lower())

    q1 = parser(str(row[0]))
    q2 = parser(str(row[1]))

    set_ent1 = set([ele.label_.lower() for ele in q1.ents])
    set_ent2 = set([ele.label_.lower() for ele in q2.ents])

    num_ent, val_ent, rate_ent = feat(set_ent1, set_ent2)

    set_ent1 = set([' '.join(t.orth_ for t in ele) for ele in q1.ents])
    set_ent2 = set([' '.join(t.orth_ for t in ele) for ele in q2.ents])

    num_ent2, val_ent2, rate_ent2 = feat(set_ent1, set_ent2)

    list_last1 = [ele.lower_ for ele in q1 if ele.pos_ != 'PUNCT']
    list_last2 = [ele.lower_ for ele in q2 if ele.pos_ != 'PUNCT']
    num_for = 0
    val_for = 0.
    for i in range(min(len(list_last1), len(list_last2))):
        if list_last1[i] == list_last2[i] or match_rating_comparison(
                list_last1[i], list_last2[i]):
            num_for += 1
            val_for += weights.get(list_last1[i], 0)
        else:
            break

    list_last1.reverse()
    list_last2.reverse()
    num_clean2_rev = 0
    val_clean2_rev = 0.
    for i in range(min(len(list_last1), len(list_last2))):
        if list_last1[i] == list_last2[i] or match_rating_comparison(
                list_last1[i], list_last2[i]):
            num_clean2_rev += 1
            val_clean2_rev += weights.get(list_last1[i], 0)
        else:
            break

    set_sub1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'nsubj'])
    set_sub2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'nsubj'])

    num_sub, val_sub, rate_sub = feat(set_sub1, set_sub2)

    set_root1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'ROOT'])
    set_root2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'ROOT'])

    num_root, val_root, rate_root = feat(set_root1, set_root2)

    set_advmod1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advmod'])
    set_advmod2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advmod'])

    num_advmod, val_advmod, rate_advmod = feat(set_advmod1, set_advmod2)

    set_advcl1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'advcl'])
    set_advcl2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'advcl'])

    num_advcl, val_advcl, rate_advcl = feat(set_advcl1, set_advcl2)

    set_aux1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'aux'])
    set_aux2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'aux'])

    num_aux, val_aux, rate_aux = feat(set_aux1, set_aux2)

    set_dobj1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'dobj'])
    set_dobj2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'dobj'])

    num_dobj, val_dobj, rate_dobj = feat(set_dobj1, set_dobj2)

    # set_poss1 = set([ele.lower_ for ele in q1 if ele.dep_ == 'poss'])
    # set_poss2 = set([ele.lower_ for ele in q2 if ele.dep_ == 'poss'])

    # num_poss, val_poss, rate_poss = feat(set_poss1, set_poss2)

    set_noun1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'NOUN'])
    set_noun2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'NOUN'])

    num_noun, val_noun, rate_noun = feat(set_noun1, set_noun2)

    set_verb1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'VERB'])
    set_verb2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'VERB'])

    num_verb, val_verb, rate_verb = feat(set_verb1, set_verb2)

    set_adv1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADV'])
    set_adv2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADV'])

    num_adv, val_adv, rate_adv = feat(set_adv1, set_adv2)

    # set_adj1 = set([ele.lower_ for ele in q1 if ele.pos_ == 'ADJ'])
    # set_adj2 = set([ele.lower_ for ele in q2 if ele.pos_ == 'ADJ'])
    # num_adj, val_adj, rate_adj = feat(set_adj1, set_adj2)

    set_svo1 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower())
                    for ele in findSVOs(q1)])
    set_svo2 = set([(ele[0].lower(), ele[1].lower(), ele[2].lower())
                    for ele in findSVOs(q2)])

    set_svo1 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]),
                     wnl.lemmatize(ele[2])) for ele in set_svo1])
    set_svo2 = set([(wnl.lemmatize(ele[0]), wnl.lemmatize(ele[1]),
                     wnl.lemmatize(ele[2])) for ele in set_svo2])

    num_svo, val_svo, rate_svo = feat(set_svo1, set_svo2)

    set_s1 = set(ele[0] for ele in set_svo1)
    set_v1 = set(ele[1] for ele in set_svo1)
    set_o1 = set(ele[2] for ele in set_svo1)

    set_s2 = set(ele[0] for ele in set_svo2)
    set_v2 = set(ele[1] for ele in set_svo2)
    set_o2 = set(ele[2] for ele in set_svo2)

    num_s, val_s, rate_s = feat(set_s1, set_s2)

    num_v, val_v, rate_v = feat(set_v1, set_v2)

    num_o, val_o, rate_o = feat(set_o1, set_o2)

    list_ret = [
        num_ent,
        num_ent2,
        num_clean2_rev,
        num_for,
        lev_dist,
        jar_dist,
        dam_dist,
        num_sub,
        num_root,
        num_advmod,
        num_advcl,
        num_aux,  # num_poss,
        num_noun,
        num_verb,
        num_adv,  # num_adj,
        num_svo,
        num_s,
        num_v,
        num_o
    ]
    list_ret += [
        val_ent,
        val_ent2,
        val_clean2_rev,
        val_for,
        val_sub,
        val_root,
        val_advmod,
        val_advcl,
        val_aux,
        val_dobj,  # val_poss,
        val_noun,
        val_verb,
        val_adv,  # val_adj,
        val_svo,
        val_s,
        val_v,
        val_o
    ]
    list_ret += [
        rate_ent,
        rate_ent2,
        rate_sub,
        rate_root,
        rate_advmod,
        rate_advcl,
        rate_aux,
        rate_dobj,  # rate_poss,
        rate_noun,
        rate_verb,
        rate_adv,  # rate_adj,
        rate_svo,
        rate_s,
        rate_v,
        rate_o
    ]

    return list_ret