Beispiel #1
0
def similarity(s1, s2):
    s1 = process_line_msrp(s1)
    s2 = process_line_msrp(s2)

    # print "s1:", s1
    # print "s2:", s2

    s1_word = s1.split(" ")
    s2_word = s2.split(" ")

    union_s = list(set(s2_word).union(set(s1_word)))

    # print "union_s:", union_s

    n = len(union_s)
    s1_vector = np.zeros(n)
    s2_vector = np.zeros(n)

    i = 0
    for word in union_s:
        if word in s1_word:
            s1_vector[i] = 1
        if word in s2_word:
            s2_vector[i] = 1

        i += 1

    # print "s1_vector:", s1_vector
    # print "s2_vector:", s2_vector
    # print np.dot(s1_vector, s2_vector) / (np.linalg.norm(s1_vector) * np.linalg.norm(s2_vector))

    matrix_wup = np.zeros((n, n))

    i = 0
    j = 0
    for w1 in union_s:
        for w2 in union_s:
            if matrix_wup[i][j] > 0:
                j += 1
                continue

            if i == j:
                matrix_wup[i][j] = 1
                j += 1
                continue

            w1 = w1.lower()
            w2 = w2.lower()
            if w1 == w2:
                matrix_wup[i][j] = 1
                j += 1
                continue

            if not w1.isalpha():
                matrix_wup[i][j] = 0
                j += 1
                continue

            if not w2.isalpha():
                matrix_wup[i][j] = 0
                j += 1
                continue

            w1_synsets = wn.synsets(w1)
            if len(w1_synsets) == 0:
                matrix_wup[i][j] = 0
                j += 1
                continue

            w1_synset = w1_synsets[0]

            w2_synsets = wn.synsets(w2)
            if len(w2_synsets) == 0:
                matrix_wup[i][j] = 0
                j += 1
                continue
            w2_synset = w2_synsets[0]

            if w1_synset.pos() != w2_synset.pos():
                matrix_wup[i][j] = 0
                j += 1
                continue

            wup_sim = w1_synset.wup_similarity(w2_synset)
            if wup_sim < 0.8:
                wup_sim = 0.0

            try:
                a = float(wup_sim)
                if math.isnan(a):
                    print w1, w2
            except:
                print w1, w2

            matrix_wup[i][j] = wup_sim
            matrix_wup[j][i] = wup_sim

            j += 1
        i += 1
        j = 0

    similarity_value = np.dot(np.dot(s1_vector, matrix_wup), s2_vector.T) / (
        np.linalg.norm(s1_vector) * np.linalg.norm(s2_vector))
    #
    # print "matrix_wup:\n", matrix_wup
    #
    # print s1, "#", s2, ":", similarity_value
    mylog("1#" + str(similarity_value))
    mylog("\n")
Beispiel #2
0
    mylog("\n")


if __name__ == '__main__':
    # for test
    # s1 = '''
    # dog eat cat
    # '''
    # s2 = '''
    # dog eat cat
    # '''

    # s1 = CorpusUtil.pre_process_msrp(" ".join(nltk.word_tokenize(s1)))
    # s2 = CorpusUtil.pre_process_msrp(" ".join(nltk.word_tokenize(s2)))

    i = 0
    for line in open(msrp_train_file).readlines():
        i += 1
        line = line.strip()
        split_line = line.split("\t")
        flag = int(split_line[0])
        s1 = split_line[3].strip()
        s2 = split_line[4].strip()

        s1 = " ".join(nltk.word_tokenize(s1))
        s2 = " ".join(nltk.word_tokenize(s2))

        mylog(str(flag) + " ")
        similarity(s1, s2)
        print i, "行......"