Ejemplo n.º 1
0
import similarity_overlap
import similarity_overlap_idf
import similarity_utils

from pprint import pprint


# log file
logFile = open('mylogfile.txt', 'wb')

list1 = similarity_utils.load_sentences('data_not_sell')
list2 = similarity_utils.load_sentences('data_sell_share')
print "len(list1):", len(list1)
print "len(list2):", len(list2)

sentence1 = list1[1]
sentence2 = list2[2]

# test similarity from sentence
score1 = similarity_overlap.sim_overlap(sentence1, sentence2)
print "sim(list1[1], list2[2])  :", score1
# test iterate_combination_2d_sim
score_array = similarity_utils.iterate_combination_2d_sim(list1, list2, similarity_overlap.sim_overlap)
print "score_array[1][2]        :", score_array[1][2]


# Test for combined_list
combined_list = list1+list2

# Sim_overlap
score_array_overlap = similarity_utils.iterate_combination_2d_sim(combined_list, combined_list, similarity_overlap.sim_overlap)
Ejemplo n.º 2
0
import re
from nltk import word_tokenize as wt
from nltk.text import TextCollection

from similarity_utils import load_sentences

# create the textcollection for calculation of IDF
list_all_sentences = load_sentences('train_all')
text_collection = TextCollection(list_all_sentences)


# Calculate sentence similarity base on overlap_idf, i.e.
# Sim = ( |Q intersect R| / |Q| ) * Sum(idf_w) for w in (Q intersect R)
def sim_overlap_idf(sentence1, sentence2): 
    # remove punctuation
    nopunct_sentence1 = ''.join([c for c in sentence1 
                                        if re.match("[a-z\-\' \n\t]", c)])
    nopunct_sentence2 = ''.join([c for c in sentence2 
                                        if re.match("[a-z\-\' \n\t]", c)])                                         
    # tokenize
    line1 = wt(nopunct_sentence1)
    line2 = wt(nopunct_sentence2)
    
    # intersection: (Q intersect R)
    intersection = set(line1) & set(line2)
    # calculate sum of idfs: Sum(idf_w) for w in (Q intersect R)
    sum_idf = 0.0
    for item in intersection:
        idf = text_collection.idf(item)
        sum_idf += idf