import similarity_overlap import similarity_overlap_idf import similarity_utils from pprint import pprint # log file logFile = open('mylogfile.txt', 'wb') list1 = similarity_utils.load_sentences('data_not_sell') list2 = similarity_utils.load_sentences('data_sell_share') print "len(list1):", len(list1) print "len(list2):", len(list2) sentence1 = list1[1] sentence2 = list2[2] # test similarity from sentence score1 = similarity_overlap.sim_overlap(sentence1, sentence2) print "sim(list1[1], list2[2]) :", score1 # test iterate_combination_2d_sim score_array = similarity_utils.iterate_combination_2d_sim(list1, list2, similarity_overlap.sim_overlap) print "score_array[1][2] :", score_array[1][2] # Test for combined_list combined_list = list1+list2 # Sim_overlap score_array_overlap = similarity_utils.iterate_combination_2d_sim(combined_list, combined_list, similarity_overlap.sim_overlap)
import re from nltk import word_tokenize as wt from nltk.text import TextCollection from similarity_utils import load_sentences # create the textcollection for calculation of IDF list_all_sentences = load_sentences('train_all') text_collection = TextCollection(list_all_sentences) # Calculate sentence similarity base on overlap_idf, i.e. # Sim = ( |Q intersect R| / |Q| ) * Sum(idf_w) for w in (Q intersect R) def sim_overlap_idf(sentence1, sentence2): # remove punctuation nopunct_sentence1 = ''.join([c for c in sentence1 if re.match("[a-z\-\' \n\t]", c)]) nopunct_sentence2 = ''.join([c for c in sentence2 if re.match("[a-z\-\' \n\t]", c)]) # tokenize line1 = wt(nopunct_sentence1) line2 = wt(nopunct_sentence2) # intersection: (Q intersect R) intersection = set(line1) & set(line2) # calculate sum of idfs: Sum(idf_w) for w in (Q intersect R) sum_idf = 0.0 for item in intersection: idf = text_collection.idf(item) sum_idf += idf