def get_dict_vectors_of_documents(documents, justTokens=False, scores=None, headers=None): init_doc_count = len(documents)/2 operated_doc_count = 0 doc_dict_vectors_list = [] for i in range(len(documents)/2): if(operated_doc_count == 400): init_doc_count-=400 operated_doc_count=0 print str(init_doc_count) + " sets remaining" operated_doc_count+=1 document1, document2 = documents[(2*i)], documents[(2*i)+1] sent_1_tokens = process.tokens(document1) sent_2_tokens = process.tokens(document2) # print sent_1_tokens, sent_2_tokens if(justTokens): doc_dict_vectors_list.append((sent_1_tokens, sent_2_tokens)) else: dictionary_v = {} if headers!= None: val = {} for idx, header in enumerate(headers): val[header] = scores[i][idx] # dictionary_v.update(val) dictionary_v.update(get_dict_vector_of_2_sentences(sent_1_tokens, sent_2_tokens)) doc_dict_vectors_list.append(dictionary_v) return doc_dict_vectors_list
def jaccard_and_containment_coefficient_evaluate(analyze_type, documents, answers, ngram_size=1, ngram_weighing=False, IDFScores=None,f=None): ind = 0 containment_coefficient_predicted_answers = [] jaccard_coefficient_predicted_answers = [] init_doc_count = len(documents)/2 operated_doc_count = 0 for i in range(len(documents)/2): if(operated_doc_count == 400): init_doc_count-=400 operated_doc_count=0 print str(init_doc_count) + " sets remaining" operated_doc_count+=1 document1, document2 = documents[(2*i)], documents[(2*i)+1] #print document1, document2 sent_1_tokens = process.tokens(document1) sent_2_tokens = process.tokens(document2) if(analyze_type=="pos"): jaccard_coefficient, containment_coefficient = ngram.POSTags_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, ngram_weighing, IDFScores) elif(analyze_type=="lemma"): jaccard_coefficient, containment_coefficient = ngram.Lemma_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, ngram_weighing, IDFScores) elif(analyze_type=="character"): jaccard_coefficient, containment_coefficient = ngram.character_ngram_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, True, ngram_weighing, IDFScores) jaccard_coefficient_predicted_answers.append(5*jaccard_coefficient) containment_coefficient_predicted_answers.append(5*containment_coefficient) #print answers[ind], " | ", jaccard_coefficient_predicted_answers[ind] #print answers[ind], " | ", containment_coefficient_predicted_answers[ind] ind+=1 print "Error in Estimate For Jaccard Coefficient is " + str(utility.evaluate(answers, jaccard_coefficient_predicted_answers)) f.write("Error in Estimate For Jaccard Coefficient is " + str(utility.evaluate(answers, jaccard_coefficient_predicted_answers))) print "Error in Estimate For Containment Coefficient is " + str(utility.evaluate(answers, containment_coefficient_predicted_answers)) f.write("Error in Estimate For Containment Coefficient is " + str(utility.evaluate(answers, containment_coefficient_predicted_answers))) return jaccard_coefficient_predicted_answers, containment_coefficient_predicted_answers
def cosinesimilarity_without_TFIDF(document1, document2): vector1 = Counter(process.tokens(document1)) vector2 = Counter(process.tokens(document2)) len_vector_1 = math.sqrt(sum({k: v**2 for k, v in vector1.items()}.values())) len_vector_2 = math.sqrt(sum({k: v**2 for k, v in vector2.items()}.values())) cosine_similarity_score = (utility.dict_dotprod(vector1,vector2))/float((len_vector_2*len_vector_1)) return cosine_similarity_score
def cosinesimilarity(document1, document2, TFIDFScores): tokens1 = set(process.tokens(document1)) tokens2 = set(process.tokens(document2)) vector1 = DocvectorTFIDF(TFIDFScores, tokens1) vector2 = DocvectorTFIDF(TFIDFScores, tokens2) len_vector_1 = math.sqrt(sum({k: v**2 for k, v in vector1.items()}.values())) len_vector_2 = math.sqrt(sum({k: v**2 for k, v in vector2.items()}.values())) cosine_similarity_score = (utility.dict_dotprod(vector1,vector2))/float((len_vector_2*len_vector_1)) return cosine_similarity_score
def CharacterIDFVector(documents, ngram_size=2): No_of_Documents = float(len(documents)) n = len(documents) IDFVector = Counter() for document in documents: tokens = process.tokens(document) # print str(n) + " Documents remaining to process" n-=1 IDFVector += Counter(set(utility.character_ngram_vector_keys(tokens, ngram_size))) #print IDFVector for key in IDFVector.keys(): IDFVector[key] = math.log(No_of_Documents/(1+IDFVector[key])) return IDFVector
def TFIDF(documents): Vocabulary = Counter() DocVectors = [] IDFVector = Counter() No_of_Documents = float(len(documents)) n = len(documents) for document in documents: print str(n) + " Documents remaining to process" n-=1 tf_single_doc_count = Counter(process.tokens(document)) Vocabulary+= tf_single_doc_count DocVectors.append(tf_single_doc_count) IDFVector += Counter(tf_single_doc_count.keys()) # print IDFVector for key in IDFVector.keys(): IDFVector[key] = math.log(No_of_Documents/(1+IDFVector[key])) # print IDFVector # IDFVector = defaultdict(lambda:0.0, dict((key,Vocabulary[key]*) for key in c.keys())) TFIDFScores = defaultdict(lambda:0.0, dict((key,Vocabulary[key]*IDFVector[key]) for key in Vocabulary.keys())) # print TFIDFScores, IDFVector, Vocabulary return TFIDFScores, Vocabulary, DocVectors, IDFVector