def jaccard_and_containment_coefficient_evaluate(analyze_type, documents, answers, ngram_size=1, ngram_weighing=False, IDFScores=None,f=None): ind = 0 containment_coefficient_predicted_answers = [] jaccard_coefficient_predicted_answers = [] init_doc_count = len(documents)/2 operated_doc_count = 0 for i in range(len(documents)/2): if(operated_doc_count == 400): init_doc_count-=400 operated_doc_count=0 print str(init_doc_count) + " sets remaining" operated_doc_count+=1 document1, document2 = documents[(2*i)], documents[(2*i)+1] #print document1, document2 sent_1_tokens = process.tokens(document1) sent_2_tokens = process.tokens(document2) if(analyze_type=="pos"): jaccard_coefficient, containment_coefficient = ngram.POSTags_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, ngram_weighing, IDFScores) elif(analyze_type=="lemma"): jaccard_coefficient, containment_coefficient = ngram.Lemma_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, ngram_weighing, IDFScores) elif(analyze_type=="character"): jaccard_coefficient, containment_coefficient = ngram.character_ngram_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, True, ngram_weighing, IDFScores) jaccard_coefficient_predicted_answers.append(5*jaccard_coefficient) containment_coefficient_predicted_answers.append(5*containment_coefficient) #print answers[ind], " | ", jaccard_coefficient_predicted_answers[ind] #print answers[ind], " | ", containment_coefficient_predicted_answers[ind] ind+=1 print "Error in Estimate For Jaccard Coefficient is " + str(utility.evaluate(answers, jaccard_coefficient_predicted_answers)) f.write("Error in Estimate For Jaccard Coefficient is " + str(utility.evaluate(answers, jaccard_coefficient_predicted_answers))) print "Error in Estimate For Containment Coefficient is " + str(utility.evaluate(answers, containment_coefficient_predicted_answers)) f.write("Error in Estimate For Containment Coefficient is " + str(utility.evaluate(answers, containment_coefficient_predicted_answers))) return jaccard_coefficient_predicted_answers, containment_coefficient_predicted_answers
def cosine_similarity_without_tfidf(documents): answers = [] predicted_answers = [] ind=0 for document in documents: answers.append(document[2]) predicted_answers.append(5 * ngram.cosinesimilarity_without_TFIDF(document[0], document[1])) # print answers[ind], " | ", predicted_answers[ind] ind+=1 print "Error in Estimate is " + str(utility.evaluate(answers, predicted_answers)) return predicted_answers
def cosinesimilarity_evaluate_TFIDF(documents, TFIDFScores, answers): ind = 0 predicted_answers = [] for i in range(len(documents)/2): document1, document2 = documents[(2*i)], documents[(2*i)+1] #print document1, document2 predicted_answers.append(5*ngram.cosinesimilarity(document1, document2, TFIDFScores)) #print answers[ind], " | ", predicted_answers[ind] ind+=1 print "Error in Estimate is " + str(utility.evaluate(answers, predicted_answers)) return predicted_answers
def w2vec_similarity_measure_unsupervised(documents, answers): model = w2vec_model() documents_tokens = utility.get_dict_vectors_of_documents(documents, justTokens=True) predicted_answers = [] for i in range(len(documents_tokens)): s1_tokens, s2_tokens = documents_tokens[i] s1_vector = computation_vec_for_sentence(s1_tokens, model) s2_vector = computation_vec_for_sentence(s2_tokens, model) predicted_answers.append(5 * cossim_dense(s1_vector, s2_vector)) print "Error in Estimation of Word2Vec similarity: " + str( utility.evaluate(predicted_answers, answers))
def support_vector_machines(training_documents, test_documents, training_answers, test_answers, load, w2vec_model, use_w2_vec_model): # doc_dict_vectors_list = [] # Corpus = [] # vectorizer = CountVectorizer(min_df=1) # for i in range(len(documents)/2): # Corpus.append(documents[(2*i)] + " " + documents[(2*i)+1]) # X = vectorizer.fit_transform(Corpus) # vectorizer.transform(['Something completely new.']).toarray() # pX = v.fit_transform(D2) v = DictVectorizer(sparse=True) if (load): Doc_Dict_Vectors = utility.load_weights("weights/Feature_Vector.dat") Test_doc_dict_vectors = utility.load_weights( "weights/Test_Feature_Vector.dat") X = utility.load_weights("weights/Train_X.dat") pX = utility.load_weights("weights/Test_X.dat") else: scores = utility.load_weights("weights/jc_cc_scores.dat") headers = utility.load_weights("weights/headers_scores_jc_cc.dat") tr_scores = scores[:len(training_answers)] te_scores = scores[len(training_answers):] Doc_Dict_Vectors = utility.get_dict_vectors_of_documents( training_documents, None, tr_scores, headers) Test_doc_dict_vectors = utility.get_dict_vectors_of_documents( test_documents, None, te_scores, headers) if (use_w2_vec_model): TR_S1, TR_S2 = w2vec.w2vec_for_pair_of_docs( training_documents, w2vec_model) TE_S1, TE_S2 = w2vec.w2vec_for_pair_of_docs( test_documents, w2vec_model) # print len(Doc_Dict_Vectors[0]) Doc_Dict_Vectors = utility.appendWordEmbeddings( Doc_Dict_Vectors, TR_S1, TR_S2) Test_doc_dict_vectors = utility.appendWordEmbeddings( Test_doc_dict_vectors, TE_S1, TE_S2) # print len(Doc_Dict_Vectors[0]) utility.save_weights("Feature_Vector.dat", Doc_Dict_Vectors) utility.save_weights("Test_Feature_Vector.dat", Test_doc_dict_vectors) X = v.fit_transform(Doc_Dict_Vectors) pX = v.transform(Test_doc_dict_vectors) # if(use_w2_vec_model): # print X # print TR_S1 # print TR_S2 # X = hstack([X,TR_S1,TR_S2]).toarray() # print X # pX = hstack([pX,TE_S1,TE_S2]).toarray() utility.save_weights("Train_X.dat", X) utility.save_weights("Test_X.dat", pX) ######### CODE FOR TESTING HYPER PARAMETERS ################## # Total_Doc_Dict_Vectors = utility.get_dict_vectors_of_documents(training_documents+test_documents) # X_total = v.fit_transform(Total_Doc_Dict_Vectors) # y_total = training_answers+test_answers # test_hyper_parameters(X_total,y_total) ############################################################## # print X # print training_answers min_max_scaler = preprocessing.StandardScaler(with_mean=False) X_train_minmax = min_max_scaler.fit_transform(X) X_normalized = preprocessing.normalize(X, norm='l2') C = 10000000000.0 gamma = 1000 print "Trying C = %s, Gamma = %s" % (str(C), str(gamma)) svm_model = svm.SVR(C=C, gamma=gamma) svm_model.fit(X_train_minmax, training_answers) predicted_answers = svm_model.predict(X_train_minmax) answers = [] for i in predicted_answers: if (i < 0): # print "came in" answers.append(0) elif (i > 5): # print "came in *** " answers.append(5) else: answers.append(i) # print answers print "Error in Estimation of SVM - Training : " + str( utility.evaluate(training_answers, answers)) # pX = v.transform(Test_doc_dict_vectors) pX_normalized = preprocessing.normalize(pX, norm='l2') pX_test_minmax = min_max_scaler.fit_transform(pX) predicted_answers = svm_model.predict(pX_test_minmax) answers = [] for i in predicted_answers: if (i < 0): # print "came in" answers.append(0) elif (i > 5): # print "came in *** " answers.append(5) else: answers.append(i) # print answers print "Error in Estimation of SVM - Testing : " + str( utility.evaluate(test_answers, answers))
def mlp_network(training_documents, test_documents, training_answers, test_answers, load, w2vec_model, use_w2_vec_model): v = DictVectorizer(sparse=True) if(load): Doc_Dict_Vectors = utility.load_weights("weights/Feature_Vector.dat") Test_doc_dict_vectors = utility.load_weights("weights/Test_Feature_Vector.dat") X = utility.load_weights("weights/Train_X.dat") pX = utility.load_weights("weights/Test_X.dat") else: scores = utility.load_weights("weights/jc_cc_scores.dat") headers = utility.load_weights("weights/headers_scores_jc_cc.dat") tr_scores = scores[:len(training_answers)] te_scores = scores[len(training_answers):] Doc_Dict_Vectors = utility.get_dict_vectors_of_documents(training_documents,None,tr_scores,headers) Test_doc_dict_vectors = utility.get_dict_vectors_of_documents(test_documents,None,te_scores,headers) if(use_w2_vec_model): TR_S1,TR_S2 = w2vec.w2vec_for_pair_of_docs(training_documents, w2vec_model) TE_S1,TE_S2 = w2vec.w2vec_for_pair_of_docs(test_documents, w2vec_model) print len(Doc_Dict_Vectors[0]) Doc_Dict_Vectors = utility.appendWordEmbeddings(Doc_Dict_Vectors,TR_S1,TR_S2) print len(Doc_Dict_Vectors[0]) utility.save_weights("Feature_Vector.dat",Doc_Dict_Vectors) utility.save_weights("Test_Feature_Vector.dat",Test_doc_dict_vectors) X = v.fit_transform(Doc_Dict_Vectors) pX = v.transform(Test_doc_dict_vectors) # if(use_w2_vec_model): # print X # print TR_S1 # print TR_S2 # X = hstack([X,TR_S1,TR_S2]).toarray() # print X # pX = hstack([pX,TE_S1,TE_S2]).toarray() utility.save_weights("Train_X.dat",X) utility.save_weights("Test_X.dat",pX) ######### CODE FOR TESTING HYPER PARAMETERS ################## # Total_Doc_Dict_Vectors = utility.get_dict_vectors_of_documents(training_documents+test_documents) # X_total = v.fit_transform(Total_Doc_Dict_Vectors) # y_total = training_answers+test_answers # test_hyper_parameters(X_total,y_total) ############################################################## # print X # print training_answers min_max_scaler = preprocessing.StandardScaler(with_mean=False) X_train_minmax = min_max_scaler.fit_transform(X) X_normalized = preprocessing.normalize(X, norm='l2') mlp = neural_network.MLPRegressor(verbose=True,max_iter=1) mlp.fit(X_train_minmax, training_answers) predicted_answers = mlp.predict(X_train_minmax) answers = [] for i in predicted_answers: if(i<0): # print "came in" answers.append(0) elif(i>5): # print "came in *** " answers.append(5) else: answers.append(i) # print answers print "Error in Estimation of MLP - Training : "+str(utility.evaluate(training_answers,answers)) pX_normalized = preprocessing.normalize(pX, norm='l2') pX_test_minmax = min_max_scaler.fit_transform(pX) predicted_answers = mlp.predict(pX_test_minmax) answers = [] for i in predicted_answers: if(i<0): # print "came in" answers.append(0) elif(i>5): # print "came in *** " answers.append(5) else: answers.append(i) # print answers print "Error in Estimation of MLP - Testing : "+str(utility.evaluate(test_answers,answers))
def linear_regression(training_documents, test_documents, training_answers, test_answers, load, w2vec_model, use_w2_vec_model): lm = LinearRegression() # doc_dict_vectors_list = [] # Corpus = [] # vectorizer = CountVectorizer(min_df=1) # for i in range(len(documents)/2): # Corpus.append(documents[(2*i)] + " " + documents[(2*i)+1]) # X = vectorizer.fit_transform(Corpus) # vectorizer.transform(['Something completely new.']).toarray() # pX = v.fit_transform(D2) if (load): Doc_Dict_Vectors = utility.load_weights("weights/Feature_Vector.dat") Test_doc_dict_vectors = utility.load_weights( "weights/Test_Feature_Vector.dat") X = utility.load_weights("weights/Train_X.dat") pX = utility.load_weights("weights/Test_X.dat") else: scores = utility.load_weights("weights/jc_cc_scores.dat") headers = utility.load_weights("weights/headers_scores_jc_cc.dat") tr_scores = scores[:len(training_answers)] te_scores = scores[len(training_answers):] Doc_Dict_Vectors = utility.get_dict_vectors_of_documents( training_documents, None, tr_scores, headers) Test_doc_dict_vectors = utility.get_dict_vectors_of_documents( test_documents, None, te_scores, headers) if (use_w2_vec_model): TR_S1, TR_S2 = w2vec.w2vec_for_pair_of_docs( training_documents, w2vec_model) TE_S1, TE_S2 = w2vec.w2vec_for_pair_of_docs( test_documents, w2vec_model) # print len(Doc_Dict_Vectors[0]) Doc_Dict_Vectors = utility.appendWordEmbeddings( Doc_Dict_Vectors, TR_S1, TR_S2) Test_doc_dict_vectors = utility.appendWordEmbeddings( Test_doc_dict_vectors, TE_S1, TE_S2) # print len(Doc_Dict_Vectors[0]) utility.save_weights("Feature_Vector.dat", Doc_Dict_Vectors) utility.save_weights("Test_Feature_Vector.dat", Test_doc_dict_vectors) v = DictVectorizer(sparse=True) X = v.fit_transform(Doc_Dict_Vectors) pX = v.transform(Test_doc_dict_vectors) # if(use_w2_vec_model): # print X # print TR_S1 # print TR_S2 # X = hstack([X,TR_S1,TR_S2]).toarray() # print X # pX = hstack([pX,TE_S1,TE_S2]).toarray() utility.save_weights("Train_X.dat", X) utility.save_weights("Test_X.dat", pX) # print training_answers min_max_scaler = preprocessing.MaxAbsScaler() X_train_minmax = min_max_scaler.fit_transform(X) X_normalized = preprocessing.normalize(X, norm='l2') lm.fit(X_train_minmax, training_answers) predicted_answers = lm.predict(X_train_minmax) answers = [] for i in predicted_answers: if (i < 0): # print "came in" answers.append(0) elif (i > 5): # print "came in *** " answers.append(5) else: answers.append(i) # print answers print "Error in Estimation of Linear Regression - Training : " + str( utility.evaluate(training_answers, answers)) pX_normalized = preprocessing.normalize(pX, norm='l2') pX_test_minmax = min_max_scaler.fit_transform(pX) predicted_answers = lm.predict(pX_test_minmax) answers = [] for i in predicted_answers: if (i < 0): # print "came in" answers.append(0) elif (i > 5): # print "came in *** " answers.append(5) else: answers.append(i) # print answers print "Error in Estimation of Linear Regression - Testing : " + str( utility.evaluate(test_answers, answers))