Esempio n. 1
0
def jaccard_and_containment_coefficient_evaluate(analyze_type, documents, answers, ngram_size=1, ngram_weighing=False, IDFScores=None,f=None):
  ind = 0
  containment_coefficient_predicted_answers = []
  jaccard_coefficient_predicted_answers = []
  init_doc_count = len(documents)/2
  operated_doc_count = 0
  for i in range(len(documents)/2):
    if(operated_doc_count == 400):
      init_doc_count-=400
      operated_doc_count=0
      print str(init_doc_count) + " sets remaining"
    operated_doc_count+=1
    document1, document2 = documents[(2*i)], documents[(2*i)+1]
    #print document1, document2
    sent_1_tokens = process.tokens(document1)
    sent_2_tokens = process.tokens(document2)
    if(analyze_type=="pos"):
      jaccard_coefficient, containment_coefficient = ngram.POSTags_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, ngram_weighing, IDFScores)
    elif(analyze_type=="lemma"):
      jaccard_coefficient, containment_coefficient = ngram.Lemma_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, ngram_weighing, IDFScores)
    elif(analyze_type=="character"):
      jaccard_coefficient, containment_coefficient = ngram.character_ngram_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, True, ngram_weighing, IDFScores)
    jaccard_coefficient_predicted_answers.append(5*jaccard_coefficient)
    containment_coefficient_predicted_answers.append(5*containment_coefficient)
    #print answers[ind],   "   |   ", jaccard_coefficient_predicted_answers[ind]
    #print answers[ind],   "   |   ", containment_coefficient_predicted_answers[ind]
    ind+=1
  print "Error in Estimate For Jaccard Coefficient is " + str(utility.evaluate(answers, jaccard_coefficient_predicted_answers))
  f.write("Error in Estimate For Jaccard Coefficient is " + str(utility.evaluate(answers, jaccard_coefficient_predicted_answers)))
  print "Error in Estimate For Containment Coefficient is " + str(utility.evaluate(answers, containment_coefficient_predicted_answers))
  f.write("Error in Estimate For Containment Coefficient is " + str(utility.evaluate(answers, containment_coefficient_predicted_answers)))
  return jaccard_coefficient_predicted_answers, containment_coefficient_predicted_answers
Esempio n. 2
0
def cosine_similarity_without_tfidf(documents):
  answers = []
  predicted_answers = []
  ind=0
  for document in documents:
    answers.append(document[2])
    predicted_answers.append(5 * ngram.cosinesimilarity_without_TFIDF(document[0], document[1]))
    # print answers[ind],   "   |   ",  predicted_answers[ind]
    ind+=1
  print "Error in Estimate is " + str(utility.evaluate(answers, predicted_answers))
  return predicted_answers
Esempio n. 3
0
def cosinesimilarity_evaluate_TFIDF(documents, TFIDFScores, answers):
  ind = 0
  predicted_answers = []
  for i in range(len(documents)/2):
    document1, document2 = documents[(2*i)], documents[(2*i)+1]
    #print document1, document2
    predicted_answers.append(5*ngram.cosinesimilarity(document1, document2, TFIDFScores))
    #print answers[ind],   "   |   ", predicted_answers[ind]
    ind+=1
  print "Error in Estimate is " + str(utility.evaluate(answers, predicted_answers))
  return predicted_answers 
Esempio n. 4
0
def w2vec_similarity_measure_unsupervised(documents, answers):
    model = w2vec_model()
    documents_tokens = utility.get_dict_vectors_of_documents(documents,
                                                             justTokens=True)
    predicted_answers = []
    for i in range(len(documents_tokens)):
        s1_tokens, s2_tokens = documents_tokens[i]
        s1_vector = computation_vec_for_sentence(s1_tokens, model)
        s2_vector = computation_vec_for_sentence(s2_tokens, model)
        predicted_answers.append(5 * cossim_dense(s1_vector, s2_vector))
    print "Error in Estimation of Word2Vec similarity: " + str(
        utility.evaluate(predicted_answers, answers))
Esempio n. 5
0
def support_vector_machines(training_documents, test_documents,
                            training_answers, test_answers, load, w2vec_model,
                            use_w2_vec_model):

    # doc_dict_vectors_list = []
    # Corpus = []
    # vectorizer = CountVectorizer(min_df=1)
    # for i in range(len(documents)/2):
    #   Corpus.append(documents[(2*i)] + " " + documents[(2*i)+1])
    # X = vectorizer.fit_transform(Corpus)
    # vectorizer.transform(['Something completely new.']).toarray()
    # pX = v.fit_transform(D2)
    v = DictVectorizer(sparse=True)
    if (load):
        Doc_Dict_Vectors = utility.load_weights("weights/Feature_Vector.dat")
        Test_doc_dict_vectors = utility.load_weights(
            "weights/Test_Feature_Vector.dat")
        X = utility.load_weights("weights/Train_X.dat")
        pX = utility.load_weights("weights/Test_X.dat")
    else:
        scores = utility.load_weights("weights/jc_cc_scores.dat")
        headers = utility.load_weights("weights/headers_scores_jc_cc.dat")
        tr_scores = scores[:len(training_answers)]
        te_scores = scores[len(training_answers):]
        Doc_Dict_Vectors = utility.get_dict_vectors_of_documents(
            training_documents, None, tr_scores, headers)
        Test_doc_dict_vectors = utility.get_dict_vectors_of_documents(
            test_documents, None, te_scores, headers)
        if (use_w2_vec_model):
            TR_S1, TR_S2 = w2vec.w2vec_for_pair_of_docs(
                training_documents, w2vec_model)
            TE_S1, TE_S2 = w2vec.w2vec_for_pair_of_docs(
                test_documents, w2vec_model)
            # print len(Doc_Dict_Vectors[0])
            Doc_Dict_Vectors = utility.appendWordEmbeddings(
                Doc_Dict_Vectors, TR_S1, TR_S2)
            Test_doc_dict_vectors = utility.appendWordEmbeddings(
                Test_doc_dict_vectors, TE_S1, TE_S2)
            # print len(Doc_Dict_Vectors[0])
        utility.save_weights("Feature_Vector.dat", Doc_Dict_Vectors)
        utility.save_weights("Test_Feature_Vector.dat", Test_doc_dict_vectors)
        X = v.fit_transform(Doc_Dict_Vectors)
        pX = v.transform(Test_doc_dict_vectors)
        # if(use_w2_vec_model):
        #   print X
        #   print TR_S1
        #   print TR_S2
        #   X = hstack([X,TR_S1,TR_S2]).toarray()
        #   print X
        #   pX = hstack([pX,TE_S1,TE_S2]).toarray()
        utility.save_weights("Train_X.dat", X)
        utility.save_weights("Test_X.dat", pX)

    ######### CODE FOR TESTING HYPER PARAMETERS ##################
    # Total_Doc_Dict_Vectors = utility.get_dict_vectors_of_documents(training_documents+test_documents)
    # X_total = v.fit_transform(Total_Doc_Dict_Vectors)
    # y_total = training_answers+test_answers
    # test_hyper_parameters(X_total,y_total)
    ##############################################################

    # print X
    # print training_answers
    min_max_scaler = preprocessing.StandardScaler(with_mean=False)
    X_train_minmax = min_max_scaler.fit_transform(X)
    X_normalized = preprocessing.normalize(X, norm='l2')
    C = 10000000000.0
    gamma = 1000
    print "Trying C = %s,  Gamma = %s" % (str(C), str(gamma))
    svm_model = svm.SVR(C=C, gamma=gamma)
    svm_model.fit(X_train_minmax, training_answers)
    predicted_answers = svm_model.predict(X_train_minmax)
    answers = []
    for i in predicted_answers:
        if (i < 0):
            # print "came in"
            answers.append(0)
        elif (i > 5):
            # print "came in *** "
            answers.append(5)
        else:
            answers.append(i)
    # print answers
    print "Error in Estimation of SVM - Training : " + str(
        utility.evaluate(training_answers, answers))
    # pX = v.transform(Test_doc_dict_vectors)
    pX_normalized = preprocessing.normalize(pX, norm='l2')
    pX_test_minmax = min_max_scaler.fit_transform(pX)
    predicted_answers = svm_model.predict(pX_test_minmax)
    answers = []
    for i in predicted_answers:
        if (i < 0):
            # print "came in"
            answers.append(0)
        elif (i > 5):
            # print "came in *** "
            answers.append(5)
        else:
            answers.append(i)
    # print answers
    print "Error in Estimation of SVM - Testing : " + str(
        utility.evaluate(test_answers, answers))
Esempio n. 6
0
def mlp_network(training_documents, test_documents, training_answers,  test_answers, load, w2vec_model, use_w2_vec_model):
  v = DictVectorizer(sparse=True)
  if(load):
    Doc_Dict_Vectors = utility.load_weights("weights/Feature_Vector.dat")
    Test_doc_dict_vectors = utility.load_weights("weights/Test_Feature_Vector.dat")
    X = utility.load_weights("weights/Train_X.dat")
    pX = utility.load_weights("weights/Test_X.dat")
  else:
    scores = utility.load_weights("weights/jc_cc_scores.dat")
    headers = utility.load_weights("weights/headers_scores_jc_cc.dat")
    tr_scores = scores[:len(training_answers)]
    te_scores = scores[len(training_answers):]
    Doc_Dict_Vectors = utility.get_dict_vectors_of_documents(training_documents,None,tr_scores,headers)
    Test_doc_dict_vectors = utility.get_dict_vectors_of_documents(test_documents,None,te_scores,headers)
    if(use_w2_vec_model):
      TR_S1,TR_S2 = w2vec.w2vec_for_pair_of_docs(training_documents, w2vec_model)
      TE_S1,TE_S2 = w2vec.w2vec_for_pair_of_docs(test_documents, w2vec_model)
      print len(Doc_Dict_Vectors[0])
      Doc_Dict_Vectors = utility.appendWordEmbeddings(Doc_Dict_Vectors,TR_S1,TR_S2)
      print len(Doc_Dict_Vectors[0])
    utility.save_weights("Feature_Vector.dat",Doc_Dict_Vectors)
    utility.save_weights("Test_Feature_Vector.dat",Test_doc_dict_vectors)
    X = v.fit_transform(Doc_Dict_Vectors)
    pX = v.transform(Test_doc_dict_vectors)
    # if(use_w2_vec_model):
    #   print X
    #   print TR_S1
    #   print TR_S2
    #   X = hstack([X,TR_S1,TR_S2]).toarray()
    #   print X
    #   pX = hstack([pX,TE_S1,TE_S2]).toarray()
    utility.save_weights("Train_X.dat",X)
    utility.save_weights("Test_X.dat",pX)

  ######### CODE FOR TESTING HYPER PARAMETERS ##################
  # Total_Doc_Dict_Vectors = utility.get_dict_vectors_of_documents(training_documents+test_documents)
  # X_total = v.fit_transform(Total_Doc_Dict_Vectors)
  # y_total = training_answers+test_answers
  # test_hyper_parameters(X_total,y_total)
  ##############################################################

  # print X
  # print training_answers
  min_max_scaler = preprocessing.StandardScaler(with_mean=False)
  X_train_minmax = min_max_scaler.fit_transform(X)
  X_normalized = preprocessing.normalize(X, norm='l2')
  mlp = neural_network.MLPRegressor(verbose=True,max_iter=1)
  mlp.fit(X_train_minmax, training_answers)
  predicted_answers = mlp.predict(X_train_minmax)
  answers = []
  for i in predicted_answers:
    if(i<0):
      # print "came in"
      answers.append(0)
    elif(i>5):
      # print "came in *** "
      answers.append(5)
    else:
      answers.append(i)
  # print answers
  print "Error in Estimation of MLP - Training : "+str(utility.evaluate(training_answers,answers))
  
  pX_normalized = preprocessing.normalize(pX, norm='l2')
  pX_test_minmax = min_max_scaler.fit_transform(pX)
  predicted_answers = mlp.predict(pX_test_minmax)
  answers = []
  for i in predicted_answers:
    if(i<0):
      # print "came in"
      answers.append(0)
    elif(i>5):
      # print "came in *** "
      answers.append(5)
    else:
      answers.append(i)
  # print answers
  print "Error in Estimation of MLP - Testing : "+str(utility.evaluate(test_answers,answers))
Esempio n. 7
0
def linear_regression(training_documents, test_documents, training_answers,
                      test_answers, load, w2vec_model, use_w2_vec_model):
    lm = LinearRegression()
    # doc_dict_vectors_list = []
    # Corpus = []
    # vectorizer = CountVectorizer(min_df=1)
    # for i in range(len(documents)/2):
    #   Corpus.append(documents[(2*i)] + " " + documents[(2*i)+1])
    # X = vectorizer.fit_transform(Corpus)
    # vectorizer.transform(['Something completely new.']).toarray()
    # pX = v.fit_transform(D2)

    if (load):
        Doc_Dict_Vectors = utility.load_weights("weights/Feature_Vector.dat")
        Test_doc_dict_vectors = utility.load_weights(
            "weights/Test_Feature_Vector.dat")
        X = utility.load_weights("weights/Train_X.dat")
        pX = utility.load_weights("weights/Test_X.dat")
    else:
        scores = utility.load_weights("weights/jc_cc_scores.dat")
        headers = utility.load_weights("weights/headers_scores_jc_cc.dat")
        tr_scores = scores[:len(training_answers)]
        te_scores = scores[len(training_answers):]
        Doc_Dict_Vectors = utility.get_dict_vectors_of_documents(
            training_documents, None, tr_scores, headers)
        Test_doc_dict_vectors = utility.get_dict_vectors_of_documents(
            test_documents, None, te_scores, headers)
        if (use_w2_vec_model):
            TR_S1, TR_S2 = w2vec.w2vec_for_pair_of_docs(
                training_documents, w2vec_model)
            TE_S1, TE_S2 = w2vec.w2vec_for_pair_of_docs(
                test_documents, w2vec_model)
            # print len(Doc_Dict_Vectors[0])
            Doc_Dict_Vectors = utility.appendWordEmbeddings(
                Doc_Dict_Vectors, TR_S1, TR_S2)
            Test_doc_dict_vectors = utility.appendWordEmbeddings(
                Test_doc_dict_vectors, TE_S1, TE_S2)
            # print len(Doc_Dict_Vectors[0])
        utility.save_weights("Feature_Vector.dat", Doc_Dict_Vectors)
        utility.save_weights("Test_Feature_Vector.dat", Test_doc_dict_vectors)
        v = DictVectorizer(sparse=True)
        X = v.fit_transform(Doc_Dict_Vectors)
        pX = v.transform(Test_doc_dict_vectors)
        # if(use_w2_vec_model):
        # print X
        # print TR_S1
        # print TR_S2
        # X = hstack([X,TR_S1,TR_S2]).toarray()
        # print X
        # pX = hstack([pX,TE_S1,TE_S2]).toarray()
        utility.save_weights("Train_X.dat", X)
        utility.save_weights("Test_X.dat", pX)
        # print training_answers
    min_max_scaler = preprocessing.MaxAbsScaler()
    X_train_minmax = min_max_scaler.fit_transform(X)
    X_normalized = preprocessing.normalize(X, norm='l2')
    lm.fit(X_train_minmax, training_answers)
    predicted_answers = lm.predict(X_train_minmax)
    answers = []
    for i in predicted_answers:
        if (i < 0):
            # print "came in"
            answers.append(0)
        elif (i > 5):
            # print "came in *** "
            answers.append(5)
        else:
            answers.append(i)
    # print answers
    print "Error in Estimation of Linear Regression - Training : " + str(
        utility.evaluate(training_answers, answers))

    pX_normalized = preprocessing.normalize(pX, norm='l2')
    pX_test_minmax = min_max_scaler.fit_transform(pX)
    predicted_answers = lm.predict(pX_test_minmax)
    answers = []
    for i in predicted_answers:
        if (i < 0):
            # print "came in"
            answers.append(0)
        elif (i > 5):
            # print "came in *** "
            answers.append(5)
        else:
            answers.append(i)
    # print answers
    print "Error in Estimation of Linear Regression - Testing : " + str(
        utility.evaluate(test_answers, answers))