def TestModel(new_train,
              new_test,
              texts_train_path,
              texts_test_path,
              train_pos,
              test_pos,
              new_train_ngrams,
              new_test_ngrams,
              nFeaturesList,
              subsample=False,
              removeCenter=True,
              BoW=True,
              charNgrams=False,
              POS=False,
              features=False,
              POSgrams=False,
              tfidf=False,
              binary=False,
              statistics=False):

    names = []
    train, test, train_labels, test_labels, feature_names = FeaturesReader.readFeatures(
        new_train, new_test)

    labels_train, texts_train, nominates_train = Eval.readDataSet(
        texts_train_path, 0)
    labels_test, texts_test, nominates_test = Eval.readDataSet(
        texts_test_path, 0)
    """
    train_pos_3gram_file = open(train_pos_3gram, 'r')
    train_pos_3gram_file_list = train_pos_3gram_file.readlines() 
    pos_3gram_names = train_pos_3gram_file_list[0].split(",")
    train_pos_3gram_file_list.pop(0)
    test_pos_3gram_file = open(test_pos_3gram,'r')
    test_pos_3gram_file_list = test_pos_3gram_file.readlines()
    test_pos_3gram_file_list.pop(0)
    """

    if features:
        print("Reading feature files")
        train_matrix = np.matrix(train)
        test_matrix = np.matrix(test)

        names = names + feature_names

    if POS:
        print("Reading POS files")

        train_pos_file = open(train_pos, 'r')
        train_pos_file_list = train_pos_file.readlines()
        pos_names = train_pos_file_list[0].split(",")
        train_pos_file_list.pop(0)
        test_pos_file = open(test_pos, 'r')
        test_pos_file_list = test_pos_file.readlines()
        test_pos_file_list.pop(0)

        pos_train_rows = []
        for line in train_pos_file_list:
            line = line.replace("\n", '')
            pos_train_rows.append([int(r) for r in line.split(',')])
        train_pos_file.close()

        pos_test_rows = []
        for line in test_pos_file_list:
            line = line.replace("\n", '')
            pos_test_rows.append([int(r) for r in line.split(',')])
        test_pos_file.close()

        if features:
            train_matrix = np.concatenate(
                [train_matrix, np.matrix(pos_train_rows)], axis=1)
            test_matrix = np.concatenate(
                [test_matrix, np.matrix(pos_test_rows)], axis=1)
        else:
            train_matrix = np.matrix(pos_train_rows)
            test_matrix = np.matrix(pos_test_rows)
        names = names + pos_names

    if charNgrams:
        print("Reading ngram files")
        train_ngram_file = open(new_train_ngrams, 'r')
        train_ngram_file_list = train_ngram_file.readlines()
        ngram_names = train_ngram_file_list[0].split(",")
        train_ngram_file_list.pop(0)
        test_ngram_file = open(new_test_ngrams, 'r')
        test_ngram_file_list = test_ngram_file.readlines()
        test_ngram_file_list.pop(0)

        lines = train_ngram_file_list
        ngram_train_rows = []
        for line in lines:
            line = line.replace("\n", '')
            ngram_train_rows.append([int(r) for r in line.split(',')])
        train_ngram_file.close()

        lines = test_ngram_file_list
        ngram_test_rows = []
        for line in lines:
            line = line.replace("\n", '')

            ngram_test_rows.append([int(r) for r in line.split(',')])
        test_ngram_file.close()

        if (features or POS):
            train_matrix = np.concatenate(
                [train_matrix, np.matrix(ngram_train_rows)], axis=1)
            test_matrix = np.concatenate(
                [test_matrix, np.matrix(ngram_test_rows)], axis=1)
        else:
            train_matrix = np.matrix(ngram_train_rows)
            test_matrix = np.matrix(ngram_test_rows)
        names = names + ngram_names

    if POSgrams:
        print("Reading POS n gram files")

        train_pos_gram_file = open(train_pos_gram, 'r')
        train_pos_gram_file_list = train_pos_gram_file.readlines()
        pos_gram_names = train_pos_gram_file_list[0].split(",")
        train_pos_gram_file_list.pop(0)
        test_pos_gram_file = open(test_pos_gram, 'r')
        test_pos_gram_file_list = test_pos_gram_file.readlines()
        test_pos_gram_file_list.pop(0)

        pos_gram_train_rows = []
        for line in train_pos_gram_file_list:
            line = line.replace("\n", '')
            pos_gram_train_rows.append([int(r) for r in line.split(',')])
        train_pos_gram_file.close()

        pos_gram_test_rows = []
        for line in test_pos_gram_file_list:
            line = line.replace("\n", '')
            pos_gram_test_rows.append([int(r) for r in line.split(',')])
        test_pos_gram_file.close()

        if (features or POS or charNgrams):
            train_matrix = np.concatenate(
                [train_matrix, np.matrix(pos_gram_train_rows)], axis=1)
            test_matrix = np.concatenate(
                [test_matrix, np.matrix(pos_gram_test_rows)], axis=1)
        else:
            train_matrix = np.matrix(pos_gram_train_rows)
            test_matrix = np.matrix(pos_gram_test_rows)
        names = names + pos_gram_names
        """
        pos_3gram_train_rows = []
        for line in train_pos_3gram_file_list:
            line = line.replace("\n",'')
            pos_3gram_train_rows.append([int(r) for r in line.split(',')])
        train_pos_3gram_file.close()
        
        
        pos_3gram_test_rows = []
        for line in test_pos_3gram_file_list:
            line = line.replace("\n",'')
            pos_3gram_test_rows.append([int(r) for r in line.split(',')])
        test_pos_3gram_file.close()
            

        train_matrix = np.concatenate([train_matrix, np.matrix(pos_gram_train_rows)], axis = 1)
        test_matrix = np.concatenate([test_matrix, np.matrix(pos_gram_test_rows)], axis = 1)
        
        names = names + pos_3gram_names
        """

    if BoW:
        print("Generating Bag of Words")

        #vocab_f = open(vocab_path, 'r')
        #vocab = vocab_f.readline().split(',')
        vectorizer = CountVectorizer(token_pattern='[a-zA-Z]+',
                                     stop_words='english')
        bow_train = vectorizer.fit_transform(texts_train)
        bow_test = vectorizer.transform(texts_test)
        if (features or POS or charNgrams or POSgrams):
            train_matrix = hstack((bow_train, train_matrix))
            test_matrix = hstack((bow_test, test_matrix))
        else:
            train_matrix = bow_train
            test_matrix = bow_test
        bow_names = vectorizer.get_feature_names()
        names = bow_names + names

    if tfidf:
        print("Generating TFIDF")

        #vocab_f = open(vocab_path, 'r')
        #vocab = vocab_f.readline().split(',')
        vectorizer = TfidfVectorizer()
        bow_train = vectorizer.fit_transform(texts_train)
        bow_test = vectorizer.transform(texts_test)
        if (features or POS or charNgrams or POSgrams or BoW):
            train_matrix = hstack((bow_train, train_matrix))
            test_matrix = hstack((bow_test, test_matrix))
        else:
            train_matrix = bow_train
            test_matrix = bow_test
        bow_names = vectorizer.get_feature_names()
        names = bow_names + names

    if not BoW or not tfidf:
        train_matrix = sparse.csc_matrix(train_matrix)
        test_matrix = sparse.csc_matrix(test_matrix)

    if binary:
        transformer = Binarizer().fit(train_matrix)
        train_matrix = transformer.transform(train_matrix)

        transformer = Binarizer().fit(test_matrix)
        test_matrix = transformer.transform(test_matrix)

    if removeCenter:
        extreme_indexes = []
        for i in range(0, len(texts_train)):
            if (nominates_train[i] > 0.2 or nominates_train[i] < -0.2):
                extreme_indexes.append(i)
        train_matrix = train_matrix.tocsr()[extreme_indexes, :]
        labels_train = [labels_train[i] for i in extreme_indexes]

    pos_train = []
    neg_train = []
    for i in range(0, len(labels_train)):
        if labels_train[i] == -1.0:
            neg_train.append(i)
        else:
            pos_train.append(i)

    pos_matrix = train_matrix.tocsr()[pos_train, :]
    neg_matrix = train_matrix.tocsr()[neg_train, :]
    diff = [
        abs(x - y) for x, y in zip(
            pos_matrix.mean(axis=0).tolist()[0],
            neg_matrix.mean(axis=0).tolist()[0])
    ]

    indexes = []

    indexes_sorted = [
        i[0] for i in sorted(enumerate(diff), key=lambda x: x[1])
    ]
    names_sorted = [names[i] for i in indexes_sorted]

    ac_nb_list = []
    ac_log_list = []

    train_matrix_original = train_matrix
    test_matrix_original = test_matrix

    for nFeatures in nFeaturesList:
        indexes = indexes_sorted[len(indexes_sorted) -
                                 nFeatures:len(indexes_sorted)]
        names = names_sorted[len(indexes_sorted) -
                             nFeatures:len(indexes_sorted)]
        train_matrix = train_matrix_original.tocsr()[:, indexes]
        test_matrix = test_matrix_original.tocsr()[:, indexes]

        print("Training the Naive Bayes classifier")
        clf = MultinomialNB()
        clf.fit(train_matrix, labels_train)
        pred = clf.predict(test_matrix)

        print("Naive Bayes")
        print("Accuracy:  " + str(Eval.Accuracy(labels_test, pred.tolist())))
        print("Precision: " + str(Eval.Precision(labels_test, pred.tolist())))
        print("Recall: " + str(+Eval.Recall(labels_test, pred.tolist())))
        ac_nb = Eval.Accuracy(labels_test, pred.tolist())
        ac_nb_list.append(float(ac_nb))

        if statistics:
            Eval.histogram(nominates_test, labels_test, pred.tolist(), 10,
                           'Naive Bayes', 'blue')

            a = clf.feature_log_prob_[0] - clf.feature_log_prob_[1]
            b = [
                x * y for x, y in zip(a,
                                      train_matrix.mean(axis=0).tolist()[0])
            ]
            coefs_with_fns = sorted(zip(b, names))
            top = zip(coefs_with_fns[:20], coefs_with_fns[:-(20 + 1):-1])
            for (coef_1, fn_1), (coef_2, fn_2) in top:
                print("\t%.4f\t%-15s\t\t%.4f\t%-15s" %
                      (coef_2, fn_2, coef_1, fn_1))

        clf = LogisticRegression(solver='saga', max_iter=2000)
        clf.fit(train_matrix, labels_train)
        pred = clf.predict(test_matrix)

        print("Logistic Regression")
        print("Accuracy: " + str(Eval.Accuracy(labels_test, pred.tolist())))
        print("Precision: " + str(Eval.Precision(labels_test, pred.tolist())))
        print("Recall: " + str(Eval.Recall(labels_test, pred.tolist())))
        ac_log = Eval.Accuracy(labels_test, pred.tolist())
        ac_log_list.append(float(ac_log))

        if statistics:
            Eval.histogram(nominates_test, labels_test, pred.tolist(), 10,
                           'Logistic Regression', 'orange')

            plt.legend(bbox_to_anchor=(0., 1.02, 1., .102),
                       loc=3,
                       ncol=2,
                       mode="expand",
                       borderaxespad=0.)

            b = [
                x * y for x, y in zip(clf.coef_[0],
                                      train_matrix.mean(axis=0).tolist()[0])
            ]
            coefs_with_fns = sorted(zip(b, names))
            top = zip(coefs_with_fns[:20], coefs_with_fns[:-(20 + 1):-1])
            for (coef_1, fn_1), (coef_2, fn_2) in top:
                print("\t%.4f\t%-15s\t\t%.4f\t%-15s" %
                      (coef_1, fn_1, coef_2, fn_2))

    return ac_nb_list, ac_log_list
Beispiel #2
0
def TestModel(new_train, new_test, texts_train_path, texts_test_path, train_pos, test_pos, new_train_ngrams, new_test_ngrams, train_lda, test_lda, thresholdPos = 0.2, thresholdNeg = -0.2, thresholdPosTest = 0.2, thresholdNegTest = -0.2, subsample=False, removeCenter=True, BoW = True, charNgrams = False, POS = False, features = False, POSgrams = False, tfidf = False, binary = False, lda = False, addToTrain = None):
    
    names = []
    
    labels_train, texts_train, nominates_train = Eval.readDataSet(texts_train_path, 0)
    labels_test, texts_test, nominates_test = Eval.readDataSet(texts_test_path, 0)
    
    if addToTrain:
        for i in addToTrain:
            texts_train_path2 = "C:/Users/Eduardo/Desktop/2 cuatri IIT/TFM/Datasets/hein-daily/hein-daily/longTexts/speeches_"+str(i)+"_dwnominate_nonames.txt"
            labels_train2, texts_train2, nominates_train2 = Eval.readDataSet(texts_train_path2, 0)
            labels_train = labels_train + labels_train2
            texts_train = texts_train + texts_train2
            nominates_train = nominates_train + nominates_train2
    
    """
    train_pos_3gram_file = open(train_pos_3gram, 'r')
    train_pos_3gram_file_list = train_pos_3gram_file.readlines() 
    pos_3gram_names = train_pos_3gram_file_list[0].split(",")
    train_pos_3gram_file_list.pop(0)
    test_pos_3gram_file = open(test_pos_3gram,'r')
    test_pos_3gram_file_list = test_pos_3gram_file.readlines()
    test_pos_3gram_file_list.pop(0)
    """    
      
    if features:
        print("Reading feature files")
        train, test, train_labels, test_labels, feature_names = FeaturesReader.readFeatures(new_train, new_test) 

        train_matrix = np.matrix(train)
        test_matrix = np.matrix(test)
        
        names = names + feature_names
    
    if POS:
        print("Reading POS files")
        
        train_pos_file = open(train_pos, 'r')
        train_pos_file_list = train_pos_file.readlines() 
        pos_names = train_pos_file_list[0].split(",")
        train_pos_file_list.pop(0)
        test_pos_file = open(test_pos,'r')
        test_pos_file_list = test_pos_file.readlines()
        test_pos_file_list.pop(0)
        
        pos_train_rows = []
        for line in train_pos_file_list:
            line = line.replace("\n",'')
            pos_train_rows.append([int(r) for r in line.split(',')])
        train_pos_file.close()
        
        
        pos_test_rows = []
        for line in test_pos_file_list:
            line = line.replace("\n",'')
            pos_test_rows.append([int(r) for r in line.split(',')])
        test_pos_file.close()
            
        if features:
            train_matrix = np.concatenate([train_matrix, np.matrix(pos_train_rows)], axis = 1)
            test_matrix = np.concatenate([test_matrix, np.matrix(pos_test_rows)], axis = 1)
        else:
            train_matrix = np.matrix(pos_train_rows)
            test_matrix = np.matrix(pos_test_rows)
        names = names + pos_names   
        
    
    if charNgrams:
        print("Reading ngram files")
        train_ngram_file = open(new_train_ngrams, 'r')
        train_ngram_file_list = train_ngram_file.readlines()
        ngram_names = train_ngram_file_list[0].split(",")
        train_ngram_file_list.pop(0)
        test_ngram_file = open(new_test_ngrams, 'r')
        test_ngram_file_list = test_ngram_file.readlines()
        test_ngram_file_list.pop(0)
        
        lines = train_ngram_file_list
        ngram_train_rows = []
        for line in lines:
            line = line.replace("\n",'')
            ngram_train_rows.append([int(r) for r in line.split(',')])
        train_ngram_file.close()
        
        
        lines = test_ngram_file_list
        ngram_test_rows = []
        for line in lines:
            line = line.replace("\n",'')
            
            ngram_test_rows.append([int(r) for r in line.split(',')])
        test_ngram_file.close()
        
        if (features or POS):
            train_matrix = np.concatenate([train_matrix, np.matrix(ngram_train_rows)], axis = 1)
            test_matrix = np.concatenate([test_matrix, np.matrix(ngram_test_rows)], axis = 1)
        else:
            train_matrix = np.matrix(ngram_train_rows)
            test_matrix = np.matrix(ngram_test_rows)
        names = names + ngram_names
            
    if POSgrams:
        print("Reading POS n gram files")
        
        train_pos_gram_file = open(train_pos_gram, 'r')
        train_pos_gram_file_list = train_pos_gram_file.readlines() 
        pos_gram_names = train_pos_gram_file_list[0].split(",")
        train_pos_gram_file_list.pop(0)
        test_pos_gram_file = open(test_pos_gram,'r')
        test_pos_gram_file_list = test_pos_gram_file.readlines()
        test_pos_gram_file_list.pop(0)
        
        pos_gram_train_rows = []
        for line in train_pos_gram_file_list:
            line = line.replace("\n",'')
            pos_gram_train_rows.append([int(r) for r in line.split(',')])
        train_pos_gram_file.close()
        
        
        pos_gram_test_rows = []
        for line in test_pos_gram_file_list:
            line = line.replace("\n",'')
            pos_gram_test_rows.append([int(r) for r in line.split(',')])
        test_pos_gram_file.close()
            
        if (features or POS or charNgrams):
            train_matrix = np.concatenate([train_matrix, np.matrix(pos_gram_train_rows)], axis = 1)
            test_matrix = np.concatenate([test_matrix, np.matrix(pos_gram_test_rows)], axis = 1)
        else:
            train_matrix = np.matrix(pos_gram_train_rows)
            test_matrix = np.matrix(pos_gram_test_rows)
        names = names + pos_gram_names
        
        """
        pos_3gram_train_rows = []
        for line in train_pos_3gram_file_list:
            line = line.replace("\n",'')
            pos_3gram_train_rows.append([int(r) for r in line.split(',')])
        train_pos_3gram_file.close()
        
        
        pos_3gram_test_rows = []
        for line in test_pos_3gram_file_list:
            line = line.replace("\n",'')
            pos_3gram_test_rows.append([int(r) for r in line.split(',')])
        test_pos_3gram_file.close()
            

        train_matrix = np.concatenate([train_matrix, np.matrix(pos_gram_train_rows)], axis = 1)
        test_matrix = np.concatenate([test_matrix, np.matrix(pos_gram_test_rows)], axis = 1)
        
        names = names + pos_3gram_names
        """
        
    if lda:
        print("Reading lda files")
        
        train_lda_file = open(train_lda, 'r')
        train_lda_file_list = train_lda_file.readlines() 
        lda_names = train_lda_file_list[0].split(",")
        train_lda_file_list.pop(0)
        test_lda_file = open(test_lda,'r')
        test_lda_file_list = test_lda_file.readlines()
        test_lda_file_list.pop(0)
        
        lda_train_rows = []
        for line in train_lda_file_list:
            line = line.replace("\n",'')
            lda_train_rows.append([float(r) for r in line.split(',')])
        train_lda_file.close()
        
        
        lda_test_rows = []
        for line in test_lda_file_list:
            line = line.replace("\n",'')
            lda_test_rows.append([float(r) for r in line.split(',')])
        test_lda_file.close()
            
        if (features or POS or charNgrams or POSgrams):
            train_matrix = np.concatenate([train_matrix, np.matrix(lda_train_rows)], axis = 1)
            test_matrix = np.concatenate([test_matrix, np.matrix(lda_test_rows)], axis = 1)
        else:
            train_matrix = np.matrix(lda_train_rows)
            test_matrix = np.matrix(lda_test_rows)
        names = names + lda_names
    
    if removeCenter:
        
        extreme_indexes = []
        for i in range(0,len(texts_train)):
            if (nominates_train[i] > thresholdPos or nominates_train[i]<thresholdNeg):
                extreme_indexes.append(i)
        if (features or POS or charNgrams or POSgrams or lda):        
            train_matrix = train_matrix[extreme_indexes,:]
        labels_train = [labels_train[i] for i in extreme_indexes]
        texts_train = [texts_train[i] for i in extreme_indexes]
        """
        extreme_indexes = []
        for i in range(0,len(texts_test)):
            if (nominates_test[i] > thresholdPosTest or nominates_test[i]<thresholdNegTest):
                extreme_indexes.append(i)
        if (features or POS or charNgrams or POSgrams or lda):       
            test_matrix = test_matrix[extreme_indexes,:]
        
        texts_test = [texts_test[i] for i in extreme_indexes]
        labels_test = [labels_test[i] for i in extreme_indexes]
        nominates_test = [nominates_test[i] for i in extreme_indexes]
        """
    if BoW:
        print("Generating Bag of Words")
        
        #vocab_f = open(vocab_path, 'r')
        #vocab = vocab_f.readline().split(',')
        vectorizer = CountVectorizer(token_pattern = '[a-zA-Z]+', stop_words='english')
        bow_train = vectorizer.fit_transform(texts_train)
        bow_test = vectorizer.transform(texts_test)
        if (features or POS or charNgrams or POSgrams or lda):
            train_matrix = hstack((bow_train,train_matrix))
            test_matrix = hstack((bow_test,test_matrix))
        else:
            train_matrix = bow_train
            test_matrix = bow_test
        bow_names = vectorizer.get_feature_names()
        names = bow_names + names
    
    if tfidf:
        print("Generating TFIDF")
        
        #vocab_f = open(vocab_path, 'r')
        #vocab = vocab_f.readline().split(',')
        vectorizer = TfidfVectorizer(token_pattern = '[a-zA-Z]+', stop_words='english')
        bow_train = vectorizer.fit_transform(texts_train)
        bow_test = vectorizer.transform(texts_test)
        if (features or POS or charNgrams or POSgrams or BoW):
            train_matrix = hstack((bow_train,train_matrix))
            test_matrix = hstack((bow_test,test_matrix))
        else:
            train_matrix = bow_train
            test_matrix = bow_test
        bow_names = vectorizer.get_feature_names()
        names = bow_names + names
    
    if not BoW or not tfidf:
        train_matrix = sparse.csc_matrix(train_matrix)
        test_matrix = sparse.csc_matrix(test_matrix)
    
    if binary:
        transformer = Binarizer().fit(train_matrix)
        train_matrix = transformer.transform(train_matrix)
        
        transformer = Binarizer().fit(test_matrix)
        test_matrix = transformer.transform(test_matrix)
    

        
        
    print("Training the Naive Bayes classifier")
    clf = MultinomialNB()
    clf.fit(train_matrix, labels_train)
    pred = clf.predict(test_matrix)

    print("Naive Bayes")
    print("Accuracy:  "+str(Eval.Accuracy(labels_test, pred.tolist())))
    print("Precision: "+str(Eval.Precision(labels_test, pred.tolist())))
    print("Recall: "+str(+Eval.Recall(labels_test, pred.tolist())))
    cm = confusion_matrix(labels_test, pred)
    print(cm)
    #print("Speaker accuracy: " + str(Eval.SpeakerAccuracy(112, pred)))
    
    nb_ac = Eval.Accuracy(labels_test, pred.tolist())
    
    Eval.histogram(nominates_test,labels_test,pred.tolist(),10, 'Naive Bayes', 'c')
    
    
    a = clf.feature_log_prob_[0] - clf.feature_log_prob_[1]
    b = [x*y for x,y in  zip(a, train_matrix.mean(axis=0).tolist()[0])]
    coefs_with_fns = sorted(zip(b, names)) 
    top = zip(coefs_with_fns[:20], coefs_with_fns[:-(20 + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_2, fn_2, coef_1, fn_1))
    
    clf = LogisticRegression(solver='lbfgs')
    clf.fit(train_matrix, labels_train)
    pred = clf.predict(test_matrix)
    
    print("Logistic Regression")
    print("Accuracy: "+str(Eval.Accuracy(labels_test, pred.tolist())))
    print("Precision: "+str(Eval.Precision(labels_test, pred.tolist())))
    print("Recall: "+str(Eval.Recall(labels_test, pred.tolist())))
    cm = confusion_matrix(labels_test, pred)
    print(cm)
    #print("Speaker accuracy: " + str(Eval.SpeakerAccuracy(112, pred)))

    Eval.histogram(nominates_test,labels_test,pred.tolist(),10, 'Logistic Regression', 'b')
    
    plt.legend(loc=1, ncol=1)
    
    b = [x*y for x,y in  zip(clf.coef_[0], train_matrix.mean(axis=0).tolist()[0])]
    coefs_with_fns = sorted(zip(b, names))
    top = zip(coefs_with_fns[:20], coefs_with_fns[:-(20 + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
            print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
    
    log_ac = Eval.Accuracy(labels_test, pred.tolist())