예제 #1
0
def baseline(tweets_train, train_labels, tweets_test, test_labels):
    # Import the subjectivity lexicon
    subj_dict = data_proc.get_subj_lexicon()

    types_of_features = ['1', '2', '3', 'ngrams']
    for t in types_of_features:
        start = time.time()
        utils.print_model_title("Classification using feature type " + t)
        if t is '1':
            x_train_features = extract_baseline_features.get_features1(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features1(
                tweets_test, subj_dict)

        if t is '2':
            x_train_features = extract_baseline_features.get_features2(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features2(
                tweets_test, subj_dict)

        if t is '3':
            x_train_features = extract_baseline_features.get_features3(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features3(
                tweets_test, subj_dict)

        if t is 'ngrams':
            ngram_map, x_train_features = extract_baseline_features.get_ngram_features(
                tweets_train, n=1)
            x_test_features = extract_baseline_features.get_ngram_features_from_map(
                tweets_test, ngram_map, n=1)

        # Get the class ratio
        class_ratio = utils.get_classes_ratio_as_dict(train_labels)

        # Train on a Linear Support Vector Classifier
        print("\nEvaluating a linear SVM model...")
        classifiers.linear_svm(x_train_features, train_labels, x_test_features,
                               test_labels, class_ratio)

        # Train on a Logistic Regression Classifier
        print("\nEvaluating a logistic regression model...")
        classifiers.logistic_regression(x_train_features, train_labels,
                                        x_test_features, test_labels,
                                        class_ratio)
        end = time.time()
        print(
            "Completion time of the baseline model with features type %s: %.3f s = %.3f min"
            % (t, (end - start), (end - start) / 60.0))
예제 #2
0
def run_classifiers_with_doc2vec(reviews, scores, lang='en'):
    '''Corpus should be an array of TaggedDocument objects.'''
    corpus = list(get_corpus(reviews, scores, lang))[:20000]
    train_corpus, test_corpus = train_test_split(corpus, test_size=0.25, random_state=42)

    doc2vec_model = create_doc2vec_model(train_corpus)
    train_targets, train_regressors = zip(*[(doc.words, doc.tags[0]) for doc in train_corpus])
    test_targets, test_regressors = zip(*[(doc.words, doc.tags[0]) for doc in test_corpus])

    '''
    For every review, we apply doc2vec_model.infer_vector(review). This creates
    a feature vector for every document (in our case, review) in the corpus.
    '''
    train_x, train_y = get_train_lists(doc2vec_model, train_targets, train_regressors)
    test_x,  test_y  = get_test_lists(doc2vec_model, test_targets, test_regressors)

    classifier = classifiers.logistic_regression(train_x, train_y)
    # classifier = classifier_func(train_x, train_y)
    # return logistic_reg
    train_predictions = classifier.predict(train_x)  # Training
    train_accuracy = metrics.accuracy_score(train_y, train_predictions)
    class_probabilities_train = classifier.predict_proba(train_x)
    train_auc_score = metrics.roc_auc_score(train_y, class_probabilities_train[:, 1]);
    print('\nTraining:')
    print(' accuracy:', format(100 * train_accuracy, '.2f'))
    print(' AUC value:', format(100 * train_auc_score, '.2f'))

    test_predictions = classifier.predict(test_x)  # Test
    test_accuracy = metrics.accuracy_score(test_y, test_predictions)
    class_probabilities_test = classifier.predict_proba(test_x)
    test_auc_score = metrics.roc_auc_score(test_y, class_probabilities_test[:, 1]);
    print('\nTesting:')
    print(' accuracy:', format(100 * test_accuracy, '.2f'))
    print(' AUC value:', format(100 * test_auc_score, '.2f'))
예제 #3
0
def perform_cross_validation_split(data, num_splits, filename):
    y = data['defect_status']
    del data["defect_status"]  # removing defect answer for training
    X = data.to_numpy()  # remove defects

    # performing cross-validation
    kf = KFold(n_splits=num_splits)

    for train, test in kf.split(X):
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        actual = y_test.tolist()

        # Classification and Regression Tree (CART)
        prediction = classifiers.cart(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "cart")

        # KNearestNeighbor(KNN)
        prediction = classifiers.knn(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "knn")

        # Logistic Regression (LR)
        prediction = classifiers.logistic_regression(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "lr")

        # Naive Bayes (NB)
        prediction = classifiers.naive_bayes(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "nb")

        # Random Forest (RF)
        prediction = classifiers.random_forest(X_train, y_train, X_test)
        calculate_metrics(prediction, actual, "rf")

    print("\nCalculate mean metrics...")
    filename = filename.split("IST_")[1].split(".csv")[0]

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_cart, recall_results_cart, f_score_results_cart,
        auc_results_cart)
    save_results(precision, recall, f_score, auc, filename, "CART")

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_knn, recall_results_knn, f_score_results_knn,
        auc_results_knn)
    save_results(precision, recall, f_score, auc, filename, "KNN")

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_lr, recall_results_lr, f_score_results_lr,
        auc_results_lr)
    save_results(precision, recall, f_score, auc, filename, "LR")

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_nb, recall_results_nb, f_score_results_nb,
        auc_results_nb)
    save_results(precision, recall, f_score, auc, filename, "NB")

    precision, recall, f_score, auc = calculate_mean_metrics(
        precision_results_rf, recall_results_rf, f_score_results_rf,
        auc_results_rf)
    save_results(precision, recall, f_score, auc, filename, "RF")
예제 #4
0
def baseline(tweets_train, train_labels, tweets_test, test_labels):
    subj_dict = dproc.get_subj_lexicon('hindi_lexicon.tff')
    types_of_features = ['1', '2', 'ngrams']  # '3' is removed

    for t in types_of_features:

        start = time.time()
        utils.print_model_title("Classification using features type " + t)
        if t is '1':
            x_train_features = extract_baseline_features.get_features1(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features1(
                tweets_test, subj_dict)

        if t is '2':
            x_train_features = extract_baseline_features.get_features2(
                tweets_train, subj_dict)
            x_test_features = extract_baseline_features.get_features2(
                tweets_test, subj_dict)

        #if t is '3':
        #	x_train_features = extract_baseline_features.get_feature3(tweets_train, subj_dict)
        #	x_test_features = extract_baseline_features.get_feature3(tweets_test, subj_dict)

        if t is 'ngrams':
            ngram_map, x_train_features = extract_baseline_features.get_ngram_features(
                tweets_train, n=1)
            x_test_features = extract_baseline_features.get_ngram_features_from_map(
                tweets_test, ngram_map, n=1)

        #get the class ratio
        class_ratio = utils.get_classes_ratio_as_dict(train_labels)

        # train on a linear Support Vector Classifer
        print('\n Evaluating a linear SVM model...')
        classifiers.linear_svm(x_train_features, train_labels, x_test_features,
                               test_labels, class_ratio)

        #train on logistic regression
        classifiers.logistic_regression(x_train_features, train_labels,
                                        x_test_features, test_labels,
                                        class_ratio)
        end = time.time()

        print(
            "Completion time of the baseline model with features type %s: %.3f s = %.3f min"
            % (t, (end - start), (end - start) / 60.0))
예제 #5
0
def supperclassify(train_set, train_label, test_set, test_label):
    '''Different methods'''
    train_voted = voting(train_set)
    aux = train_voted == train_label
    correct = sum(aux.astype(int))
    _accuracy = (correct * 100) / len(train_label)
    _precision, _recall, _f1score, _support = ut.get_measures_for_each_class(
        train_label, train_voted)
    print 'Estimator VOTING'
    print 'Average Accuracy:\t', _accuracy
    print 'Average Precision:\t', _precision
    print 'Average Recall:\t', _recall
    print 'Average F1 Measure:\t', _f1score
    print '\n'

    lambdas = weighted_voting_getlambdas(train_set, train_label)
    results = weighted_voting(test_set, lambdas)

    aux = results == test_label
    correct = sum(aux.astype(int))
    _accuracy = (correct * 100) / len(test_label)
    _precision, _recall, _f1score, _support = ut.get_measures_for_each_class(
        test_label, results)
    print 'Estimator W_VOTING'
    print 'Average Accuracy:\t', _accuracy
    print 'Average Precision:\t', _precision
    print 'Average Recall:\t', _recall
    print 'Average F1 Measure:\t', _f1score

    rf = clf.classifier_randomForest(train_set, train_label)
    results = clf.evaluateResults(rf,
                                  test_set,
                                  test_label,
                                  estimator_name='RF')

    lr = clf.logistic_regression(train_set, train_label)
    results = clf.evaluateResults(lr,
                                  test_set,
                                  test_label,
                                  estimator_name='LR')

    svm = clf.classifier_svm(train_set, train_label)
    results = clf.evaluateResults(svm,
                                  test_set,
                                  test_label,
                                  estimator_name='SVM')

    rbf = clf.rbf_classifier(train_set, train_label)
    results = clf.evaluateResults(rbf,
                                  test_set,
                                  test_label,
                                  estimator_name='RBF')
예제 #6
0
def supperclassify(train_set, train_label, test_set, test_label):
    '''Different methods'''
    train_voted = voting(train_set)
    aux = train_voted == train_label
    correct = sum(aux.astype(int))
    _accuracy = (correct * 100) / len(train_label)
    _precision, _recall, _f1score, _support = ut.get_measures_for_each_class(train_label, train_voted)
    print 'Estimator VOTING'
    print 'Average Accuracy:\t', _accuracy
    print 'Average Precision:\t', _precision
    print 'Average Recall:\t', _recall
    print 'Average F1 Measure:\t', _f1score
    print '\n'

    lambdas = weighted_voting_getlambdas(train_set, train_label)
    results = weighted_voting(test_set, lambdas)

    aux = results == test_label
    correct = sum(aux.astype(int))
    _accuracy = (correct * 100) / len(test_label)
    _precision, _recall, _f1score, _support = ut.get_measures_for_each_class(test_label, results)
    print 'Estimator W_VOTING'
    print 'Average Accuracy:\t', _accuracy
    print 'Average Precision:\t', _precision
    print 'Average Recall:\t', _recall
    print 'Average F1 Measure:\t', _f1score

    rf = clf.classifier_randomForest(train_set, train_label)
    results = clf.evaluateResults(rf, test_set, test_label, estimator_name='RF')

    lr = clf.logistic_regression(train_set, train_label)
    results = clf.evaluateResults(lr, test_set, test_label, estimator_name='LR')

    svm = clf.classifier_svm(train_set, train_label)
    results = clf.evaluateResults(svm, test_set, test_label, estimator_name='SVM')

    rbf = clf.rbf_classifier(train_set, train_label)
    results = clf.evaluateResults(rbf, test_set, test_label, estimator_name='RBF')
    # import pdb; pdb.set_trace()
예제 #7
0
random.shuffle(train_set)
Y_train = np.array([row[0] for row in train_set])
X_train = np.array([row[1] for row in train_set])

test_set = []
for key, value in test.items():
    for file_id, words in value.items():
        bow = np.zeros(len(vocab))
        for word in words:
            if word in vocab:
                bow[vocab[word]] += 1
            else:
                bow[vocab['</unk>']] += 1
        tf = bow/len(words)
        tfidf = tf*idf_test
        avg_glove = np.zeros(300)
        for i in range(len(vocab)):
            avg_glove += tfidf[i]*glove[i]
        test_set.append([key, avg_glove])

print('shuffling test')
random.shuffle(test_set)
Y_test = np.array([row[0] for row in test_set])
X_test = np.array([row[1] for row in test_set])

print("Calling Classifiers\n")
print("Accuracy for Naive Bayes is : ", naive_bayes(X_train, Y_train, X_test, Y_test))
print("Accuracy for Logistic Regression is : ", logistic_regression(X_train, Y_train, X_test, Y_test))
print("Accuracy for SVM is : ", svm(X_train, Y_train, X_test, Y_test))
print("Accuracy for FF Neural Net is : ", fnn(X_train, Y_train, X_test, Y_test))
# print("Accuracy for Recurrent Neural Net is : ", rnn(X_train, Y_train, X_test, Y_t))
예제 #8
0
        # Extract tweet partition
        train_tweets = tweets[np.array(train)]
        train_tweets, test_tweets, train_labels, test_labels = tweets[train], tweets[test], labels[train], labels[test]

        print len(test_tweets)
        print len(train_tweets)

        train_tweets = np.hstack(train_tweets)
        dictionary, tweets_features, vectorizer = bow.bow(train_tweets, vec="tfidf")
        '''
        Training different classifiers.
        '''
        svm = clf.classifier_svm(tweets_features, train_labels)
        rf = clf.classifier_randomForest(tweets_features, train_labels)
        ada = clf.adaboost(tweets_features, train_labels)
        lr = clf.logistic_regression(tweets_features, train_labels)

        '''
        Test the different classifiers with the test tweets.
        '''

        pred = vectorizer.transform(test_tweets)
        pred = pred.toarray()

        _results, _accuracyLR, _precisionLR, _recallLR, _f_measureLR = clf.evaluateResults(lr, pred, test_labels,
                                                                                           estimator_name='Logistic regression',
                                                                                           file_name=results_folder)
        _results, _accuracyRF, _precisionRF, _recallRF, _f_measureRF = clf.evaluateResults(rf, pred, test_labels,
                                                                                           estimator_name='RF',
                                                                                           file_name=results_folder)
예제 #9
0
        reviews.append(review)
        # length_of_reviews.append(len(review))

    return reviews, scores #, length_of_reviews

if __name__ == '__main__':
    print("OP_SPAM.py")
    reviews, scores = parse_op_spam()
    print(len(reviews), len(scores))
    bow, vec = bow.bag_of_words(reviews)
    # print(bow)

    train_x, test_x, train_y, test_y = train_test_split(bow, scores, test_size=0.25, random_state=42)

    classifier = classifiers.logistic_regression(train_x, train_y)

    train_predictions = classifier.predict(train_x)  # Training
    train_accuracy = metrics.accuracy_score(train_y, train_predictions)
    class_probabilities_train = classifier.predict_proba(train_x)
    train_auc_score = metrics.roc_auc_score(train_y, class_probabilities_train[:, 1]);
    print('\nTraining:')
    print(' accuracy:', format(100 * train_accuracy, '.2f'))
    print(' AUC value:', format(100 * train_auc_score, '.2f'))

    test_predictions = classifier.predict(test_x)  # Test
    test_accuracy = metrics.accuracy_score(test_y, test_predictions)
    class_probabilities_test = classifier.predict_proba(test_x)
    test_auc_score = metrics.roc_auc_score(test_y, class_probabilities_test[:, 1]);
    print('\nTesting:')
    print(' accuracy:', format(100 * test_accuracy, '.2f'))
예제 #10
0
def main():
    # Decision Tree
    dtree_scores = decision_tree(5)
    X, Y = extract_scores(dtree_scores)
    tree_max = max(Y)
    plt.plot(X, Y, 'r')
    plt.axis([8, 22, 0.94, 0.98])
    plt.title('Decision Tree Classifier Accuracy')
    plt.xlabel('max_depth')
    plt.ylabel('accuracy')
    plt.savefig('dtree.png')
    plt.clf()

    # Logistic Regression
    log_scores = logistic_regression(5)
    X, Y = extract_scores(log_scores)
    log_max = max(Y)
    plt.plot(X, Y, 'r')
    plt.axis([0, 1, 0.65, 0.8])
    plt.title('Logistic Regression Accuracy')
    plt.xlabel('alpha')
    plt.ylabel('accuracy')
    plt.savefig('log.png')
    plt.clf()

    # kNN
    k_scores = kNN(5)
    X, Y = extract_scores(k_scores)
    k_max = max(Y)
    plt.plot(X, Y, 'r')
    plt.axis([0, 15, 0.95, 1.0])
    plt.title('kNN Accuracy')
    plt.xlabel('neighbors')
    plt.ylabel('accuracy')
    plt.savefig('knn.png')
    plt.clf()

    # neural network
    net_scores = neural_network(5)
    X, Y = extract_scores(net_scores)
    net_max = max(Y)
    plt.plot(X, Y, 'r')
    plt.axis([0, 1.5, 0.95, 1.0])
    plt.title('Neural Network Accuracy')
    plt.xlabel('alpha')
    plt.ylabel('accuracy')
    plt.savefig('net.png')
    plt.clf()

    # Final Results
    best = [tree_max, k_max, net_max]
    models = ['DecisionTree', 'K-NN', 'NeuralNetwork']
    x_pos = [i for i, _ in enumerate(models)]

    plt.bar(x_pos, best, color='green')
    plt.ylim(0.95, 1.0)
    plt.xlabel("Classifiers")
    plt.ylabel("Accuracy")
    plt.title("Best Accuracy from Models")

    plt.xticks(x_pos, models)

    plt.savefig('best.png')
예제 #11
0
print('shuffling train')
random.shuffle(train_set)
Y_train = np.array([row[0] for row in train_set])
X_train = np.array([row[1] for row in train_set])

test_set = []
for key, value in test.items():
    for file_id, words in value.items():
        bow = np.zeros(len(vocab))
        for word in words:
            if word in vocab:
                bow[vocab[word]] += 1
            else:
                bow[vocab['</unk>']] += 1
        bow = bow / len(words)
        test_set.append([key, bow])

print('shuffling test')
random.shuffle(test_set)
Y_test = np.array([row[0] for row in test_set])
X_test = np.array([row[1] for row in test_set])

print("Calling Classifiers\n")
print("Accuracy for Naive Bayes is : ",
      naive_bayes(X_train, Y_train, X_test, Y_test))
print("Accuracy for Logistic Regression is : ",
      logistic_regression(X_train, Y_train, X_test, Y_test))
print("Accuracy for SVM is : ", svm(X_train, Y_train, X_test, Y_test))
print("Accuracy for FF Neural Net is : ", fnn(X_train, Y_train, X_test,
                                              Y_test))
# print("Accuracy for Recurrent Neural Net is : ", rnn(X_train, Y_train, X_test, Y_t))
예제 #12
0
        train_tweets, test_tweets, train_labels, test_labels = tweets[
            train], tweets[test], labels[train], labels[test]

        print len(test_tweets)
        print len(train_tweets)

        train_tweets = np.hstack(train_tweets)
        dictionary, tweets_features, vectorizer = bow.bow(train_tweets,
                                                          vec="tfidf")
        '''
        Training different classifiers.
        '''
        svm = clf.classifier_svm(tweets_features, train_labels)
        rf = clf.classifier_randomForest(tweets_features, train_labels)
        ada = clf.adaboost(tweets_features, train_labels)
        lr = clf.logistic_regression(tweets_features, train_labels)
        '''
        Test the different classifiers with the test tweets.
        '''

        pred = vectorizer.transform(test_tweets)
        pred = pred.toarray()

        _results, _accuracyLR, _precisionLR, _recallLR, _f_measureLR = clf.evaluateResults(
            lr,
            pred,
            test_labels,
            estimator_name='Logistic regression',
            file_name=results_folder)
        _results, _accuracyRF, _precisionRF, _recallRF, _f_measureRF = clf.evaluateResults(
            rf,
예제 #13
0
    # print('Random Forest:')
    # tree_count = clf.test_random_forest(analysis, 50, 10, True)
    # print('Optimal Tree Count: {}.'.format(tree_count))

    results = []
    for i in np.round(np.linspace(0, 0.9, 10), 1).tolist():
        print('Noise Amount: {}'.format(i))
        # Preprocess
        analysis = pr.Analyser(blur, oasis, i)
        # analysis.get_summaries()
        analysis.train_test()
        analysis.get_tfidf()

        # Fit logistic regression
        print('Logistic Regression:')
        logit_acc, logit_prec, logit_rec = clf.logistic_regression(
            analysis, 10)
        print('Accuracy: {}% +/-{}\nPrecsion: {}% +/- {}\nRecall: {}% +/- {}'.
              format(np.round(logit_acc[0] * 100, 2),
                     np.round(100 * logit_acc[1], 2),
                     np.round(100 * logit_prec[0], 2),
                     np.round(100 * logit_prec[1], 2),
                     np.round(100 * logit_rec[0], 2),
                     np.round(100 * logit_rec[1], 2)))
        result = ['logistic', i]
        result.extend(logit_acc + logit_prec + logit_rec)
        results.append(result)
        print('{}\n'.format('-' * 80))

        # Fit Random Forest
        print('Random Forest:')
        rf_acc, rf_prec, rf_rec = clf.random_forest(analysis, 100, 10)