Esempio n. 1
0
def grid_search_bow_custom_fold(data_h, target, ids, questionmark_features, folds=10, do_custom_folds=True):
    ngram_range = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)]
    max_features = range(80, 95)
    custom_folds = cv_fold_generator(ids, folds)
    res = []
    count = 0
    for i in ngram_range:
        for j in max_features:
            print(count / (len(max_features) * len(ngram_range)))
            count += 1
            bow = BoW(ngram_range=i, max_features=j, stop_words=None)
            x = bow.fit(data_h)
            if i == (1, 2) and j == 90:
                plot_2D_data(x, target)

            # print(reduced)
            # combined2 = np.column_stack((reduced, questionmark_features.toarray()))
            combined = add_question_mark_feature(x, questionmark_features)
            # print(combined.toarray()[0])
            regularization = 'l2'
            if do_custom_folds:
                res.append([logistic_regression(combined, target, custom_folds, regularization), i, j])
            else:
                res.append([logistic_regression(combined, target, folds, regularization), i, j])

    print(sorted(res, key=lambda x: x[0], reverse=True))
Esempio n. 2
0
def combined_crossval(claim_ids, target, rootdist_matrix, tf_matrix, questionmark, folds=7, do_custom_folds=True):
    custom_folds = cv_fold_generator(claim_ids, folds)
    rootdist_feature = sparse.csr_matrix(rootdist_matrix)
    questionmark_feature = questionmark
    ppdb_alignment_feature = sparse.csr_matrix(get_ppdb_alignment_feature())

    combined_all = sparse.hstack((
        rootdist_feature,
        questionmark_feature,
        ppdb_alignment_feature,
        tf_matrix
    ))
    plot_2D_data(combined_all, target)

    if do_custom_folds:
        folds = custom_folds

    print("Classifier: ", '[accuracy,', 'f1_macro,', 'recall_macro,', 'precision_macro]')
    print("Logistic regression ovr L1: ", logistic_regression(combined_all, target, folds, 'l1', 1000000, 'ovr'))
    print("Logistic regression ovr L2: ", logistic_regression(combined_all, target, folds, 'l2', 1000000, 'ovr'))
    print("Logistic regression multiclass L1: ", logistic_regression(combined_all, target, folds, 'l1', 1000000, 'multinomial'))
    print("Logistic regression multiclass L2: ", logistic_regression(combined_all, target, folds, 'l2', 1000000, 'multinomial'))
    print("SVM Cross-validation")
    svm_crossval_grid(combined_all, target, folds)
    print("Naive Bayes: ", naive_bayes(combined_all.toarray(), target, folds))
Esempio n. 3
0
def questionmark_only(claim_ids, target, questionmark, folds=5, do_custom_folds=True, regularization='l2'):
    custom_folds = cv_fold_generator(claim_ids, folds)
    print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro')
    if do_custom_folds:
        print(logistic_regression(questionmark, target, custom_folds, regularization, 1000000))
    else:
        print(logistic_regression(questionmark, target, folds, regularization, 1000000))
Esempio n. 4
0
def bow_rootdist(claim_ids, target, rootdist_matrix, tf_matrix, folds=5, do_custom_folds=True, regularization='l2'):
    custom_folds = cv_fold_generator(claim_ids, folds)
    data_sparse = sparse.csr_matrix(rootdist_matrix)
    combined_all = sparse.hstack((data_sparse, tf_matrix))
    plot_2D_data(combined_all, target)

    print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro')
    if do_custom_folds:
        print(logistic_regression(combined_all, target, custom_folds, regularization, 1000000))
    else:
        print(logistic_regression(combined_all, target, folds, regularization, 1000000))
Esempio n. 5
0
def crossval_rootdist(data,
                      target,
                      ids,
                      questionmark_features=None,
                      folds=10,
                      do_custom_folds=True):
    custom_folds = cv_fold_generator(ids, folds)
    data = sparse.csr_matrix(data)
    if questionmark_features is not None:
        combined = add_question_mark_feature(data, questionmark_features)
    else:
        combined = data
    print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro')
    if do_custom_folds:
        print(logistic_regression(combined, target, custom_folds))
    else:
        print(logistic_regression(combined, target, folds))
def getModel():
    print('getting model')
    reviews, scores = yelp_parser.get_chi_hotel_review_score_list()
    reviews, vec = functions.create_bow_from_reviews(reviews)
    print('first reviews')
    print(reviews.shape)
    logistic = classification.logistic_regression(reviews, scores)
    return logistic, vec
Esempio n. 7
0
def run_classifiers_with_doc2vec(reviews,
                                 scores,
                                 review_lengths,
                                 with_features=False):
    '''Corpus should be an array of TaggedDocument objects.'''
    corpus = list(embeddings.get_corpus(reviews, scores))[:20000]
    train_corpus, test_corpus = train_test_split(corpus,
                                                 test_size=0.25,
                                                 random_state=42)

    doc2vec_model = embeddings.create_doc2vec_model(train_corpus)
    train_targets, train_regressors = zip(*[(doc.words, doc.tags[0])
                                            for doc in train_corpus])
    test_targets, test_regressors = zip(*[(doc.words, doc.tags[0])
                                          for doc in test_corpus])
    '''
    For every review, we apply doc2vec_model.infer_vector(review). This creates
    a feature vector for every document (in our case, review) in the corpus.
    '''
    train_x, train_y = get_train_lists(doc2vec_model, train_targets,
                                       train_regressors, review_lengths)
    test_x, test_y = get_test_lists(doc2vec_model, test_targets,
                                    test_regressors)
    '''
    When the 'with_features' parameter=True, we add our extra features to the
    existing feature matrix.
    '''
    if with_features:
        prp_list = functions.create_pos_features(reviews)
        train_x = functions.add_pos_feature(train_x, prp_list)
        train_x = add_length_review_feature(train_x, review_lengths)
        test_x = functions.add_pos_feature(test_x, prp_list)
        test_x = add_length_review_feature(test_x, review_lengths)

    logistic_reg = classification.logistic_regression(train_x, train_y)
    k_nearest_n = classification.knearest_neighbors(train_x, train_y)
    decision_trees = classification.decision_trees(train_x, train_y)
    random_forest = classification.random_forest(train_x, train_y)

    classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest]

    for i in range(len(classifiers)):
        print("-------------------------------------------------")
        if i == 0:
            print("Logistic Regression\n")
        if i == 1:
            print("K Nearest Neighbors\n")
        if i == 2:
            print("Decision Trees\n")
        if i == 3:
            print("Random Forest\n")
        '''Train and predict on classifiers[i] for both training and testing data.'''
        functions.train_classifier_and_evaluate_accuracy_on_training_data(
            classifiers[i], train_x, train_y)
        functions.train_classifier_and_evaluate_accuracy_on_testing_data(
            classifiers[i], test_x, test_y)
        print('\n\n')
Esempio n. 8
0
def crossval_grid_search(target,
                         ids,
                         min_rootdist=1,
                         max_rootdist=200,
                         step=1,
                         ppdb=None,
                         questionmark_features=None,
                         bow=None,
                         folds=10):
    default_score = range(min_rootdist, max_rootdist + 1, step)
    res = []
    count = 0
    custom_folds = cv_fold_generator(ids, folds)
    for i in default_score:
        data = sparse.csc_matrix(get_rootdist_matrix(i))
        print("At ", round((count * 100.0) / (len(default_score)), 2), "%")
        count += 1
        combined = sparse.hstack((data, questionmark_features, bow, ppdb))

        regularization = 'l2'
        res.append([
            logistic_regression(combined, target, custom_folds,
                                regularization), i
        ])

    acc = np.asarray([[a[0][0], a[1]] for a in res])
    f1 = np.asarray([[a[0][1], a[1]] for a in res])
    recall = np.asarray([[a[0][2], a[1]] for a in res])
    precision = np.asarray([[a[0][3], a[1]] for a in res])
    print("Max acc without question at default_dist: ",
          acc[np.argmax(acc[:, 0]), 1], " ", np.max(acc[:, 0]))
    print("Max f1 without question at default_dist: ", f1[np.argmax(f1[:, 0]),
                                                          1], " ",
          np.max(f1[:, 0]))
    print("Max recall without question at default_dist: ",
          recall[np.argmax(recall[:, 0]), 1], " ", np.max(recall[:, 0]))
    print("Max precision without question at default_dist: ",
          precision[np.argmax(precision[:, 0]), 1], " ", np.max(precision[:,
                                                                          0]))
    plt.plot(acc[:, 1], acc[:, 0], label='Accuracy')
    plt.plot(f1[:, 1], f1[:, 0], label='F1-Score')
    plt.plot(recall[:, 1], recall[:, 0], label='Recall')
    plt.plot(precision[:, 1], precision[:, 0], label='Precision')
    plt.legend()
    plt.xlabel("Default rootdist score")
    plt.ylabel("Accuracy")
    plt.show()

    return res
def rmlr(train_path, test_path, top_k, token_size, model_type):
    # read the training file
    review_token, review_star, review_rating = fr.json_reader(train_path, top_k, "train")
    # process all the unigram and bigram in review and pick the top 1000
    token_list, df_dict = fr.get_dict(review_token, top_k)
    # get the unigram and bigram data matrix
    train_mtx = fe.feature_matrix(review_token, token_list, df_dict, token_size, model_type)
    # perform gradient ascent on training set, stochastic or batched
    k_size = 250
    model_mtx = cf.logistic_regression(train_mtx, review_star, review_rating, k_size, token_size)
    # read the test file
    test_token = fr.json_reader(test_path, top_k, "test")
    test_list, test_df = fr.get_dict(test_token, top_k)
    test_mtx = fe.feature_matrix(test_token, token_list, test_df, token_size, model_type)
    # predict the result
    predict(model_mtx, test_mtx)
Esempio n. 10
0
def run_classifiers_with_bow(reviews,
                             scores,
                             review_lengths,
                             with_features=False):
    X, vectorizer = functions.create_bow_from_reviews(reviews)
    train_x, test_x, train_y, test_y = train_test_split(X,
                                                        scores,
                                                        test_size=0.25,
                                                        random_state=42)
    '''
    When the 'with_features' parameter=True, we add our extra features to the
    existing feature matrix.
    '''
    if with_features:
        '''Create 'Part of Speech' feature vector for each review'''
        prp_list = functions.create_pos_features(reviews)
        '''Add both the POS and Review Length vectors to features'''
        train_x = functions.add_pos_feature(train_x, prp_list)
        train_x = add_length_review_feature(train_x, review_lengths)
        '''Do the same for the testing features'''
        test_x = functions.add_pos_feature(test_x, prp_list)
        test_x = add_length_review_feature(test_x, review_lengths)
    '''Create each classifier with Training Features and Training Labels.'''
    logistic_reg = classification.logistic_regression(train_x, train_y)
    k_nearest_n = classification.knearest_neighbors(train_x, train_y)
    decision_trees = classification.decision_trees(train_x, train_y)
    random_forest = classification.random_forest(train_x, train_y)

    classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest]

    for i in range(len(classifiers)):
        print("-------------------------------------------------")
        if i == 0:
            print("Logistic Regression\n")
        if i == 1:
            print("K Nearest Neighbors\n")
        if i == 2:
            print("Decision Trees\n")
        if i == 3:
            print("Random Forest\n")
        '''Train and predict on classifiers[i] for both training and testing data.'''
        functions.train_classifier_and_evaluate_accuracy_on_training_data(
            classifiers[i], train_x, train_y)
        functions.train_classifier_and_evaluate_accuracy_on_testing_data(
            classifiers[i], test_x, test_y)
        print('\n\n')
Esempio n. 11
0
def hyperparam_bow(data, target):
    max_features = range(80, 120)
    res = []
    count = 0

    for i in max_features:
        count += 1
        bow = BoW(ngram_range=(1, 2), max_features=i)

        d = bow.fit(data)

        r = logistic_regression(d, target, 10)
        res.append([r, i])

    plot_hyperparam_bow(res, max_features)

    print(sorted(res, key=lambda x: x[0], reverse=True))
Esempio n. 12
0
def test_logistic_regression():
    """
    Test set "stolen" from scikit learn
    """
    # this is our test set, it's just a straight line with some
    # gaussian noise
    xmin, xmax = -5, 5
    n_samples = 100
    X = np.array([[i] for i in np.linspace(xmin, xmax, n_samples)])
    Y = np.array(2 + 0.5 * np.linspace(xmin, xmax, n_samples) \
        + np.random.randn(n_samples, 1).ravel())

    beta, u = logistic_regression(X, Y)

    plt.scatter(X, Y, color='black')
    plt.plot(X, np.dot(X, beta) + u, linewidth=1, color='#FF9C34')
    plt.show()
Esempio n. 13
0
def grid_search_bow(data, target):
    ngram_range = [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
    max_features = [5,10,100,200,300,400,500,600,700,800,900,1000]
    res = []
    count = 0

    for i in ngram_range:
        for j in max_features:
            count += 1
            bow = BoW(ngram_range=i, max_features=j)

            d = bow.fit(data)

            r = logistic_regression(d, target, 10)
            res.append([r, i, j])

    plot_grid_search_bow(res, ngram_range, max_features)

    print(sorted(res, key=lambda x: x[0], reverse=True))
Esempio n. 14
0
File: plot.py Progetto: NelleV/MGRPR
def plot(X, Y, XtA, title="ClassificationA.png"):
    fig = plt.figure()
    colors = ['#4EACC5', '#FF9C34', '#aaaaaa', '#4E9A06', '#00465F', "#7E2007"]
    my_members = Y == 0
    my_members.shape = (my_members.shape[0])
    ax = fig.add_subplot(1, 1, 1)

    ax.plot(X[my_members, 0], X[my_members, 1],
            'w', markerfacecolor=colors[0], marker = '.')

    my_members = Y == 1
    my_members.shape = (my_members.shape[0])
    ax.plot(X[my_members, 0], X[my_members, 1],
            'w', markerfacecolor=colors[1], marker = '.')


    beta, u = classification.LDA(X, Y)
    YtcA = classification.logistic_regression_predict(XtA, beta, u)
    x_beta = [[i] for i in np.linspace(X.min(), X.max(), 100)]
    y_beta =  (- u - beta[0] * np.linspace(X.min(), X.max(), 100)) * 1 / beta[1]
    ax.plot(x_beta, y_beta, color=colors[3], linewidth=1)


    beta, u = classification.logistic_regression(X, Y, verbose=False)
    x_beta = [[i] for i in np.linspace(X.min(), X.max(), 100)]
    y_beta =  (- u - beta[0] * np.linspace(X.min(), X.max(), 100)) * 1 / beta[1]
    ax.plot(x_beta, y_beta, color=colors[4], linewidth=1)

    YtcA = classification.logistic_regression_predict(XtA, beta, u)

    beta, u = classification.linear_regression(X, Y)
    YtcA = classification.linear_regression_predict(XtA, beta, u)
    x_beta = [[i] for i in np.linspace(X.min(), X.max(), 100)]
    y_beta =  (0.5 - u - beta[0] * np.linspace(X.min(), X.max(), 100)) * 1 / beta[1]
    ax.plot(x_beta, y_beta, color=colors[5], linewidth=1)

    labels = ('unknown', 'label 0', 'label 1', 'LDA model', 'logistic regression', 'linear regression')
    legend = plt.legend(labels, loc=(0.9, .95), labelspacing=0.1)
    plt.setp(legend.get_texts(), fontsize='small')

    plt.show()
    plt.savefig(title)
Esempio n. 15
0
File: ex_4.py Progetto: NelleV/MGRPR
XtB, YtB = load_data('classificationB.test')
XtC, YtC = load_data('classificationC.test')

# Jeu de données A

print "Jeu de données A"
print "****************"
print

beta, u = classification.LDA(XA, YA)
YtcA = classification.logistic_regression_predict(XtA, beta, u)
erreur = classification.error(YtcA, YtA)

print "Jeu de test A - Modèle LDA: erreur %s" % erreur

beta, u = classification.logistic_regression(XA, YA, verbose=False)
YtcA = classification.logistic_regression_predict(XtA, beta, u)
erreur = classification.error(YtcA, YtA)

print "Jeu de test A - Regression logisitique: erreur %s" % erreur

beta, u = classification.linear_regression(XA, YA)
YtcA = classification.linear_regression_predict(XtA, beta, u)
erreur = classification.error(YtcA, YtA)

print "Jeu de test A - Regression linéaire: erreur %s" % erreur

# Jeu de données B
print
print
print "Jeu de données B"
Esempio n. 16
0
    X, vectorizer = functions.create_bow_from_reviews(reviews, scores)

    # Adding length of a each review feature
    print("After adding length review feature")
    X = functions.add_length_review_feature(X, length_of_reviews)
    print(X)

    # Adding Part of Speech Tag Feature
    print("After adding Part of Speech Tag feature")
    prp_list = functions.create_pos_features(reviews)
    X = functions.add_pos_feature(X, prp_list)
    print(X)

    # Logistic Regression
    # --------------------------------------------
    classifier = classification.logistic_regression(X, scores)

    # Naive Bayes
    # --------------------------------------------
    classifier = classification.naive_bayes(X, scores)

    # K Nearest Neighbors
    # --------------------------------------------
    classifier = classification.knearest_neighbors(X, scores)

    # Decision Trees
    # --------------------------------------------
    classifier = classification.decision_trees(X, scores)

    # Random Forests
    # --------------------------------------------
Esempio n. 17
0
def logistic_regression(input_dict):
    '''Logistic regression classifier.'''
    output_dict = {}
    output_dict['LR_out'] = c.logistic_regression(input_dict["pen_in"], input_dict["c_in"])
    return output_dict
Esempio n. 18
0
File: ex_2.py Progetto: NelleV/MGRPR
#
# IRLS
#

import numpy as np
import pylab as pl

from classification import logistic_regression
from utils import load_data

verbose = True
max_iter = 500
X, Y = load_data('classificationA.train')
beta, u = logistic_regression(X, Y)

# Plot
fig = pl.figure(1)
colors = ['#4EACC5', '#FF9C34', '#4E9A06']
my_members = Y == 0
my_members.shape = (my_members.shape[0])
ax = fig.add_subplot(1, 1, 1)
ax.plot(X[my_members, 0], X[my_members, 1],
        'w', markerfacecolor=colors[0], marker = '.')

my_members = Y == 1
my_members.shape = (my_members.shape[0])
ax.plot(X[my_members, 0], X[my_members, 1],
        'w', markerfacecolor=colors[1], marker = '.')

x_beta = [[i] for i in np.linspace(X.min(), X.max(), 100)]
y_beta =  (- u - beta[0] * np.linspace(X.min(), X.max(), 100)) * 1 / beta[1]
import chinese as dianping

if __name__ == '__main__':
    print('Running Classifiers for dianping dataset')
    print("Does not include extra features")
    print("Using Bag of Words")
    print('------------------------------------------')

    stop = dianping.gather_stopwords()
    labels, reviews = dianping.read_chinese()

    BOW, vec = dianping.chinese_BOW(reviews, stop)

    # Logistic Regression
    # --------------------------------------------
    classifier = classification.logistic_regression(BOW, labels)

    # Naive Bayes
    # --------------------------------------------
    classifier = classification.naive_bayes(BOW, labels)

    # K Nearest Neighbors
    # --------------------------------------------
    classifier = classification.knearest_neighbors(BOW, labels)

    # Decision Trees
    # --------------------------------------------
    classifier = classification.decision_trees(BOW, labels)

    # Random Forests
    # --------------------------------------------
Esempio n. 20
0
    data_sframe['intercept'] = 1
    features = ['intercept'] + features
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.to_numpy()
    label_sarray = data_sframe[label]
    label_array = label_sarray.to_numpy()
    return(feature_matrix, label_array)

feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')

arrays = np.load('module-3-assignment-numpy-arrays.npz')
feature_matrix, sentiment = arrays['feature_matrix'], arrays['sentiment']

print "Q3: How many features: "

coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194),
                                   step_size=1e-7, max_iter=301)

scores = np.dot(feature_matrix, coefficients)

class_prediction = []
positive = 0
for score in scores:
    if score > 0:
        class_prediction.append(1)
        positive = positive + 1
    else:
        class_prediction.append(-1)

print "Q6: Positive reviews: ", positive

num_mistakes = 0