def grid_search_bow_custom_fold(data_h, target, ids, questionmark_features, folds=10, do_custom_folds=True): ngram_range = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)] max_features = range(80, 95) custom_folds = cv_fold_generator(ids, folds) res = [] count = 0 for i in ngram_range: for j in max_features: print(count / (len(max_features) * len(ngram_range))) count += 1 bow = BoW(ngram_range=i, max_features=j, stop_words=None) x = bow.fit(data_h) if i == (1, 2) and j == 90: plot_2D_data(x, target) # print(reduced) # combined2 = np.column_stack((reduced, questionmark_features.toarray())) combined = add_question_mark_feature(x, questionmark_features) # print(combined.toarray()[0]) regularization = 'l2' if do_custom_folds: res.append([logistic_regression(combined, target, custom_folds, regularization), i, j]) else: res.append([logistic_regression(combined, target, folds, regularization), i, j]) print(sorted(res, key=lambda x: x[0], reverse=True))
def combined_crossval(claim_ids, target, rootdist_matrix, tf_matrix, questionmark, folds=7, do_custom_folds=True): custom_folds = cv_fold_generator(claim_ids, folds) rootdist_feature = sparse.csr_matrix(rootdist_matrix) questionmark_feature = questionmark ppdb_alignment_feature = sparse.csr_matrix(get_ppdb_alignment_feature()) combined_all = sparse.hstack(( rootdist_feature, questionmark_feature, ppdb_alignment_feature, tf_matrix )) plot_2D_data(combined_all, target) if do_custom_folds: folds = custom_folds print("Classifier: ", '[accuracy,', 'f1_macro,', 'recall_macro,', 'precision_macro]') print("Logistic regression ovr L1: ", logistic_regression(combined_all, target, folds, 'l1', 1000000, 'ovr')) print("Logistic regression ovr L2: ", logistic_regression(combined_all, target, folds, 'l2', 1000000, 'ovr')) print("Logistic regression multiclass L1: ", logistic_regression(combined_all, target, folds, 'l1', 1000000, 'multinomial')) print("Logistic regression multiclass L2: ", logistic_regression(combined_all, target, folds, 'l2', 1000000, 'multinomial')) print("SVM Cross-validation") svm_crossval_grid(combined_all, target, folds) print("Naive Bayes: ", naive_bayes(combined_all.toarray(), target, folds))
def questionmark_only(claim_ids, target, questionmark, folds=5, do_custom_folds=True, regularization='l2'): custom_folds = cv_fold_generator(claim_ids, folds) print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro') if do_custom_folds: print(logistic_regression(questionmark, target, custom_folds, regularization, 1000000)) else: print(logistic_regression(questionmark, target, folds, regularization, 1000000))
def bow_rootdist(claim_ids, target, rootdist_matrix, tf_matrix, folds=5, do_custom_folds=True, regularization='l2'): custom_folds = cv_fold_generator(claim_ids, folds) data_sparse = sparse.csr_matrix(rootdist_matrix) combined_all = sparse.hstack((data_sparse, tf_matrix)) plot_2D_data(combined_all, target) print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro') if do_custom_folds: print(logistic_regression(combined_all, target, custom_folds, regularization, 1000000)) else: print(logistic_regression(combined_all, target, folds, regularization, 1000000))
def crossval_rootdist(data, target, ids, questionmark_features=None, folds=10, do_custom_folds=True): custom_folds = cv_fold_generator(ids, folds) data = sparse.csr_matrix(data) if questionmark_features is not None: combined = add_question_mark_feature(data, questionmark_features) else: combined = data print('accuracy', 'f1_macro', 'recall_macro', 'precision_macro') if do_custom_folds: print(logistic_regression(combined, target, custom_folds)) else: print(logistic_regression(combined, target, folds))
def getModel(): print('getting model') reviews, scores = yelp_parser.get_chi_hotel_review_score_list() reviews, vec = functions.create_bow_from_reviews(reviews) print('first reviews') print(reviews.shape) logistic = classification.logistic_regression(reviews, scores) return logistic, vec
def run_classifiers_with_doc2vec(reviews, scores, review_lengths, with_features=False): '''Corpus should be an array of TaggedDocument objects.''' corpus = list(embeddings.get_corpus(reviews, scores))[:20000] train_corpus, test_corpus = train_test_split(corpus, test_size=0.25, random_state=42) doc2vec_model = embeddings.create_doc2vec_model(train_corpus) train_targets, train_regressors = zip(*[(doc.words, doc.tags[0]) for doc in train_corpus]) test_targets, test_regressors = zip(*[(doc.words, doc.tags[0]) for doc in test_corpus]) ''' For every review, we apply doc2vec_model.infer_vector(review). This creates a feature vector for every document (in our case, review) in the corpus. ''' train_x, train_y = get_train_lists(doc2vec_model, train_targets, train_regressors, review_lengths) test_x, test_y = get_test_lists(doc2vec_model, test_targets, test_regressors) ''' When the 'with_features' parameter=True, we add our extra features to the existing feature matrix. ''' if with_features: prp_list = functions.create_pos_features(reviews) train_x = functions.add_pos_feature(train_x, prp_list) train_x = add_length_review_feature(train_x, review_lengths) test_x = functions.add_pos_feature(test_x, prp_list) test_x = add_length_review_feature(test_x, review_lengths) logistic_reg = classification.logistic_regression(train_x, train_y) k_nearest_n = classification.knearest_neighbors(train_x, train_y) decision_trees = classification.decision_trees(train_x, train_y) random_forest = classification.random_forest(train_x, train_y) classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest] for i in range(len(classifiers)): print("-------------------------------------------------") if i == 0: print("Logistic Regression\n") if i == 1: print("K Nearest Neighbors\n") if i == 2: print("Decision Trees\n") if i == 3: print("Random Forest\n") '''Train and predict on classifiers[i] for both training and testing data.''' functions.train_classifier_and_evaluate_accuracy_on_training_data( classifiers[i], train_x, train_y) functions.train_classifier_and_evaluate_accuracy_on_testing_data( classifiers[i], test_x, test_y) print('\n\n')
def crossval_grid_search(target, ids, min_rootdist=1, max_rootdist=200, step=1, ppdb=None, questionmark_features=None, bow=None, folds=10): default_score = range(min_rootdist, max_rootdist + 1, step) res = [] count = 0 custom_folds = cv_fold_generator(ids, folds) for i in default_score: data = sparse.csc_matrix(get_rootdist_matrix(i)) print("At ", round((count * 100.0) / (len(default_score)), 2), "%") count += 1 combined = sparse.hstack((data, questionmark_features, bow, ppdb)) regularization = 'l2' res.append([ logistic_regression(combined, target, custom_folds, regularization), i ]) acc = np.asarray([[a[0][0], a[1]] for a in res]) f1 = np.asarray([[a[0][1], a[1]] for a in res]) recall = np.asarray([[a[0][2], a[1]] for a in res]) precision = np.asarray([[a[0][3], a[1]] for a in res]) print("Max acc without question at default_dist: ", acc[np.argmax(acc[:, 0]), 1], " ", np.max(acc[:, 0])) print("Max f1 without question at default_dist: ", f1[np.argmax(f1[:, 0]), 1], " ", np.max(f1[:, 0])) print("Max recall without question at default_dist: ", recall[np.argmax(recall[:, 0]), 1], " ", np.max(recall[:, 0])) print("Max precision without question at default_dist: ", precision[np.argmax(precision[:, 0]), 1], " ", np.max(precision[:, 0])) plt.plot(acc[:, 1], acc[:, 0], label='Accuracy') plt.plot(f1[:, 1], f1[:, 0], label='F1-Score') plt.plot(recall[:, 1], recall[:, 0], label='Recall') plt.plot(precision[:, 1], precision[:, 0], label='Precision') plt.legend() plt.xlabel("Default rootdist score") plt.ylabel("Accuracy") plt.show() return res
def rmlr(train_path, test_path, top_k, token_size, model_type): # read the training file review_token, review_star, review_rating = fr.json_reader(train_path, top_k, "train") # process all the unigram and bigram in review and pick the top 1000 token_list, df_dict = fr.get_dict(review_token, top_k) # get the unigram and bigram data matrix train_mtx = fe.feature_matrix(review_token, token_list, df_dict, token_size, model_type) # perform gradient ascent on training set, stochastic or batched k_size = 250 model_mtx = cf.logistic_regression(train_mtx, review_star, review_rating, k_size, token_size) # read the test file test_token = fr.json_reader(test_path, top_k, "test") test_list, test_df = fr.get_dict(test_token, top_k) test_mtx = fe.feature_matrix(test_token, token_list, test_df, token_size, model_type) # predict the result predict(model_mtx, test_mtx)
def run_classifiers_with_bow(reviews, scores, review_lengths, with_features=False): X, vectorizer = functions.create_bow_from_reviews(reviews) train_x, test_x, train_y, test_y = train_test_split(X, scores, test_size=0.25, random_state=42) ''' When the 'with_features' parameter=True, we add our extra features to the existing feature matrix. ''' if with_features: '''Create 'Part of Speech' feature vector for each review''' prp_list = functions.create_pos_features(reviews) '''Add both the POS and Review Length vectors to features''' train_x = functions.add_pos_feature(train_x, prp_list) train_x = add_length_review_feature(train_x, review_lengths) '''Do the same for the testing features''' test_x = functions.add_pos_feature(test_x, prp_list) test_x = add_length_review_feature(test_x, review_lengths) '''Create each classifier with Training Features and Training Labels.''' logistic_reg = classification.logistic_regression(train_x, train_y) k_nearest_n = classification.knearest_neighbors(train_x, train_y) decision_trees = classification.decision_trees(train_x, train_y) random_forest = classification.random_forest(train_x, train_y) classifiers = [logistic_reg, k_nearest_n, decision_trees, random_forest] for i in range(len(classifiers)): print("-------------------------------------------------") if i == 0: print("Logistic Regression\n") if i == 1: print("K Nearest Neighbors\n") if i == 2: print("Decision Trees\n") if i == 3: print("Random Forest\n") '''Train and predict on classifiers[i] for both training and testing data.''' functions.train_classifier_and_evaluate_accuracy_on_training_data( classifiers[i], train_x, train_y) functions.train_classifier_and_evaluate_accuracy_on_testing_data( classifiers[i], test_x, test_y) print('\n\n')
def hyperparam_bow(data, target): max_features = range(80, 120) res = [] count = 0 for i in max_features: count += 1 bow = BoW(ngram_range=(1, 2), max_features=i) d = bow.fit(data) r = logistic_regression(d, target, 10) res.append([r, i]) plot_hyperparam_bow(res, max_features) print(sorted(res, key=lambda x: x[0], reverse=True))
def test_logistic_regression(): """ Test set "stolen" from scikit learn """ # this is our test set, it's just a straight line with some # gaussian noise xmin, xmax = -5, 5 n_samples = 100 X = np.array([[i] for i in np.linspace(xmin, xmax, n_samples)]) Y = np.array(2 + 0.5 * np.linspace(xmin, xmax, n_samples) \ + np.random.randn(n_samples, 1).ravel()) beta, u = logistic_regression(X, Y) plt.scatter(X, Y, color='black') plt.plot(X, np.dot(X, beta) + u, linewidth=1, color='#FF9C34') plt.show()
def grid_search_bow(data, target): ngram_range = [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)] max_features = [5,10,100,200,300,400,500,600,700,800,900,1000] res = [] count = 0 for i in ngram_range: for j in max_features: count += 1 bow = BoW(ngram_range=i, max_features=j) d = bow.fit(data) r = logistic_regression(d, target, 10) res.append([r, i, j]) plot_grid_search_bow(res, ngram_range, max_features) print(sorted(res, key=lambda x: x[0], reverse=True))
def plot(X, Y, XtA, title="ClassificationA.png"): fig = plt.figure() colors = ['#4EACC5', '#FF9C34', '#aaaaaa', '#4E9A06', '#00465F', "#7E2007"] my_members = Y == 0 my_members.shape = (my_members.shape[0]) ax = fig.add_subplot(1, 1, 1) ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=colors[0], marker = '.') my_members = Y == 1 my_members.shape = (my_members.shape[0]) ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=colors[1], marker = '.') beta, u = classification.LDA(X, Y) YtcA = classification.logistic_regression_predict(XtA, beta, u) x_beta = [[i] for i in np.linspace(X.min(), X.max(), 100)] y_beta = (- u - beta[0] * np.linspace(X.min(), X.max(), 100)) * 1 / beta[1] ax.plot(x_beta, y_beta, color=colors[3], linewidth=1) beta, u = classification.logistic_regression(X, Y, verbose=False) x_beta = [[i] for i in np.linspace(X.min(), X.max(), 100)] y_beta = (- u - beta[0] * np.linspace(X.min(), X.max(), 100)) * 1 / beta[1] ax.plot(x_beta, y_beta, color=colors[4], linewidth=1) YtcA = classification.logistic_regression_predict(XtA, beta, u) beta, u = classification.linear_regression(X, Y) YtcA = classification.linear_regression_predict(XtA, beta, u) x_beta = [[i] for i in np.linspace(X.min(), X.max(), 100)] y_beta = (0.5 - u - beta[0] * np.linspace(X.min(), X.max(), 100)) * 1 / beta[1] ax.plot(x_beta, y_beta, color=colors[5], linewidth=1) labels = ('unknown', 'label 0', 'label 1', 'LDA model', 'logistic regression', 'linear regression') legend = plt.legend(labels, loc=(0.9, .95), labelspacing=0.1) plt.setp(legend.get_texts(), fontsize='small') plt.show() plt.savefig(title)
XtB, YtB = load_data('classificationB.test') XtC, YtC = load_data('classificationC.test') # Jeu de données A print "Jeu de données A" print "****************" print beta, u = classification.LDA(XA, YA) YtcA = classification.logistic_regression_predict(XtA, beta, u) erreur = classification.error(YtcA, YtA) print "Jeu de test A - Modèle LDA: erreur %s" % erreur beta, u = classification.logistic_regression(XA, YA, verbose=False) YtcA = classification.logistic_regression_predict(XtA, beta, u) erreur = classification.error(YtcA, YtA) print "Jeu de test A - Regression logisitique: erreur %s" % erreur beta, u = classification.linear_regression(XA, YA) YtcA = classification.linear_regression_predict(XtA, beta, u) erreur = classification.error(YtcA, YtA) print "Jeu de test A - Regression linéaire: erreur %s" % erreur # Jeu de données B print print print "Jeu de données B"
X, vectorizer = functions.create_bow_from_reviews(reviews, scores) # Adding length of a each review feature print("After adding length review feature") X = functions.add_length_review_feature(X, length_of_reviews) print(X) # Adding Part of Speech Tag Feature print("After adding Part of Speech Tag feature") prp_list = functions.create_pos_features(reviews) X = functions.add_pos_feature(X, prp_list) print(X) # Logistic Regression # -------------------------------------------- classifier = classification.logistic_regression(X, scores) # Naive Bayes # -------------------------------------------- classifier = classification.naive_bayes(X, scores) # K Nearest Neighbors # -------------------------------------------- classifier = classification.knearest_neighbors(X, scores) # Decision Trees # -------------------------------------------- classifier = classification.decision_trees(X, scores) # Random Forests # --------------------------------------------
def logistic_regression(input_dict): '''Logistic regression classifier.''' output_dict = {} output_dict['LR_out'] = c.logistic_regression(input_dict["pen_in"], input_dict["c_in"]) return output_dict
# # IRLS # import numpy as np import pylab as pl from classification import logistic_regression from utils import load_data verbose = True max_iter = 500 X, Y = load_data('classificationA.train') beta, u = logistic_regression(X, Y) # Plot fig = pl.figure(1) colors = ['#4EACC5', '#FF9C34', '#4E9A06'] my_members = Y == 0 my_members.shape = (my_members.shape[0]) ax = fig.add_subplot(1, 1, 1) ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=colors[0], marker = '.') my_members = Y == 1 my_members.shape = (my_members.shape[0]) ax.plot(X[my_members, 0], X[my_members, 1], 'w', markerfacecolor=colors[1], marker = '.') x_beta = [[i] for i in np.linspace(X.min(), X.max(), 100)] y_beta = (- u - beta[0] * np.linspace(X.min(), X.max(), 100)) * 1 / beta[1]
import chinese as dianping if __name__ == '__main__': print('Running Classifiers for dianping dataset') print("Does not include extra features") print("Using Bag of Words") print('------------------------------------------') stop = dianping.gather_stopwords() labels, reviews = dianping.read_chinese() BOW, vec = dianping.chinese_BOW(reviews, stop) # Logistic Regression # -------------------------------------------- classifier = classification.logistic_regression(BOW, labels) # Naive Bayes # -------------------------------------------- classifier = classification.naive_bayes(BOW, labels) # K Nearest Neighbors # -------------------------------------------- classifier = classification.knearest_neighbors(BOW, labels) # Decision Trees # -------------------------------------------- classifier = classification.decision_trees(BOW, labels) # Random Forests # --------------------------------------------
data_sframe['intercept'] = 1 features = ['intercept'] + features features_sframe = data_sframe[features] feature_matrix = features_sframe.to_numpy() label_sarray = data_sframe[label] label_array = label_sarray.to_numpy() return(feature_matrix, label_array) feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment') arrays = np.load('module-3-assignment-numpy-arrays.npz') feature_matrix, sentiment = arrays['feature_matrix'], arrays['sentiment'] print "Q3: How many features: " coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194), step_size=1e-7, max_iter=301) scores = np.dot(feature_matrix, coefficients) class_prediction = [] positive = 0 for score in scores: if score > 0: class_prediction.append(1) positive = positive + 1 else: class_prediction.append(-1) print "Q6: Positive reviews: ", positive num_mistakes = 0