def main(argv): (X, Y) = read_Data(datafile) ### implement cross validation ### X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0) # num_class_one = sum(y_train) sample_weight_class_one = np.ones(len(y_train)) for i in range(len(y_train)): if y_train[i,0] == 1: sample_weight_class_one[i] = 1 ### generate poly SVM model ### # clf = svm.SVC(kernel = 'poly', degree = 5) # clf.fit(X_train, y_train[:,0], sample_weight_class_one) # mkdir(outputfile) # joblib.dump(clf, outputfile + 'svm_poly_4.pkl') # save model to disc ### load existing model ### clf = joblib.load(outputfile + 'svm_poly_4.pkl') y_pred = clf.predict(X_test) # calculate prediction result cm = confusion_matrix(y_test, y_pred) # plot_confusion_matrix(cm) print '\n', 'Confusion matrix:', '\n', cm err_FP, err_FN, err_ALL = err_rate(cm) print '\n', 'Over all accuracy: ', clf.score(X_test, y_test)*100, '%' print '\n', 'Fasle positive rate: ', err_FP*100, '%' print 'Fasle negative rate: ', err_FN*100, '%' print 'Over all error rate: ', err_ALL*100, '%' return 0
def runSGDPipeline(entries, langs): t0 = time() sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.fit_transform(X_train_counts) clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return sgd_pipeline
def runRFPipeline(entries, langs): t0 = time() rf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', RandomForestClassifier(n_estimators=10))]) vect = CountVectorizer(ngram_range=(1, 1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.fit_transform(X_train_counts) clf = RandomForestClassifier(n_estimators=10) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return rf_pipeline
def runSGDPipeline(entries, langs): t0 = time() sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42))]) vect = CountVectorizer(ngram_range=(1, 1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.fit_transform(X_train_counts) clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return sgd_pipeline
def runSVCPipeline(entries, langs): t0 = time() svc_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) clf = LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) #dec = clf.decision_function([[1]]) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return svc_pipeline
def print_confusion_matrix(y_test_report, y_predicted_report, algo_name): # y_predicted report is one array of len(100) , this array has all the predicted values of all folds cm = confusion_matrix(y_test_report, y_predicted_report) np.set_printoptions(precision=2) print ("Confusion matrix for: " + algo_name) print (cm) return None
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading feature and label data') labels = np.asarray(map(int, list(file_line_generator(args.label_file)))) log.info('performing cross validation') single_predictions, classification_result = do_cross_validation(labels) log.info('storing results') header = 'fold_no;instance_index;true_label;pred_label' np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), single_predictions, '%d', ';', '\n', header=header) all_true_labels = single_predictions[:, 2] all_pred_labels = single_predictions[:, 3] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') classification_result[NO_OF_FOLDS, :] = get_classification_result(-1, all_true_labels, all_pred_labels) header = 'fold_no;accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', ';', '\n', header=header) log.info(classification_result) log.info('finished')
def print_confusion_matrix(y_test_report, y_predicted_report,algo_name): #y_predicted report is one array of len(100) , this array has all the predicted values of all folds cm = confusion_matrix(y_test_report, y_predicted_report) np.set_printoptions(precision=2) print('Confusion matrix for: ' +algo_name) print(cm) return None
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__C': (0.1, 1, 10), } df = pd.read_csv('data/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
def _run_cv_iter((ensemble, selection_strategy, inp, y, train_indices, test_indices, seed, it)): Logger.get().write("!Running", (it+1), "iteration...") train_inp, train_y = inp[train_indices], y[train_indices] test_inp, test_y = inp[test_indices], y[test_indices] ensemble.set_params( random_state=seed, selection_strategy=selection_strategy ) ensemble.fit(train_inp, train_y) threshold_range = selection_strategy.get_threshold_range( ensemble.n_estimators ) confusion_matrices = numpy.zeros( (len(threshold_range), ensemble.n_classes_, ensemble.n_classes_) ) for i, threshold in enumerate(threshold_range): ensemble.selection_strategy.threshold = threshold Logger.get().write("!Testing using threshold: {:.3f}".format(threshold)) confusion_matrices[i] = confusion_matrix( test_y, ensemble.predict(test_inp), labels=ensemble.classes_ ) return confusion_matrices
def testDataset(model, test, test_labels, n_slice=1): test_sample = test[::n_slice] test_labels_sample = test_labels[::n_slice] # print test_sample[1] # extracted_test_sample = extractFeatures(test_sample, [74,745,361,445,164,681,258,230,277,719,509,637,738,709,529,557,473,175,664,133,305,75,333,585,501,222,105,612,342,250,286,746,202,481,68,621,94,638,257,314,165,639,278,147,229,692,120,453,720,285,680,650,76,737,708,174,564,194,613,134,536,592,425,640,747,201,67,313,135,397,558,256,665,284,693,106,93,530,77,649,508,736,721,146,341,586,369,136,306,251,173,228,707,666,502,667,620,679,641,748,119,279,223,166,107,66,92,772,771,312,668,480,735,474,694,78,145,614,770,137,200,722,446,678,283,749,648,255,773,418,42,340,195,108,227,311,334,362,41,669,307,563,774,390,40,706,591,65,91,769,768,43,452,750,695,44,535,723,172,677,775,642,79,619,396,167,339,118,39,507,751,254,503,559,531,199,696,335,734,587,766,767,64,45,109,776,479,282,226,80,47,11,13,16,8,12,14,9,15,10,116,81,7,90,88,6,5,4,3,2,89,87,114,18,115,86,117,82,83,84,85,17,111,19,51,50,30,52,53,54,55,56,49,48,46,36,38,37,35,32,34,33,31,29,20,62,61,28,63,110,112,113,21,60,59,58,25,27,26,24,57,23,22,784,392,138,676,697,675,699,674,698,700,139,704,705,703,701,702,673,672,671,615,616,590,670,589,617,618,643,644,647,646,645,724,725,726,763,764,762,759,761,765,777,778,779,782,781,780,760,758,727,730,731,729,757,728,732,733,752,753,756,755,754,588,562,561,281,308,280,225,253,309,310,336,337,364,363,338,252,224,366,142,143,141,198,140,144,168,169,170,197,196,171,365,367,560,476,477,475,449,451,478,504,505,506,534,533,532,450,448,368,393,394,783,447,391,395,419,420,421,424,423,422,1]) extracted_test_sample = test_sample preds = model.predict(extracted_test_sample) accuracy = model.score(extracted_test_sample, test_labels_sample) cm = metrics.confusion_matrix(test_labels_sample, preds) print("Confusion matrix:\n%s" % cm) plot_confusion_matrix(cm) digitsDict = digitDict.initData() print 'true \t\t predicted' errors = 0 for i in xrange(0, len(test_sample)): if preds[i] != test_labels_sample[i]: errors += 1 trueClass = str(int(test_labels_sample[i])) prediction = str(int(preds[i])) digitDict.updateErrors(digitsDict, trueClass, prediction) print test_labels_sample[i], '\t\t', preds[i] digitDict.save(digitsDict) print 'Accuracy: ', accuracy, '\nErrors: ', errors
def runSVCPipeline(entries, langs): t0 = time() svc_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42))]) vect = CountVectorizer(ngram_range=(1, 1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) clf = LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) #dec = clf.decision_function([[1]]) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return svc_pipeline
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading feature and label data') train_labels = np.asarray( map(int, list(file_line_generator(args.train_labels)))) train_features = np.loadtxt(args.train_data) if train_features.ndim == 1: train_features = train_features.reshape((train_features.shape[0], 1)) test_labels = np.asarray( map(int, list(file_line_generator(args.test_labels)))) test_features = np.loadtxt(args.test_data) if test_features.ndim == 1: test_features = test_features.reshape((test_features.shape[0], 1)) log.info('performing classification') single_predictions, classification_result, weight_vectors, model = \ calc_results(train_features, train_labels, test_features, test_labels, args.normalize, args.mode == True) log.info('storing results') save_object_to_file(model, os.path.join(args.output_dir, 'svm')) np.savetxt(os.path.join(args.output_dir, 'weights.csv'), weight_vectors, '%f', ';', '\n') header = 'instance_index;true_label;pred_label' np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), single_predictions, '%d', ';', '\n', header=header) all_true_labels = single_predictions[:, 1] all_pred_labels = single_predictions[:, 2] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') header = 'accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', ';', '\n', header=header) log.info(classification_result) log.info('finished')
def show_report(self, y_predicted): y_true = [] y_predicted_new = [] for i in range(len(self.__labels)): if self.__labels[i]=='P': y_true.append(1) if y_predicted[i]=='positivo': y_predicted_new.append(1) if self.__labels[i]=='N': y_true.append(-1) if y_predicted[i]=='negativo': y_predicted_new.append(-1) if self.__labels[i]=='NEU': y_true.append(0) if y_predicted[i]=='neutral': y_predicted_new.append(0) print classification_report(y_true, y_predicted_new) print confusion_matrix(y_true, y_predicted_new)
def show_confusion_matrix(y_true, y_pred, title=''): """ Plot (and print) a confusion matrix from y_true and y_predicted """ # TODO: show confusion matrix plot cm = confusion_matrix(y_true, y_pred) pl.matshow(cm) pl.title(title) pl.colorbar() pl.ylabel('True label') pl.xlabel('Predicted label') pl.show()
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading embeddings') vocab = read_vocabulary_id_file(args.vocabulary) embs = np.loadtxt(args.embeddings) log.info('loading documents') features, labels = load_data(args.corpus_dir, vocab, embs) log.info('performing cross validation') single_predictions, classification_result, weight_vectors = \ do_cross_validation(features, labels) log.info('storing results') np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'), weight_vectors, '%f', ';', '\n') with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \ as pred_file: pred_file.write(u'fold_no;doc;true_label;pred_label\n') for sp in single_predictions: pred_file.write(u';'.join(map(unicode, sp)) + u'\n') all_true_labels = [sp[2] for sp in single_predictions] all_pred_labels = [sp[3] for sp in single_predictions] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') classification_result[NO_OF_FOLDS, :] = get_classification_result( -1, all_true_labels, all_pred_labels) header = u'fold_no;accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', u';', u'\n', header=header) log.info(classification_result) log.info('finished')
def show_confusion_matrix(y_true, y_predicted, title=''): # compute confusion matrix cm = confusion_matrix(y_true, y_predicted) print cm # configure window pl.matshow(cm) pl.title(title) pl.colorbar() pl.ylabel('True label') pl.xlabel('Predicted label') pl.jet() # show confusion matrix plot pl.show()
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading feature and label data') train_labels = np.asarray(list(map(int, list(file_line_generator(args.train_labels))))) train_features = np.loadtxt(args.train_data) if train_features.ndim == 1: train_features = train_features.reshape((train_features.shape[0], 1)) test_labels = np.asarray(list(map(int, list(file_line_generator(args.test_labels))))) test_features = np.loadtxt(args.test_data) if test_features.ndim == 1: test_features = test_features.reshape((test_features.shape[0], 1)) log.info('performing classification') single_predictions, classification_result, weight_vectors, model = \ calc_results(train_features, train_labels, test_features, test_labels, args.normalize, args.mode == True) log.info('storing results') save_object_to_file(model, os.path.join(args.output_dir, 'svm')) np.savetxt(os.path.join(args.output_dir, 'weights.csv'), weight_vectors, '%f', ';', '\n') header = 'instance_index;true_label;pred_label' np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), single_predictions, '%d', ';', '\n', header=header) all_true_labels = single_predictions[:, 1] all_pred_labels = single_predictions[:, 2] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') header = 'accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', ';', '\n', header=header) log.info(classification_result) log.info('finished')
def main(): pipeline = Pipeline([('vect', TfidfVectorizer()), ('clf', LogisticRegression())]) parameters = { # 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), # 'vect__max_features': (5000, 10000, None), # 'vect__ngram_range': ((1, 1), (1, 2)), # 'vect__use_idf': (True, False), # 'vect__norm': ('l1', 'l2'), # 'clf__penalty': ('l1', 'l2'), # 'clf__C': (0.1, 1, 10), } df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, predictions) print cm plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() predictions = np.ones(len(predictions)) * 2 print 'Accuracy:', accuracy_score(y_test, predictions) print 'Degenerate Classification Report:', classification_report( y_test, predictions)
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading embeddings') vocab = read_vocabulary_id_file(args.vocabulary) embs = np.loadtxt(args.embeddings) log.info('loading documents') features, labels = load_data(args.corpus_dir, vocab, embs) log.info('performing cross validation') single_predictions, classification_result, weight_vectors = \ do_cross_validation(features, labels) log.info('storing results') np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'), weight_vectors, '%f', ';', '\n') with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \ as pred_file: pred_file.write(u'fold_no;doc;true_label;pred_label\n') for sp in single_predictions: pred_file.write(u';'.join(map(unicode, sp)) + u'\n') all_true_labels = [sp[2] for sp in single_predictions] all_pred_labels = [sp[3] for sp in single_predictions] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') classification_result[NO_OF_FOLDS, :] = get_classification_result(-1, all_true_labels, all_pred_labels) header = u'fold_no;accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', u';', u'\n', header=header) log.info(classification_result) log.info('finished')
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer()), ('clf', LogisticRegression()) ]) parameters = { # 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), # 'vect__max_features': (5000, 10000, None), # 'vect__ngram_range': ((1, 1), (1, 2)), # 'vect__use_idf': (True, False), # 'vect__norm': ('l1', 'l2'), # 'clf__penalty': ('l1', 'l2'), # 'clf__C': (0.1, 1, 10), } df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, predictions) print cm plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() predictions = np.ones(len(predictions)) * 2 print 'Accuracy:', accuracy_score(y_test, predictions) print 'Degenerate Classification Report:', classification_report(y_test, predictions)
def runTreePipeline(entries, langs): t0 = time() tree_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', DecisionTreeClassifier(max_features=n_features))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) clf = DecisionTreeClassifier(max_features=n_features) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return tree_pipeline
lc.pop(0) lc = [float(i) for i in lc] x.append(lc) f.close() pipeline = Pipeline([ ('clf', LogisticRegression()) ]) parameters = { 'clf__C': (0.1, 1, 10), } X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:' print confusion_matrix(y_test, predictions) print 'Classification Report:' print classification_report(y_test, predictions)
indices = np.arange(n_total_samples) unlabeled_set = indices[n_labeled_points:] # shuffle everything around y_train = np.copy(y) y_train[unlabeled_set] = -1 ############################################################################### # Learn with LabelSpreading lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5) lp_model.fit(X, y_train) predicted_labels = lp_model.transduction_[unlabeled_set] true_labels = y[unlabeled_set] cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_) print "Label Spreading model: %d labeled & %d unlabeled points (%d total)" % \ (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples) print metrics.classification_report(true_labels, predicted_labels) print "Confusion matrix" print cm # calculate uncertainty values for each transduced distribution pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T) # pick the top 10 most uncertain labels uncertainty_index = np.argsort(pred_entropies)[-10:]
y_.extend(y_test) prediction_.extend(prediction) verbose('----------\n') verbose("Evaluation") if opts.mode in ['age', 'gender']: from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score # Calculando desempeño print('Accuracy :', accuracy_score(y_, prediction_)) print('Precision :', precision_score(y_, prediction_)) print('Recall :', recall_score(y_, prediction_)) print('F-score :', f1_score(y_, prediction_)) print('\nClasification report:\n', classification_report(y_, prediction_)) print('\nConfussion matrix :\n', confusion_matrix(y_, prediction_)) else: from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error, r2_score print('Mean Abs Error :', mean_absolute_error(y_, prediction_)) print('Mean Sqr Error :', mean_squared_error(y_, prediction_)) print('R2 Error :', r2_score(y_, prediction_)) #plots: #import matplotlib.pyplot as plt #confusion_matrix_plot = confusion_matrix(y_test, prediction) #plt.title('matriz de confusion') #plt.colorbar() #plt.xlabel() #plt.xlabel('categoria de verdad') #plt.ylabel('categoria predecida') #plt.show()
def plotConfusionMatrix(trueLabels, testPredLabels, saveFilename, normalization = False): """ Plot confusion matrix using true labels and prediction labels normalization: True, accuracy of each class False, number of results """ # Calculate confusion matrix cm = confusion_matrix(trueLabels, testPredLabels) if normalization: # If normalization cm = cm.astype(float) / LA.norm(cm, ord = 1, axis = 1) labels = [] for item in trueLabels: if item not in labels: labels.append(item) # Plot confusion matrix font = {'size' : 12} mplib.rc('font', **font) fig, ax = plt.subplots() fig.subplots_adjust(bottom=0.2) ax.set_aspect('equal', adjustable='box') height, width = cm.shape for x in xrange(width): for y in xrange(height): if normalization: # If normalization floatNum = cm[x, y] annotation = "%.2f" % floatNum ax.annotate(annotation, xy=(y, x), horizontalalignment='center', verticalalignment='center') else: intNum = cm[x, y] annotation = str(intNum) ax.annotate(annotation, xy=(y, x), horizontalalignment='center', verticalalignment='center') heatmap = ax.imshow(np.array(cm), cmap=plt.cm.jet, interpolation='nearest') fig.colorbar(heatmap) ax.set_xticks(np.arange(height), minor=False) ax.set_yticks(np.arange(width), minor=False) ax.set_xticklabels(labels, minor=False, rotation=45) ax.set_yticklabels(labels, minor=False) ax.set_xlabel('Predicted Labels', fontsize=18) ax.set_ylabel('True Labels', fontsize=18) plt.show() fig.savefig(saveFilename)
#Task 6 # Map five to 1 and 1 to 0 y_test[y_test ==1] = 0 y_test[y_test == 5 ] = 1 y_pred_prob = nb.predict_proba(test_dtm)[:,1] print metrics.roc_auc_score(y_test, y_pred_prob) #Task 7 import matplotlib.pyplot as plt fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob) plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate (1 - Specificity)') plt.ylabel('True Positive Rate (Sensitivity)') #Task 8 print metrics.confusion_matrix(y_test, y_pred) sensitivity = 126 / float(25 + 126) specificity = 813/ float(813 + 58) #Task 9 false_positives = X_test[y_test < y_pred] # false positives false_negatives = X_test[y_test > y_pred] # false negatives #One theory I have for false positives is that the more descriptive language you use the more the model thinks that it willl be rated a 5. #Task 10 #From the ROC Curve I would say a threshold of .18 would maximize true positive rate
def plotConfusionMatrix(trueLabels, testPredLabels, saveFilename, normalization=False): """ Plot confusion matrix using true labels and prediction labels normalization: True, accuracy of each class False, number of results """ # Calculate confusion matrix cm = confusion_matrix(trueLabels, testPredLabels) if normalization: # If normalization cm = cm.astype(float) / LA.norm(cm, ord=1, axis=1) labels = [] for item in trueLabels: if item not in labels: labels.append(item) # Plot confusion matrix font = {'size': 12} mplib.rc('font', **font) fig, ax = plt.subplots() fig.subplots_adjust(bottom=0.2) ax.set_aspect('equal', adjustable='box') height, width = cm.shape for x in xrange(width): for y in xrange(height): if normalization: # If normalization floatNum = cm[x, y] annotation = "%.2f" % floatNum ax.annotate(annotation, xy=(y, x), horizontalalignment='center', verticalalignment='center') else: intNum = cm[x, y] annotation = str(intNum) ax.annotate(annotation, xy=(y, x), horizontalalignment='center', verticalalignment='center') heatmap = ax.imshow(np.array(cm), cmap=plt.cm.jet, interpolation='nearest') fig.colorbar(heatmap) ax.set_xticks(np.arange(height), minor=False) ax.set_yticks(np.arange(width), minor=False) ax.set_xticklabels(labels, minor=False, rotation=45) ax.set_yticklabels(labels, minor=False) ax.set_xlabel('Predicted Labels', fontsize=18) ax.set_ylabel('True Labels', fontsize=18) plt.show() fig.savefig(saveFilename)
def run_nested_cross_validation(self, data, labels, k=3, columns=None, draw_roc=True, draw_decision_boundaries=True, classifier_type='logistic', title_suffix='', save_path_prefix='', balance=True, Cs=None, error_f=None, higher_is_better=False, threshold_p=.5): ''' Run nested CV on data with passed k folds. If columns is passed, it should be a list of columns to subset from data. Otherwise, all of data is used. Data is assumed to be a pandas dataframe or similar. Cs can also be passed as a list of C parameter values to use. Error_f is a function for measuring error that takes predicted probabilities and true labels. Defaults to MSE. Uses the scikit-learn LogisticRegression library. ''' if columns is not None: data_chosen = data[columns] Cs = Cs or self.Cs error_f = error_f or self.mse best_c = 0 try: cv_outer = StratifiedKFold(labels.values, k=k) except AttributeError: # Labels needs to be a pandas series labels = Series(labels) cv_outer = StratifiedKFold(labels.values, k=k) outer_metric, for_roc = [], [] for train_outer, test_outer in cv_outer: mod = self.get_model(classifier_type=classifier_type) c_metric = [] for c in Cs: cv_inner = StratifiedKFold(labels.ix[train_outer].values, k=k) mod.set_params(C=c) inner_metric = [] for train_inner, test_inner in cv_inner: # Balance rare classes if necessary: if balance: data_balanced, labels_balanced = self.balance_classes( data_chosen.ix[train_inner], labels.ix[train_inner]) else: data_balanced, labels_balanced = data_chosen.ix[ train_inner], labels.ix[train_inner] fitted = mod.fit(data_balanced, labels_balanced) # Predict probabilities predicted_probs = fitted.predict_proba( data_chosen.ix[test_inner]) err = error_f(predicted_probs, labels.ix[test_inner].values) inner_metric.append(err) error_for_this_c = sum(inner_metric) / len(inner_metric) print "Average Error: ", error_for_this_c, ", for C: ", c c_metric.append(error_for_this_c) best_c = self.get_best_c(Cs, c_metric, higher_is_better=higher_is_better) # Now that we have selected the best parameter, apply to outer set. mod.set_params(C=best_c) # Balance rare classes if necessary: if balance: data_balanced, labels_balanced = self.balance_classes( data_chosen.ix[train_outer], labels.ix[train_outer]) else: data_balanced, labels_balanced = data_chosen.ix[ train_outer], labels.ix[train_outer] predicted_probs = fitted.predict_proba(data_chosen.ix[test_outer]) err = error_f(predicted_probs, labels.ix[test_outer].values) print confusion_matrix(labels[test_outer].values, predicted_probs[:, 1] > threshold_p) for_roc.append((labels[test_outer].values, predicted_probs)) outer_metric.append(err) mean_metric = sum(outer_metric) / len(outer_metric) print 'Mean Nested CV Error for best c: ', mean_metric, ', C: ', best_c print 'Final intercept: ', fitted.intercept_[0] try: print 'Final columns, coefficients: ' print zip(columns, fitted.coef_[0]) except NotImplementedError: pass num_features = len(data_chosen.columns) if draw_decision_boundaries: self.draw_decision_boundaries( mod, data_chosen.columns, data_chosen.ix[train_outer].as_matrix(), labels.ix[train_outer].values, title='Decision Boundaries: ' + title_suffix, save_path=save_path_prefix + '_{0}_features_decision_boundaries.png'.format(num_features)) if draw_roc: self.draw_roc(for_roc, title='ROC for {1} features, c = {2}: {0}'.format( title_suffix, num_features, best_c), save_path=save_path_prefix + '_{0}_features_roc.png'.format(num_features)) return mean_metric, best_c, mod
ids_ = np.load(opts.IDS) le = preprocessing.LabelEncoder() le.fit(ids_) verbose("Total classes", le.classes_.shape[0]) ids = le.transform(ids_) X_train, X_test, y_train, y_test=\ train_test_split(feats, ids, test_size=0.20, random_state=42) verbose("Training") classifier = RandomForestClassifier(n_estimators=opts.estimators, n_jobs=opts.nprocessors, max_depth=20, verbose=True) # Aprendiendo classifier.fit(X_train, y_train) # Prediciendo verbose("Prediction") prediction = classifier.predict(X_test) print('Accuracy :', accuracy_score(y_test, prediction)) print('Precision :', precision_score(y_test, prediction)) print('Recall :', recall_score(y_test, prediction)) print('F-score :', f1_score(y_test, prediction)) print('\nClasification report:\n', classification_report(y_test, prediction)) print('\nConfussion matrix :\n', confusion_matrix(y_test, prediction))
#se pasmo con 1000000 #probar con mas parametros classifier = RandomForestClassifier(n_estimators=100) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) #print X_train.shape from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score print '\nAccuracy:', accuracy_score(y_test, prediction) print '\nscore:', classifier.score(X_train, y_train) print '\nrecall:', recall_score(y_test, prediction) print '\nprecision:', precision_score(y_test, prediction) print '\n clasification report:\n', classification_report(y_test, prediction) print '\n confussion matrix:\n', confusion_matrix(y_test, prediction) #plots: import matplotlib.pyplot as plt confusion_matrix_plot = confusion_matrix(y_test, prediction) plt.title('matriz de confusion') plt.colorbar() plt.xlabel() plt.xlabel('categoria de verdad') plt.ylabel('categoria predecida') plt.show() #como arreglo # import numpy as np # scores = cross_val_score(classifier, X_train, y_train, cv=5)
classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) #print X_train.shape from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score print '\nAccuracy:', accuracy_score(y_test, prediction) print '\nscore:', classifier.score(X_train, y_train) print '\nrecall:', recall_score(y_test, prediction) print '\nprecision:', precision_score(y_test, prediction) print '\n clasification report:\n', classification_report(y_test, prediction) print '\n confussion matrix:\n',confusion_matrix(y_test, prediction) #plots: import matplotlib.pyplot as plt confusion_matrix_plot = confusion_matrix(y_test, prediction) plt.title('matriz de confusion') plt.colorbar() plt.xlabel() plt.xlabel('categoria de verdad') plt.ylabel('categoria predecida') plt.show() #como arreglo # import numpy as np # scores = cross_val_score(classifier, X_train, y_train, cv=5)
best_chosen = chosen best_mod = mod if force_choice: break print "Best number of features: ", len(best_chosen) print "Best features: ", best_chosen print "Best C, MSE: ", best_c, best_err if force_choice: #mod = learner.get_model(classifier_type=classifier_type, C=best_c) #fitted = mod.fit(training_data, training_labels) test_vectors = test_vectors[best_chosen] predicted_probs = best_mod.predict_proba(test_vectors) err = learner.mse(predicted_probs, test_labels.values) print err print confusion_matrix(test_labels.values, predicted_probs[:, 1] > .5) learner.draw_roc( label_sets=[(test_labels.values, predicted_probs)], save_path=learner.get_filename( subdir, 'check_nontrivial_{0}group'.format(rep_str))) learner.draw_decision_boundaries(best_mod, best_chosen, test_vectors.as_matrix(), test_labels.values, title = 'Decision Boundaries: ' + (replicate_id and 'Group {0}'.format(replicate_id) or 'Overall'), force_lim = [-3,3,-3,3], save_path = learner.get_filename(subdir,'plot_{0}group'.format(rep_str))\ + '_check_non_trivial_decision_boundaries.png' )
for tweet in reader[0:2*(numironicos/3)]: tweets_train.append(tweet["text"]) labels_train.append("noironia") for tweet in reader[2*(numironicos/3):]: tweets_test.append(tweet["text"]) labels_test.append("noironia") stop_words = [] f = open("spanish.txt") for line in f: stop_words.append(line.strip()) f.close() y_train = np.array(labels_train, dtype=object) y_test = np.array(labels_test, dtype=object) vectorizer = TfidfVectorizer(input='content', max_df=0.5, stop_words = stop_words) X_train = vectorizer.fit_transform(np.array(tweets_train, dtype=object)) X_test = vectorizer.transform(np.array(tweets_test, dtype=object)) classifier = RandomForestClassifier(n_estimators = 10) classifier.fit(X_train.toarray(), y_train) prediction = classifier.predict(X_test.toarray()) print '\nAccuracy :', accuracy_score(y_test, prediction) print '\nPrecision :', precision_score(y_test, prediction) print '\nRecall :', recall_score(y_test, prediction) print '\nF-score :', f1_score(y_test, prediction) print '\nClasification report:\n', classification_report(y_test,prediction) print '\nConfussion matrix :\n',confusion_matrix(y_test, prediction)
verbose('----------\n') verbose("Evaluation") if opts.mode in ['age','gender']: from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score # Calculando desempeño print( 'Accuracy :', accuracy_score(y_, prediction_)) print( 'Precision :', precision_score(y_, prediction_)) print( 'Recall :', recall_score(y_, prediction_)) print( 'F-score :', f1_score(y_, prediction_)) print( '\nClasification report:\n', classification_report(y_, prediction_)) print( '\nConfussion matrix :\n',confusion_matrix(y_, prediction_)) else: from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error,r2_score print( 'Mean Abs Error :', mean_absolute_error(y_, prediction_)) print( 'Mean Sqr Error :', mean_squared_error(y_, prediction_)) print( 'R2 Error :', r2_score(y_, prediction_)) #plots: #import matplotlib.pyplot as plt #confusion_matrix_plot = confusion_matrix(y_test, prediction) #plt.title('matriz de confusion') #plt.colorbar() #plt.xlabel() #plt.xlabel('categoria de verdad') #plt.ylabel('categoria predecida')
def load_data(dataset): f = gzip.open(dataset, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() train_set_x, train_set_y = train_set valid_set_x, valid_set_y = valid_set test_set_x, test_set_y = test_set rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] return rval if __name__ == "__main__": datasets = load_data('mnist.pkl.gz') train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print train_set_x.shape print train_set_y.shape logreg = linear_model.LogisticRegression() logreg.fit(train_set_x, train_set_y) predictions = logreg.predict(test_set_x) print confusion_matrix(test_set_y, predictions) print classification_report(test_set_y, predictions)
vect__norm: 'l2' vect__use_idf: True """ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix __author__ = 'gavin' import pandas as pd df = pd.read_csv('sms/sms.csv') X_train_r, X_test_r, y_train, y_test = train_test_split( df['message'], df['label']) vectorizer = TfidfVectorizer(max_df=0.5, max_features=None, ngram_range=(1, 1), norm='l2', use_idf=True) X_train = vectorizer.fit_transform(X_train_r) X_test = vectorizer.transform(X_test_r) classifier = LogisticRegression(penalty='l2', C=7) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) print 'score', classifier.score(X_test, y_test) print 'precision', precision_score(y_test, predictions) print 'recall', recall_score(y_test, predictions) print confusion_matrix(y_test, predictions)