def test_density(): rng = np.random.RandomState(0) X = rng.randint(10, size=(10, 5)) X[1, 2] = 0 X[5, 3] = 0 X_csr = sparse.csr_matrix(X) X_csc = sparse.csc_matrix(X) X_coo = sparse.coo_matrix(X) X_lil = sparse.lil_matrix(X) for X_ in (X_csr, X_csc, X_coo, X_lil): assert_equal(density(X_), density(X))
def gridsearch_svm(Xtrain, Ytrain, Xval, Yval): #---------------------------------- Scaling X1, scaler = scale_data(Xtrain) X2 = scale_data(Xval, scaler) #---------------------------------- Factor analysis fa = FactorAnalysis() X1 = fa.fit_transform(X1) X2 = fa.fit(X2) #---------------------------------- Cross validation and grid search cv = ShuffleSplit(len(Xtrain), n_iter=1, train_size=0.25, test_size=.03, random_state=0) params = {'C': [1, 10], 'kernel': ['rbf', 'linear']} svr = svm.SVC(verbose=True, shrinking=False) classifier = grid_search.GridSearchCV(svr, params, verbose=3, cv=cv) t0 = time() classifier.fit(X1, Ytrain) train_time = time() - t0 print("train time: %0.3fs" % train_time) #---------------------------------- Prediction on validation set: t0 = time() pred = list(classifier.predict(X2)) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(classifier, 'coef_'): print("dimensionality: %d" % classifier.coef_.shape[1]) print("density: %f" % density(classifier.coef_)) print 'F1-score : ', f1_score(Yval, pred, average='binary') print("classification report:") print(classification_report(Yval, pred, target_names=['0', '1'], digits=4)) print("confusion matrix:") print(confusion_matrix(Yval, pred)) return classifier, scaler
def train_svm(Xtrain, Ytrain, Xval=None, Yval=None): X1, scaler = scale_data(Xtrain) if Xval is not None: X2 = scale_data(Xval, scaler) # initialize basic SVM classifier = svm.SVC(verbose=True, shrinking=False, C=10, kernel='rbf') # train t0 = time() classifier.fit(X1, Ytrain) train_time = time() - t0 print("train time: %0.3fs" % train_time) if Xval is not None: # prediction on validation set: t0 = time() pred = list(classifier.predict(X2)) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(classifier, 'coef_'): print("dimensionality: %d" % classifier.coef_.shape[1]) print("density: %f" % density(classifier.coef_)) print 'F1-score : ', f1_score(Yval, pred, average='binary') print("classification report:") print( classification_report(Yval, pred, target_names=['0', '1'], digits=4)) print("confusion matrix:") print(confusion_matrix(Yval, pred)) return classifier, scaler
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if feature_names is not None: print("top 10 keywords per class:") print() if True: print("confusion matrix:") cm = metrics.confusion_matrix(y_test, pred) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf,X_train,y_train,X_test,y_test): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print() print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print("_" * 80) print("Training: ") print(clf) t0 = time() clf.fit(X, Y) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_dev) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(Y_dev, pred) print("accuracy: %0.3f" % score) if hasattr(clf, "coef_"): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(Y_dev, pred)) print() clf_descr = str(clf).split("(")[0] return clf_descr, score, train_time, test_time
def report_accuracy(model, categories, test_target, predicted): score = metrics.f1_score(test_target, predicted) print "f1-score: {:.3f}".format(score) clf = model.named_steps['clf'] if hasattr(clf, 'coef_'): coef = model.named_steps['clf'].coef_ print "dimensionality: {}".format(coef.shape[1]) print "density: {}".format(density(coef)) print "top 15 keywords per class:" feature_names = np.asarray( model.named_steps['vect'].get_feature_names()) for i, category in enumerate(categories): topkw = np.argsort(coef[i])[-15:] keywords = '\n\t'.join( textwrap.wrap(", ".join(feature_names[topkw]))) print "{}: {}".format(category, keywords) print print "classification report:" print metrics.classification_report(test_target, predicted, target_names=categories) print "confusion matrix:" print metrics.confusion_matrix(test_target, predicted) print
def train_rf(Xtrain, Ytrain, Xval=None, Yval=None): X1, scaler = scale_data(Xtrain) classifier = RandomForestClassifier(n_estimators=100, verbose=True) t0 = time() classifier.fit(X1, np.ravel(Ytrain)) train_time = time() - t0 print("train time: %0.3fs" % train_time) if Xval is not None: X2 = scale_data(Xval, scaler) t0 = time() pred = classifier.predict(X2.as_matrix()) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(classifier, 'coef_'): print("dimensionality: %d" % classifier.coef_.shape[1]) print("density: %f" % density(classifier.coef_)) print 'F1-score : ', f1_score(Yval, pred, average='binary') print("classification report:") print( classification_report(Yval, pred, target_names=['0', '1'], digits=4)) print("confusion matrix:") print(confusion_matrix(Yval, pred)) return classifier, scaler
def report_accuracy(model, categories, test_target, predicted): score = metrics.f1_score(test_target, predicted) print "f1-score: {:.3f}".format(score) clf = model.named_steps['clf'] if hasattr(clf, 'coef_'): coef = model.named_steps['clf'].coef_ print "dimensionality: {}".format(coef.shape[1]) print "density: {}".format(density(coef)) print "top 15 keywords per class:" feature_names = np.asarray(model.named_steps['vect'].get_feature_names()) for i, category in enumerate(categories): topkw = np.argsort(coef[i])[-15:] keywords = '\n\t'.join(textwrap.wrap( ", ".join(feature_names[topkw]) )) print "{}: {}".format(category, keywords) print print "classification report:" print metrics.classification_report(test_target, predicted, target_names=categories) print "confusion matrix:" print metrics.confusion_matrix(test_target, predicted) print
def benchmark(clf): print 'en benchmark' logger.info('_' * 80) logger.info("Entrenamiento: ") logger.info(clf) t0 = time.time() clf.fit(X_train, y_train) train_time = time.time() - t0 logger.info("tiempo de entrenamiento: %0.3fs" % train_time) t0 = time.time() pred = clf.predict(X_test) test_time = time.time() - t0 logger.info("tiempo de test: %0.3fs" % test_time) precision = metrics.precision_score(y_test, pred) score = metrics.recall_score(y_test, pred) funo = metrics.f1_score(y_test, pred) logger.info("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): logger.info("dimensionalidad: %d" % clf.coef_.shape[1]) logger.info("densidad: %f" % density(clf.coef_)) logger.info("Informe de clasificacion:") logger.info(metrics.classification_report(y_test, pred)) logger.info("Matriz de confusion:") logger.info(metrics.confusion_matrix(y_test, pred)) clf_descr = str(clf).split('(')[0] return precision, score, funo, train_time, test_time
def benchmarkDoc2Vec(clf): """ Benchmark classifiers """ print('_' * 80) print("Training Doc2Vec: ") print(clf) t0 = time() clf.fit(x_train1, y_train) #! train data using x(features) & y(targets) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred1 = clf.predict(x_test1) #!predict data test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred1) #! check the accuracy of preds using the test targets print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("classification report:") print(metrics.classification_report(y_test, pred1, target_names=class_names)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred1)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmarkWithCV(clf, X, y, n_folds): print('-' * 80) print("Training: ") print(clf) cv = StratifiedKFold(y, n_folds) cv_scores = [] t0 = time() for train, test in cv: clf.fit(X[train], y[train]) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X[test]) test_time = time() - t0 print("test time: %0.3fs" % test_time) #score = np.sum(pred == y_test) / float(np.size(y_test)) score = metrics.accuracy_score(y[test], pred) cv_scores.append(score) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) clf_descr = str(clf).split('(')[0] mean_score = np.mean(cv_scores) return clf_descr, mean_score, train_time, test_time
def benchmark(clf): print 80 * '_' print "Training: " print clf t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print "train time: %0.3fs" % train_time t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print "test time: %0.3fs" % test_time score = metric(y_test, pred) print "MAE: %0.3f" % score if hasattr(clf, 'alpha_'): print "Alpha", clf.alpha_ try: if hasattr(clf, 'coef_'): print "density: %f" % density(clf.coef_) print "dimensionality: %d" % clf.coef_.shape[0] print except Exception as ex: print ex print clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def _classify(clf, cluster_data, X_train, y_train, X_test, feature_names, categories, c_params): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() cluster_data.cluster_of_posts = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if c_params.is_report_printed: print("classification report:") print() clf_descr = str(clf).split('(')[0] return clf_descr, train_time, test_time
def benchmark_features_selection(clf,name): print('_' * 80) print("Training: ") print(clf) t0 = time() rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy') rfecv.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) print(name+"Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() t0 = time() pred = rfecv.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("Saving data to database:") save_results_data(cursor, name, testing_identifiant_produit_list, pred) print() clf_descr = str(clf).split('(')[0] return clf_descr,train_time,test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10]).encode("utf-8")))) print() print("classification report:") print(metrics.classification_report(y_test, pred,target_names=categories)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf, clf_name): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(x_train_std, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(x_test_std) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("classification report:") print(metrics.classification_report(y_test, pred, target_names=["not helpful", "helpful"])) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] save_confusion_matrix(confusion_matrix(y_test, pred), pred, clf_name) return clf_descr, score, train_time, test_time
def getReport(self, save=1, get_top_words=0): # returns report report = "" if get_top_words == 1: if hasattr(self.mlModel, 'coef_'): report += "Dimensionality: " + str(self.mlModel.coef_.shape[1]) report += "\nDensity: " + str(density(self.mlModel.coef_)) rank = np.argsort(self.mlModel.coef_[0]) top10 = rank[-20:] bottom10 = rank[:20] report += "\n\nTop 10 keywords: " report += "\nPositive: " + (" ".join( self.feature_names[top10])) report += "\nNegative: " + (" ".join( self.feature_names[bottom10])) score = metrics.accuracy_score(self.y_test, self.y_pred) report += "\n\nAccuracy: " + str(score) report += "\nClassification report: " report += "\n\n" + str( metrics.classification_report( self.y_test, self.y_pred, target_names=["Negative", "Positive"])) report += "\nConfusion matrix: " report += "\n\n" + str( metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n" if save == 1: with open(self.model_path + "report.txt", "w") as text_file: text_file.write(report) return report
def benchmark(self, clf): print('_' * 80) print("Training: ") print(clf) t0 = time.time() clf.fit(self.X_train, self.y_train) train_time = time.time() - t0 print("train time: %0.3fs" % train_time) t0 = time.time() pred = clf.predict(self.X_test) test_time = time.time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(self.y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("top 10 keywords per class:") for i, label in enumerate(self.target_names): top10 = np.argsort(clf.coef_[i])[-10:] print(self.trim("%s: %s" % (label, " ".join(self.feature_names[top10])))) print() print("confusion matrix:") print(metrics.confusion_matrix(self.y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): ## Benchmark classifiers ## Modified after: http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html#sphx-glr-auto-examples-text-document-classification-20newsgroups-py print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, Y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() Y_pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = clf.score(X_train, Y_train) print("Training accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("classification report:") print( metrics.classification_report(Y_test, Y_pred, target_names=target_names)) print("confusion matrix:") print(metrics.confusion_matrix(Y_test, Y_pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def test(classifier): print('\n\n') print("Training: ") print(classifier) t0 = time() classifier.fit(train_matrix, train_y) train_time = time() - t0 print("train time: %0.4fs" % train_time) t0 = time() pred = classifier.predict(test_matrix) test_time = time() - t0 print("test time: %0.4fs" % test_time) score = metrics.accuracy_score(test_y, pred) print("accuracy: %0.4f" % score) if hasattr(classifier, 'coef_'): print("dimensionality: %d" % classifier.coef_.shape[1]) print("density: %f" % density(classifier.coef_)) print("top 50 keywords per rating:") for i in stars: top50 = np.argsort(classifier.coef_[i-1])[-50:] print(trim("%d: %s" % (i, " ".join(feature_list[top50])))) print() print("Classification report:") print(metrics.classification_report(test_y, pred)) print("Confusion matrix:") print(metrics.confusion_matrix(test_y, pred)) classifier_name = str(classifier).split('(')[0] return classifier_name, score, train_time, test_time
def benchmark(clf): print('_' * 80) print(clf.steps[1][0]) print('_' * 80) print("Training: ") # print(clf) t0 = time() clf.fit(X_train_flip, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test_flip) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) # f1score = metrics.f1_score(y_test, pred,average='micro') # print("f1-score: %0.3f" % f1score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print() print("classification report:") print(metrics.classification_report(y_test, pred, target_names=settings[4])) # print("confusion matrix:") cm = metrics.confusion_matrix(y_test, pred) # print(cm) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix(cm, classes=settings[4], title='Confusion matrix') # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cm, classes=settings[4], normalize=True, title='Normalized confusion matrix') plt.show() # print() # print("roc auc score:") # roc_auc_score = metrics.roc_auc_score(y_test, pred, average='micro') # print(roc_auc_score) # print("roc curve:") # roc_curve = metrics.roc_curve(y_test, pred) # print(roc_curve) print() clf_descr = str(clf).split('(')[0] return clf, clf_descr, score, train_time, test_time
def benchmark(clf): print('_'*80) print('training') print(clf) t0=time() clf.fit(X_train,y_train) train_time=time()-t0 print('train time%0.3f'%train_time) to=time() pred=clf.predict(X_test) test_time=time()-t0 print('test time %0.3f'%test_time) score=metrics.accuracy_score(y_test,pred) print('accuracy:%0.3f'%score) if hasattr(clf,'coef_'): print('dimensionality:%d'%clf.coef_.shape[1]) print('density:%f'%density(clf.coef_)) if opts.print_top10 and feature_names is not None: print('top10 keywords per class:') for i,label in enumerate(target_names): top10=np.argsort(clf.coef_[i])[-10:] print(trim('%s:%s'%(label,' '.join(feature_names[top10])))) print() if opts.print_report: print('classfication report:') print(metrics.classification_report(y_test,pred,target_names=target_names)) if opts.print_cm: print('confusion matrix') print(metrics.confusion_matrix(y_test,pred)) print() clf.descr=str(clf).split('(')[0] return clf.descr,score,train_time,test_time
def train_logit(Xtrain, Ytrain, Xval=None, Yval=None): X1, scaler = scale_data(Xtrain) classifier = LogisticRegression(C=1, penalty='l1', tol=1e-4, verbose=True) t0 = time() classifier.fit(X1, Ytrain) train_time = time() - t0 print("train time: %0.3fs" % train_time) if Xval is not None: X2 = scale_data(Xval, scaler) t0 = time() pred = classifier.predict(X2) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(classifier, 'coef_'): print("dimensionality: %d" % classifier.coef_.shape[1]) print("density: %f" % density(classifier.coef_)) print 'F1-score : ', f1_score(Yval, pred, average='binary') print("classification report:") print( classification_report(Yval, pred, target_names=['0', '1'], digits=4)) print("confusion matrix:") print(confusion_matrix(Yval, pred)) return classifier, scaler
def getReport(self,save = 1, get_top_words = 0): # returns report report = "" if get_top_words == 1: if hasattr(self.mlModel, 'coef_'): report += "Dimensionality: " + str(self.mlModel.coef_.shape[1]) report += "\nDensity: " + str(density(self.mlModel.coef_)) rank = np.argsort(self.mlModel.coef_[0]) top10 = rank[-20:] bottom10 = rank[:20] report += "\n\nTop 10 keywords: " report += "\nPositive: " + (" ".join(self.feature_names[top10])) report += "\nNegative: " + (" ".join(self.feature_names[bottom10])) score = metrics.accuracy_score(self.y_test, self.y_pred) report += "\n\nAccuracy: " + str(score) report += "\nClassification report: " report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"])) report += "\nConfusion matrix: " report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n" if save == 1: with open(self.model_path + "report.txt", "w") as text_file: text_file.write(report) return report
def benchmark(clf): print('_' * 80) print("Training: ") print(len(X_train)) clf_descr = str(clf).split('(')[0] print("model name:" + clf_descr) a = datetime.now() if clf_descr.__contains__('tensorflow'): history = clf.fit(X_train, y_train, epochs=epochs, callbacks=callbacks, validation_data=(X_test, y_test), verbose=2, batch_size=batch_size) else: clf.fit(X_train, y_train) b = datetime.now() c = a - b train_time = c.microseconds print("train time: %0.3fs" % train_time) pred = clf.predict(X_test) pred_train = clf.predict(X_train) if clf_descr.__contains__('tensorflow'): for i in range(len(pred)): if (pred[i] >= 0.3): pred[i] = 1 else: pred[i] = 0 for i in range(len(pred_train)): if (pred_train[i] >= 0.3): pred_train[i] = 1 else: pred_train[i] = 0 f1_score = metrics.f1_score(y_test, pred) print("f1_score: %0.3f" % f1_score) f1_score_train = metrics.f1_score(y_train, pred_train) print("f1_score_train: %0.3f" % f1_score_train) print("classification report:") print(classification_report(y_test, pred)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) return clf_descr, f1_score_train, f1_score, train_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) cumScore = 0 cumTrainTime = 0 cumTestTime = 0 for classC in categories: print("*** One class model for : ", classC, " ***") t0 = time() new_y_train = y_train.tolist() new_y_test = y_test.tolist() new_y_train = [x if x == classC else 0 for x in new_y_train] new_y_test = [x if x == classC else 0 for x in new_y_test] new_y_train = np.array(new_y_train) new_y_test = np.array(new_y_test) clf.fit(X_train, new_y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) cumTrainTime += train_time t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) cumTestTime += test_time score = metrics.accuracy_score(new_y_test, pred) cumScore += score print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if False: # or opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print( trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if False: #or opts.print_report : print("classification report:") print( metrics.classification_report(new_y_test, pred, target_names=categories)) if False: #opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(new_y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, cumScore / len(categories), cumTrainTime, cumTestTime
def benchmark(clf, X_train, y_train, X_test, y_test, target_names, print_report=True, feature_names=None, print_top10=False, print_cm=True): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) #print("Accuracy: %0.3f (+/- %0.3f)" % (score.mean(), score.std() * 2)) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, label in enumerate([ "Make Update", "Setup Printer", "Shutdown Computer", "Software Recommendation", "None" ]): top10 = np.argsort(clf.coef_[i])[-10:] print( trim("%s: %s" % (label, " ".join([feature_names[i] for i in top10])))) print() if print_report: print("classification report:") print( metrics.classification_report(y_test, pred, target_names=target_names)) if print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) accscore = metrics.accuracy_score(y_test, pred) print ("pred count is %d" %len(pred)) print ('accuracy score: %0.3f' % accscore) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print("confidence for unlabeled data:") #compute absolute confidence for each unlabeled sample in each class confidences = np.abs(clf.decision_function(X_unlabeled)) #average abs(confidence) over all classes for each unlabeled sample (if there is more than 2 classes) if(len(categories) > 2): confidences = np.average(confidences, axix=1) print confidences sorted_confidences = np.argsort(confidences) question_samples = [] #select top k low confidence unlabeled samples low_confidence_samples = sorted_confidences[0:NUM_QUESTIONS] #select top k high confidence unlabeled samples high_confidence_samples = sorted_confidences[-NUM_QUESTIONS:] question_samples.extend(low_confidence_samples.tolist()) question_samples.extend(high_confidence_samples.tolist()) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time, question_samples
def benchmark(clf, section): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.4f" % score) print("classification report:") class_matrix = metrics.classification_report( y_test, pred, target_names=target_names).split() print( metrics.classification_report(y_test, pred, target_names=target_names)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] if hasattr(clf, 'coef_'): with open("svm_top_features.tsv", "a", encoding="utf-8") as tsv2: print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) top = zip(coefs_with_fns[:50], coefs_with_fns[:-(50 + 1):-1]) for (coef_1, fn_1), (coef_2, fn_2) in top: print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)) # writes coef_d feat_d coef_r feat_r section model acc_d acc_r acc_all recall f1 support tsv2.write( "\n%.5f\t%s\t%.5f\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (coef_1, fn_1, coef_2, fn_2, section, str(clf).split("(")[0], class_matrix[5], class_matrix[10], class_matrix[17], class_matrix[18], class_matrix[19], class_matrix[20])) tsv2.close() print() return clf_descr, score, train_time, test_time
def benchmark(clf): global train_duration, test_duration print('_' * 80) print("Training: ") print(clf) t0 = time() if isinstance(clf, (GensimFastText, FastText)): clf.fit(train_text, y_train) train_time = time() - t0 else: clf.fit(X_train, y_train) train_time = train_duration + (time() - t0) print("train time: %0.3fs" % train_time) t0 = time() if isinstance(clf, (GensimFastText, FastText)): pred = clf.predict(test_text) test_time = time() - t0 # fix unknown predictions pred = [most_freq if p is None else p for p in pred] else: pred = clf.predict(X_test) test_time = test_duration + (time() - t0) print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred, average='macro') print("macro F1: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print( trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print( metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) accscore = metrics.accuracy_score(y_test, pred) print("pred count is %d" % len(pred)) print('accuracy score: %0.3f' % accscore) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("classification report:") print( metrics.classification_report(y_test, pred, target_names=categories)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print("confidence for unlabeled data:") #compute absolute confidence for each unlabeled sample in each class confidences = np.abs(clf.decision_function(X_unlabeled)) #average abs(confidence) over all classes for each unlabeled sample (if there is more than 2 classes) if (len(categories) > 2): confidences = np.average(confidences, axix=1) print confidences sorted_confidences = np.argsort(confidences) question_samples = [] #select top k low confidence unlabeled samples low_confidence_samples = sorted_confidences[0:NUM_QUESTIONS] #select top k high confidence unlabeled samples high_confidence_samples = sorted_confidences[-NUM_QUESTIONS:] question_samples.extend(low_confidence_samples.tolist()) question_samples.extend(high_confidence_samples.tolist()) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time, question_samples
def benchmark(clf, name): print('_' * 80) if not opts.restore: print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) clf_path = "models/%s/" % name if opts.save: if not os.path.exists(clf_path): os.makedirs(clf_path) joblib.dump(clf, clf_path + "model.pkl") if opts.restore: clf = joblib.load(clf_path + "model.pkl") train_time = 0 t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, label in enumerate(target_names): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (label, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print( metrics.classification_report(y_test, pred, target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print_top10 = True if print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, label in enumerate(target_names): if i >= len(clf.coef_): print("%s: Missing data???" % label) continue top10 = np.argsort(clf.coef_[i])[-10:] try: print( trim("%s: \"%r\"" % (label, '" "'.join(feature_names[top10])))) except UnicodeEncodeError as e: print(e) print() print_report = True if print_report: print("classification report:") print(target_names) print( metrics.classification_report(y_test, pred, target_names=target_names)) print_cm = True if print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) ccm = clf.predict(X_test) print(ccm) #pred_list = clf.predict_proba(X_test) #print (clf.classes_, pred_list) test_time = time() - t0 print("test time: %0.3fs" % test_time) # Add F1 socre score_acc = metrics.accuracy_score(y_test, pred) score_macro_f1 = metrics.f1_score(y_test, pred, average='macro') score_micro_f1 = metrics.f1_score(y_test, pred, average='micro') print("Precision: %0.3f" % score_acc) print("Macro F1: %0.3f" % score_macro_f1) print("Micro F1: %0.3f" % score_micro_f1) print(metrics.classification_report(y_test, pred, target_names)) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, label in enumerate(target_names): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (label, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print( metrics.classification_report(y_test, pred, target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score_acc, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) predictions = clf.predict_proba(X_test) fin_predict = [] for i in xrange(0,len(predictions)): x = np.argpartition(predictions[i],-5)[-5:] x = clf.classes_[x] fin_predict.append([bunch.target_names[e] for e in x]) our_accuracies.append(final_accuracy(fin_predict)) print(our_accuracies[-1]) # print("------------predictions------------") # print(pred) # print("-------------------------") test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print 80 * '_' print "Training: " print clf t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print "train time: %0.3fs" % train_time t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print "test time: %0.3fs" % test_time score = metrics.f1_score(y_test, pred) print "f1-score: %0.3f" % score if hasattr(clf, 'coef_'): print "dimensionality: %d" % max(clf.coef_.shape) print "density: %f" % density(clf.coef_) if opts.print_top10: print "top 10 keywords per class:" for i, category in enumerate(categories): import pdb pdb.set_trace() if len(clf.coef_.shape) == 1: top10 = np.argsort(clf.coef_[i])[-10:] else: top10 = np.argsort(clf.coef_[0][i])[-10:] print trim( "%s: %s" % (category, " ".join(np.array(feature_names)[top10]))) print pos_hits = [] for i in range(len(pred)): if pred[i] == 1: pos_hits.append(y_test[i]) #print float(sum(pos_hits))/len(pos_hits) #print len(pos_hits) if opts.print_report: print "classification report:" print metrics.classification_report(y_test, pred, target_names=map(str, categories)) if opts.print_cm: print "confusion matrix:" print metrics.confusion_matrix(y_test, pred) print clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("training: ") print(clf) t0 = time() clf.fit(x_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(x_test) test_time = time() - t0 print("test time: %0.3f" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): """ `coef_` : array, shape (n_features, ) or (n_targets, n_features) Estimated coefficients for the linear regression problem. If multiple targets are passed during the fit (y 2D), this is a 2D array of shape (n_targets, n_features), while if only one target is passed, this is a 1D array of length n_features """ print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class: ") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print( trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report: ") print( metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix: ") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() # FIXME: use X_train.toarray() instead. if it didn't work use y_train.toarray() too :D #y_train.toarray() #X_train.toarray() #clf.fit(X_train.toarray(), y_train) #clf.fit(X_train, y_train.toarray()) clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) score = metrics.precision_score(y_test, pred, average='weighted', pos_label=None) print("precision: %0.3f" % score) score = metrics.recall_score(y_test, pred, average='weighted', pos_label=None) print("recall: %0.3f" % score) score = metrics.f1_score(y_test, pred, average='weighted', pos_label=None) print("f1: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") #for i, category in enumerate(categories): # top10 = np.argsort(clf.coef_[i])[-10:] # print(trim("%s: %s" # % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") #print(metrics.classification_report(y_test, pred, # target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(self, clf): print_topX = self.print_topX print_report = self.print_report print_cm = self.print_cm X_train = self.X_train y_train = self.y_train X_test = self.X_test y_test = self.y_test feature_names = self.feature_names categories = ["1"] print("_" * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, "coef_"): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if print_topX: print("top 10 keywords per class:") for i, category in enumerate(categories): topX = np.argsort(clf.coef_[i])[-print_topX:] print(trim("%s: %s" % (category, " ".join(feature_names[topX])))) print() if print_report: print("classification report:") print(classification_report(y_test, pred)) # target_names=categories)) if print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split("(")[0] return clf_descr, score, train_time, test_time, clf, pred
def benchmark(clf): print 80 * '_' print "Training: " print clf t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print "train time: %0.3fs" % train_time t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print "test time: %0.3fs" % test_time score = metrics.f1_score(y_test, pred) print "f1-score: %0.3f" % score if hasattr(clf, 'coef_'): print "dimensionality: %d" % max(clf.coef_.shape) print "density: %f" % density(clf.coef_) if opts.print_top10: print "top 10 keywords per class:" for i, category in enumerate(categories): import pdb;pdb.set_trace() if len(clf.coef_.shape) == 1: top10 = np.argsort(clf.coef_[i])[-10:] else: top10 = np.argsort(clf.coef_[0][i])[-10:] print trim("%s: %s" % ( category, " ".join(np.array(feature_names)[top10]))) print pos_hits = [] for i in range(len(pred)): if pred[i] == 1: pos_hits.append(y_test[i]) #print float(sum(pos_hits))/len(pos_hits) #print len(pos_hits) if opts.print_report: print "classification report:" print metrics.classification_report(y_test, pred, target_names=map(str,categories)) if opts.print_cm: print "confusion matrix:" print metrics.confusion_matrix(y_test, pred) print clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): needsDense=[RandomForestClassifier, AdaBoostClassifier, Pipeline] print('_' * 80) print("Training: ") print(clf) t0 = time() if type(clf) in needsDense: clf.fit(X_train.todense(), y_train) else: clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() if type(clf) in needsDense: pred = clf.predict(X_test.todense()) else: pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if print_topX: print("top feature per class:") for i, category in enumerate(categories): # topX = np.min(clf.coef_.shape[1], print_topX) topX = np.argsort(clf.coef_[i])[-print_topX:][::-1] print(trim("%s: %s" % (category, " | ".join(feature_names[topX])))) print() if print_report: print("classification report:") print(metrics.classification_report(y_test, pred)) # target_names=categories)) if print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time, clf, pred
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() try: clf.fit(X_train, y_train) except: clf.fit(X_train.toarray(), y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() try: pred = clf.predict(X_test) except: pred = clf.predict(X_test.toarray()) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf, clf_descr, X_train, X_test, y_train, y_test, feature_names, categories, silent, print_top10): """ Benchmark a classifier. """ if not silent: print('_' * 80) print("Training: ") if (not silent) or print_top10: print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 if not silent: print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 if not silent: print("test time: %0.3fs" % test_time) #score = metrics.f1_score(y_test, pred) score = np.mean(pred == y_test) if not silent: print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): if not silent: print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if print_top10 and feature_names is not None: print("top 10 keywords per class:") #print(categories) if len(categories) > 2: # multi-class for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print("%s: %s" % (category, " ".join(feature_names[top10]))) else: # binary top10 = np.argsort(clf.coef_[0])[-10:] print("%s" % (" ".join(feature_names[top10]))) print() #clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time, pred
def benchmark(clf,name): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") # for i, category in enumerate(categories): # top10 = np.argsort(clf.coef_[i])[-10:] # print(trim("%s: %s" # % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred)) # print(metrics.classification_report(y_test, pred, # target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print("Saving data to database:") save_my_data(cursor, name, testing_identifiant_produit_list, y_test, pred) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() if type(clf) is RandomForestClassifier: clf.fit(X_train.todense(), y_train) else: clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() if type(clf) is RandomForestClassifier: pred = clf.predict(X_test.todense()) else: pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) accscore = metrics.accuracy_score(y_test, pred) print ("pred count is %d" % len(pred)) print ('accuracy score: %0.3f' % accscore) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf, name): print('_' * 80) print("Training: ") print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) # if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) #if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) #if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) clf_descr = str(clf).split('(')[0] #predicted_labels = le.inverse_transform(clf.predict(dtM_whole)) #np.savetxt(name+'.csv', predicted_labels, delimiter=",") return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) clf.fit(X_train, y_train) pred = clf.predict(X_test) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") # print(metrics.confusion_matrix(y_test, pred)) cm = confusion_matrix(y_test, pred) plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() print() clf_descr = str(clf).split('(')[0] return clf_descr, score
def benchmark(self, opts, classifier, dataTrain, labelTrain, dataTest, labelTest): print '_' * 80 print 'Training: ' print classifier # start training and measure the time frame t0 = time() classifier.fit(dataTrain, labelTrain) trainTime = time() - t0 print 'train time: %0.3fs' % trainTime # start prediction and measure the time frame t0 = time() predictor = classifier.predict(dataTest) testTime = time() - t0 print 'test time: %0.3fs' % testTime # accuracy score = metrics.f1_score(labelTest, predictor) print 'f1 score: %0.3f' % score if hasattr(classifier, 'coef_'): print 'dimensionality: %d' % classifier.coef_.shape[1] print 'density: %f' % density(classifier.coef_) if opts.print_top10 and feature_name is not None: print 'top 10 keywords per class:' for i, category in enumerate(categories): top10 = np.argsort(classifier.coef_[i])[-10 : ] print trim('%s: %s' % (category, ' '.join(featureNames[top10]))) print if opts.print_report: print 'classification report:' print metrics.classification_report(labelTest, predictor, target_names = categories) if opts.print_cm: print 'confusion matrix:' print metrics.confusion_matrix(labelTest, predictor) print classifierDescription = str(classifier).split('(')[0] return classifierDescription, score, trainTime, testTime
def benchmark(clf): print 80 * '_' print "Training: " print clf t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print "train time: %0.3fs" % train_time t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print "test time: %0.3fs" % test_time score = metrics.f1_score(y_test, pred) print "f1-score: %0.3f" % score if hasattr(clf, 'coef_'): print "dimensionality: %d" % clf.coef_.shape[1] print "density: %f" % density(clf.coef_) if opts.print_top10: print "top 10 keywords per class:" for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print trim("%s: %s" % ( category, " ".join(np.array(feature_names)[top10]))) print clf.coef_[i][top10] print if opts.print_report: print "classification report:" print metrics.classification_report(y_test, pred, target_names=categories) if opts.print_cm: print "confusion matrix:" print metrics.confusion_matrix(y_test, pred) print clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf, name): # Read two parameters print('_' * 80) # Add one line for presentation print("Training: ") print(clf) t0 = time() # Read the current time clf.fit(X_train, y_train) # Training machine ??? train_time = time() - t0 # Measure the time spent training print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) # Prediction with test set test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) # Get accuracy score print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): # If the result 'has attribute' coef, print it # .shape shows dimensions of a numpy array object print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) # Didn't understand this part 100% # The purpose of this is to find top keywords of C and L print("top 10 keywords per class:") top10C = np.argsort(clf.coef_[0])[-10:] # Conservative -1 top10L = np.argsort(clf.coef_[0])[:10] # Liberal +1 print(trim("C: %s" % (" ".join([feature_names[word_idx] for word_idx in top10C])))) print(trim("L: %s" % (" ".join([feature_names[word_idx] for word_idx in top10L])))) print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf, X_train, X_test, y_train, y_test, feature_names): print('_' * 80) print("Training: ") print(clf) t0 = time.time() clf.fit(X_train, y_train) train_time = time.time() - t0 print("train time: %0.3fs" % train_time) t0 = time.time() pred = clf.predict(X_test) test_time = time.time() - t0 print("test time: %0.3fs" % test_time) score = metrics.f1_score(y_test, pred) print("f1-score: %0.3f" % score) if hasattr(clf, 'coef_'): print("density: %f" % density(clf.coef_)) if feature_names is not None: print("top 10 keywords per class:") if clf.coef_.shape[0] == 1: top10female = np.argsort(clf.coef_[0])[-10:] top10male = np.argsort(clf.coef_[0])[:10] else: top10female = np.argsort(clf.coef_)[-10:] top10male = np.argsort(clf.coef_)[:10] print("%s: %s" % ("Female", ", ".join(feature_names[top10female]))) print("%s: %s" % ("Male", ", ".join(feature_names[top10male]))) print "" print("classification report:") print(metrics.classification_report(y_test, pred, target_names=['Female', 'Male'])) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print "" clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print('Training: ') print(clf) t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 print('train time: %0.3fs' % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print('test time: %0.3fs' % test_time) score = metrics.f1_score(y_test, pred) print('f1-score: %0.3f' % score) if hasattr(clf, 'coef_'): print('dimensionality: %d' % clf.coef_.shape[1]) print('density: %f' % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print('top 10 keywords per class:') for (i, category) in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim('%s: %s' % (category, ' '.join(feature_names[top10])))) print() if opts.print_report: print('classification report:') print(metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print('confusion matrix:') print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return (clf_descr, score, train_time, test_time)
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() print(clf.__name__) clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() pred = clf.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("top 10 keywords per class:") rank = np.argsort(clf.coef_[0]) top10 = rank[-10:] bottom10 = rank[:10] print(trim("%s: %s" % ("Funny: ", " ".join(feature_names[top10]).encode("utf-8")))) print(trim("%s: %s" % ("Not Funny: ", " ".join(feature_names[bottom10]).encode("utf-8")))) print() print("classification report:") print(metrics.classification_report(y_test, pred,target_names=categories)) print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time
def benchmark(clf): print('_' * 80) print("Training: ") print(clf) t0 = time() try: score = cross_validation.cross_val_score( clf, X, y, cv=5) except: score = cross_validation.cross_val_score( clf, X.toarray(), y, cv=5) test_time = time() - t0 print("CV time: %0.3fs" % test_time) # score = metrics.f1_score(y_test, pred) print("CV-score: %s" % str(score)) print("Mean CV-score: %f" % np.mean(score)) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) if opts.print_top10 and feature_names is not None: print("top 10 keywords per class:") for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print(trim("%s: %s" % (category, " ".join(feature_names[top10])))) print() if opts.print_report: print("classification report:") print(metrics.classification_report(y_test, pred, target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, np.mean(score)