def stochasticGD(input_file,Output): lvltrace.lvltrace("LVLEntree dans stochasticGD") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "Stochastic Gradient Descent " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Stochastic_GD_metrics.txt" file = open(results, "w") file.write("Stochastic Gradient Descent estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Stochastic Gradient Descent" save = Output + "Stochastic_GD_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans stochasticGD")
def SVC_linear(input_file,Output): lvltrace.lvltrace("LVLEntree dans SVC_linear") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf=svm.SVC(kernel='linear') clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "C-Support Vector Classifcation (with linear kernel) " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"SVM_Linear_Kernel_metrics.txt" file = open(results, "w") file.write("Support Vector Machine with Linear Kernel estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "SVC - linear Kernel" save = Output + "SVC_linear_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans SVC_linear")
def randomforest(input_file,Output): lvltrace.lvltrace("LVLEntree dans randomforest") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = RandomForestClassifier(n_estimators=10) clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "The Random forest algo " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Random_Forest_metrics.txt" file = open(results, "w") file.write("Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "The Random forest" save = Output + "Random_Forest_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans randomforest")
def run_model(X_test, X_train, y_test, y_train, prob_threshold = 20, layers = 5, nodes = 64, dropout = 50): print "run_model RUNNING" # Grab the model model = get_model(X_test, layers =layers, dropout = dropout) model.fit(X_train, y_train, nb_epoch=20, batch_size=16, verbose = 0) # Get the training and test predictions from our model fit. train_predictions = model.predict_proba(X_train) test_predictions = model.predict_proba(X_test) # Set these to either 0 or 1 based off the probability threshold we # passed in (divide by 100 becuase we passed in intergers). train_preds = (train_predictions) >= prob_threshold / 100.0 test_preds = (test_predictions) >= prob_threshold / 100.0 # Calculate the precision and recall. Only output until precision_score_train = precision_score(y_train, train_preds) precision_score_test = precision_score(y_test, test_preds) acc_train = accuracy_score(y_train, train_preds) acc_test = accuracy_score(y_test, test_preds) recall_score_train = recall_score(y_train, train_preds) recall_score_test = recall_score(y_test, test_preds) return precision_score_train, precision_score_test, recall_score_train, recall_score_test, acc_train, acc_test, model
def on_epoch_end(self, batch, logs={}): # losses self.losses_train.append(self.model.evaluate(X_train, Y_train, batch_size=128,verbose =0)) self.losses_val.append(self.model.evaluate(X_val, Y_val, batch_size=128,verbose = 0)) # Roc train train_preds = self.model.predict_proba(X_train, verbose=0) train_preds = train_preds[:, 1] roc_train = metrics.roc_auc_score(y_train, train_preds) self.roc_train.append(roc_train) # Roc val val_preds = self.model.predict_proba(X_val, verbose=0) val_preds = val_preds[:, 1] roc_val = metrics.roc_auc_score(y_val, val_preds) self.roc_val.append(roc_val) # Metrics train y_preds = self.model.predict_classes(X_train,verbose = 0) self.f1_train.append(metrics.f1_score(y_train,y_preds)) self.recal_train.append(metrics.recall_score(y_train,y_preds)) self.preci_train.append(metrics.precision_score(y_train,y_preds)) # Metrics val y_preds = self.model.predict_classes(X_val,verbose =0) self.f1_val.append(metrics.f1_score(y_val,y_preds)) self.recal_val.append(metrics.recall_score(y_val,y_preds)) self.preci_val.append(metrics.precision_score(y_val,y_preds))
def learning_curve_mod(data, labels, clf, percents, d=100, avg=3, test_size=.2): """ This method calculates the performance of the training and cross validation test set as the training set size increases and returns the performance at each percent Args: :param data: (md.array) The raw data to use for training and cross validation testing :param labels: (nd.array) the labels associated with the data :param clf: (sklearn classifier) the classifier to be used for training :param percents: (nd.array) a list of percent of training data to use :param d: (int) The number of principle components to calculate :param avg: (int) The number of iterations to average when calculating performance :param test_size: (double [0,1]) The size of the testing set Return: :return: train_accuracies (list) performance on the training set :return: test_accuracies (list) performance on the testing set """ # split into train and testing dataset x_train, x_test, y_train, y_test = train_test_split(data.T, labels, test_size=test_size, random_state=0) x_test = x_test.T train_accuracies = [] test_accuracies = [] for percent in percents: temp_train_accuracies = [] temp_test_accuracies = [] print percent for i in range(0, avg): x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x_train, y_train, test_size=percent) x_train_2 = x_train_2.T # Subtract off the mean mean_face = np.mean(x_train_2, axis=1) x_train_2 = x_train_2 - mean_face # Find low dimensional subspace using PCA pca = PCA(n_components=d) pca.fit(x_train_2) model = pca.transform(x_train_2) # Project the known faces onto the face space label_map = np.dot(x_train_2.T, model) # Train a KNN classifier clf.fit(label_map, y_train_2) # project the unknown faces onto face space W_train = np.dot(x_train_2.T - mean_face.T, model) W_test = np.dot(x_test.T - mean_face.T, model) test_prediction = clf.predict(W_test) temp_test_accuracies.append(metrics.precision_score(y_test, test_prediction)) train_prediction = clf.predict(W_train) temp_train_accuracies.append(metrics.precision_score(y_train_2, train_prediction)) train_accuracies.append(np.mean(temp_train_accuracies)) test_accuracies.append(np.mean(temp_test_accuracies)) return train_accuracies, test_accuracies
def main(): f = open("me.stdout", "r").read() print f (confusionMatrix, labels, ytrue, ypred, trueCount) = readConfusionMatrix.readText(f) for row in confusionMatrix: print row precisionMicro = np.float(metrics.precision_score(ytrue, ypred, average="micro")) recallMicro = np.float(metrics.recall_score(ytrue, ypred, average="micro")) f1Micro = np.float(metrics.f1_score(ytrue, ypred, average="micro")) f1Macro = np.float(metrics.f1_score(ytrue, ypred, pos_label=1, average="macro")) precisionMacro = np.float(metrics.precision_score(ytrue, ypred, average="macro")) recallMacro = np.float(metrics.recall_score(ytrue, ypred, average="macro")) mConf = metrics.confusion_matrix(ytrue, ypred) print mConf print labels print len(ytrue) print len(ypred) print trueCount print metrics.accuracy_score(ytrue, ypred) print precisionMicro print recallMicro print f1Micro print f1Macro print precisionMacro print recallMacro
def calculate_f1_metrics(all_predicted, all_targets): first_class = first_meaningful_entity class_count = len(set(all_targets)) filtered_true, filtered_predicted = [], [] for i in range(len(all_targets)): if all_targets[i] > 0: filtered_true.append(all_targets[i]) filtered_predicted.append(all_predicted[i]) precision_separate_scores = metrics.precision_score(filtered_true, filtered_predicted, labels=[i for i in range(first_class, class_count)], average=None) precision_score = metrics.precision_score(filtered_true, filtered_predicted, labels=[i for i in range(first_class, class_count)], average='micro') recall_separate_scores = metrics.recall_score(filtered_true, filtered_predicted, labels=[i for i in range(first_class, class_count)], average=None) recall_score = metrics.recall_score(filtered_true, filtered_predicted, labels=[i for i in range(first_class, class_count)], average='micro') f1_separate_scores = metrics.f1_score(filtered_true, filtered_predicted, labels=[i for i in range(first_class, class_count)], average=None) f1_score = metrics.f1_score(filtered_true, filtered_predicted, labels=[i for i in range(first_class, class_count)], average='micro') return f1_separate_scores, f1_score, precision_separate_scores, precision_score, recall_separate_scores, recall_score
def stratified_k_fold(clf,features,labels): skf = StratifiedKFold( labels, n_folds=3 ) precisions = [] recalls = [] for train_idx, test_idx in skf: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) pred = clf.predict(features_test) ### for each fold, print some metrics print print "precision score: ", precision_score( labels_test, pred ) print "recall score: ", recall_score( labels_test, pred ) precisions.append( precision_score(labels_test, pred) ) recalls.append( recall_score(labels_test, pred) ) ### aggregate precision and recall over all folds print "average precision: ", sum(precisions)/2. print "average recall: ", sum(recalls)/2.
def evaluate(ytest, ypred, filename='metrics.txt'): true_result = [1 if item > 0.5 else 0 for item in ytest] pred_result = [1 if item > 0.5 else 0 for item in ypred] cm = confusion_matrix(true_result, pred_result) print('\nConfusion matrix:') print(cm) print("\nLoss classified as loss", cm[0][0]) print("Wins classified as wins", cm[1][1]) print("Wins classified as loss", cm[1][0]) print("Loss classified as wins", cm[0][1]) print('\nAccuracy:\t', accuracy_score(true_result, pred_result)) print('Precision:\t', precision_score(true_result, pred_result)) print('Recall: \t', recall_score(true_result, pred_result)) print('F1 score:\t', f1_score(true_result, pred_result)) print('Mean absolute error:\t', mean_absolute_error(ytest, ypred)) # print to file print("Loss classified as loss", cm[0][0], file=open(filename, "a")) print("Wins classified as wins", cm[1][1], file=open(filename, "a")) print("Wins classified as loss", cm[1][0], file=open(filename, "a")) print("Loss classified as wins", cm[0][1], file=open(filename, "a")) print('\nAccuracy:\t', accuracy_score(true_result, pred_result), file=open(filename, "a")) print('Precision:\t', precision_score(true_result, pred_result), file=open(filename, "a")) print('Recall: \t', recall_score(true_result, pred_result), file=open(filename, "a")) print('F1 score:\t', f1_score(true_result, pred_result), file=open(filename, "a")) print('Mean absolute error:\t', mean_absolute_error(ytest, ypred), file=open(filename, "a"))
def do_cv(self): from sklearn.metrics import precision_score data = util.get_x_y_for_cv() X = data['x'] y = data['y']['Survived'] skf = StratifiedKFold(y, n_folds=5) e_test = [] e_train = [] for train_idx, test_idx in skf: train_x = X.iloc[train_idx] train_y = y.iloc[train_idx] test_x = X.iloc[test_idx] test_y = y.iloc[test_idx] self.model.fit(train_x, train_y) yhat = self.model.predict(test_x) e_test.append(precision_score(test_y, yhat)) yhat_train = self.model.predict(train_x) e_train.append(precision_score(train_y, yhat_train)) print np.mean(e_train) print np.mean(e_test)
def predictSVD(svd, row, column, d): # start = timeit.default_timer() u = svd[0] #clf.components_ s = svd[1] #clf.explained_variance_ vt = svd[2] #clf.fit_transform(X) # print " fitting done."; # stop = timeit.default_timer() # print " runtime: " + str(stop - start) # print "d:" # print d # matrixY = clf.components_ probsY = [] # print "dot products:" for i in range(len(row)): # print np.dot(u[:,column[i]], v[row[i],:]) prob = np.sum(u[column[i],:]*s*vt[:,row[i]]) if(prob < 0): prob = 0 if(prob > 1): prob = 1 probsY.append(prob) probsY = np.array(probsY) preds = np.zeros(shape=len(probsY)) preds[probsY >= 0.5] = 1 print "Precision" print precision_score(d, preds) print "Recall" print recall_score(d, preds) print "F-Score" print f1_score(d, preds) return probsY, preds
def trainModel(self,folds): kf = cross_validation.StratifiedKFold(self.y_total,n_folds=folds,shuffle=True,random_state=random.randint(1,100)) for (train_index,test_index) in (kf): self.X_train = [self.X_total[i] for i in train_index] self.X_test = [self.X_total[i] for i in test_index] self.y_train = [self.y_total[i] for i in train_index] self.y_test = [self.y_total[i] for i in test_index] print "################" print "Original" print np.array(self.y_test) print "################" self.clf = self.clf.fit(self.X_train,self.y_train) print "Predicted" y_pred = self.clf.predict(self.X_test) print y_pred print "################" print "Evaluation\n" cm = confusion_matrix(self.y_test,y_pred) print cm print "Precision Score:" print precision_score(self.y_test,y_pred,average="macro") print "Recall Score:" print recall_score(self.y_test,y_pred,average="macro") print "Accuracy Score:" print accuracy_score(self.y_test,y_pred)
def nearest_centroid(input_file,Output,test_size): ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = NearestCentroid() clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Nearest Centroid Classifier " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Nearest_Centroid_metrics_test.txt" file = open(results, "w") file.write("Nearest Centroid Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Nearest Centroid %f"%test_size save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def SVC_linear(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans SVC_linear split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf=svm.SVC(kernel='linear') clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "C-Support Vector Classifcation (with RBF linear) " print "y_test, y_pred, iteration" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"SVM_Linear_Kernel_metrics_test.txt" file = open(results, "w") file.write("Support Vector Machine with Linear Kernel estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "SVC linear %f"%test_size save = Output + "SVC_linear_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLsortie dans SVC_linear split_test")
def extratreeclassifier(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = ExtraTreesClassifier(n_estimators=10) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Extremely Randomized Trees" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"_Extremely_Random_Forest_metrics_test.txt" file = open(results, "w") file.write("Extremely Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Extremely Randomized Trees %f"%test_size save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def gaussianNB(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans gaussianNB split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape # Instantiate the estimator clf = GaussianNB() # Fit the estimator to the data clf.fit(X_train, y_train) # Use the model to predict the last several labels y_pred = clf.predict(X_test) print "Gaussian Naive Bayes estimator accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) results = Output+"GaussianNB_metrics_test.txt" file = open(results, "w") file.write("Gaussian Naive Bayes estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Gaussian Naive Bayes %f"%test_size save = Output + "Gaussian_NB_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLsortie dans gaussianNB split_test")
def main(): resize_shape = 64 print "data is loading..." train_X, train_Y, test_X, test_Y = load_data(resize_shape) print "data is loaded" print "feature engineering..." learning_rate = 0.01 training_iters = 100000 batch_size = 128 display_step = 10 # Network Parameters n_input = resize_shape*resize_shape # MNIST data input (img shape: 28*28) n_classes = 62 # MNIST total classes (0-9 digits) dropout = 0.5 # Dropout, probability to keep units with tf.Session() as sess: cnn = CNN(sess, learning_rate, training_iters, batch_size, display_step, n_input, n_classes, dropout,resize_shape) train_X = cnn.inference(train_X) test_X = cnn.inference(test_X) print "feature engineering is complete" print 'training phase' clf = svm.LinearSVC().fit(train_X, train_Y) print 'test phase' predicts = clf.predict(test_X) # measure function print 'measure phase' print confusion_matrix(test_Y, predicts) print f1_score(test_Y, predicts, average=None) print precision_score(test_Y, predicts, average=None) print recall_score(test_Y, predicts, average=None) print accuracy_score(test_Y, predicts)
def applyClassifier(self, clf, name, training_set, testing_set, y_train, y_test): print("\nMODEL " + name) t0 = time() classifier = clf.fit(training_set, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() y_nb_predicted = classifier.predict(testing_set) test_time = time() - t0 print("test time: %0.3fs" % test_time) precision = metrics.precision_score(y_test, y_nb_predicted) recall = metrics.recall_score(y_test, y_nb_predicted) f1_score = metrics.f1_score(y_test, y_nb_predicted) accuracy = metrics.accuracy_score(y_test, y_nb_predicted) micro_recall = metrics.recall_score(y_test, y_nb_predicted, average="micro") macro_recall = metrics.recall_score(y_test, y_nb_predicted, average="macro") micro_precision = metrics.precision_score(y_test, y_nb_predicted, average="micro") macro_precision = metrics.precision_score(y_test, y_nb_predicted, average="macro") print 'The precision for this classifier is ' + str(precision) print 'The micro averaged precision for this classifier is ' + str(micro_precision) print 'The macro averaged precision for this classifier is ' + str(macro_precision) print 'The recall for this classifier is ' + str(recall) print 'The micro averaged recall for this classifier is ' + str(micro_recall) print 'The macro averaged recall for this classifier is ' + str(macro_recall) print 'The f1 for this classifier is ' + str(f1_score) print 'The accuracy for this classifier is ' + str(accuracy) return name, accuracy, precision, recall, micro_precision, micro_recall, macro_precision, macro_recall, train_time, test_time
def cross_val(data_x, data_y, classifier, kFold, b_cost=1, h_cost=1, w=0.5): e_h, e_b = 0, 0 y_tests, pred_probas = [], [] for train_index, test_index in kFold: data_x_, data_y_ = np.array(data_x), np.array(data_y) X_train, X_test = list(data_x_[train_index]), list(data_x_[test_index]) y_train, y_test = list(data_y_[train_index]), list(data_y_[test_index]) classifier.fit(X_train, y_train) pred_proba = [r[0] for r in classifier.predict_proba(X_test)] y_tests += y_test pred_probas += pred_proba predictions = [0 if p*b_cost > (1-p)*h_cost else 1 for p in pred_probas] roc_auc = roc_auc_score(y_tests, pred_probas) total_acc = accuracy_score(y_tests, predictions) precision, recall, thresholds = precision_recall_curve(y_tests, pred_probas, pos_label=0) fpr, tpr, thresholds = roc_curve(y_tests, pred_probas, pos_label=0) precision_bots = precision_score(y_tests, predictions, pos_label = 0) precision_humans = precision_score(y_tests, predictions, pos_label = 1) recall_bots = recall_score(y_tests, predictions, pos_label = 0) recall_humans = recall_score(y_tests, predictions, pos_label = 1) f1_bots = f1_score(y_tests, predictions, pos_label = 0) f1_humans = f1_score(y_tests, predictions, pos_label = 1) conf_matrix = np.matrix(list(confusion_matrix(y_tests, predictions))) #plot_curve(fpr, tpr, 'ROC', w) #plot_curve(recall, precision, 'PR', w) return [total_acc, precision_bots, precision_humans, recall_bots, recall_humans, f1_bots, f1_humans, roc_auc, conf_matrix]
def score(y_true, y_pred): precision_weighted = metrics.precision_score( y_true, y_pred, average='weighted') precision_ave = np.mean(metrics.precision_score( y_true, y_pred, average=None)[::12]) recall_weighted = metrics.recall_score( y_true, y_pred, average='weighted') recall_ave = np.mean(metrics.recall_score( y_true, y_pred, average=None)[::12]) f1_weighted = metrics.f1_score( y_true, y_pred, average='weighted') f1_ave = np.mean(metrics.f1_score( y_true, y_pred, average=None)[::12]) stat_line = " Precision: %0.4f\t Recall: %0.4f\tf1: %0.4f" res1 = "Weighted: " + stat_line % (100*precision_weighted, 100*recall_weighted, 100*f1_weighted) res2 = "Averaged: " + stat_line % (100*precision_ave, 100*recall_ave, 100*f1_ave) res3 = "-"*72 outputs = [res3, res1, res2, res3] return "\n".join(outputs)
def confusion_matrix(true_y, pred_y, labels): c_matrix = metrics.confusion_matrix(true_y, pred_y) confusion_table = [] first_row = ["C.Matrix"] + labels + ["ACTUAL"] + ["RECALL"] confusion_table.append(first_row) recall = metrics.recall_score(true_y, pred_y, average=None) for r, row in enumerate(c_matrix): new_row = [labels[r]] new_row.extend(row) new_row.append(sum(row)) new_row.append(recall[r]) confusion_table.append(new_row) new_row = ["PREDICTED"] for l in labels: new_row.append(len([t for t in pred_y if t == l])) new_row.append(len(true_y)) new_row.append(metrics.recall_score(true_y, pred_y, average='macro')) confusion_table.append(new_row) new_row = ["PRECISION"] new_row.extend(metrics.precision_score(true_y, pred_y, average=None)) new_row.append(metrics.precision_score(true_y, pred_y, average='macro')) new_row.append(metrics.f1_score(true_y, pred_y, average='macro')) confusion_table.append(new_row) confusion_table = pd.DataFrame(confusion_table) return confusion_table
def _clf_mlp(trX,teX,trY,teY): print "MLP" print trX.shape,"trX shape" print "Enter Layer for MLP" layer=input() # print "enter delIdx" # delIdx=input() # while(delIdx): # trX=np.delete(trX,-1,axis=0) # trY=np.delete(trY,-1,axis=0) # delIdx=delIdx-1 print "factors",factors(trX.shape[0]) teY=teY.astype(np.int32) trY=trY.astype(np.int32) print trX.shape,"trX shape" print "enter no of mini batch" mini_batch=int(input()) mlp = TfMultiLayerPerceptron(eta=0.01, epochs=100, hidden_layers=layer, activations=['relu' for i in range(len(layer))], print_progress=3, minibatches=mini_batch, optimizer='adam', random_seed=1) mlp.fit(trX,trY) pred=mlp.predict(teX) print _f_count(teY),"test f count" pred=pred.astype(np.int32) print _f_count(pred),"pred f count" conf_mat=confusion_matrix(teY, pred) process_cm(conf_mat, to_print=True) print precision_score(teY,pred),"Precision Score" print recall_score(teY,pred),"Recall Score" print roc_auc_score(teY,pred), "ROC_AUC"
def get_scores(self, **kwargs): """ Calculate scores. """ data = kwargs['data'] true_values = np.array(data['targets']) predicted_values = kwargs['predicted_values'] le = kwargs["data_args"]["LabelEncoder"] out_args = {} scores = [] sc = accuracy_score (true_values, predicted_values) score = {} score['name'] = 'Accuracy' score['value'] = sc scores.append(score) sc = f1_score(true_values, predicted_values, average='weighted') score_by_class = f1_score(true_values, predicted_values, average=None) score = {} score['name'] = 'F1 score' score['summary_name'] = 'Weighted average F1 score' score['summary_value'] = sc score['class_wise'] = {} score['class_wise']['names'] = list(le.classes_) score['class_wise']['values'] = list(score_by_class) scores.append(score) sc = precision_score(true_values, predicted_values, average='weighted') score_by_class = precision_score (true_values, predicted_values, average=None) score = {} score['name'] = 'Precision' score['summary_name'] = 'Weighted average precision score' score['summary_value'] = sc score['class_wise'] = {} score['class_wise']['names'] = list(le.classes_) score['class_wise']['values'] = list(score_by_class) scores.append(score) sc = recall_score(true_values, predicted_values, average='weighted') score_by_class = precision_score (true_values, predicted_values, average=None) score = {} score['name'] = 'Recall' score['summary_name'] = 'Weighted average recall score' score['summary_value'] = sc score['class_wise'] = {} score['class_wise']['names'] = list(le.classes_) score['class_wise']['values'] = list(score_by_class) scores.append(score) scores_out = {} scores_out["scores"] = scores scores_out["schema_version"] = "0.02" return scores_out, out_args
def gradient_boosting(X,y, nf = 2, lr = .1, ne = 100): col_names = X.columns y = y.astype(float) Xs = X.astype(float) Xs_t, Xs_holdout, y_t, y_holdout = train_test_split(Xs, y, train_size=.8) Xs_t = Xs_t.set_index([range(len(Xs_t))]) Xs_holdout = Xs_holdout.set_index([range(len(Xs_holdout))]) y_t = pd.DataFrame(y_t).set_index([range(len(y_t))]) y_holdout = pd.DataFrame(y_holdout).set_index([range(len(y_holdout))]) kf = KFold(len(Xs_t), nf) output_table = [] precisions = [] accuracies = [] F1s = [] fold_count = 1 for train_index, test_index in kf: results = [] Xs_train, Xs_test = Xs_t.iloc[train_index,:], Xs_t.iloc[test_index,:] y_train, y_test = y_t.iloc[train_index,:], y_t.iloc[test_index,:] y_train = np.array(y_train) y_test = np.array(y_test) Gboost = GradientBoostingRegressor(learning_rate=lr, loss='ls', n_estimators=ne) Gboost.fit(Xs_train, y_train) pred = Gboost.predict(Xs_test) pred = np.array(pred) pred = pred.round() output_table.append(' ') output_table.append("Fold "+ str(fold_count) + ':') output_table.append("Precision Score: "+str(precision_score(pred, y_test))) output_table.append("Accuracy Score: "+ str(accuracy_score(pred, y_test))) output_table.append("F1 Score: "+str(f1_score(pred, y_test))) precisions.append(precision_score(pred, y_test)) accuracies.append(accuracy_score(pred, y_test)) F1s.append(f1_score(pred, y_test)) fold_count += 1 pred_holdout = Gboost.predict(Xs_holdout) pred_holdout = np.array(pred_holdout) pred_holdout = pred_holdout.round() cm = confusion_matrix(y_holdout, pred_holdout) TN = cm[0][0] FN = cm[0][1] TP = cm[1][1] FP = cm[1][0] print "Mean Precision: ", np.mean(precisions) print "Mean F1s: ", np.mean(F1s) print "True Positive Rate (Sensitivity): ", TP*1./(TP+FN)#cm[1][1]*1./(cm[1][1]+cm[0][1]) print "True Negative Rate (Specificity): ", TN*1./(TN+FP)#cm[0][0]*1./(cm[0][0]+cm[1][0]) print "Precision: ", TP*1./(TP+FP), #precision_score(pred_holdout, y_holdout) print "Accuracy: ", (TP+TN)*1./(TP+TN+FP+FN), #accuracy_score(pred_holdout, y_holdout) indices = np.argsort(Gboost.feature_importances_) figure = plt.figure(figsize=(10,7)) plt.barh(np.arange(len(col_names)), Gboost.feature_importances_[indices], align='center', alpha=.5) plt.yticks(np.arange(len(col_names)), np.array(col_names)[indices], fontsize=14) plt.xticks(fontsize=14) _ = plt.xlabel('Relative importance', fontsize=18) return Gboost
def DTree(X, Y, XTest, YTest): print '-----------------------------------------------------' # dot_data = StringIO() # tree.export_graphviz(dtree_model, out_file=dot_data) # graph = pydot.graph_from_dot_data(dot_data.getvalue()) # graph.write_pdf("../dtree.pdf") # param_grid = {'max_depth': np.arange(1, 15)} # tree_grid = GridSearchCV(DecisionTreeClassifier(), param_grid) tree_grid = DecisionTreeClassifier(max_depth=3) tree_grid.fit(X, Y) export_graphviz(tree_grid, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("dtreevis.pdf") # print("The best parameters are %s with a score of %0.2f" # % (tree_grid.best_params_, tree_grid.best_score_)) print "Computing training statistics" dtree_predict_time_training = time.time() Ypred_dtree_training = tree_grid.predict(X) dtree_predict_time_training = time.time() - dtree_predict_time_training dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training) dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training, average='binary') dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training, average='binary') print "DT training prediction time: " + str(dtree_predict_time_training) print "DT training accuracy Score: " + str(dtree_accuracy_training) print "DT training precision Score: " + str(dt_precision_training) print "DT training recall Score: " + str(dtree_recall_training) print "Computing testing statistics" dtree_predict_time_test = time.time() Ypred_dtree_test = tree_grid.predict(XTest) dtree_predict_time_test = time.time() - dtree_predict_time_test dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test) dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test, average='binary') dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test, average='binary') print "DT test prediction time: " + str(dtree_predict_time_test) print "DT test accuracy Score: " + str(dtree_accuracy_test) print "DT test precision Score: " + str(dt_precision_test) print "DT test recall Score: " + str(dtree_recall_test) print "Creating ROC curve" y_true = YTest y_score = tree_grid.predict_proba(XTest) fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true, y_score=y_score[:, 0], pos_label=0) plt.plot(fprSVM, trpSVM, 'r-', label='DT')
def cv(self, X, y, eval_size=.33, nfold=3): metrics=['roc_auc','f1','recall','precision'] Xtrain, Xeval , ytrain, yeval = train_test_split(X,y,test_size=eval_size) Xtrain.reset_index(drop = True) Xeval.reset_index(drop = True) ytrain.reset_index(drop = True) yeval.reset_index(drop = True) self.fit(Xtrain, ytrain) ypred = self.predict(Xeval) yprob = self.predict_proba(Xeval) eroc = roc_auc_score(yeval, yprob) ef1 = f1_score(yeval, ypred) erecall = recall_score(yeval, ypred) eprecision = precision_score(yeval, ypred) # print confusion_matrix(yeval, ypred, labels = [0,1]) # eroc = roc_auc_score(yeval, yprob, sample_weight=sw) # ef1 = f1_score(yeval, ypred,sample_weight=sw) # erecall = recall_score(yeval, ypred, sample_weight=sw) # eprecision = precision_score(yeval, ypred, sample_weight=sw) escores = [eroc, ef1, erecall, eprecision] skfscores = [] skf = StratifiedKFold(ytrain,n_folds=nfold, random_state=2016) for trainIndex, testIndex in skf: skfxtrain, skfxtest = X.loc[trainIndex,:], X.loc[testIndex,:] skfytrain, skfytest = y.values[trainIndex], y.values[testIndex] self.fit(skfxtrain, skfytrain) ypred = self.predict(skfxtest) yprob = self.predict_proba(skfxtest) # roc = roc_auc_score(skfytest, yprob, sample_weight=sw) # f1 = f1_score(skfytest, ypred, sample_weight=sw) # recall = recall_score(skfytest, ypred,sample_weight=sw) # precision = precision_score(skfytest, ypred, sample_weight=sw) roc = roc_auc_score(skfytest, yprob) f1 = f1_score(skfytest, ypred) recall = recall_score(skfytest, ypred) precision = precision_score(skfytest, ypred) # print confusion_matrix(skfytest, ypred, labels=[0,1]) scores = [roc, f1, recall, precision] print 'cv scores:' print scores skfscores.append(scores) skfscores = np.array(skfscores) skfscores = skfscores.mean(0) report = pd.DataFrame({'eval': escores, 'train': skfscores}, index=metrics) return report
def print_scores(model, X_train, y_train, X_test, y_test): """ Compute scores for given model with training and test sets Input: model (sklearn.linear_model): the model with which to calculate scores X_train (numpy_array): training design matrix X y_train (numpy_array): training labels y X_test (numpy_array): test design matrix X y_test (numpy_array): test labels y Output: F1-score in test set Side Effects: prints the scores Comments: model must be fitted before calling this function """ y_train_predicted = model.predict(X_train) y_test_predicted = model.predict(X_test) # accuracy scores print("Accuracy") print("Train: ", model.score(X_train,y_train)) print("Test: ", model.score(X_test, y_test)) print("\n") # use precision and recall metrics from sklearn.metrics import precision_score, recall_score precision_train = precision_score(y_train, y_train_predicted) recall_train = recall_score(y_train, y_train_predicted) precision_test = precision_score(y_test, y_test_predicted) recall_test = recall_score(y_test, y_test_predicted) print("Precision and Recall") print ("Train: ", precision_train, recall_train) print ("Test: ", precision_test, recall_test) print("\n") # F1 score from tilestools import F1score f1_train = F1score(y_train, y_train_predicted) f1_test = F1score(y_test, y_test_predicted) print("F1 score") print ("Train: ", f1_train) print ("Test: ", f1_test) return f1_test
def helpfulPrediction(y_actual,X,grid_search_best,model_name): #use grid_search.best_estimator_ (the best parameters found) to predict X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual, test_size=0.15, random_state=0) y_true, y_pred = y_actual_test['is_helpful'].values, grid_search_best.predict(X_test.iloc[:,0:len(X.columns)-2]) print confusion_matrix(y_true, y_pred) print "Training precision score:"+str(precision_score(y_actual_train, grid_search_best.predict(X_train.iloc[:,0:len(X.columns)-2]))) print "Testing precision score:"+str(precision_score(y_true,y_pred)) pd.DataFrame({'comment_id': X_test['id_x'].values,'users_count':X_test['users_count'].values,'comments_count':X_test['comments_count'].values,'contains_image':X_test['contains_image'].values,'thanking_in_reply':X_test['gratitude_in_reply'].values,'is_response_to_question':X_test['is_response_to_question'].values,'is_response_to_image':X_test['is_response_to_image'].values,'contains_link':X_test['contains_link'].values,'contains_question':X_test['contains_question'].values,'contains_hashtag':X_test['contains_hashtag'].values,'comment_order':X_test['comment_order'].values,'word_count':X_test['word_count'].values,'polarity':X_test['polarity'].values,'subjectivity':X_test['subjectivity'].values,'body': X_test['body'].values,'y_true':y_true,'y_pred':y_pred}).to_csv(model_name+"_pred_true.csv", index=False)
def test_precision_score(self): result = self.df.metrics.precision_score(average='weighted') expected = metrics.precision_score(self.target, self.pred, average='weighted') self.assertEqual(result, expected) result = self.df.metrics.precision_score(average=None) expected = metrics.precision_score(self.target, self.pred, average=None) self.assertTrue(isinstance(result, pdml.ModelSeries)) self.assert_numpy_array_almost_equal(result.values, expected)
num_round, valid_sets=[trn_data, val_data], verbose_eval=500, early_stopping_rounds=200, categorical_feature=cat_cols, ) feat_imp_df['imp'] += clf.feature_importance() / 5 oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration) predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) print("AUC score: {}".format(roc_auc_score(y, oof_lgb))) print("F1 score: {}".format( f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) print("Precision score: {}".format( precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) print("Recall score: {}".format( recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb]))) drop_f = feat_imp_df[feat_imp_df['imp'] < 0.2]['feat'] for f in drop_f: if f in cat_cols: cat_cols.remove(f) features = features.drop(drop_f) print("得到最终特征共计{}维".format(len(features))) print("开始模型一的训练与预测") # 模型训练、预测 KF = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True) X_train[cat_cols] = X_train[cat_cols].astype('category')
# Come si può osservare dall'output il modello usato (Random Forest) ha una precisione media del 94% con una deviazione # standard del 0.06%,che ci mostra, quanto precise sono le stime. Ciò significa che la precisione del nostro modello # può variare di +/- 0.06%. Dunque, a seguito di questa verifica, la precisione continua ad essere ancora buona per cui # nelle fasi successive si proverà a migliorare ulteriormente le prestazioni del Random Forest. # Matrice di Confusione per Random Forest from sklearn.model_selection import cross_val_predict from sklearn.metrics import confusion_matrix predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3) confusion_matrix(Y_train, predictions) # PRECISIONE E RECALL (per il random forest) from sklearn.metrics import precision_score, recall_score print("Precision: ", precision_score(Y_train, predictions)) print("Recall: ", recall_score(Y_train, predictions)) """ # Ora invece CROSS VALIDATION per Decision Tree from sklearn.model_selection import cross_val_score rf = DecisionTreeClassifier() scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy") print("Scores: ", scores) print("Mean: ", scores.mean()) print("Standard deviation: ", scores.std()) # Abbiamo all'incirca gli stessi risultati del Random Forest per quanto riguarda la deviazione standard (0.04%), mentre # la precisione media è del 96%, quindi molto più accurato del Random Forest.
sorted_positive_features = np.argsort(positive_features)[::-1] print("Top 20 Important Features and their log probabilities For Negative Class :\n\n") for i in list(sorted_negative_features[0:20]): print("%s\t -->\t%f "%(feature_names[i],negative_features[i])) print("\n\nTop 20 Important Features and their log probabilities For Positive Class :\n\n") for i in list(sorted_positive_features[0:20]): print("%s\t -->\t%f "%(feature_names[i],positive_features[i])) # evaluate accuracy acc = accuracy_score(Y_test, predictions) * 100 print('\nThe Test Accuracy of the Bernoulli naive Bayes classifier for alpha = %.3f is %f%%' % (optimal_alpha, acc)) # evaluate precision acc = precision_score(Y_test, predictions, pos_label = 'positive') print('\nThe Test Precision of the Bernoulli naive Bayes classifier for alpha = %.3f is %f' % (optimal_alpha, acc)) # evaluate recall acc = recall_score(Y_test, predictions, pos_label = 'positive') print('\nThe Test Recall of the Bernoulli naive Bayes classifier for alpha = %.3f is %f' % (optimal_alpha, acc)) # evaluate f1-score acc = f1_score(Y_test, predictions, pos_label = 'positive') print('\nThe Test F1-Score of the Bernoulli naive Bayes classifier for alpha = %.3f is %f' % (optimal_alpha, acc)) # Evaluate TPR , FPR , TNR , FNR TrueNeg,FalseNeg,FalsePos, TruePos = confusion_matrix(Y_test, predictions).ravel() # Evaluate TPR (TPR = TP/(FN+TP)) TPR = TruePos/(FalseNeg + TruePos)
res = confusion_matrix(y_train_5, y_train_pred) print(res) ''' #p.87 print(res) [[53124 1455] #5以外の画像 [ 949 4472]] #5の画像 [[TN, FP] [FN, TP]] ''' #Precision and Recall Rate (p.88) #F-value 適合率と再現率の調和平均(harmonic mean) from sklearn.metrics import precision_score, recall_score, f1_score precision_rate = precision_score(y_train_5, y_train_pred) recall_rate = recall_score(y_train_5, y_train_pred) f_value = f1_score(y_train_5, y_train_pred) # pracision: 0.754513244474439 recall: 0.8249400479616307 f-value: 0.7881565033486078 print('pracision:', precision_rate, ' recall:', recall_rate, ' f-value:', f_value) #Threshold閾値を決定して、プロジェクトに合った適合率 または、再現率にする。(適合率、再現率はトレードオフ) #例えば、適合率をあげたい場合、 #y_score = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function") #決定閾値 ''' y_score_Threshold = 10000 y_train_pred_90 = (y_score > y_score_Threshold) print(y_score, max(y_score), min(y_score))
def modelselection(X, y, modelname, featureExtractionVector): dictionarylist = {} accuracyscores = [] precisionscores = [] recallscores = [] TP = [] FP = [] TN = [] FN = [] f1scores = [] for modelname in modelname: if modelname == 'Naive Bayes': model = naive_bayes.MultinomialNB() if modelname == 'Logistic Regression': model = LogisticRegression(C=1., solver = 'lbfgs') if modelname == 'SVM': model = svm.LinearSVC() if modelname == 'K-NN': model = KNeighborsClassifier() if modelname == 'AdaBoost': model = AdaBoostClassifier() if modelname == 'Random Forest Classifier': model = RandomForestClassifier(n_estimators=100) if modelname == 'Gradient Boosting Classifier': model = GradientBoostingClassifier() kf = KFold(n_splits=10) for train_index, test_index in kf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) predict = model.predict(X_test) accuracyscore = accuracy_score(y_test, predict) accuracyscores.append(accuracyscore) precisionscores.append(precision_score(y_test, predict)) recallscores.append(recall_score(y_test, predict)) TP.append(confusion_matrix(y_test, predict)[1][1]) FP.append(confusion_matrix(y_test, predict)[1][0]) TN.append(confusion_matrix(y_test, predict)[0][0]) FN.append(confusion_matrix(y_test, predict)[0][1]) f1score = f1_score(y_test, predict) f1scores.append(f1score) if modelname != 'SVM': y_pred_prob = model.predict_proba(X_test)[:, 1] fpr, tpr, threshold = roc_curve(y_test, y_pred_prob) roc_auc = roc_auc_score(y_test, y_pred_prob) # plot roc curves plotroccurves(modelname, y_pred_prob, fpr, tpr, roc_auc, predict) # Convert the array to list and store average accuracyscores = np.asarray(accuracyscores) TP = np.asarray(TP) FP = np.asarray(FP) TN = np.asarray(TN) FN = np.asarray(FN) f1scores = np.asarray(f1scores) precisionscores = np.asarray(precisionscores) recallscores = np.asarray(recallscores) print( f"The average accuracy score for training dataset length: {len(train_index)} for {modelname}:") print("%0.6f (+/- %0.6f)" % (accuracyscores.mean(), accuracyscores.std() * 2)) # accuracyscores = [] print('TP = ', int(TP.mean())) print('FP = ', int(FP.mean())) print('TN = ', int(TN.mean())) print('FN = ', int(FN.mean())) print('F1 Score =', f1scores.mean())s print('The precision score = ', precisionscores.mean()) print('The recall score = ', recallscores.mean()) dictionary = dict() dictionary = ({modelname: {'Accuracy score': accuracyscores.mean(), 'f1score': f1scores.mean(), 'precision': precisionscores.mean(), 'recall': recallscores.mean(), 'true positive': int(TP.mean()), 'true negative': int(TN.mean()), 'false positive': int(FP.mean()), 'false negative': int(FN.mean())}}) dictionarylist.update(dictionary) accuracyscores = [] TP = [] FP = [] TN = [] FN = [] f1scores = [] precisionscores = [] recallscores = [] # The code below is used to enter a query to check the sentiment. Query is a sentence. ''' if modelname == 'Naive Bayes': model = MultinomialNB() model.fit(X, y) while(1): inputtext = input('Enter a string; "exit" to exit.\n') if inputtext == 'exit': exit(0) inputtext = preprocessdata(inputtext) inputtext = stem_sentences(inputtext) inputtext = [inputtext] inputvector = featureExtractionTest( inputtext, featureExtractionVector) predict = model.predict(inputvector) # Converting input string to Array for the vectorizer. # for i in range(len(inputtext)): # inputtext[i] = preprocessdata(inputtext[i]) # inputtext[i] = stem_sentences(inputtext[i]) # inputvector[i] = featureExtractionTest( # inputtext[i], featureExtractionVector) # predict = model.predict(inputvector[i]) # # inputvector.clear() # print(predict) if predict == 1: print('The sentiment score is positive') elif predict == 0: print('The sentiment score is negative') else: print('Error: Predict is: ', predict) ''' return dictionarylist
class_names = [0, 1] # name of classes fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) # create heatmap sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt.tight_layout() plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) print("Precision:", metrics.precision_score(y_test, y_pred)) print("Recall:", metrics.recall_score(y_test, y_pred)) ############################################################################################################################ # <a href = "#top">Back to index</a><br /> # <a id = "reg_plot"></a> # In[53]: ############################################################################################################################ # Regresson plot y_pred_proba = logreg.predict_proba(X_test)[::, 1] fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
def crossvalidation(filepath): raw_df = pd.read_csv(filepath) Options_df = make_clean_Options_df(raw_df) print (Options_df.head()) y = Options_df.pop('Options').values X = Options_df.values X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.33) # After the split #X_train,y_train = oversample0(X_train,y_train) kfold = StratifiedKFold(5,shuffle=False) accuracies = [] precisions = [] recalls = [] sumofy = [] sumofy2 = [] F1s = [] for train_index, test_index in kfold.split(X_train,y_train): model = LR(solver='liblinear') model.fit(X_train[train_index], y_train[train_index]) #y_predict = model.predict(X_train[test_index]) y_proba = model.predict_proba(X_train[test_index])[:,1] # Above the model is using predict_proba this returns the 'probability' of a 1 # the code below uses this with a differnt threshold to get the actual prediciton y_predict = np.array([(lambda z: 1 if z >0.45177236 else 0)(z) for z in y_proba]) y_true = y_train[test_index] #print(y_proba) #print('Predict',y_predict) #print('True ',y_true) accuracies.append(accuracy_score(y_true, y_predict)) precisions.append(precision_score(y_true, y_predict)) recalls.append(recall_score(y_true, y_predict)) sumofy.append(len(y_predict)-y_predict.sum()) sumofy2.append(len(y_true)-y_true.sum()) F1s.append(f1_score(y_true, y_predict)) # print ("accuracy:", np.average(accuracies)) # print ("precision:", np.average(precisions)) # print ("recall:", np.average(recalls)) # print ("sumofyPridict:", np.average(sumofy)) # print ("sumofyTrue:", np.average(sumofy2)) # print ('F1', np.average(F1s)) y_proba = model.predict_proba(X_train)[:,1] y_predict = np.array([(lambda z: 1 if z >0.6443742 else 0)(z) for z in y_proba]) #print(y_train, y_predict) plotROC(y_train, y_proba,"images/plotROC_Training.png","ROC Curve Training") y_proba = model.predict_proba(X_test)[:,1] y_predict = np.array([(lambda z: 1 if z >0.6443742 else 0)(z) for z in y_proba]) print("__________________test") #print(y_proba,y_test, y_predict) print("Threashold 0.6443742") plotROC(y_test, y_proba,'images/ROC_Test.png',"ROC Curve Test") print ('Test accuracy_score',accuracy_score(y_test, y_predict)) print ('Test precision_score',precision_score(y_test, y_predict)) print ('Test recall_score',recall_score(y_test, y_predict)) print ('Test f1_score',f1_score(y_test, y_predict)) print('---------------------summmary') print(raw_df.Options.value_counts()) print('train sum',sum(y_train),'train len',len(y_train)) zipped = set(zip(y_proba,y_test, y_predict)) cw = csv.writer(open("zipped.csv",'w')) cw.writerow(zipped) y_proba = model.predict_proba(X_train)[:,1] y_predict = np.array([(lambda z: 1 if z >0.0 else 0)(z) for z in y_proba]) print('------All ones') print(len(y_proba),len(y_train),len(y_predict)) #print(y_proba,y_train, y_predict) # print("Threashold 0.6443742") plotROC(y_train, y_proba,'images/plotAll1.png','ROC Curve All 1') print ('All1 accuracy_score',accuracy_score(y_train, y_predict)) print ('All1 precision_score',precision_score(y_train, y_predict)) print ('All1 recall_score',recall_score(y_train, y_predict)) print ('All1 f1_score',f1_score(y_train, y_predict))
def fun(in_road): # ok start = time.time() index = [] # 获取csv文件里面一共有几列 col_num = get_col.getCol(in_road) data_dimension = col_num - 1 # 载入数据集 dataset = loadtxt(in_road, delimiter=",", skiprows=1) print(type(dataset)) # split data into x and y x = dataset[:, 0:data_dimension] # x[:,m:n],即取所有数据的第m到n-1列数据,含左不含右 y = dataset[:, data_dimension] random_s = [8, 20, 40, 100, 200, 1000] # 依据不同的种子运算多次,之后进行投票选择继续约减 for rs in random_s: # 把数据集拆分成训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=rs) print("-----------------XGBoost-----------------") # 拟合XGBoost模型 model1 = XGBClassifier( learning_rate=0.1, n_estimators=1000, # 树的个数--1000棵树建立xgboost max_depth=5, # 树的深度 min_child_weight=1, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=0.8, # 随机选择80%样本建立决策树 colsample_btree=0.8, # 随机选择80%特征建立决策树 objective='reg:logistic', # 指定损失函数 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27 # 随机数种子 ) model1.fit(x_train, y_train) # 强特征排序 importance = model1.feature_importances_ top = pd.Series(importance).sort_values(ascending=False) # 输出前10的index索引 print(list(top.index)[:top_num]) index.extend(list(top.index)[:top_num]) # 对测试集做预测 y_pred = model1.predict(x_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) precision = precision_score(y_test, predictions) print("precision: %.2f%%" % (precision * 100.0)) print("-----------------LightGBM-----------------") params = { 'task': 'train', 'boosting_type': 'gbdt', # GBDT算法为基础 'objective': 'binary', 'metric': 'auc', # 评判指标 'max_bin': 255, # 大会有更准的效果,更慢的速度 'learning_rate': 0.1, # 学习率 'num_leaves': 64, # 大会更准,但可能过拟合 # 'max_depth': -1, 小数据集下限制最大深度可防止过拟合,小于0表示无限制 'feature_fraction': 0.8, # 防止过拟合 'bagging_freq': 5, # 防止过拟合 'bagging_fraction': 0.8, # 防止过拟合 'min_data_in_leaf': 10, # 防止过拟合 'min_sum_hessian_in_leaf': 3.0, # 防止过拟合 # 'header': True 数据集是否带表头 'verbose': -1 # 忽略掉警告:No further splits with positive gain, best gain: -inf } lgb_train = lgb.Dataset(x_train, label=y_train) model2 = lgb.train(params, train_set=lgb_train) importance = model2.feature_importance() top = pd.Series(importance).sort_values(ascending=False) print(list(top.index)[:top_num]) index.extend(list(top.index)[:top_num]) y_pred = model2.predict(x_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) precision = precision_score(y_test, predictions) print("precision: %.2f%%" % (precision * 100.0)) print("-----------------ExtraTree是随机森林的一个变种-----------------") model4 = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0) model4.fit(x_train, y_train) importance = model4.feature_importances_ top = pd.Series(importance).sort_values(ascending=False) print(list(top.index)[:top_num]) index.extend(list(top.index)[:top_num]) y_pred = model4.predict(x_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) precision = precision_score(y_test, predictions) print("precision: %.2f%%" % (precision * 100.0)) end = time.time() running_time = end - start print('-----------time--------') print(running_time) print(index) #排序 sort = get_count_by_counter(index) top_index = sort.most_common(top_num) return top_index
def scores(y_test, y_pred, th=0.5): y_predlabel = [(0. if item < th else 1.) for item in y_pred] tn, fp, fn, tp = confusion_matrix(y_test, y_predlabel).flatten() SPE = tn*1./(tn+fp) MCC = matthews_corrcoef(y_test, y_predlabel) fpr,tpr,threshold = roc_curve(y_test, y_predlabel) sen, spe, pre, f1, mcc, acc, auc, tn, fp, fn, tp = np.array([recall_score(y_test, y_predlabel), SPE, precision_score(y_test, y_predlabel,average='macro'), f1_score(y_test, y_predlabel), MCC, accuracy_score(y_test, y_predlabel), roc_auc_score(y_test, y_pred), tn, fp, fn, tp]) return sen, spe, pre, f1, mcc, acc,auc,tn,fp,fn,tp
def run(): train_e = getParticleSet('/home/drozd/analysis/data_train_elecs.npy') train_p = getParticleSet('/home/drozd/analysis/data_train_prots.npy') train = np.concatenate((train_e, train_p)) np.random.shuffle(train) X_train = train[:, 0:-1] Y_train = train[:, -1] del train_e, train_p, train val_e = np.concatenate( (getParticleSet( '/home/drozd/analysis/fraction1/data_validate_elecs_1.npy'), getParticleSet('/home/drozd/analysis/fraction1/data_test_elecs_1.npy') )) val_p = np.concatenate( (getParticleSet( '/home/drozd/analysis/fraction1/data_validate_prots_1.npy'), getParticleSet('/home/drozd/analysis/fraction1/data_test_prots_1.npy') )) val = np.concatenate((val_e, val_p)) np.random.shuffle(val) X_val = val[:, 0:-1] Y_val = val[:, -1] val_imba = np.concatenate((val_e[0:int(val_p.shape[0] / 100)], val_p)) np.random.shuffle(val_imba) X_val_imba = val_imba[:, 0:-1] Y_val_imba = val_imba[:, -1] del val_e, val_p, val, val_imba model = Sequential() model.add( Dense(300, input_shape=(X_train.shape[1], ), kernel_initializer='he_uniform', activation='relu')) model.add(Dropout(0.1)) model.add(Dense(150, kernel_initializer='he_uniform', activation='relu')) model.add(Dropout(0.1)) model.add(Dense(70, kernel_initializer='he_uniform', activation='relu')) model.add(Dropout(0.1)) model.add(Dense(1, kernel_initializer='he_uniform', activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) rdlronplt = ReduceLROnPlateau(monitor='loss', patience=3, min_lr=0.001) earl = EarlyStopping(monitor='loss', min_delta=0.0001, patience=5) callbacks = [rdlronplt, earl] history = model.fit(X_train, Y_train, batch_size=150, epochs=40, verbose=0, callbacks=callbacks, validation_data=(X_val, Y_val)) # -------------------------------- predictions_balanced = model.predict(X_val) predictions_imba = model.predict(X_val_imba) predictions_train = model.predict(X_train) del X_val, X_val_imba, X_train sk_l_precision_b, sk_l_recall_b, sk_l_thresholds_b = precision_recall_curve( Y_val, predictions_balanced) sk_l_precision_i, sk_l_recall_i, sk_l_thresholds_i = precision_recall_curve( Y_val_imba, predictions_imba) sk_l_precision_t, sk_l_recall_t, sk_l_thresholds_t = precision_recall_curve( Y_train, predictions_train) sk_l_fpr_b, sk_l_tpr_b, sk_l_roc_thresholds_b = roc_curve( Y_val, predictions_balanced) sk_l_fpr_i, sk_l_tpr_i, sk_l_roc_thresholds_i = roc_curve( Y_val_imba, predictions_imba) sk_l_fpr_t, sk_l_tpr_t, sk_l_roc_thresholds_t = roc_curve( Y_train, predictions_train) man_l_precision_b, man_l_recall_b, man_l_thresholds_b = getPR( Y_val, predictions_balanced, 100) man_l_precision_i, man_l_recall_i, man_l_thresholds_i = getPR( Y_val_imba, predictions_imba, 100) man_l_fpr_b, man_l_tpr_b, man_l_roc_thresholds_b = getROC( Y_val, predictions_balanced, 100) man_l_fpr_i, man_l_tpr_i, man_l_roc_thresholds_i = getROC( Y_val_imba, predictions_imba, 100) print("----- AUC -----") print("Train:", average_precision_score(Y_train, predictions_train)) print("Validate:", average_precision_score(Y_val, predictions_balanced)) print("----- F1 -----") print("Train:", f1_score(Y_train, np.around(predictions_train))) print("Validate:", f1_score(Y_val, np.around(predictions_balanced))) print("----- Precision/Recall -----") print("Train:", precision_score(Y_train, np.around(predictions_train)), " / ", recall_score(Y_train, np.around(predictions_train))) print("Validate:", precision_score(Y_val, np.around(predictions_balanced)), " / ", recall_score(Y_val, np.around(predictions_balanced))) fig1 = plt.figure() plt.plot(sk_l_precision_b, sk_l_recall_b, label='balanced') plt.plot(sk_l_precision_i, sk_l_recall_i, label='imbalanced') #~ plt.plot(sk_l_precision_t, sk_l_recall_t,label='training set') #~ plt.plot(man_l_precision_b, man_l_recall_b,'o',label='balanced, hand') #~ plt.plot(man_l_precision_i, man_l_recall_i,'o',label='imbalanced, hand') plt.xlabel('Precision') plt.ylabel('Recall') plt.legend(loc='best') plt.savefig('PR') fig1b = plt.figure() plt.plot(sk_l_precision_b, sk_l_recall_b, label='validation set') plt.plot(sk_l_precision_t, sk_l_recall_t, label='training set') plt.xlabel('Precision') plt.ylabel('Recall') plt.legend(loc='best') plt.savefig('PRb') fig2 = plt.figure() plt.plot(sk_l_fpr_b, sk_l_tpr_b, label='balanced') plt.plot(sk_l_fpr_i, sk_l_tpr_i, label='imbalanced') #~ plt.plot(man_l_fpr_b, man_l_tpr_b,'o',label='balanced, hand') #~ plt.plot(man_l_fpr_i, man_l_tpr_i,'o',label='imbalanced, hand') plt.xlabel('False Positive') plt.ylabel('True Positive') plt.legend(loc='best') plt.savefig('ROC') fig2b = plt.figure() plt.plot(sk_l_fpr_b, sk_l_tpr_b, label='validation set') plt.plot(sk_l_fpr_t, sk_l_tpr_t, label='training set') plt.xlabel('False Positive') plt.ylabel('True Positive') plt.legend(loc='best') plt.savefig('ROCb') Nbins = 50 binList = [x / Nbins for x in range(0, Nbins + 1)] elecs_t, prots_t = getClassifierScore(Y_train, predictions_train) fig3 = plt.figure() plt.hist(elecs_t, bins=binList, label='e', alpha=0.7, histtype='step', color='green') plt.hist(prots_t, bins=binList, label='p', alpha=0.7, histtype='step', color='red') plt.xlabel('Classifier score') plt.ylabel('Number of events') plt.title('Training set') plt.legend(loc='best') plt.yscale('log') plt.savefig('predHisto_train') fig3b = plt.figure() plt.hist(elecs_t, bins=binList, label='e', alpha=0.7, histtype='step', color='green', normed=True) plt.hist(prots_t, bins=binList, label='p', alpha=0.7, histtype='step', color='red', normed=True) plt.xlabel('Classifier score') plt.ylabel('Fraction of events') plt.title('Training set - normalised') plt.legend(loc='best') plt.yscale('log') plt.savefig('predHisto_train_n') del elecs_t, prots_t, Y_train, predictions_train elecs_b, prots_b = getClassifierScore(Y_val, predictions_balanced) fig4 = plt.figure() plt.hist(elecs_b, bins=binList, label='e', alpha=0.7, histtype='step', color='green') plt.hist(prots_b, bins=binList, label='p', alpha=0.7, histtype='step', color='red') plt.xlabel('Classifier score') plt.ylabel('Number of events') plt.title('Balanced validation set') plt.legend(loc='best') plt.yscale('log') plt.savefig('predHisto_bal') fig4b = plt.figure() plt.hist(elecs_b, bins=binList, label='e', alpha=0.7, histtype='step', color='green', normed=True) plt.hist(prots_b, bins=binList, label='p', alpha=0.7, histtype='step', color='red', normed=True) plt.xlabel('Classifier score') plt.ylabel('Fraction of events') plt.title('Balanced validation set - normalised') plt.legend(loc='best') plt.yscale('log') plt.savefig('predHisto_bal_n') del elecs_b, prots_b, Y_val, predictions_balanced elecs_i, prots_i = getClassifierScore(Y_val_imba, predictions_imba) fig5 = plt.figure() plt.hist(elecs_i, bins=binList, label='e', alpha=0.7, histtype='step', color='green') plt.hist(prots_i, bins=binList, label='p', alpha=0.7, histtype='step', color='red') plt.xlabel('Classifier score') plt.ylabel('Number of events') plt.legend(loc='best') plt.title('Imbalanced validation set') plt.yscale('log') plt.savefig('predHisto_imba') fig5b = plt.figure() plt.hist(elecs_i, bins=binList, label='e', alpha=0.7, histtype='step', color='green', normed=True) plt.hist(prots_i, bins=binList, label='p', alpha=0.7, histtype='step', color='red', normed=True) plt.xlabel('Classifier score') plt.ylabel('Fraction of events') plt.title('Imbalanced validation set - normalised') plt.legend(loc='best') plt.yscale('log') plt.savefig('predHisto_imba_n')
def test_fasttext(): """Test FASTTEXT model.""" # Print parameters used for the model dh.tab_printer(args, logger) # Load word2vec model word2idx, embedding_matrix = dh.load_word2vec_matrix(args.word2vec_file) # Load data logger.info("Loading data...") logger.info("Data processing...") test_data = dh.load_data_and_labels(args, args.test_file, word2idx) # Load fasttext model OPTION = dh._option(pattern=1) if OPTION == 'B': logger.info("Loading best model...") checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR, select_maximum_value=True) else: logger.info("Loading latest model...") checkpoint_file = tf.train.latest_checkpoint(CPT_DIR) logger.info(checkpoint_file) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=args.allow_soft_placement, log_device_placement=args.log_device_placement) session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{0}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] is_training = graph.get_operation_by_name("is_training").outputs[0] # Tensors we want to evaluate scores = graph.get_operation_by_name("output/scores").outputs[0] loss = graph.get_operation_by_name("loss/loss").outputs[0] # Split the output nodes name by '|' if you have several output nodes output_node_names = "output/scores" # Save the .pb model file output_graph_def = tf.graph_util.convert_variables_to_constants( sess, sess.graph_def, output_node_names.split("|")) tf.train.write_graph(output_graph_def, "graph", "graph-fasttext-{0}.pb".format(MODEL), as_text=False) # Generate batches for one epoch batches = dh.batch_iter(list(create_input_data(test_data)), args.batch_size, 1, shuffle=False) # Collect the predictions here test_counter, test_loss = 0, 0.0 test_pre_tk = [0.0] * args.topK test_rec_tk = [0.0] * args.topK test_F1_tk = [0.0] * args.topK # Collect the predictions here true_labels = [] predicted_labels = [] predicted_scores = [] # Collect for calculating metrics true_onehot_labels = [] predicted_onehot_scores = [] predicted_onehot_labels_ts = [] predicted_onehot_labels_tk = [[] for _ in range(args.topK)] for batch_test in batches: x, y_onehot, y = zip(*batch_test) feed_dict = { input_x: x, input_y: y_onehot, dropout_keep_prob: 1.0, is_training: False } batch_scores, cur_loss = sess.run([scores, loss], feed_dict) # Prepare for calculating metrics for i in y_onehot: true_onehot_labels.append(i) for j in batch_scores: predicted_onehot_scores.append(j) # Get the predicted labels by threshold batch_predicted_labels_ts, batch_predicted_scores_ts = \ dh.get_label_threshold(scores=batch_scores, threshold=args.threshold) # Add results to collection for i in y: true_labels.append(i) for j in batch_predicted_labels_ts: predicted_labels.append(j) for k in batch_predicted_scores_ts: predicted_scores.append(k) # Get onehot predictions by threshold batch_predicted_onehot_labels_ts = \ dh.get_onehot_label_threshold(scores=batch_scores, threshold=args.threshold) for i in batch_predicted_onehot_labels_ts: predicted_onehot_labels_ts.append(i) # Get onehot predictions by topK for top_num in range(args.topK): batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk( scores=batch_scores, top_num=top_num + 1) for i in batch_predicted_onehot_labels_tk: predicted_onehot_labels_tk[top_num].append(i) test_loss = test_loss + cur_loss test_counter = test_counter + 1 # Calculate Precision & Recall & F1 test_pre_ts = precision_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') test_rec_ts = recall_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') test_F1_ts = f1_score(y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_ts), average='micro') for top_num in range(args.topK): test_pre_tk[top_num] = precision_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') test_rec_tk[top_num] = recall_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') test_F1_tk[top_num] = f1_score( y_true=np.array(true_onehot_labels), y_pred=np.array(predicted_onehot_labels_tk[top_num]), average='micro') # Calculate the average AUC test_auc = roc_auc_score(y_true=np.array(true_onehot_labels), y_score=np.array(predicted_onehot_scores), average='micro') # Calculate the average PR test_prc = average_precision_score( y_true=np.array(true_onehot_labels), y_score=np.array(predicted_onehot_scores), average="micro") test_loss = float(test_loss / test_counter) logger.info( "All Test Dataset: Loss {0:g} | AUC {1:g} | AUPRC {2:g}". format(test_loss, test_auc, test_prc)) # Predict by threshold logger.info( "Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}" .format(test_pre_ts, test_rec_ts, test_F1_ts)) # Predict by topK logger.info("Predict by topK:") for top_num in range(args.topK): logger.info( "Top{0}: Precision {1:g}, Recall {2:g}, F1 {3:g}".format( top_num + 1, test_pre_tk[top_num], test_rec_tk[top_num], test_F1_tk[top_num])) # Save the prediction result if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) dh.create_prediction_file(output_file=SAVE_DIR + "/predictions.json", data_id=test_data['id'], true_labels=true_labels, predict_labels=predicted_labels, predict_scores=predicted_scores) logger.info("All Done.")
# Calculate precision and recall for both classes (0 and 1) r1 = recall_score(y_vall, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None) r0 = recall_score(y_vall, y_pred, labels=None, pos_label=0, average='binary', sample_weight=None) p1 = precision_score(y_vall, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None) p0 = precision_score(y_vall, y_pred, labels=None, pos_label=0, average='binary', sample_weight=None) print('Recall for class 1 is {}'.format(r1)) print('Recall for class 0 is {}'.format(r0)) print('Precision for class 1 is {}'.format(p1)) print('Precision for class 0 is {}'.format(p0))
def train_and_eval_DNN(df, X_train, X_test, y_train, y_test, y_names, feature_set, metrics_manager, fold): """ Train and Evaulate Deep Neural Networks Args: df => pandas dataframe fold => n-fold cross validation Classifier names used as key in metrics_manager Keras-TensorFlow => keras Fast.ai => fastai Returns: None """ # Keras-TensorFlow DNN Model print('Training and Evaluating Keras-Tensoflow...') dnn_keras = Sequential(layers=[ Dense(128, kernel_regularizer=l2(0.001), activation='relu', input_shape=(len(X_train.columns), )), BatchNormalization(), Dense(64, activation='relu', kernel_regularizer=l2(0.001)), BatchNormalization(), Dense(y_train.nunique(), activation='softmax') ]) dnn_keras.compile(optimizer='adam', loss='binary_crossentropy') dnn_keras.fit(X_train, pd.get_dummies(y_train), epochs=100, verbose=0, batch_size=512) #loss, acc = dnn_keras.evaluate(X_test, pd.get_dummies(y_test), verbose=0) y_pred = dnn_keras.predict_classes(X_test) acc = accuracy_score(y_test, y_pred) bal_acc = balanced_accuracy_score(y_test, y_pred) rec = recall_score(y_test, y_pred, average='weighted') prec = precision_score(y_test, y_pred, average='weighted') auc = roc_auc_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) m = Metric('Keras-Tensorflow', fold=fold) m.addValue('acc', 100 * acc) m.addValue('bal-acc', 100 * bal_acc) m.addValue('rec', 100 * rec) m.addValue('prec', 100 * prec) m.addValue('auc', 100 * auc) m.addValue('f1', 100 * f1) metrics_manager.addMetric(m) metrics_manager.printMeasures() # Fast.ai DNN Model, v.2 print('Training and Evaluating Fast.ai...') splits = RandomSplitter(valid_pct=0.2)(range_of(X_train)) #print(feature_set) #print(df[:5]) tp = TabularPandas(df, procs=[], cat_names=[], cont_names=list(feature_set), y_names=y_names, splits=splits) dls = tp.dataloaders(bs=64) #dls.show_batch() #return dnn_fastai = tabular_learner(dls, metrics=accuracy) dnn_fastai.fit_one_cycle(5) # acquire predictions y_pred = [] #print('Length of test set: {}'.format(len(y_test))) for j in range(len(y_test)): row, clas, probs = dnn_fastai.predict(X_test.iloc[j]) #print(clas) pred = 0 if clas >= tensor(0.5): pred = 1 y_pred.append(pred) acc = accuracy_score(y_test, y_pred) bal_acc = balanced_accuracy_score(y_test, y_pred) rec = recall_score(y_test, y_pred, average='weighted') prec = precision_score(y_test, y_pred, average='weighted') auc = roc_auc_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) m = Metric('fastai', fold=fold) m.addValue('acc', 100 * acc) m.addValue('bal-acc', 100 * bal_acc) m.addValue('rec', 100 * rec) m.addValue('prec', 100 * prec) m.addValue('auc', 100 * auc) m.addValue('f1', 100 * f1) metrics_manager.addMetric(m)
def run(k, j, filename, seednum=10, threshold = 0.5, resultdir=None): # classes = ["P1a1" , "P1a2" , "P2b" , "P2c" ] classes = ["P1a1" , "P1a2", "P2b", "P2c", "H1" ] # H1 H2 O (1) P1a1 (4) P1a2 (6) P2b P2c S1a (0) S1c S2 S3 joind = gp.read_file(filename, layer = layers[j]) print(f'\n------\n------{layers[j]}----\n-----\n') df1 = pd.DataFrame(joind.drop(columns='geometry')) df1 = df1.replace([np.inf, -np.inf], np.nan).dropna() Pcl = df1.loc[df1['geocode_2'].isin(classes)] # filter only classes of interest print(Pcl['geocode_2'].value_counts()) # regroup, geocode_2 from here on becomes binary! Pcl['geocode_2'] = np.where(Pcl['geocode_2'].str.contains(classes[k]),classes[k],'Others') print(Pcl['geocode_2'].value_counts()) minc = min(Pcl['geocode_2'].value_counts() ) # skip if less than 20 objects if minc< 20 or minc==len(Pcl): print("minimum class less than 20") return (-1, -1) # -1 -1 if not calculated else: print(f'total {len(df1)}, P_H1_classes: {len(Pcl)}, minimun class: {minc}') # bootstrap and get averaged accuracy avepre = np.zeros(1) # store all the precisions in each CV averec = np.zeros(1) for seeds in range(seednum): np.random.seed(seeds) # use groupby to sample the same amount for each group. # use 70% of data for training, get the index train = Pcl.groupby('geocode_2').sample(n = int(minc*0.7)).index test = Pcl[~Pcl.index.isin(train)].index #len(train)+len(test) df_covar = Pcl X_train = df_covar.loc [train ].drop(columns=["geocode_2","layer","OBJECTID","path"]) X_test = df_covar.loc [test ].drop(columns=["geocode_2","layer","OBJECTID","path"]) Y_train =Pcl.filter(regex='geocode_2').loc[train].values Y_test =Pcl.filter(regex='geocode_2').loc[test].values # relable label_all = [classes[k], "Others"] #classtype = [(j, "float32") for j in classes] #Pcl.geocode_2.unique() i = 0 idx2class = {} class2idx = {} for tp in label_all: idx2class[i] = tp class2idx[tp] = i i+= 1 Y_trainnum = cl2idx(Y_train, class2idx).astype(int) Y_testnum = cl2idx(Y_test, class2idx).astype(int) np.unique(Y_trainnum) # can consider use scikitlearn or h2o to replace the xgb API. # note the estimators can only be specified in the xgb.train, not in the params. dtrain = xgb.DMatrix(X_train, label=Y_trainnum) dtest = xgb.DMatrix(X_test, label=Y_testnum) params = {'max_depth': 6, 'eta': 0.002, 'objective':'binary:logistic', 'num_class': 1, 'eval_metric':['merror', 'mlogloss', 'auc' ] } # Fit #print("Train and test shapes, dividing number of classes for the sample size (i.e. 2 for binary case)") #print(X_train.shape, Y_trainnum.shape, X_test.shape, Y_testnum.shape) model = xgb.train(params, dtrain, 500) #numroudnd = 500 yhat = model.predict(dtest) # threshold 0.5, probability higher than 0.5 -> positive. yhat_labels = yhat>threshold yhat_labels = yhat_labels.astype(int) #get accuracy score accuracy = accuracy_score(Y_testnum, yhat_labels) # get precision and recall # print("precision: tp / (tp + fp)") # print(label_all) recall=np.round(recall_score(Y_testnum, yhat_labels, average = None),2)[0] # only get the recall and precision for the class of interest, therefore "[0]" precision = np.round(precision_score(Y_testnum, yhat_labels, average = None),2)[0] averec = np.append(averec, recall) #store all of them avepre= np.append(avepre, precision) recall = averec.sum()/seednum #get the mean but exclude the first one (0) precision = avepre.sum()/seednum print(averec, recall) if resultdir is not None: Y_testnum = Y_testnum.astype(int) plt.rcParams.update({'font.size': 8}) ax = xgb.plot_importance(model, grid=False, importance_type='gain', title='Feature importance') ax.set_title(f'xgboost importance {layers[j]} {classes[k]}') fname = f"{resultdir}P_{layers[j]}_{classes[k]}_imp" plt.savefig(fname, dpi=1200) return (recall, precision)
def crossvalidationThr(filepath): raw_df = pd.read_csv(filepath) Options_df = make_clean_Options_df(raw_df) print (Options_df.head()) y = Options_df.pop('Options').values X = Options_df.values X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.33) # After the split #X_train,y_train = oversample0(X_train,y_train) kfold = StratifiedKFold(5,shuffle=False) accuracies = [] precisions = [] recalls = [] sumofy = [] sumofy2 = [] F1s = [] thr = [0.27411032 ,0.45177236 ,0.43848543 ,0.57502369 ,0.52727048, 0.26294993, 0.3734106, 0.55862627 ,0.35298962, 0.55236403, 0.86459271, 0.47550826, 0.36730173, 0.69804784 ,0.42914183, 0.56947256, 0.59538206, 0.58619825, 0.73952725, 0.76668073 ,0.7893238 , 0.65972535, 0.77526139, 0.43201526, 0.73858921, 0.60181424 ,0.43968636, 0.75375776, 0.78417738, 0.53074894, 0.75953991, 0.62555268 ,0.48809615, 0.67150554, 0.52648925] bestthr = 0 bestF1 = 0 for i in thr: for train_index, test_index in kfold.split(X_train,y_train): model = LR(solver='liblinear') model.fit(X_train[train_index], y_train[train_index]) #y_predict = model.predict(X_train[test_index]) y_proba = model.predict_proba(X_train[test_index])[:,1] # Above the model is using predict_proba this returns the 'probability' of a 1 # the code below uses this with a differnt threshold to get the actual prediciton y_predict = np.array([(lambda z: 1 if z >i else 0)(z) for z in y_proba]) y_true = y_train[test_index] print(y_proba) print('Predict',y_predict) print('True ',y_true) accuracies.append(accuracy_score(y_true, y_predict)) precisions.append(precision_score(y_true, y_predict)) recalls.append(recall_score(y_true, y_predict)) sumofy.append(len(y_predict)-y_predict.sum()) sumofy2.append(len(y_true)-y_true.sum()) F1s.append(f1_score(y_true, y_predict)) print ("accuracy:", np.average(accuracies)) print ("precision:", np.average(precisions)) print ("recall:", np.average(recalls)) print ("sumofyPridict:", np.average(sumofy)) print ("sumofyTrue:", np.average(sumofy2)) print ('F1', np.average(F1s)) if(np.average(F1s) > bestF1): bestF1 = np.average(F1s) bestthr = i accuracies = [] precisions = [] recalls = [] sumofy = [] sumofy2 = [] F1s = [] print('BestF1 and threashold',bestF1,bestthr)
def FULL_onehot_chem(Dropout1=0,Epochs= 20,Batch_size=64): # 优化器选择 Adam 优化器。 # 损失函数使用 sparse_categorical_crossentropy, # 还有一个损失函数是 categorical_crossentropy,两者的区别在于输入的真实标签的形式, # sparse_categorical 输入的是整形的标签,例如 [1, 2, 3, 4],categorical 输入的是 one-hot 编码的标签。 Feature_test = np.load("../../data_all/TCRB_train_feature_array.npy") Label_array = np.load("../../data_all/TCRB_train_label_array.npy") X = Feature_test#[:,0:29,:] #提取one-hot特征 #print(X[0]) Y = Label_array[:,1] X = X.reshape(len(X),-1) #loo = LeaveOneOut() kf = KFold(n_splits=5,shuffle=True,random_state=0) kf.get_n_splits(X) TN = FP = FN = TP = 0 aa = 1 for train_index, test_index in kf.split(X): np.random.shuffle(train_index) np.random.shuffle(test_index) X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] X_train= X_train.reshape([len(X_train),29,20,2]) X_test = X_test.reshape([len(X_test),29,20,2]) X_test=tf.cast(X_test, tf.float32) model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(29,20,2)), tf.keras.layers.Dense(256,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),# activation='relu', tf.keras.layers.Dense(512,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),# activation='relu', tf.keras.layers.Dense(256,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),# activation='relu', #tf.keras.layers.LeakyReLU(alpha=0.05), tf.keras.layers.Dense(128,activation='relu'), #tf.keras.layers.LeakyReLU(alpha=0.05), tf.keras.layers.Dense(64,activation='relu'), #tf.keras.layers.LeakyReLU(alpha=0.05), tf.keras.layers.Dropout(Dropout1),# Dropout:在 0 和 1 之间浮动。需要丢弃的输入比例 tf.keras.layers.Dense(1, activation='sigmoid') ]) model.compile(optimizer="Adam", loss=keras.losses.binary_crossentropy, metrics=['accuracy']) model.fit(X_train, Y_train, epochs= Epochs , batch_size= Batch_size, verbose=0,) Y_pred = model.predict_classes(X_test) #print(Y_pred) confusion_matrix1 =confusion_matrix(Y_test,Y_pred) TP += confusion_matrix1[0,0] FN += confusion_matrix1[0,1] FP += confusion_matrix1[1,0] TN += confusion_matrix1[1,1] # accuracy = accuracy_score(Y_test,Y_pred) #准确率 # precision = precision_score(Y_test,Y_pred) #精确率 # recall = recall_score(Y_test,Y_pred) #召回率 # f1= f1_score(Y_test,Y_pred) #F1 # print('混淆矩阵\n',confusion_matrix1, # '\n准确率ACC:',accuracy, # '\n精确率precision:',precision, # '\n召回率recall:',recall, # '\nF1:',f1, # ) # y_predict = model.predict(X_test) # y_probs = model.predict_proba(X_test) #模型的预测得分 # #print(y_probs) # fpr, tpr, thresholds = metrics.roc_curve(Y_test,y_probs) # roc_auc = auc(fpr, tpr) #auc为Roc曲线下的面积 # #开始画ROC曲线 # plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc) # plt.legend(loc='lower right') # plt.plot([0,1],[0,1],'r--') # plt.xlim([-0.1,1.1]) # plt.ylim([-0.1,1.1]) # plt.xlabel('False Positive Rate') #横坐标是fpr # plt.ylabel('True Positive Rate') #纵坐标是tpr # plt.title('Receiver operating characteristic example') # plt.show() #model.save('./data_625/model_'+str(aa)+'.h5') #print(aa) if aa == 1: Y_test_all = Y_test Y_pred_all = Y_pred else: Y_test_all = np.append(Y_test_all, Y_test, axis=0) Y_pred_all = np.append(Y_pred_all, Y_pred, axis=0) aa += 1 del model print('\n\n总混淆矩阵') print(TP,FN) print(FP,TN) #print(Y_test_all[0]) accuracy = accuracy_score(Y_test_all,Y_pred_all) #准确率 precision = precision_score(Y_test_all,Y_pred_all) #精确率 recall = recall_score(Y_test_all,Y_pred_all) #召回率 f1= f1_score(Y_test_all,Y_pred_all) #F1 MCC = matthews_corrcoef(Y_test_all,Y_pred_all) #MCC print('准确率ACC:',accuracy, '\n精确率precision:',precision, '\n召回率recall:',recall, '\nF1:',f1, '\nMCC:',MCC )
def main(): np.random.seed(1) random.seed(1) feat_data, labels, adj_lists, train, test, edge_map = load_cora() num_nodes = feat_data.shape[0] feat_dim = feat_data.shape[1] hidden_dim = 15 features = nn.Embedding(num_nodes, feat_dim) features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False) #features.cuda() agg1 = MeanAggregator(features, cuda=True) enc1 = Encoder(features, feat_dim, hidden_dim, adj_lists, agg1, gcn=False, cuda=False) agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), cuda=False) enc2 = Encoder(lambda nodes: enc1(nodes).t(), enc1.embed_dim, hidden_dim, adj_lists, agg2, base_model=enc1, gcn=False, cuda=False) enc1.num_samples = 5 enc2.num_samples = 5 graphsage = SupervisedGraphSage(1, enc2, edge_map) #graphsage.cuda() optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, graphsage.parameters()), lr=0.001, weight_decay=1e-5) times = [] epoch = 10 batch_size = 512 num_batch = len(train) // batch_size best = 1e9 cnt_wait = 0 patience = 20 best_t = 0 for e in range(epoch): for i in range(num_batch): if i < num_batch - 1: batch_nodes = train[i * batch_size:i * batch_size + batch_size] else: batch_nodes = train[i * batch_size:len(train)] start_time = time.time() optimizer.zero_grad() loss = graphsage.loss(batch_nodes,\ Variable(torch.FloatTensor(labels[np.array(batch_nodes)]))) loss.backward() optimizer.step() end_time = time.time() times.append(end_time - start_time) print("The {}-th epoch ".format(e), "{}-th batch".format(i), "Loss: ", loss.item()) if loss.item() < best: best_loss = loss.item() cnt_wait = 0 best_t = e torch.save(graphsage.state_dict(), 'best_model.pkl') else: cnt_wait += 1 if cnt_wait == patience: print("early stopping!") break print('Loading {}th epoch'.format(best_t)) graphsage.load_state_dict(torch.load('best_model.pkl')) if len(test) < 100000: test_output = torch.sigmoid(graphsage.forward(test)) pred = (np.where(test_output.data.numpy() < 0.5, 0, 1)) print("Test F1:", f1_score(labels[test], pred, labels=[1], average="micro")) print("Test Recall:", recall_score(labels[test], pred, labels=[1], average="micro")) print("Test Precision:", precision_score(labels[test], pred, labels=[1], average="micro")) cm = plot_confusion_matrix( labels[test], pred, np.array([0, 1]), title='Confusion matrix, without normalization') #recall = cm[1][1]/(cm[1][0]+cm[1][1]) #precision = cm[1][1]/(cm[1][1]+cm[0][1]) #f1 = 2*recall*precision/(recall+precision) #print("Test F1 micro:", f1) #print("Test Recall micro:", recall) #print("Test Precision micro:", precision) ### Inference on large graph, avoid out of memory else: chunk_size = 5120 pred = [] for j in range(len(test) // chunk_size): if j < (len(test) // chunk_size - 1): test_output = torch.sigmoid( graphsage.forward(test[j * chunk_size:(j + 1) * chunk_size])) else: test_output = torch.sigmoid( graphsage.forward(test[j * chunk_size:len(test)])) pred += (np.where(test_output.data.numpy() < 0.5, 0, 1)).tolist() print("Inference on the {}-th chunk".format(j)) cm = plot_confusion_matrix( labels[test], np.asarray(pred), np.array([0, 1]), title='Confusion matrix, without normalization') print( "Test F1:", f1_score(labels[test], np.asarray(pred), labels=[1], average="micro")) print( "Test Recall:", recall_score(labels[test], np.asarray(pred), labels=[1], average="micro")) print( "Test Precision:", precision_score(labels[test], np.asarray(pred), labels=[1], average="micro")) print("Average batch time:", np.mean(times))
#====================== # SCORE CALCULATION #====================== score = clf.score(X_test, Y_test) f1_weighted = metrics.f1_score( Y_test, Y_pred, labels=None, pos_label=1, average='weighted', sample_weight=None) #toma en cuenta el desbalance de etiqueta precision_score = metrics.precision_score(Y_test, Y_pred, labels=None, pos_label=1, average='weighted', sample_weight=None) recall_score = metrics.recall_score(Y_test, Y_pred, labels=None, pos_label=1, average='weighted', sample_weight=None) G = math.sqrt(recall_score * precision_score) specifity = sensitivity_specifity.specificity_score(Y_test, Y_pred, average='weighted') #=============================== # CONFUSSION MATRIX CALCULATION #===============================
def score(pred, labels): pre = precision_score(labels, pred, average='macro') recall = recall_score(labels, pred, average='macro') f1 = f1_score(labels, pred, average='macro') return pre, recall, f1
features_test = scaler.transform(features_test) def classify_NB(features_train, labels_train): ### your code goes here--should return a trained decision tree classifer clf = GaussianNB() clf.fit(features_train, labels_train) return clf clf = classify_NB(features_train, labels_train) pred = clf.predict(features_test) print accuracy_score(pred, labels_test) print precision_score(pred, labels_test) print recall_score(pred, labels_test) print f1_score(pred, labels_test) def classify_DTC(features_train, labels_train): ### your code goes here--should return a trained decision tree classifer clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) return clf clf = classify_DTC(features_train, labels_train) pred = clf.predict(features_test)
#CLF = DecisionTreeClassifier(random_state=0) #CLF = GaussianNB() #CLF = LogisticRegression() CLF = SVC(C=25) # Load dataset corpus, y = parse_dataset(trn_dataset) #3802 in total Xtrn = featurize(corpus) print(np.asarray(Xtrn).shape) class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist() print(class_counts) # Returns an array of the same size as 'y' where each entry is a prediction obtained by cross validated predicted = cross_val_predict(CLF, Xtrn, y, cv=K_FOLDS) #predicted = libsvm.cross_validation(Z, np.asarray(y,'float64'), 5, kernel = 'rbf') score = metrics.f1_score(y, predicted, pos_label=1) acc = metrics.accuracy_score(y, predicted) preci = metrics.precision_score(y, predicted) recall = metrics.recall_score(y, predicted) print("F1-score:", score) print("Accuracy:", acc) print("Precision:", preci) print("Recall:", recall) testing(Xtrn, y, True) for p in predicted: PREDICTIONSFILE.write("{}\n".format(p)) PREDICTIONSFILE.close()
print "\nPrecision Score" print precision_score(y_test, y_pred) print "\nRecall Score" print recall_score(y_test, y_pred) print "\nF1 Score" print f1_score(y_test, y_pred)""" print("################### SVM Classifier ###############") from sklearn.svm import LinearSVC clf = LinearSVC(random_state=20, tol=1e-5) clf = clf.fit(x_train, y_train) print "\nAccuracy on Training Set :" print clf.score(x_train, y_train) print "Checking on Test Set" print "\nAccuracy on Testing Set :" print clf.score(x_test, y_test) y_pred = clf.predict(x_test) print "\nPrecision Score" print precision_score(y_test, y_pred) print "\nRecall Score" print recall_score(y_test, y_pred) print "\nF1 Score" print f1_score(y_test, y_pred)
all_terms_list.extend(term_prob[i].keys()) all_terms_list = set(all_terms_list) #""" #-------------------------- Classification -------------------------- classifier = CopulaClassifier(corcoeff, vocab_choice, priors) predictions = classifier.predict_multilabelBR(test_docs, all_terms=all_terms_list) print "The Classification is complete and it took", print_time(start_time) #print "Avg time taken per doc: ", (print_time(start_time)/float(len(test_docs))) start_time = time.time() #-------------------------- Evaluation ---------------------- precision = precision_score(test_labels, predictions, average='micro') recall = recall_score(test_labels, predictions, average='micro') f1 = f1_score(test_labels, predictions, average='micro') print("Micro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format( precision, recall, f1)) precision = precision_score(test_labels, predictions, average='macro') recall = recall_score(test_labels, predictions, average='macro') f1 = f1_score(test_labels, predictions, average='macro') print("Macro-average quality numbers") print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format( precision, recall, f1))
# # For classification problems that are skewed in their classification distributions like in our case, for example if we had a 100 text messages and only 2 were spam and the rest 98 weren't, accuracy by itself is not a very good metric. We could classify 90 messages as not spam(including the 2 that were spam but we classify them as not spam, hence they would be false negatives) and 10 as spam(all 10 false positives) and still get a reasonably good accuracy score. For such cases, precision and recall come in very handy. These two metrics can be combined to get the F1 score, which is weighted average of the precision and recall scores. This score can range from 0 to 1, with 1 being the best possible F1 score. # We will be using all 4 metrics to make sure our model does well. For all 4 metrics whose values can range from 0 to 1, having a score as close to 1 as possible is a good indicator of how well our model is doing. # In[ ]: ''' Instructions: Compute the accuracy, precision, recall and F1 scores of your model using your test data 'y_test' and the predictions you made earlier stored in the 'predictions' variable. ''' # In[51]: ''' Solution ''' from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score print('Accuracy score: ', format(accuracy_score(y_test, predictions))) print('Precision score: ', format(precision_score(y_test, predictions))) print('Recall score: ', format(recall_score(y_test, predictions))) print('F1 score: ', format(f1_score(y_test, predictions))) # ### Step 7: Conclusion ### # # One of the major advantages that Naive Bayes has over other classification algorithms is its ability to handle an extremely large number of features. In our case, each word is treated as a feature and there are thousands of different words. Also, it performs well even with the presence of irrelevant features and is relatively unaffected by them. The other major advantage it has is its relative simplicity. Naive Bayes' works well right out of the box and tuning it's parameters is rarely ever necessary, except usually in cases where the distribution of the data is known. # It rarely ever overfits the data. Another important advantage is that its model training and prediction times are very fast for the amount of data it can handle. All in all, Naive Bayes' really is a gem of an algorithm! # # Congratulations! You have successfully designed a model that can efficiently predict if an SMS message is spam or not! # # Thank you for learning with us!
#print('five fold:') print(mean_sensitivity) print(mean_SP) print(mean_ACC) print(mean_MCC) print(mean_AUC) clf.fit(gram_train, y_train) y_score = clf.predict_proba(gram_test) y_score = get_y_score(y_score) precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_score) y_pred = clf.predict(gram_test) ACC = metrics.accuracy_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred) sensitivity = metrics.recall_score(y_test, y_pred) specificity = specificity_score(y_test, y_pred) AUC = metrics.roc_auc_score(y_test, y_score) MCC = metrics.matthews_corrcoef(y_test, y_pred) AUPR = get_AUPR(y_test, y_score) #print("===========================") #print('testing:') print(sensitivity) print(specificity) print(ACC) print(MCC) print(AUC) #print('AUPR =', AUPR)
np.set_printoptions(precision=2) plt.figure(figsize=(10, 6), dpi=220) plot_confusion_matrix(cmatrix, title='Confusion matrix - No Normalisation') plt.figure(figsize=(10, 6), dpi=220) plot_confusion_matrix(cmatrix, normalize=True, title='Confusion Matrix - Normalised') plt.show() # In[9]: print('Recall score: %0.2f' % recall_score(y_test, y_pred)) print('Accuracy score: %0.2f' % accuracy_score(y_test, y_pred)) print('Precision score: %0.2f' % precision_score(y_test, y_pred)) # In[10]: model = RandomForestClassifier(n_estimators=64) scores = cross_val_score(model, X_train, y_train, cv=20) print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2)) # In[28]: model = DummyClassifier(strategy="most_frequent") model.fit(X_train, y_train) y_pred = model.predict(X_test)
ensemble_scores_pos = ensemble_scores_pos + scores[:, 1][labelzz == 1].tolist() print('Makes a prediction for', (len(positive_pred) + len(negative_pred)), 'domains') print('Would predict', np.sum(predictions), 'domains malicious') else: total_manual += len(labelzz.index) ensemble_predictions = ensemble_predictions + labelzz.values.tolist() ensemble_labels = ensemble_labels + labelzz.values.tolist() print('Total work reduced', (total_amount_of_domains-total_manual)/total_amount_of_domains) print('Total FNR', total_fp/total_amount_negative) print('Total FPR', total_fn/total_amount_positive) print('Accuracy', accuracy_score(ensemble_labels, ensemble_predictions)) print('F1', f1_score(ensemble_labels, ensemble_predictions)) print('Precision', precision_score(ensemble_labels, ensemble_predictions)) print('Recall', recall_score(ensemble_labels, ensemble_predictions)) print('Little check', total_amount_positive+total_amount_negative == total_amount_of_domains) print('Little check', total_pred+total_manual == total_amount_of_domains) print('Little check', len(ensemble_scores_pos) + len(ensemble_scores_neg) == total_amount_of_domains) print('Little check', len(ensemble_scores_pos) == total_amount_positive) print('Little check', len(ensemble_scores_neg) == total_amount_negative) results_posteriori['work_reduction_metric'].append((total_amount_of_domains - total_manual) / total_amount_of_domains) results_posteriori['fnr_metric'].append(total_fn / total_amount_positive) results_posteriori['fpr_metric'].append(total_fp / total_amount_negative) results_posteriori['accuracy_metric'].append(accuracy_score(ensemble_labels, ensemble_predictions)) results_posteriori['f1_metric'].append(f1_score(ensemble_labels, ensemble_predictions)) results_posteriori['precision_metric'].append(precision_score(ensemble_labels, ensemble_predictions)) results_posteriori['recall_metric'].append(recall_score(ensemble_labels, ensemble_predictions))
# In[26]: #applying on training dataset y_train = [] pred = [] for row in train_dataset: prediction = predict(network, row) y_train.append(int(row[-1])) pred.append(prediction) # In[27]: print("Accuracy: ", accuracy_score(y_train, pred)) print("Confusion Matrix: ", confusion_matrix(y_train, pred)) print("Precision: ", precision_score(y_train, pred)) print("recall: ", recall_score(y_train, pred)) # In[28]: #applying on testing dataset y_test = [] pred = [] for row in test_dataset: prediction = predict(network, row) y_test.append(row[-1]) pred.append(prediction) # In[29]: print("Accuracy: ", accuracy_score(y_test, pred))
def eval_once(cdir, saver, top_k_op, labels_np, logits, predict): """Run Eval once. Args: saver: Saver. summary_writer: Summary writer. top_k_op: Top K op. summary_op: Summary op. """ # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1) with tf.Session(config=tf.ConfigProto( log_device_placement=False, # gpu_options=gpu_options)) as sess: ckpt = tf.train.get_checkpoint_state(checkpoint_dir=cdir) if ckpt and ckpt.model_checkpoint_path: # Restores from checkpoint saver.restore(sess, ckpt.model_checkpoint_path) # Assuming model_checkpoint_path looks something like: # /my-favorite-path/cifar10_train/model.ckpt-0, # extract global_step from it. # global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1] else: print('No checkpoint file found') return # Start the queue runners. try: # num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size)) # true_count = 0 # Counts the number of correct predictions. # total_sample_count = num_iter * FLAGS.batch_size # step = 0 # while step < num_iter and not coord.should_stop(): # predictions = sess.run([top_k_op]) # true_count += np.sum(predictions) # step += 1 # predictions = sess.run([top_k_op]) # num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size_test)) # true_count = 0 # Counts the number of correct predictions. # total_sample_count = num_iter * FLAGS.batch_size_test # step = 0 # while step < num_iter and not coord.should_stop(): _, logits, pred = sess.run([top_k_op, logits, predict]) # with open("testresultss_LSTM.txt", "a") as myfile: # myfile.write(np.array2string(logits) + '\n') # print(pred) # logits_max = np.argmax(logits,axis=1) # print(logits_max) # print(labels_np) precision = precision_score(labels_np, pred) recall = recall_score(labels_np, pred) acc = accuracy_score(labels_np, pred) cm = confusion_matrix(labels_np, pred) # recall = sess.run(rec_op) # acc = sess.run(acc_op)#accuracy # true_count = np.sum(predictions) # false_count = FLAGS.num_examples - np.count_nonzero(predictions) # Compute precision @ 1. # precision = true_count / FLAGS.num_examples # recall = tf.metrics.recall(labels=labels,predictions=predictions) print('precision @ 1 = %.3f recall @ 1 = %.3f acc @ 1 = %.3f' % (precision, recall, acc)) with open("testresultss_LSTM.txt", "a") as myfile: # myfile.write(cdir + '\n') myfile.write(cdir + ',%.3f,%.3f,%.3f \n' % (precision, recall, acc)) myfile.write(np.array2string(cm) + '\n') # summary = tf.Summary() # summary.ParseFromString(sess.run(summary_op)) # summary.value.add(tag='Precision @ 1', simple_value=precision) # summary.value.add(tag='Recall @ 1', simple_value=recall) # summary.value.add(tag='Accuracy @ 1', simple_value=acc) # summary_writer.add_summary(summary, global_step) except Exception as e: # pylint: disable=broad-except print('error in ' + cdir)
from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(variables,labels) pred = neigh.predict(variables) accuracy=sklearn.metrics.accuracy_score(labels, pred) print(accuracy) from sklearn.metrics import f1_score knn_f1_score = f1_score(labels, pred, average='macro') print(knn_f1_score) from sklearn.metrics import precision_score knn_precision_score = precision_score(labels, pred, average='macro') print(knn_precision_score) from sklearn.metrics import recall_score knn_recall_score = recall_score(labels, pred, average='macro') print(knn_recall_score) ################################################################################ variables_train, variables_test, labels_train, labels_test=train_test_split( variables, labels, test_size=.9, random_state=1) neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(variables_train,labels_train) pred = neigh.predict(variables_test) accuracy=sklearn.metrics.accuracy_score(labels_test, pred) print(accuracy)