def on_epoch_end(self, batch, logs={}):
     # losses
     self.losses_train.append(self.model.evaluate(X_train, Y_train, batch_size=128,verbose =0))
     self.losses_val.append(self.model.evaluate(X_val, Y_val, batch_size=128,verbose = 0))
     
     # Roc train
     train_preds = self.model.predict_proba(X_train, verbose=0)
     train_preds = train_preds[:, 1]
     roc_train = metrics.roc_auc_score(y_train, train_preds)
     self.roc_train.append(roc_train)
     
     # Roc val
     val_preds = self.model.predict_proba(X_val, verbose=0)
     val_preds = val_preds[:, 1]
     roc_val = metrics.roc_auc_score(y_val, val_preds)
     self.roc_val.append(roc_val)
     
     # Metrics train
     y_preds = self.model.predict_classes(X_train,verbose = 0)
     self.f1_train.append(metrics.f1_score(y_train,y_preds))
     self.recal_train.append(metrics.recall_score(y_train,y_preds))
     self.preci_train.append(metrics.precision_score(y_train,y_preds))
     
     # Metrics val
     y_preds = self.model.predict_classes(X_val,verbose =0)
     self.f1_val.append(metrics.f1_score(y_val,y_preds))
     self.recal_val.append(metrics.recall_score(y_val,y_preds))
     self.preci_val.append(metrics.precision_score(y_val,y_preds))
Example #2
0
def single_test(feature, attribute):
    from sklearn.metrics import f1_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import accuracy_score
    from data_generator import load_vector_from_text
    import random
    data=merge_different_vectors([feature],attribute)
    none_attribute_uids=load_vector_from_text('uids_none_attributes.vector',feature,'list')
    none_attribute_uids=filter(lambda x:x in data[0],none_attribute_uids)
    alpha=0.2*len(data[0])/len(none_attribute_uids)
    train_data=[[],[]]
    test_data=[[],[]]
    for index,uid in enumerate(data[0]):
        if uid in none_attribute_uids and random.random()<alpha:
        #if random.random()<0.2:
            test_data[0].append(data[1][index])
            test_data[1].append(data[2][index])
        else:
            train_data[0].append(data[1][index])
            train_data[1].append(data[2][index])
    print len(test_data[1]),sum(test_data[1]),len(train_data[1]),sum(train_data[1])
    clf=LogisticRegression()
    clf.fit(train_data[0], train_data[1])
    predicted_y=clf.predict(test_data[0])
    test_accuracy=accuracy_score(test_data[1],predicted_y)
    test_recall=recall_score(test_data[1],predicted_y)
    test_f1=f1_score(test_data[1],predicted_y)
    print 'F1 of test data (%d %d): %0.2f'%(sum(test_data[1]),len(test_data[1])-sum(test_data[1]),test_f1)
    print 'Accuracy of test data (%d %d): %0.2f'%(sum(test_data[1]),len(test_data[1])-sum(test_data[1]),test_accuracy)
    predicted_y=clf.predict(train_data[0])
    train_accuracy=accuracy_score(train_data[1],predicted_y)
    train_recall=recall_score(train_data[1],predicted_y)
    train_f1=f1_score(train_data[1],predicted_y)
    print 'F1 of train data (%d %d): %0.2f'%(sum(train_data[1]),len(train_data[1])-sum(train_data[1]),train_f1)
    return [test_accuracy,test_recall,test_f1,train_accuracy,train_recall,train_f1]
Example #3
0
def confusion_matrix(true_y, pred_y, labels):
    c_matrix = metrics.confusion_matrix(true_y, pred_y)

    confusion_table = []
    first_row = ["C.Matrix"] + labels + ["ACTUAL"] + ["RECALL"]
    confusion_table.append(first_row)

    recall = metrics.recall_score(true_y, pred_y, average=None)
    for r, row in enumerate(c_matrix):
        new_row = [labels[r]]
        new_row.extend(row)
        new_row.append(sum(row))
        new_row.append(recall[r])
        confusion_table.append(new_row)

    new_row = ["PREDICTED"]
    for l in labels:
        new_row.append(len([t for t in pred_y if t == l]))
    new_row.append(len(true_y))
    new_row.append(metrics.recall_score(true_y, pred_y, average='macro'))
    confusion_table.append(new_row)

    new_row = ["PRECISION"]
    new_row.extend(metrics.precision_score(true_y, pred_y, average=None))
    new_row.append(metrics.precision_score(true_y, pred_y, average='macro'))
    new_row.append(metrics.f1_score(true_y, pred_y, average='macro'))
    confusion_table.append(new_row)

    confusion_table = pd.DataFrame(confusion_table)
    return confusion_table
 def applyClassifier(self, clf, name, training_set, testing_set, y_train, y_test):
     print("\nMODEL " + name)
     
     t0 = time()
     classifier = clf.fit(training_set, y_train)
     train_time = time() - t0
     print("train time: %0.3fs" % train_time)
     
     t0 = time()       
     y_nb_predicted = classifier.predict(testing_set)
     test_time = time() - t0
     print("test time:  %0.3fs" % test_time)
     
     precision = metrics.precision_score(y_test, y_nb_predicted)
     recall = metrics.recall_score(y_test, y_nb_predicted)
     f1_score = metrics.f1_score(y_test, y_nb_predicted)
     accuracy = metrics.accuracy_score(y_test, y_nb_predicted)
     micro_recall = metrics.recall_score(y_test, y_nb_predicted, average="micro")
     macro_recall = metrics.recall_score(y_test, y_nb_predicted, average="macro")
     micro_precision = metrics.precision_score(y_test, y_nb_predicted, average="micro")
     macro_precision = metrics.precision_score(y_test, y_nb_predicted, average="macro")        
     print 'The precision for this classifier is ' + str(precision)
     print 'The micro averaged precision for this classifier is ' + str(micro_precision)
     print 'The macro averaged precision for this classifier is ' + str(macro_precision)
     print 'The recall for this classifier is ' + str(recall)
     print 'The micro averaged recall for this classifier is ' + str(micro_recall)
     print 'The macro averaged recall for this classifier is ' + str(macro_recall)        
     print 'The f1 for this classifier is ' + str(f1_score)
     print 'The accuracy for this classifier is ' + str(accuracy) 
     
     return name, accuracy, precision, recall, micro_precision, micro_recall, macro_precision, macro_recall, train_time, test_time
def main():
	f = open("me.stdout", "r").read()

	print f
	
	(confusionMatrix, labels, ytrue, ypred, trueCount) = readConfusionMatrix.readText(f)
	for row in confusionMatrix:
		print row

	precisionMicro = np.float(metrics.precision_score(ytrue, ypred, average="micro"))
	recallMicro = np.float(metrics.recall_score(ytrue, ypred, average="micro"))
	f1Micro = np.float(metrics.f1_score(ytrue, ypred, average="micro"))
	f1Macro = np.float(metrics.f1_score(ytrue, ypred, pos_label=1, average="macro"))
	precisionMacro = np.float(metrics.precision_score(ytrue, ypred, average="macro"))
	recallMacro = np.float(metrics.recall_score(ytrue, ypred, average="macro"))

	mConf = metrics.confusion_matrix(ytrue, ypred)
	print mConf

	print labels
	print len(ytrue)
	print len(ypred)
	print trueCount

	print metrics.accuracy_score(ytrue, ypred)

	print precisionMicro
	print recallMicro
	print f1Micro
	print f1Macro
	print precisionMacro
	print recallMacro
Example #6
0
    def on_epoch_end(self, epoch, logs={}):
        print logs

        corr=0
        tot=0
        preds = self.model.predict(self.dev_data, verbose=1)
        preds_text=[]
        for l in preds:
            preds_text.append(self.index2label[np.argmax(l)])

        print "Micro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"micro")
        print "Macro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"macro")
        print "Macro recall:", recall_score(self.dev_labels_text,preds_text,average=u"macro")

        if self.best_mr < recall_score(self.dev_labels_text,preds_text,average=u"macro"):
            self.best_mr = recall_score(self.dev_labels_text,preds_text,average=u"macro")
            model.save_weights(self.model_name + '_full_' + str(epoch) + '_MR_' + str(self.best_mr) + '.hdf5')
            print 'Saved Weights!'


        print classification_report(self.dev_labels_text, preds_text)
        for i in xrange(len(self.dev_labels)):

        #    next_index = sample(preds[i])
            next_index = np.argmax(preds[i])
            # print preds[i],next_index,index2label[next_index]

            l = self.index2label[next_index]

            # print "correct:", index2label[np.argmax(dev_labels[i])], "predicted:",l
            if self.index2label[np.argmax(self.dev_labels[i])]==l:
                corr+=1
            tot+=1
        print corr,"/",tot
Example #7
0
def stratified_k_fold(clf,features,labels):
    skf = StratifiedKFold( labels, n_folds=3 )
    precisions = []
    recalls = []
    for train_idx, test_idx in skf:
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)


        ### for each fold, print some metrics
        print
        print "precision score: ", precision_score( labels_test, pred )
        print "recall score: ", recall_score( labels_test, pred )

        precisions.append( precision_score(labels_test, pred) )
        recalls.append( recall_score(labels_test, pred) )

    ### aggregate precision and recall over all folds
    print "average precision: ", sum(precisions)/2.
    print "average recall: ", sum(recalls)/2.
Example #8
0
def create_all_eval_results(y_true,y_pred,key,system_features,sampling,replacement,num_of_samples):
    # precision = metrics.precision_score(y_true, y_pred, average='weighted')
    # recall = metrics.recall_score(y_true, y_pred, average='weighted')
    # F2 = calculateF2(precision, recall)
    name = data_names[key]

    y_true_bugs, y_pred_bugs = zip(*[[y_true[i], y_pred[i]] for i in range(len(y_true)) if y_true[i] == 1])
    # precision_bug, recall_bug, F_measure_bug ,_ = metrics.precision_recall_fscore_support(y_true_bugs,
    #                                                                                                  y_pred_bugs,
    #                                                                                                  average='micro')
    precision_bug =metrics.precision_score(y_true_bugs,y_pred_bugs,average='micro')
    recall_bug =metrics.recall_score(y_true_bugs,y_pred_bugs,average='micro')
    F2_bug = calculateF2(precision_bug,recall_bug)
    precision_bug_all, recall_bug_all,_ = metrics.precision_recall_curve(y_true_bugs, y_pred_bugs)
    prc_area_bug = metrics.auc(recall_bug_all, precision_bug_all)

    # precision, recall, F_measure,_ = metrics.precision_recall_fscore_support(y_true,
    #                                                                                                 y_pred,
    #                                                                                                 average='micro')
    precision = metrics.average_precision_score(y_true, y_pred, average='micro')
    recall = metrics.recall_score(y_true, y_pred, average='micro')
    F2 = calculateF2(precision, recall)
    precision_all, recall_all, _ = metrics.precision_recall_curve(y_true, y_pred)
    prc_area = metrics.auc(recall_all, precision_all)

    global results
    results.loc[len(results)] = [name,precision_bug,recall_bug,F2_bug,prc_area_bug, precision, recall,F2,prc_area,str(system_features),str(sampling),str(replacement),str(num_of_samples)]
Example #9
0
def cross_val(data_x, data_y, classifier, kFold, b_cost=1, h_cost=1, w=0.5):
    e_h, e_b = 0, 0
    y_tests, pred_probas = [], []
    
    for train_index, test_index in kFold:
        data_x_, data_y_ = np.array(data_x), np.array(data_y)
        X_train, X_test = list(data_x_[train_index]), list(data_x_[test_index])
        y_train, y_test = list(data_y_[train_index]), list(data_y_[test_index])
        classifier.fit(X_train, y_train)
        pred_proba = [r[0] for r in classifier.predict_proba(X_test)]
        y_tests += y_test
        pred_probas += pred_proba
    
    predictions = [0 if p*b_cost > (1-p)*h_cost else 1 for p in pred_probas]
    roc_auc = roc_auc_score(y_tests, pred_probas)
    total_acc = accuracy_score(y_tests, predictions)
    precision, recall, thresholds = precision_recall_curve(y_tests, pred_probas, pos_label=0)
    fpr, tpr, thresholds = roc_curve(y_tests, pred_probas, pos_label=0)
    precision_bots = precision_score(y_tests, predictions, pos_label = 0)
    precision_humans = precision_score(y_tests, predictions, pos_label = 1)
    recall_bots = recall_score(y_tests, predictions, pos_label = 0)
    recall_humans = recall_score(y_tests, predictions, pos_label = 1)
    f1_bots = f1_score(y_tests, predictions, pos_label = 0)
    f1_humans = f1_score(y_tests, predictions, pos_label = 1)
    conf_matrix = np.matrix(list(confusion_matrix(y_tests, predictions)))
    
    #plot_curve(fpr, tpr, 'ROC', w)
    #plot_curve(recall, precision, 'PR', w)
    
    return [total_acc, precision_bots, precision_humans, recall_bots, recall_humans, f1_bots, f1_humans, roc_auc, conf_matrix]
Example #10
0
def calculate_f1_metrics(all_predicted, all_targets):
    first_class = first_meaningful_entity
    class_count = len(set(all_targets))
    filtered_true, filtered_predicted = [], []

    for i in range(len(all_targets)):
        if all_targets[i] > 0:
            filtered_true.append(all_targets[i])
            filtered_predicted.append(all_predicted[i])

    precision_separate_scores = metrics.precision_score(filtered_true, filtered_predicted,
                                                        labels=[i for i in range(first_class, class_count)],
                                                        average=None)
    precision_score = metrics.precision_score(filtered_true, filtered_predicted,
                                              labels=[i for i in range(first_class, class_count)], average='micro')
    recall_separate_scores = metrics.recall_score(filtered_true, filtered_predicted,
                                                  labels=[i for i in range(first_class, class_count)], average=None)
    recall_score = metrics.recall_score(filtered_true, filtered_predicted,
                                        labels=[i for i in range(first_class, class_count)], average='micro')
    f1_separate_scores = metrics.f1_score(filtered_true, filtered_predicted,
                                          labels=[i for i in range(first_class, class_count)], average=None)
    f1_score = metrics.f1_score(filtered_true, filtered_predicted,
                                labels=[i for i in range(first_class, class_count)], average='micro')

    return f1_separate_scores, f1_score, precision_separate_scores, precision_score, recall_separate_scores, recall_score
def run_model(X_test, X_train, y_test, y_train, prob_threshold = 20, layers = 5, nodes = 64, dropout = 50):
    
    print "run_model RUNNING"
    # Grab the model 
    model = get_model(X_test, layers =layers, dropout = dropout)
    model.fit(X_train, y_train, nb_epoch=20, batch_size=16, verbose = 0)

    # Get the training and test predictions from our model fit. 
    train_predictions  = model.predict_proba(X_train)
    test_predictions = model.predict_proba(X_test)
    # Set these to either 0 or 1 based off the probability threshold we 
    # passed in (divide by 100 becuase we passed in intergers). 
    train_preds = (train_predictions) >= prob_threshold / 100.0
    test_preds = (test_predictions) >= prob_threshold / 100.0

    # Calculate the precision and recall. Only output until 
    precision_score_train = precision_score(y_train, train_preds)
    precision_score_test = precision_score(y_test, test_preds)
    acc_train = accuracy_score(y_train, train_preds)
    acc_test = accuracy_score(y_test, test_preds)

    recall_score_train = recall_score(y_train, train_preds)
    recall_score_test = recall_score(y_test, test_preds)

    return precision_score_train, precision_score_test, recall_score_train, recall_score_test, acc_train, acc_test, model
def randomforest(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans randomforest")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "The Random forest algo "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Random_Forest_metrics.txt"
    file = open(results, "w")
    file.write("Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "The Random forest"
    save = Output + "Random_Forest_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans randomforest")
def stochasticGD(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans stochasticGD")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = SGDClassifier(loss="hinge", penalty="l2")
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "Stochastic Gradient Descent "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Stochastic_GD_metrics.txt"
    file = open(results, "w")
    file.write("Stochastic Gradient Descent estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Stochastic Gradient Descent"
    save = Output + "Stochastic_GD_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD")
Example #14
0
def evaluate(ytest, ypred, filename='metrics.txt'):
    true_result = [1 if item > 0.5 else 0 for item in ytest]
    pred_result = [1 if item > 0.5 else 0 for item in ypred]
    
    cm = confusion_matrix(true_result, pred_result)
    print('\nConfusion matrix:')
    print(cm)
    print("\nLoss classified as loss", cm[0][0])
    print("Wins classified as wins", cm[1][1])
    print("Wins classified as loss", cm[1][0])
    print("Loss classified as wins", cm[0][1])
    print('\nAccuracy:\t', accuracy_score(true_result, pred_result))
    print('Precision:\t', precision_score(true_result, pred_result))
    print('Recall: \t', recall_score(true_result, pred_result))
    print('F1 score:\t', f1_score(true_result, pred_result))
    print('Mean absolute error:\t', mean_absolute_error(ytest, ypred))
    
    # print to file
    print("Loss classified as loss", cm[0][0], file=open(filename, "a"))
    print("Wins classified as wins", cm[1][1], file=open(filename, "a"))
    print("Wins classified as loss", cm[1][0], file=open(filename, "a"))
    print("Loss classified as wins", cm[0][1], file=open(filename, "a"))
    print('\nAccuracy:\t', accuracy_score(true_result, pred_result), file=open(filename, "a"))
    print('Precision:\t', precision_score(true_result, pred_result), file=open(filename, "a"))
    print('Recall: \t', recall_score(true_result, pred_result), file=open(filename, "a"))
    print('F1 score:\t', f1_score(true_result, pred_result), file=open(filename, "a"))
    print('Mean absolute error:\t', mean_absolute_error(ytest, ypred), file=open(filename, "a"))
Example #15
0
def _clf_mlp(trX,teX,trY,teY):
	print "MLP"
	print trX.shape,"trX shape"
	print "Enter Layer for MLP"
	layer=input()
	# print "enter delIdx"
	# delIdx=input()
	# while(delIdx):
	# 	trX=np.delete(trX,-1,axis=0)
	# 	trY=np.delete(trY,-1,axis=0)
	# 	delIdx=delIdx-1
	print "factors",factors(trX.shape[0])	
	teY=teY.astype(np.int32)
	trY=trY.astype(np.int32)
	print trX.shape,"trX shape"
	print "enter no of mini batch"
	mini_batch=int(input())
	mlp = TfMultiLayerPerceptron(eta=0.01, 
                             epochs=100, 
                             hidden_layers=layer,
                             activations=['relu' for i in range(len(layer))],
                             print_progress=3, 
                             minibatches=mini_batch, 
                             optimizer='adam',
                             random_seed=1)
	mlp.fit(trX,trY)
	pred=mlp.predict(teX)
	print _f_count(teY),"test f count"
	pred=pred.astype(np.int32)
	print _f_count(pred),"pred f count"
	conf_mat=confusion_matrix(teY, pred)
	process_cm(conf_mat, to_print=True)
	print precision_score(teY,pred),"Precision Score"
	print recall_score(teY,pred),"Recall Score"
	print roc_auc_score(teY,pred), "ROC_AUC"
Example #16
0
def predictSVD(svd, row, column, d):
    # start = timeit.default_timer()
    u = svd[0] #clf.components_ 
    s = svd[1] #clf.explained_variance_
    vt = svd[2] #clf.fit_transform(X)
    # print "   fitting done.";
    # stop = timeit.default_timer()
    # print "   runtime: " + str(stop - start)
    # print "d:"
    # print d

    # matrixY = clf.components_ 
    probsY = []
    # print "dot products:"
    for i in range(len(row)):
        # print np.dot(u[:,column[i]], v[row[i],:])
        prob = np.sum(u[column[i],:]*s*vt[:,row[i]])
        if(prob < 0): prob = 0
        if(prob > 1): prob = 1
        probsY.append(prob)

    probsY = np.array(probsY)
    preds = np.zeros(shape=len(probsY))
    preds[probsY >= 0.5] = 1

    print "Precision"
    print precision_score(d, preds)
    print "Recall"
    print recall_score(d, preds)
    print "F-Score"
    print f1_score(d, preds)

    return probsY, preds
    def trainModel(self,folds):
        
        kf = cross_validation.StratifiedKFold(self.y_total,n_folds=folds,shuffle=True,random_state=random.randint(1,100))

        for (train_index,test_index) in (kf):
          
            self.X_train = [self.X_total[i] for i in train_index]
            self.X_test = [self.X_total[i] for i in test_index] 
            self.y_train = [self.y_total[i] for i in train_index]
            self.y_test = [self.y_total[i] for i in test_index] 

            print "################"
            print "Original"
            print np.array(self.y_test)
            print "################"
            self.clf = self.clf.fit(self.X_train,self.y_train)
            print "Predicted"
            y_pred = self.clf.predict(self.X_test)
            print y_pred
            print "################"
            print "Evaluation\n"           
            cm = confusion_matrix(self.y_test,y_pred)            
            print cm
            print "Precision Score:"
            print precision_score(self.y_test,y_pred,average="macro")
            print "Recall Score:"
            print recall_score(self.y_test,y_pred,average="macro") 
            print "Accuracy Score:"
            print accuracy_score(self.y_test,y_pred)
def nearest_centroid(input_file,Output,test_size):
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = NearestCentroid()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Nearest Centroid Classifier "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"Nearest_Centroid_metrics_test.txt"
    file = open(results, "w")
    file.write("Nearest Centroid Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Nearest Centroid %f"%test_size
    save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def extratreeclassifier(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Extremely Randomized Trees"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"_Extremely_Random_Forest_metrics_test.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees %f"%test_size
    save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def gaussianNB(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans gaussianNB split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    # Instantiate the estimator
    clf = GaussianNB()
    # Fit the estimator to the data
    clf.fit(X_train, y_train)
    # Use the model to predict the last several labels
    y_pred = clf.predict(X_test)
    print "Gaussian Naive Bayes estimator accuracy "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    results = Output+"GaussianNB_metrics_test.txt"
    file = open(results, "w")
    file.write("Gaussian Naive Bayes estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Gaussian Naive Bayes %f"%test_size
    save = Output + "Gaussian_NB_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLsortie dans gaussianNB split_test")
 def evaluate(self, feats, tag_set):
     """
     Tag held-out labeled corpus `tagged_corpus`, and return tagging
     accuracy
     """
     corect=0
     incorect=0        
     #nul = 0
     #ful = 0
     per_sen = 0
     all_pre = []
     all_tru = []
     
     for (tokens, tags) in zip(feats, tag_set):
         yyhat= self.tag(tokens)
         
         all_pre.extend(yyhat)
         all_tru.extend(tags)            
         
         cor_sen = 0
         for pre, tag in zip(yyhat, tags):
             if pre==tag:
                 corect+=1
                 cor_sen+=1
             else:
                 incorect+=1
         if cor_sen == len(yyhat):
             per_sen+=1
             
     print metrics.recall_score(all_pre, all_tru, average=None)
     
     return corect/(corect+incorect), per_sen/len(tag_set) #nul/corect, ful/corect
Example #22
0
def score(y_true, y_pred):
    precision_weighted = metrics.precision_score(
        y_true, y_pred, average='weighted')
    precision_ave = np.mean(metrics.precision_score(
        y_true, y_pred, average=None)[::12])

    recall_weighted = metrics.recall_score(
        y_true, y_pred, average='weighted')
    recall_ave = np.mean(metrics.recall_score(
        y_true, y_pred, average=None)[::12])

    f1_weighted = metrics.f1_score(
        y_true, y_pred, average='weighted')
    f1_ave = np.mean(metrics.f1_score(
        y_true, y_pred, average=None)[::12])

    stat_line = "  Precision: %0.4f\t Recall: %0.4f\tf1: %0.4f"
    res1 = "Weighted: " + stat_line % (100*precision_weighted,
                                       100*recall_weighted,
                                       100*f1_weighted)

    res2 = "Averaged: " + stat_line % (100*precision_ave,
                                       100*recall_ave,
                                       100*f1_ave)
    res3 = "-"*72
    outputs = [res3, res1, res2, res3]
    return "\n".join(outputs)
def main():
    resize_shape = 64
    print "data is loading..."
    train_X, train_Y, test_X, test_Y = load_data(resize_shape)
    print "data is loaded"
    print "feature engineering..."
    learning_rate = 0.01
    training_iters = 100000
    batch_size = 128
    display_step = 10

    # Network Parameters
    n_input = resize_shape*resize_shape # MNIST data input (img shape: 28*28)
    n_classes = 62 # MNIST total classes (0-9 digits)
    dropout = 0.5 # Dropout, probability to keep units

    with tf.Session() as sess:
        cnn = CNN(sess, learning_rate, training_iters, batch_size, display_step, n_input, n_classes, dropout,resize_shape)
        train_X = cnn.inference(train_X)
        test_X = cnn.inference(test_X)

    print "feature engineering is complete"

    print 'training phase'
    clf = svm.LinearSVC().fit(train_X, train_Y)
    print 'test phase'
    predicts = clf.predict(test_X)

    # measure function
    print 'measure phase'
    print confusion_matrix(test_Y, predicts)
    print f1_score(test_Y, predicts, average=None)
    print precision_score(test_Y, predicts, average=None)
    print recall_score(test_Y, predicts, average=None)
    print accuracy_score(test_Y, predicts)
def SVC_linear(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans SVC_linear")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf=svm.SVC(kernel='linear')
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "C-Support Vector Classifcation (with linear kernel) "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"SVM_Linear_Kernel_metrics.txt"
    file = open(results, "w")
    file.write("Support Vector Machine with Linear Kernel estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "SVC - linear Kernel"
    save = Output + "SVC_linear_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans SVC_linear")
def SVC_linear(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans SVC_linear split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf=svm.SVC(kernel='linear')
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "C-Support Vector Classifcation (with RBF linear) "
    print "y_test, y_pred, iteration"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"SVM_Linear_Kernel_metrics_test.txt"
    file = open(results, "w")
    file.write("Support Vector Machine with Linear Kernel estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "SVC linear %f"%test_size
    save = Output + "SVC_linear_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLsortie dans SVC_linear split_test")
Example #26
0
	def cv(self, X, y, eval_size=.33, nfold=3):
		metrics=['roc_auc','f1','recall','precision']
		Xtrain, Xeval , ytrain, yeval = train_test_split(X,y,test_size=eval_size)
		Xtrain.reset_index(drop = True)
		Xeval.reset_index(drop = True)
		ytrain.reset_index(drop = True)
		yeval.reset_index(drop = True)
		self.fit(Xtrain, ytrain)

		ypred = self.predict(Xeval)
		yprob = self.predict_proba(Xeval)

		eroc = roc_auc_score(yeval, yprob)
		ef1 = f1_score(yeval, ypred)
		erecall = recall_score(yeval, ypred)
		eprecision = precision_score(yeval, ypred)

		# print confusion_matrix(yeval, ypred, labels = [0,1])
		# eroc = roc_auc_score(yeval, yprob, sample_weight=sw)
		# ef1 = f1_score(yeval, ypred,sample_weight=sw)
		# erecall = recall_score(yeval, ypred, sample_weight=sw)
		# eprecision = precision_score(yeval, ypred, sample_weight=sw)
		escores = [eroc, ef1, erecall, eprecision]

		
		skfscores = []

		skf = StratifiedKFold(ytrain,n_folds=nfold, random_state=2016)
		for trainIndex, testIndex in skf:
			skfxtrain, skfxtest = X.loc[trainIndex,:], X.loc[testIndex,:]
			skfytrain, skfytest = y.values[trainIndex], y.values[testIndex]
			self.fit(skfxtrain, skfytrain)
			ypred = self.predict(skfxtest)
			yprob = self.predict_proba(skfxtest)

			# roc = roc_auc_score(skfytest, yprob, sample_weight=sw)
			# f1 = f1_score(skfytest, ypred, sample_weight=sw)
			# recall = recall_score(skfytest, ypred,sample_weight=sw)
			# precision = precision_score(skfytest, ypred, sample_weight=sw)

			roc = roc_auc_score(skfytest, yprob)
			f1 = f1_score(skfytest, ypred)
			recall = recall_score(skfytest, ypred)
			precision = precision_score(skfytest, ypred)

			# print confusion_matrix(skfytest, ypred, labels=[0,1])

			scores = [roc, f1, recall, precision]
			print 'cv scores:'
			print scores
			skfscores.append(scores)

		skfscores = np.array(skfscores)	
		skfscores = skfscores.mean(0)		
		
		report = pd.DataFrame({'eval': escores, 'train': skfscores}, index=metrics)

		return report
def DTree(X, Y, XTest, YTest):
    print '-----------------------------------------------------'
    # dot_data = StringIO()
    # tree.export_graphviz(dtree_model, out_file=dot_data)
    # graph = pydot.graph_from_dot_data(dot_data.getvalue())
    # graph.write_pdf("../dtree.pdf")

    # param_grid = {'max_depth': np.arange(1, 15)}

    # tree_grid = GridSearchCV(DecisionTreeClassifier(), param_grid)
    tree_grid = DecisionTreeClassifier(max_depth=3)
    tree_grid.fit(X, Y)
    export_graphviz(tree_grid, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("dtreevis.pdf")

    # print("The best parameters are %s with a score of %0.2f"
    #       % (tree_grid.best_params_, tree_grid.best_score_))

    print "Computing training statistics"
    dtree_predict_time_training = time.time()
    Ypred_dtree_training = tree_grid.predict(X)
    dtree_predict_time_training = time.time() - dtree_predict_time_training

    dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training)
    dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training,
                                                    average='binary')
    dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training,
                                                 average='binary')

    print "DT training prediction time: " + str(dtree_predict_time_training)
    print "DT training accuracy Score: " + str(dtree_accuracy_training)
    print "DT training precision Score: " + str(dt_precision_training)
    print "DT training recall Score: " + str(dtree_recall_training)

    print "Computing testing statistics"
    dtree_predict_time_test = time.time()
    Ypred_dtree_test = tree_grid.predict(XTest)
    dtree_predict_time_test = time.time() - dtree_predict_time_test

    dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test)
    dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test,
                                                average='binary')
    dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test,
                                             average='binary')

    print "DT test prediction time: " + str(dtree_predict_time_test)
    print "DT test accuracy Score: " + str(dtree_accuracy_test)
    print "DT test precision Score: " + str(dt_precision_test)
    print "DT test recall Score: " + str(dtree_recall_test)

    print "Creating ROC curve"
    y_true = YTest
    y_score = tree_grid.predict_proba(XTest)
    fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true,
                                          y_score=y_score[:, 0],
                                          pos_label=0)
    plt.plot(fprSVM, trpSVM, 'r-', label='DT')
Example #28
0
def print_scores(model, X_train, y_train, X_test, y_test):
    """
    Compute scores for given model with training and test sets
    
    Input:
        model (sklearn.linear_model): the model with which to calculate scores
        X_train (numpy_array): training design matrix X
        y_train (numpy_array): training labels y
        X_test (numpy_array): test design matrix X
        y_test (numpy_array): test labels y
        
    Output:
        F1-score in test set
    
    Side Effects:
        prints the scores
        
    Comments:
        model must be fitted before calling this function
    
    """
    
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)

    
    # accuracy scores
    print("Accuracy")
    print("Train: ", model.score(X_train,y_train))
    print("Test: ", model.score(X_test, y_test))
    print("\n")

    # use precision and recall metrics
    from sklearn.metrics import precision_score, recall_score

    precision_train = precision_score(y_train, y_train_predicted)
    recall_train = recall_score(y_train, y_train_predicted)

    precision_test = precision_score(y_test, y_test_predicted)
    recall_test = recall_score(y_test, y_test_predicted)

    print("Precision and Recall")
    print ("Train: ", precision_train, recall_train)
    print ("Test: ", precision_test, recall_test)
    print("\n")


    # F1 score
    from tilestools import F1score
    f1_train = F1score(y_train, y_train_predicted)
    f1_test = F1score(y_test, y_test_predicted)

    print("F1 score")
    print ("Train: ", f1_train)
    print ("Test: ", f1_test)

    
    return f1_test
Example #29
0
def train_folds(fit_predict, X, Y, args, n_combs=1, random_fold=False, get_train_acc=False):

	# Splits the data into k folds, calls the trainer function (fit_predict)
	# and displays results

	Y_test_true_all = []
	Y_test_pred_all = []
	if get_train_acc:
		Y_train_true_all = []
		Y_train_pred_all = []

	for i in range(n_combs):

		print('Cross-validating combination n ' + str(i+1))
		sys.stdout.flush()

		if random_fold:
			state = None
		else:
			state = i

		skf = StratifiedKFold(Y, n_folds=5, random_state=state, shuffle=True)

		for train_index, test_index in skf:

			X_train = sub_list(X, train_index)
			X_test = sub_list(X, test_index)
			Y_train = sub_list(Y, train_index)
			Y_test = sub_list(Y, test_index)

			if get_train_acc:
				(Y_train_pred, Y_test_pred) = fit_predict(X_train, Y_train, X_test, args)
			else:
				Y_test_pred = fit_predict(X_train, Y_train, X_test, args)

			if get_train_acc:
				Y_train_true_all.append(Y_train)
				Y_train_pred_all.append(Y_train_pred)

			Y_test_true_all.append(Y_test)
			Y_test_pred_all.append(Y_test_pred)

	if get_train_acc:
		Y_train_true_all = concatenate(Y_train_true_all)
		Y_train_pred_all = concatenate(Y_train_pred_all)

	Y_test_true_all = concatenate(Y_test_true_all)
	Y_test_pred_all = concatenate(Y_test_pred_all)

	print('\n')
	print(classification_report(Y_test_true_all, Y_test_pred_all, target_names=target_names))

	if get_train_acc:
		return (1 - recall_score(Y_train_true_all, Y_train_pred_all, average='micro'),
				1 - recall_score(Y_test_true_all, Y_test_pred_all, average='micro'))
	else:
		return  1 - recall_score(Y_test_true_all, Y_test_pred_all, average='micro')
Example #30
0
    def _calc_pos_prob(self):
        y_pred = self.predict(self.validation_X)
        y_true = self.validation_y

        # obtaining recall scores for each label (assuming the labels are binary)
        pos_acc = recall_score(y_true, y_pred, average='binary', pos_label=self.positive_label)
        neg_acc = recall_score(y_true, y_pred, average='binary', pos_label=int(not self.positive_label))

        return neg_acc / (pos_acc + neg_acc)
def train_model(datasetvar, dataset):
    x = datasetvar
    y = dataset['Churn'].values
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    print(sss)
    print('训练数据和测试数据被分成的组数:', sss.get_n_splits(x, y))

    # 建立训练数据和测试数据
    for train_index, test_index in sss.split(x, y):
        print('train:', train_index, 'test:', test_index)
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        print('原始数据特征:', x.shape, '训练数据特征:', x_train.shape, '测试数据特征:',
              x_test.shape)

        print('原始数据特征:', y.shape, '训练数据特征:', y_train.shape, '测试数据特征:',
              y_test.shape)

    # 使用分类算法, 这里选用10中分类算法
    Classifier = [['Random Forest', RandomForestClassifier()],
                  ['Support Vector Machine', SVC()],
                  ['LogisticRegression',
                   LogisticRegression()],
                  ['KNN', KNeighborsClassifier(n_neighbors=5)],
                  ['Navie Bayes', GaussianNB()],
                  ['Decision Tree', DecisionTreeClassifier()],
                  ['AdaBosstClassifier',
                   AdaBoostClassifier()],
                  ['GradientBoostingClassifier',
                   GradientBoostingClassifier()], ['XGB',
                                                   XGBClassifier()],
                  ['CatBoost',
                   CatBoostClassifier(logging_level='silcat')]]

    # 训练模型
    Classify_result = []
    names = []
    prediction = []
    for name, classifier in Classifier:
        classifier = classifier
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        class_eva = pd.DataFrame([recall, precision])
        Classify_result.append(class_eva)
        name = pd.Series(name)
        names.append(name)
        y_pred = pd.Series(y_pred)
        prediction.append(y_pred)

    # 训练模型
    names = pd.DataFrame(names)
    names = names[0].tolist()
    result = pd.concat(Classify_result, axis=1)
    result.columns = names
    result.index = ['recall', 'precision', 'f1score']
    print(result)

    # 实施方案
    pred_x = datasetvar.tail(10)

    # 提取customerID
    pred_id = telcom_id.tail(10)

    # 使用朴素贝叶斯方法, 对预测数据集中的生存情况进行预测
    model = GaussianNB()
    model.fit(x_train, y_train)
    pred_y = model.predict(pred_x)

    # 预测结果
    predDf = pd.DataFrame({'customerID': pred_id, 'Churn': pred_y})
    print(predDf)
Example #32
0
maozi = targets['maozi']
yanjing = targets['yanjing']

RANDOM_STATE = 500
X_train, X_test, y_db_train, y_db_test = train_test_split(
    bottlenecks, kouzhao, test_size=0.15, random_state=RANDOM_STATE)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

t1 = time.time()
# clf = DecisionTreeClassifier().fit(X_train, y_db_train)
clfb = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                         max_samples=0.5,
                         max_features=0.5).fit(X_train, y_db_train)

# predict = clf.predict(X_test)
predict = clfb.predict(X_test)

# print(clf.score(X_test, y_db_test))
print('****************metrics***************')
print(classification_report(y_db_test, predict))
print('-------precision_score:')
print(precision_score(y_db_test, predict))
print('-------recall_score:')
print(recall_score(y_db_test, predict))
print('-------F1_score:')
print(f1_score(y_db_test, predict))
print('------------time:')
print(time.time() - t1)
Example #33
0
def main(argv=None):
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_list
    try:
        os.makedirs(FLAGS.output_dir)
    except OSError as e:
        if e.errno != 17:
            raise
    """
    if FLAGS.use_vacab and os.path.exists("./vocab.txt"):
        bk_tree = BKTree(levenshtein, list_words('./vocab.txt'))
        # bk_tree = bktree.Tree()
    """
    with tf.get_default_graph().as_default() as graph:

        # define the placehodler
        input_images = tf.placeholder(tf.float32,
                                      shape=[None, None, None, 3],
                                      name='input_images')
        input_feature_map = tf.placeholder(tf.float32,
                                           shape=[None, None, None, 32],
                                           name='input_feature_map')
        input_transform_matrix = tf.placeholder(tf.float32,
                                                shape=[None, 6],
                                                name='input_transform_matrix')
        input_box_mask = []
        input_box_mask.append(
            tf.placeholder(tf.int32, shape=[None], name='input_box_masks_0'))
        input_box_widths = tf.placeholder(tf.int32,
                                          shape=[None],
                                          name='input_box_widths')

        # define the model
        # input_seq_len = input_box_widths[tf.argmax(input_box_widths, 0)] * tf.ones_like(input_box_widths)
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        shared_feature, f_score, f_geometry = detect_part.model(input_images)
        pad_rois = roi_rotate_part.roi_rotate_tensor_pad(
            input_feature_map, input_transform_matrix, input_box_mask,
            input_box_widths)
        recognition_logits = recognize_part.build_graph(
            pad_rois, input_box_widths, class_num=FLAGS.class_num)

        variable_averages = tf.train.ExponentialMovingAverage(
            0.997, global_step)
        saver = tf.train.Saver(variable_averages.variables_to_restore())
        stats_graph(graph)

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            ckpt_state = tf.train.get_checkpoint_state(FLAGS.checkpoint_path)
            weight_len = len(ckpt_state.all_model_checkpoint_paths)
            # print(weight_len)

            for w in range(0, weight_len):
                model_path = os.path.join(
                    FLAGS.checkpoint_path,
                    os.path.basename(ckpt_state.all_model_checkpoint_paths[w]))
                print('Restore from {}'.format(model_path))
                saver.restore(sess, model_path)

                im_fn_list = get_images()
                all_pred = []
                all_gt = []
                iou_list = []
                during = []
                img_null = 0
                img_per_cls = len(im_fn_list) / FLAGS.class_num
                # img_per_cls = 20
                print("Img_per_cls:", img_per_cls)
                for im_fn in im_fn_list:
                    start_time = time.time()
                    im = cv2.imread(im_fn)[:, :, ::-1]

                    im_resized, (ratio_h, ratio_w) = resize_image(im)
                    # im_resized_d, (ratio_h_d, ratio_w_d) = resize_image_detection(im)

                    timer = {'detect': 0, 'restore': 0, 'nms': 0, 'recog': 0}
                    start = time.time()
                    shared_feature_map, score, geometry = sess.run(
                        [shared_feature, f_score, f_geometry],
                        feed_dict={input_images: [im_resized]})

                    boxes, timer = detect(score_map=score,
                                          geo_map=geometry,
                                          timer=timer)
                    try:
                        if boxes.shape[0] >= 1:
                            # if boxes[0][1] > boxes[1][1]:
                            #     boxes[[0, 1], :] = boxes[[1, 0], :]

                            new_boxes = []
                            for i, b in enumerate(boxes):
                                b = b[:8].reshape((4, 2))
                                b = polygon_sort(b)
                                # b = b[(1,2,3,0), :]
                                if FLAGS.back_side == 0:
                                    b = b[(2, 3, 0, 1), :]
                                new_boxes.append(b)
                            boxes = np.asarray(new_boxes, dtype=np.float32)
                    except AttributeError:
                        # print("shape not found")
                        img_null += 1

                    # read the ground-truth as four points
                    # txt_dir = "{}gt_img{}".format(FLAGS.test_gt_path, (im_fn.split("img")[1]).split(".")[0]+".txt")
                    # boxes = []
                    # with open(txt_dir, 'r')as fp:
                    #     all_lines = fp.readlines()
                    #     # print(all_lines[1])
                    #     for i, b in enumerate(all_lines):
                    #     # for b in all_lines:
                    #         box = b.split(",")[:8]
                    #         box = np.asarray(box, dtype=np.float32)
                    #         box = box.reshape((4, 2))
                    #         box = polygon_sort(box)
                    #         if i == 1:
                    #             box = box[(2, 3, 0, 1), :]
                    #         # box = sort_poly(box)
                    #         # box = box[[2, 3, 0, 1]]
                    #         box = box.reshape((-1, 8, 1))
                    #         boxes.append(box)
                    # boxes = np.asarray(boxes, dtype=np.float32)
                    # boxes = boxes*0.53333333

                    timer['detect'] = time.time() - start
                    # print(im_fn)
                    # im_num = int((im_fn.split(".")[0]).split("/")[-1])
                    im_num = int((im_fn.split(".")[0]).split("img_")[-1])
                    if (im_num % img_per_cls == 0):
                        im_gt = int((im_num / img_per_cls) - 1)
                    else:
                        im_gt = int(im_num / img_per_cls)

                    # print(im_num)
                    predict_area = np.zeros(im[:, :, ::-1].shape[:2], np.uint8)
                    if boxes is not None and boxes.shape[0] != 0:
                        res_file_path = os.path.join(
                            FLAGS.output_dir, 'res_' + '{}.txt'.format(
                                os.path.basename(im_fn).split('.')[0]))
                        input_roi_boxes = boxes[:, :8].reshape(-1, 8)
                        if input_roi_boxes.shape[0] == 1:
                            tmp_roi_boxes = input_roi_boxes[0:2]
                            boxes_masks = [0] * tmp_roi_boxes.shape[0]
                            transform_matrixes, box_widths = get_project_matrix_and_width(
                                tmp_roi_boxes)

                            # run the recognition part
                            recog_logits = sess.run(recognition_logits,
                                                    feed_dict={
                                                        input_feature_map:
                                                        shared_feature_map,
                                                        input_transform_matrix:
                                                        transform_matrixes,
                                                        input_box_mask[0]:
                                                        boxes_masks,
                                                        input_box_widths:
                                                        box_widths
                                                    })
                            # part level
                            np_pred = np.asarray(recog_logits)
                            mean_pred = np.mean(np_pred, axis=0)
                            # mean_pred = np.average(np_pred, axis=0, weights=[1, 1, 0, 1])
                            softmax_x = np.asarray(mean_pred).reshape(
                                -1).tolist()

                            softmax_x = softmax(softmax_x)
                            softmax_x = softmax_x.reshape(-1, 1)
                            im_pred = np.argmax(softmax_x, 0)
                            all_pred.append(im_pred)
                            all_gt.append(im_gt)
                            # print("ground-truth:[{}] predict:{}".format(im_gt, im_pred))

                        timer['recog'] = time.time() - start
                        duration = time.time() - start_time
                        during.append(duration)
                        # print('[timing] {}'.format(duration))

                        # Preparing for draw boxes
                        boxes = boxes[:, :8].reshape((-1, 4, 2))
                        boxes[:, :, 0] /= ratio_w
                        boxes[:, :, 1] /= ratio_h

                        # with open(res_file_path, 'w') as f:
                        for i, box in enumerate(boxes):
                            # to avoid submitting errors
                            box = sort_poly(box.astype(np.int32))
                            if np.linalg.norm(box[0] -
                                              box[1]) < 5 or np.linalg.norm(
                                                  box[3] - box[0]) < 5:
                                continue
                            """
                            if FLAGS.use_vacab:
                                fix_result = bktree_search(bk_tree, recognition_result.upper())
                                if len(fix_result) != 0:
                                    recognition_result = fix_result[0][1]
                """
                            # f.write('{},{},{},{},{},{},{},{}\r\n'.format(
                            #     box[0, 0], box[0, 1], box[1, 0], box[1, 1], box[2, 0], box[2, 1], box[3, 0],
                            #     box[3, 1]
                            # ))
                            box = box * (640 / 512)

                            # Draw bounding box
                            cv2.polylines(
                                im[:, :, ::-1],
                                [box.astype(np.int32).reshape((-1, 1, 2))],
                                True,
                                color=(0, 0, 255),
                                thickness=3)
                            cv2.fillPoly(
                                predict_area,
                                [box.astype(np.int32).reshape((-1, 1, 2))],
                                color=(255, 255, 255))
                            # Draw recognition results area
                            text_area = box.copy()
                            text_area[2, 1] = text_area[1, 1]
                            text_area[3, 1] = text_area[0, 1]
                            text_area[0, 1] = text_area[0, 1] - 15
                            text_area[1, 1] = text_area[1, 1] - 15
                            # cv2.fillPoly(im[:, :, ::-1], [text_area.astype(np.int32).reshape((-1, 1, 2))], color=(255, 255, 0))
                            im_txt = im[:, :, ::-1]
                    else:
                        res_file = os.path.join(
                            FLAGS.output_dir, 'res_' + '{}.txt'.format(
                                os.path.basename(im_fn).split('.')[0]))
                        f = open(res_file, "w")
                        im_txt = None
                        f.close()

                    # calculate the intersection of union
                    # gt_file = os.path.join(FLAGS.test_gt_path,
                    #                        '{}.txt'.format(os.path.basename(im_fn).split('.')[0]))
                    gt_file = os.path.join(
                        FLAGS.test_gt_path, 'gt_' +
                        '{}.txt'.format(os.path.basename(im_fn).split('.')[0]))
                    gt_area = draw_gt_box(gt_file)
                    # cv2.imshow('gt', gt_area)
                    # cv2.imshow('pred', predict_area)
                    # cv2.waitKey(0)
                    # cv2.destroyAllWindows()
                    iou = iou_cal(predict_area, gt_area)
                    # print("IOU: ", iou)
                    iou_list.append(iou)

                    # print('{} : detect {:.0f}ms, restore {:.0f}ms, nms {:.0f}ms'.format(
                    #     im_fn, timer['detect'] * 1000, timer['restore'] * 1000, timer['nms'] * 1000))

                    if not FLAGS.no_write_images:
                        img_path = os.path.join(FLAGS.output_dir,
                                                os.path.basename(im_fn))
                        # cv2.imwrite(img_path, im[:, :, ::-1])
                        if im_txt is not None:
                            cv2.imwrite(img_path, im_txt)

                # print(confusion_matrix(all_gt, all_pred))
                # np.savetxt("confusion_matrix.csv", confusion_matrix(all_gt, all_pred), delimiter=" ")
                average_iou = mean(iou_list)
                accuracy = accuracy_score(all_gt, all_pred)
                precision = precision_score(all_gt, all_pred, average='macro')
                recall = recall_score(all_gt, all_pred, average='macro')
                F1_score = f1_score(all_gt, all_pred, average='macro')
                average_time = mean(during)

                print(
                    "weight: {} || accuracy:{:.4f}, precision:{:.4f}, recall:{:.4f}, f1_score:{:.4f}, IoU:{:.4f}, FPS:{:.6f} \n"
                    .format((2500 * (w + 1)), accuracy, precision, recall,
                            F1_score, average_iou, 1 / average_time))
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    # all_results = open(satellite + '_results.csv', 'w')
    best_model_wts = model.state_dict()
    best_acc = 0.0
    best_train_acc = 0.0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # Set model to training mode
                dataloders = dataloaders_train
                current_dataset = dataset_train
            else:
                model.train(False)  # Set model to evaluate mode
                dataloders = dataloaders_test
                current_dataset = dataset_test

            dataset_size = len(current_dataset)

            running_loss = 0.0
            running_corrects = torch.zeros(len(columns))
            running_preds = None
            running_labels = None
            running_scores = None

            # Iterate over data.
            for data in dataloders:
                # get the inputs
                inputs = data['image']
                if continuous:
                    labels = data['labels'].type(torch.FloatTensor)
                else:
                    labels = data['labels'].type(torch.LongTensor)

                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                if use_five_bands:
                    convolved = convolver(inputs)
                    outputs = model(convolved)
                else:
                    outputs = model(inputs)
                scores = sigmoider(outputs)
                preds = torch.round(scores).data
                scores = scores.data
                # outputs = outputs.type(torch.cuda.LongTensor)
                loss = criterion(outputs.squeeze(),
                                 labels.type(torch.cuda.FloatTensor).squeeze())

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.data[0]
                if not continuous:
                    running_corrects += torch.sum((preds == labels.data.type(
                        torch.cuda.FloatTensor)).type(torch.FloatTensor), 0)
                    if not continuous and epoch >= num_epochs - detailed_metrics_for:
                        if running_preds is None:
                            running_preds = preds.cpu().numpy()
                            running_labels = labels.data.cpu().numpy()
                            running_scores = scores.cpu().numpy()
                        else:
                            running_preds = np.vstack(
                                (running_preds, preds.cpu().numpy()))
                            running_labels = np.vstack(
                                (running_labels, labels.data.cpu().numpy()))
                            running_scores = np.vstack(
                                (running_scores, scores.cpu().numpy()))

                # print (preds == labels.data)

            epoch_loss = running_loss / dataset_size
            epoch_acc = running_corrects.numpy() / dataset_size
            if not continuous and epoch >= num_epochs - detailed_metrics_for:
                print('%s Loss: %.4f') % (phase, epoch_loss)
                for i, column in enumerate(columns):
                    column_labels = running_labels[:, i]
                    column_preds = running_preds[:, i]
                    column_scores = running_scores[:, i]

                    epoch_f1 = f1_score(column_labels, column_preds)
                    epoch_precision = precision_score(column_labels,
                                                      column_preds)
                    epoch_recall = recall_score(column_labels, column_preds)
                    roc_score = roc_auc_score(column_labels, column_scores)
                    print(
                        '%s Acc: %.4f F1: %.4f Precision: %.4f Recall: %.4f ROC_score: %.4f'
                    ) % (column, epoch_acc[i], epoch_f1, epoch_precision,
                         epoch_recall, roc_score)
                    print('Balance: %.4f' % current_dataset.balance[i])
                    false_positive_index = np.argmin(column_labels -
                                                     column_scores)
                    false_negative_index = np.argmax(column_labels -
                                                     column_scores)
                    true_positive_index = np.argmin((column_labels -
                                                     column_scores) + 2 *
                                                    (1 - column_labels))
                    true_negative_index = np.argmax((column_labels -
                                                     column_scores) -
                                                    2 * column_labels)

                    false_positive_sat_index = current_dataset.indices[
                        false_positive_index] + 1, column_scores[
                            false_positive_index], column_labels[
                                false_positive_index]
                    false_negative_sat_index = current_dataset.indices[
                        false_negative_index] + 1, column_scores[
                            false_negative_index], column_labels[
                                false_negative_index]
                    true_positive_sat_index = current_dataset.indices[
                        true_positive_index] + 1, column_scores[
                            true_positive_index], column_labels[
                                true_positive_index]
                    true_negative_sat_index = current_dataset.indices[
                        true_negative_index] + 1, column_scores[
                            true_negative_index], column_labels[
                                true_negative_index]

                    print "False positive id, score, label: %d, %.4f, %d" % false_positive_sat_index
                    print "False negative id, score, label: %d, %.4f, %d" % false_negative_sat_index
                    print "True positive id, score, label: %d, %.4f, %d" % true_positive_sat_index
                    print "True negative id, score, label: %d, %.4f, %d" % true_negative_sat_index
                    print ""

                # print('{} Loss: {:.4f} Acc: {:.4f} F1: {:.4f}'.format(
                #             phase, epoch_loss, epoch_acc, epoch_f1))
            else:
                # print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                #     phase, epoch_loss, epoch_acc))
                print('%s Loss: %.4f') % (phase, epoch_loss)
                for i, column in enumerate(columns):
                    epoch_f1 = f1_score(running_labels[:, i], running_preds[:,
                                                                            i])
                    print('%s Acc: %.4f') % (column, epoch_acc)

            # all_results.write(','.join([str(epoch), phase, str(epoch_loss), str(epoch_acc)]) + '\n')
            # deep copy the model
            if phase == 'val' and np.mean(epoch_acc) > best_acc:
                best_acc = np.mean(epoch_acc)
                best_model_wts = model.state_dict()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, best_train_acc, best_acc
Example #35
0
    def compute_score(self, conf, hy):
        conf['_all_f1'] = M = {str(self.le.inverse_transform([klass])[0]): f1 for klass, f1 in enumerate(f1_score(self.test_y, hy, average=None))}
        conf['_all_recall'] = {str(self.le.inverse_transform([klass])[0]): f1 for klass, f1 in enumerate(recall_score(self.test_y, hy, average=None))}
        conf['_all_precision'] = {str(self.le.inverse_transform([klass])[0]): f1 for klass, f1 in enumerate(precision_score(self.test_y, hy, average=None))}

        if len(self.le.classes_) == 2:
            conf['_macrof1'] = np.mean(np.array([v for v in conf['_all_f1'].values()]))
            conf['_weightedf1'] = conf['_microf1'] = f1_score(self.test_y, hy, average='binary')
        else:
            conf['_macrof1'] = f1_score(self.test_y, hy, average='macro')
            conf['_microf1'] = f1_score(self.test_y, hy, average='micro')
            conf['_weightedf1'] = f1_score(self.test_y, hy, average='weighted')

        conf['_accuracy'] = accuracy_score(self.test_y, hy)
        if self.score.startswith('avgf1:'):
            klist = [M[x] for x in self.score.replace('avgf1:', '').split(':')]
            conf['_' + self.score] = sum(klist) / len(klist)

        conf['_score'] = conf['_' + self.score]
Example #36
0
def train_model(X, y,  mtype, cv,  
                epochs, cv_models_path, train, X_test=None, nfolds=None,
                y_test=None, rs=42, max_features=40000, maxlen=400, 
                dropout_rate=0.25, rec_units=150, embed_dim=50, 
                batch_size=256, max_sen_len=100, max_sent_amount=4,
                threshold=0.3):
    if cv:
        kf = StratifiedKFold(n_splits=nfolds, random_state=rs)
        auc = []
        roc = []
        fscore_ = [] 

        for c, (train_index, val_index) in enumerate(kf.split(X, y)):
            
            print(f' fold {c}')
            
            X_train, X_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index] 
            
            tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_features)
            tokenizer.fit_on_texts(X_train)
            
            if mtype == 'HAN':
                def clean_str(string):
                    #string = string.replace(",", ".").replace(";", ".").replace(":", ".").replace("-", ".")
                    return string.strip().lower()
                
                def tok_sentence(s):
                    temp = tokenizer.texts_to_sequences(s)
                    if len(temp)==0:
                        return np.array([0])
                    return temp
                    
                    
                train_posts = []
                train_labels = []
                train_texts = []
                
                #TRAIN
                for i, value in enumerate(X_train):
                    if(i%10000==0):
                        print(i)
                    text = clean_str(value)
                    train_texts.append(text)
                    sentences = tokenize.sent_tokenize(text)
                    sentences = tok_sentence(sentences)
                    x = len(sentences)<max_sent_amount
                    while x:
                        sentences.append(np.array([0])) 
                        x = len(sentences)<max_sent_amount
            
                    if len(sentences)>max_sent_amount:
                        sentences = sentences[0:max_sent_amount]
                    sentences = sequence.pad_sequences(sentences, maxlen=max_sen_len)
            
                    train_posts.append(sentences)
                
                val_posts = []
                val_labels = []
                val_texts = []
            
                #VAL
                for i, value in enumerate(X_val):
                    if(i%10000==0):
                        print(i)
                    text = clean_str(value)
                    val_texts.append(text)
                    sentences = tokenize.sent_tokenize(text)
                    sentences = tok_sentence(sentences)
            
            
                    x = len(sentences)<max_sent_amount
                    while x:
                        sentences.append(np.array([0])) 
                        x = len(sentences)<max_sent_amount
            
                    if len(sentences)>max_sent_amount:
                        sentences = sentences[0:max_sent_amount]
                    sentences = sequence.pad_sequences(sentences, maxlen=max_sen_len)
                    val_posts.append(sentences)
                
                X_train = np.array(train_posts)
                y_train = np.array(y_train)
                X_val =  np.array(val_posts)
                y_val = np.array(y_val)
                
                del train_posts
                del val_posts
            elif mtype =='psHAN':
                X_train = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_sen_len*max_sent_amount)
                X_val = sequence.pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=max_sen_len*max_sent_amount)
                X_train = np.array([line.reshape(max_sent_amount,max_sen_len) for line in X_train])
                X_val = np.array([line.reshape(max_sent_amount,max_sen_len) for line in X_val])
            else:
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_val   = tokenizer.texts_to_sequences(X_val)
                
                X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
                X_val   = sequence.pad_sequences(list_tokenized_val, maxlen=maxlen)
            
            model = dl_model(model_type=mtype, max_features=max_features, maxlen=maxlen, 
                            dropout_rate=dropout_rate, embed_dim=embed_dim, rec_units=rec_units,
                            max_sent_len=max_sen_len, max_sent_amount=max_sent_amount)
            
            print('Fitting')
            if train:
                model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
                model.save_weights(f'{cv_models_path}/{mtype}_fold_{c}.h5')
            else: 
                model.load_weights(f'{cv_models_path}/{mtype}_fold_{c}.h5')
            
            probs = model.predict(X_val, batch_size=batch_size, verbose=1)
            
            #for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
            threshold = threshold
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1 
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_val, probs_class) 
            recall    = recall_score(y_val, probs_class)
            fscore    = f1_score(y_val, probs_class)
            print(f' {threshold} fold {c} precision {round(precision, 3)} recall {round(recall, 3)} fscore {round(fscore,3)}')
            
            auc_f = average_precision_score(y_val, probs)
            
            auc.append(auc_f)
            roc_f = roc_auc_score(y_val, probs)
            roc.append(roc_f)
            fscore_.append(fscore)
            print(f'fold {c} average precision {round(auc_f, 3)}')
            print(f'fold {c} roc auc {round(roc_f, 3)}')
            
            del model
            K.clear_session()
        
        print(f'PR-C {round(np.array(auc).mean(), 3)}')
        print(f'ROC AUC {round(np.array(roc).mean(), 3)}')
        print(f'FScore {round(np.array(fscore_).mean(), 3)}')
        
        print(f'PR-C std {round(np.array(auc).std(), 3)}')
        print(f'ROC AUC std {round(np.array(roc).std(), 3)}')
        print(f'FScore std {round(np.array(fscore_).std(), 3)}')
    else:
            X_train   = X
            y_train   = y
            tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_features, oov_token='unknown')
            tokenizer.fit_on_texts(X_train)
            
            
            if mtype == 'HAN':
                
                def clean_str(string):
                    #string = string.replace(",", ".").replace(";", ".").replace(":", ".").replace("-", ".")
                    return string.strip().lower()
                
                def tok_sentence(s):
                    temp = tokenizer.texts_to_sequences(s)
                    if len(temp)==0:
                        return np.array([0])
                    return temp
                
                train_posts = []
                train_labels = []
                train_texts = []
                
                # FULL TRAIN
                for i, value in enumerate(X):
                    if(i%10000==0):
                        print(i)
                    text = clean_str(value)
                    train_texts.append(text)
                    sentences = tokenize.sent_tokenize(text)
                    sentences = tok_sentence(sentences)
                    x = len(sentences)<max_sent_amount
                    while x:
                        sentences.append(np.array([0])) 
                        x = len(sentences)<max_sent_amount
                
                    if len(sentences)>max_sent_amount:
                        sentences = sentences[0:max_sent_amount]
                    sentences = sequence.pad_sequences(sentences, maxlen=max_sen_len)
                
                    train_posts.append(sentences)
                
                    
                test_posts = []
                test_labels = []
                test_texts = []
                    
                    
                #Test
                for i, value in enumerate(X_test):
                    if(i%10000==0):
                        print(i)
                    text = clean_str(value)
                    test_texts.append(text)
                    sentences = tokenize.sent_tokenize(text)
                    sentences = tok_sentence(sentences)
                    x = len(sentences)<max_sent_amount
                    while x:
                        sentences.append(np.array([0])) 
                        x = len(sentences)<max_sent_amount
                
                    if len(sentences)>max_sent_amount:
                        sentences = sentences[0:max_sent_amount]
                    sentences = sequence.pad_sequences(sentences, maxlen=max_sen_len)
                
                    test_posts.append(sentences)
                    
                    
                X_train = np.array(train_posts)
                y_train = np.array(y)
                X_test =  np.array(test_posts)
                y_test = np.array(y_test)
                
                del train_posts
                del test_posts
            elif mtype =='psHAN':
                X_train = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_sen_len*max_sent_amount, padding='post')
                X_test  = sequence.pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_sen_len*max_sent_amount, padding='post')
                X_train = np.array([line.reshape(max_sent_amount, max_sen_len) for line in X_train])
                X_test  = np.array([line.reshape(max_sent_amount, max_sen_len) for line in X_test])
            else:
                list_tokenized_train = tokenizer.texts_to_sequences(X_train)
                list_tokenized_test  = tokenizer.texts_to_sequences(X_test)
                X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
                X_test  = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
                
            y_train = np.array(y_train)
            y_test  = np.array(y_test)

            model = dl_model(model_type=mtype, max_features=max_features, 
                            maxlen=maxlen, dropout_rate=dropout_rate, embed_dim=embed_dim, 
                            rec_units=rec_units, max_sent_len=max_sen_len, max_sent_amount=max_sent_amount)
            
            print('Fitting')

            if train:
                model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=1)
                model.save_weights(f'{cv_models_path}/{mtype}.h5')
            else: 
                model.load_weights(f'{cv_models_path}/{mtype}.h5')
            probs = model.predict(X_test, batch_size=batch_size, verbose=1)
            auc_f = average_precision_score(y_test, probs)
            roc_f = roc_auc_score(y_test, probs)
            
            
            threshold = threshold
            probs_class = probs.copy()
            probs_class[probs_class >= threshold] = 1 
            probs_class[probs_class < threshold] = 0
            precision = precision_score(y_test, probs_class) 
            recall    = recall_score(y_test, probs_class)
            fscore    = f1_score(y_test, probs_class)
            
            print('_________________________________')
            print(f'PR-C is {round(auc_f,3)}')
            print('_________________________________\n')
            
            print('_________________________________')
            print(f'ROC AUC is {round(roc_f,3)}')
            print('_________________________________')
            
            print('_________________________________')
            print(f'FScore is {round(fscore,3)}')
            print('_________________________________\n')
def train_and_test(train_data, train_label, train_seq_length, model,
                   valid_data, valid_labels, valid_seq_length, test_file,
                   result_save_path):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        valid_acc = []
        train_loss = []
        for epoch_idx in range(model.config.num_epochs):
            batch_idx = 0
            for batch_x, batch_y, batch_length in generate_batch(
                    train_data, train_label, train_seq_length,
                    model.config.batch_size):

                _states, _loss, _ = sess.run(
                    [model._states, model.loss, model.optim],
                    feed_dict={
                        model.input_x: batch_x,
                        model.input_y: batch_y,
                        model.keep_prob: model.config.dropout_keep_prob,
                        model.seq_length: batch_length
                    })

                _predict, _acc = sess.run(
                    [model.y_pred_cls, model.acc],
                    feed_dict={
                        model.input_x: batch_x,
                        model.input_y: batch_y,
                        model.keep_prob: model.config.dropout_keep_prob,
                        model.seq_length: batch_length
                    })

                valid_acc_batch = sess.run(model.acc,
                                           feed_dict={
                                               model.input_x: valid_data,
                                               model.input_y: valid_labels,
                                               model.keep_prob:
                                               model.config.dropout_keep_prob,
                                               model.seq_length:
                                               valid_seq_length
                                           })
                valid_acc.append(str(round(valid_acc_batch, 5)))
                train_loss.append(str(round(_loss, 5)))
                batch_idx += 1
                print('epoch={} | batch={} | valid_acc={} | loss={}'.format(
                    epoch_idx, batch_idx, round(valid_acc_batch, 5),
                    round(_loss, 5)))

            each_epoch_each_batch_valid_acc = result_save_path + '_each_epoch_each_batch_valid_acc.txt'
            file_1 = open(each_epoch_each_batch_valid_acc,
                          'w',
                          encoding='utf-8')
            for ii_idx, ii in enumerate(valid_acc):
                file_1.write(
                    'epoch={} | batch={} | valid_acc={} | loss={}\n'.format(
                        epoch_idx, ii_idx, ii, train_loss[ii_idx]))

            # 轮次结束,开始测试 #
            epoch_acc = []
            epoch_precision = []
            epoch_recall = []
            epoch_f1 = []
            tmp_data, tmp_labels, tmp_seq = load_test_data(
                test_file, max_length)
            predict = sess.run(
                model.y_pred_cls,
                feed_dict={
                    model.input_x: tmp_data,
                    model.input_y: tmp_labels,
                    model.keep_prob: model.config.dropout_keep_prob,
                    model.seq_length: tmp_seq  # FIXME 维数不一致导致报错!
                })
            tmp_labels = [0 if item[0] == 1 else 1 for item in tmp_labels]
            acc = metrics.accuracy_score(tmp_labels, predict)
            precision = metrics.precision_score(tmp_labels, predict)
            recall = metrics.recall_score(tmp_labels, predict)
            f1 = metrics.f1_score(tmp_labels, predict)

            epoch_acc.append(round(acc, 5))
            epoch_precision.append(round(precision, 5))
            epoch_recall.append(round(recall, 5))
            epoch_f1.append(round(f1, 5))

        each_epoch_metrics = result_save_path + '_each_epoch_metrics.txt'
        file_2 = open(each_epoch_metrics, 'w', encoding='utf-8')
        for idx in range(len(epoch_acc)):
            file_2.write(
                'epoch={} | acc={} | precision={} | recall={} | f1={}\n'.
                format(idx, epoch_acc[idx], epoch_precision[idx],
                       epoch_recall[idx], epoch_f1[idx]))
        print('epoch={} | acc={} | precision={} | recall={} | f1={}'.format(
            epoch_idx, round(acc, 5), round(precision, 5), round(recall, 5),
            round(f1, 5)))

        file_1.close()
        file_2.close()
Example #38
0
    features_test = []
    labels_train = []
    labels_test = []
    for i in train_index:
        features_train.append(features[i])
        labels_train.append(labels[i])
    for j in test_index:
        features_test.append(features[j])
        labels_test.append(labels[j])
    #grid_search.fit(features_train, labels_train)
    #pred = grid_search.predict(features_test)
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    
    precision_list.append(precision_score(pred,labels_test))
    recall_list.append(recall_score(pred,labels_test))
    accuracy_list.append(accuracy_score(pred,labels_test))
    f1_score_list.append(f1_score(pred,labels_test))
    
precision = (sum(precision_list))/float(len(precision_list))
recall = (sum(recall_list))/float(len(recall_list))
accuracy = (sum(accuracy_list))/float(len(accuracy_list))
f1_score = (sum(f1_score_list))/float(len(f1_score_list))

print "Precision Score :", precision
print "Recall Score :", recall
print "Accuracy Score :", accuracy
print "F1 Score :", f1_score

print clf.named_steps['pca'].explained_variance_ratio_
print clf.named_steps['pca'].components_
Example #39
0
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

shuffle_index = np.random.permutation(len(X_train))
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

from sklearn.svm import SVC
clf = SVC(kernel='linear', random_state=0)
clf.fit(X_train, y_train)

# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)
cm = confusion_matrix(y_train, y_train_pred)
print(cm)

from sklearn.metrics import precision_score, recall_score
print("precision score = {0:.4f}".format(precision_score(
    y_train, y_train_pred)))
print("recall score =  {0:.4f}".format(recall_score(y_train, y_train_pred)))
Example #40
0
                  ('form', LogisticRegression())])

# evaluate model
scores = {
    'model': str(model),
    'name': str(model),
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'time': [],
}

for X_train, X_test, y_train, y_test in loader:
    start = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    scores['time'].append(time.time() - start)
    scores['accuracy'].append(accuracy_score(y_test, y_pred))
    scores['precision'].append(
        precision_score(y_test, y_pred, average='weighted'))
    scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
    scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

    print('Time: {:3.3f} Accuracy: {:0.3f}'.format(
        time.time() - start, accuracy_score(y_test, y_pred)))
    print(list(X_train))
    print(y_pred)

print('Final Accuracy: {:0.3f}'.format(np.mean(scores['accuracy'])))
Example #41
0
def Recall(y_true, y_pred):
    return "The Recall is %f%%." % metrics.recall_score(y_true, y_pred) * 100
Example #42
0
    y_train_valid_proba_vals = lgb_clf.predict_proba(x_train_valid_transformed)
    unique_probas = np.unique(y_train_valid_proba_vals)
    thr_grid = np.linspace(np.percentile(unique_probas, 1),
                           np.percentile(unique_probas, 99), 100)

    precision_scores_G, recall_scores_G = [
        np.zeros(thr_grid.size),
        np.zeros(thr_grid.size)
    ]
    #     y_train_valid_pred_probas = lgb_clf.predict_proba(x_train_valid_transformed)
    for gg, thr in enumerate(thr_grid):
        curr_thr_y_preds = y_train_valid_proba_vals[:, 1] >= thr_grid[gg]
        precision_scores_G[gg] = precision_score(y_train_valid,
                                                 curr_thr_y_preds)
        recall_scores_G[gg] = recall_score(y_train_valid, curr_thr_y_preds)

    keep_inds = precision_scores_G >= fixed_precision
    if keep_inds.sum() > 0:
        precision_scores_G = precision_scores_G[keep_inds]
        recall_scores_G = recall_scores_G[keep_inds]
        thr_grid = thr_grid[keep_inds]
        best_ind = np.argmax(recall_scores_G)
        best_thr = thr_grid[best_ind]
        thr_list.append(best_thr)

        thr_perf_df = pd.DataFrame(
            np.vstack([
                thr_grid[np.newaxis, :], precision_scores_G[np.newaxis, :],
                recall_scores_G[np.newaxis, :]
            ]).T,
# Train test split control
splits = 5
kfold = KFold(n_splits=splits)
for i, (train, test) in enumerate(kfold.split(X=X, y=y)):
    print(f'Currently in Run {i + 1}')
    X_train, X_test, y_train, y_test = X.iloc[train, :], X.iloc[
        test, :], y.iloc[train], y.iloc[test]

    for model in models:
        grid_cv = GridSearchCV(estimator=models[model]['model'],
                               param_grid=models[model]['params'],
                               n_jobs=-1,
                               verbose=True)
        grid_cv.fit(X=X_train, y=y_train)

        preds = grid_cv.predict(X=X_test)
        results[model]['metrics']['acc'].append(
            accuracy_score(y_true=y_test, y_pred=preds))
        results[model]['metrics']['precision'].append(
            precision_score(y_true=y_test, y_pred=preds, pos_label='yes'))
        results[model]['metrics']['recall'].append(
            recall_score(y_true=y_test, y_pred=preds, pos_label='yes'))
        results[model]['probs'].append(grid_cv.predict_proba(X=X_test))
        results[model]['preds'].append(preds)
        results[model]['parameter_rank'].append(
            grid_cv.cv_results_['rank_test_score'])

# bestes parameterset über alle runs auswählen (nach niedrigstem kumulierten rang)

grid_cv.cv_results_['params'][np.argmin(sum(results[model]['parameter_rank']))]
Example #44
0
def test(args):

    label_name = ['false', 'real']

    prefix = args['MODEL'] + '_' + args['BERT_CONFIG']

    bert_size = args['BERT_CONFIG'].split('-')[1]

    device = torch.device("cuda:0" if args['CUDA'] == 'gpu' else "cpu")

    print('load best model...')

    if args['MODEL'] == 'cnn':
        model = CustomBertConvModel.load(prefix + '_model.bin', device)
    elif args['MODEL'] == 'lstm':
        model = CustomBertLSTMModel.load(prefix + '_model.bin', device)

    model.to(device)

    model.eval()

    df_test = pd.read_csv(args['--test'], index_col=0)

    df_test = df_test.sort_values(by='preprocessed_text_bert' + bert_size +
                                  '_length',
                                  ascending=False)

    test_batch_size = 32

    n_batch = int(np.ceil(df_test.shape[0] / test_batch_size))

    cn_loss = torch.load('loss_func',
                         map_location=lambda storage, loc: storage).to(device)

    preprocessed_text_bert = list(df_test.preprocessed_text_bert)
    information_label = list(df_test.information_label)

    test_loss = 0.
    prediction = []
    prob = []

    softmax = torch.nn.Softmax(dim=1)

    with torch.no_grad():
        for i in range(n_batch):
            sents = preprocessed_text_bert[i * test_batch_size:(i + 1) *
                                           test_batch_size]
            targets = torch.tensor(
                information_label[i * test_batch_size:(i + 1) *
                                  test_batch_size],
                dtype=torch.long,
                device=device)
            batch_size = len(sents)

            pre_softmax = model(sents).double()

            batch_loss = cn_loss(pre_softmax, targets)
            test_loss += batch_loss.item() * batch_size
            prob_batch = softmax(pre_softmax)
            prob.append(prob_batch)

            prediction.extend(
                [t.item() for t in list(torch.argmax(prob_batch, dim=1))])

    prob = torch.cat(tuple(prob), dim=0)
    loss = test_loss / df_test.shape[0]

    pickle.dump([label_name[i] for i in prediction],
                open(prefix + '_test_prediction', 'wb'))
    pickle.dump(prob.data.cpu().numpy(),
                open(prefix + '_test_prediction_prob', 'wb'))

    accuracy = accuracy_score(df_test.information_label.values, prediction)
    matthews = matthews_corrcoef(df_test.information_label.values, prediction)

    print(
        f'F score on the test set: {f1_score(df_test.information_label.values, prediction)}'
    )
    print(f'Accuracy on the test set: {accuracy}')
    print(
        'For more information look at the metrics_csv.csv file we created for you'
    )

    precisions = {}
    recalls = {}
    f1s = {}
    aucrocs = {}

    for i in range(len(label_name)):
        prediction_ = [1 if pred == i else 0 for pred in prediction]
        true_ = [
            1 if label == i else 0
            for label in df_test.information_label.values
        ]
        f1s.update({label_name[i]: f1_score(true_, prediction_)})
        precisions.update({label_name[i]: precision_score(true_, prediction_)})
        recalls.update({label_name[i]: recall_score(true_, prediction_)})
        aucrocs.update({
            label_name[i]:
            roc_auc_score(true_, list(t.item() for t in prob[:, i]))
        })

    metrics_dict = {
        'loss': loss,
        'accuracy': accuracy,
        'matthews coef': matthews,
        'precision': precisions,
        'recall': recalls,
        'f1': f1s,
        'aucroc': aucrocs
    }

    metrics_dataframe = pd.DataFrame.from_dict(metrics_dict)
    metrics_dataframe.to_csv("metrics_csv.csv")

    pickle.dump(metrics_dict, open(prefix+'_evaluation_metrics', 'wb'))\
Example #45
0
def getROCCurveMetrics():
    import matplotlib.pyplot as plt
    lab = [7, 2, 4, 0, 3, 3]
    pred = [4, 4, 4, 0, 5, 3]
    classes = np.arange(0, 8)
    n_classes = len(classes)
    print(recall_score(lab, pred, labels=classes, average='micro'))
    print(recall_score(lab, pred, labels=classes, average='macro'))
    print(precision_score(lab, pred, labels=classes, average='micro'))
    print(precision_score(lab, pred, labels=classes, average='macro'))
    print(f1_score(lab, pred, labels=classes, average='micro'))
    print(f1_score(lab, pred, labels=classes, average='macro'))
    print(roc_curve(lab, pred, pos_label=0))

    y_truth = label_binarize(lab, classes=classes)
    y_score = label_binarize(pred, classes=classes)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_truth[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_truth.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves
    plt.figure()
    plt.plot(fpr["micro"],
             tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["micro"]),
             color='deeppink',
             linestyle=':',
             linewidth=4)

    plt.plot(fpr["macro"],
             tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='navy',
             linestyle=':',
             linewidth=4)

    lw = 2
    #colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    #for i, color in zip(range(n_classes), colors):
    #    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
    #             label='ROC curve of class {0} (area = {1:0.2f})'
    #             ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(
        'Some extension of Receiver operating characteristic to multi-class')
    plt.legend(loc="lower right")
    plt.show()
def create_analysis_report(model_output,
                           model_output_rounded,
                           groundtruth,
                           output_path,
                           LABELS_LIST,
                           validation_output=None,
                           validation_groundtruth=None):
    """
    Create a report of all the different evaluation metrics, including optimizing the threshold with the validation set
    if it is passed in the parameters
    """
    # Create a dataframe where we keep all the evaluations, starting by prediction accuracy
    accuracies_perclass = sum(
        model_output_rounded == groundtruth) / len(groundtruth)
    results_df = pd.DataFrame(columns=LABELS_LIST)
    results_df.index.astype(str, copy=False)
    percentage_of_positives_perclass = sum(groundtruth) / len(groundtruth)
    results_df.loc[0] = percentage_of_positives_perclass
    results_df.loc[1] = accuracies_perclass
    results_df.index = ['Ratio of positive samples', 'Model accuracy']

    # plot the accuracies per class
    results_df.T.plot.bar(figsize=(22, 12), fontsize=18)
    plt.title('Model accuracy vs the ratio of positive samples per class')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(output_path, "accuracies_vs_positiveRate.pdf"),
                format="pdf")
    plt.savefig(os.path.join(output_path, "accuracies_vs_positiveRate.png"))

    # Getting the true positive rate perclass
    true_positives_ratio_perclass = sum((model_output_rounded == groundtruth) *
                                        (groundtruth == 1)) / sum(groundtruth)
    results_df.loc[2] = true_positives_ratio_perclass
    # Get true negative ratio
    true_negative_ratio_perclass = sum(
        (model_output_rounded == groundtruth) *
        (groundtruth == 0)) / (len(groundtruth) - sum(groundtruth))
    results_df.loc[3] = true_negative_ratio_perclass
    # compute additional metrics (AUC,f1,recall,precision)
    auc_roc_per_label = roc_auc_score(groundtruth, model_output, average=None)
    precision_perlabel = precision_score(groundtruth,
                                         model_output_rounded,
                                         average=None)
    recall_perlabel = recall_score(groundtruth,
                                   model_output_rounded,
                                   average=None)
    f1_perlabel = f1_score(groundtruth, model_output_rounded, average=None)
    kappa_perlabel = [
        cohen_kappa_score(groundtruth[:, x], model_output_rounded[:, x])
        for x in range(len(LABELS_LIST))
    ]
    results_df = results_df.append(
        pd.DataFrame([
            auc_roc_per_label, recall_perlabel, precision_perlabel,
            f1_perlabel, kappa_perlabel
        ],
                     columns=LABELS_LIST))
    results_df.index = [
        'Ratio of positive samples', 'Model accuracy', 'True positives ratio',
        'True negatives ratio', "AUC", "Recall", "Precision", "f1-score",
        "Kappa score"
    ]

    # Creating evaluation plots
    plot_true_poisitve_vs_all_positives(
        model_output_rounded, groundtruth,
        os.path.join(output_path, 'TruePositive_vs_allPositives'), LABELS_LIST)
    plot_output_coocurances(model_output_rounded,
                            os.path.join(output_path, 'output_coocurances'),
                            LABELS_LIST)
    plot_false_netgatives_confusion_matrix(
        model_output_rounded, groundtruth,
        os.path.join(output_path, 'false_negative_coocurances'), LABELS_LIST)
    results_df['average'] = results_df.mean(numeric_only=True, axis=1)
    results_df.T.to_csv(os.path.join(output_path, "results_report.csv"),
                        float_format="%.2f")
    return results_df
Example #47
0
    # clustering is opposite of original classification
    reassignflag = True
kmeans_predicted_test_labels = kmeans.predict(test_features)
if reassignflag:
    kmeans_predicted_test_labels = 1 - kmeans_predicted_test_labels

#calculating confusion matrix for kmeans
tn, fp, fn, tp = confusion_matrix(test_labels,
                                  kmeans_predicted_test_labels).ravel()

#scoring kmeans
kmeans_accuracy_score = accuracy_score(test_labels,
                                       kmeans_predicted_test_labels)
kmeans_precison_score = precision_score(test_labels,
                                        kmeans_predicted_test_labels)
kmeans_recall_score = recall_score(test_labels, kmeans_predicted_test_labels)
kmeans_f1_score = f1_score(test_labels, kmeans_predicted_test_labels)

#printing
print("")
print("K-Means")
print("Confusion Matrix")
print("tn =", tn, "fp =", fp)
print("fn =", fn, "tp =", tp)
print("Scores")
print("Accuracy -->", kmeans_accuracy_score)
print("Precison -->", kmeans_precison_score)
print("Recall -->", kmeans_recall_score)
print("F1 -->", kmeans_f1_score)

#k_nearest_neighbours_classification:
Example #48
0
                           d2_tr[rang], T_tr[rang], Y_tr[rang])

        if (j % N) == 0:
            pred = test_step(W_te, P_te, C_te, d1_te, d2_te, T_te, Y_te)
            print "test data size ", len(pred)
            y_true = np.argmax(Y_te, 1)
            y_pred = pred
            y_true_list.append(y_true)
            y_pred_list.append(y_pred)

    for y_true, y_pred in zip(y_true_list, y_pred_list):
        fp.write(
            str(
                precision_score(y_true,
                                y_pred, [1, 2, 3, 4, 5],
                                average='weighted')))
        fp.write('\t')
        fp.write(
            str(
                recall_score(y_true,
                             y_pred, [1, 2, 3, 4, 5],
                             average='weighted')))
        fp.write('\t')
        fp.write(
            str(f1_score(y_true, y_pred, [1, 2, 3, 4, 5], average='weighted')))
        fp.write('\t')
        fp.write('\n')

    fp.write('\n')
    fp.write('\n')
lr_param = {'C': [0.01, 0.1, 0.2, 0.5, 1, 1.5, 2],
                'class_weight': [{1: 1, 0: 1},  {1: 2, 0: 1}, {1: 3, 0: 1}, {1: 5, 0: 1}]}
lr_gsearch = GridSearchCV(
        estimator=LogisticRegression(random_state=0, fit_intercept=True, penalty='l2', solver='saga'),
        param_grid=lr_param, cv=3, scoring='f1', n_jobs=-1, verbose=2)
lr_gsearch.fit(x_train,y_train)

LR_model_2=LogisticRegression(C=lr_gsearch.best_params_['C'],penalty='l2',solver='saga',class_weight=lr_gsearch.best_params_['class_weight'])
LR_model=LR_model_2.fit(x_train,y_train)

'''模型评估,使用ROC评估模型效果'''
from sklearn.metrics import roc_curve, auc,confusion_matrix,recall_score,precision_score,accuracy_score
y_pred=LR_model.predict(x_test)
cnf_matrix = confusion_matrix(y_test, y_pred)
recall_value = recall_score(y_test, y_pred)
precision_value = precision_score(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print(cnf_matrix)

'''绘制ROC曲线,评估结果AUC为0.66'''
y_score_test = LR_model.predict_proba(x_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_score_test)
roc_auc = auc(fpr, tpr)
ks = max(tpr - fpr)
ar = 2*roc_auc-1
print('test set:  model AR is {0},and ks is {1},auc={2}'.format(ar,ks,roc_auc)) 
    
import matplotlib.pyplot as plt
Example #50
0
        model = train_loop(dataloaders, dataset_sizes, num_classes, config=config, epochs=15)
        

        y_pred = np.array([])

        for i in tqdm(range(len(test_set))):
            inputs = torch.Tensor([test_set[i][0]]).to(device)
            model.eval()
            outputs = model(inputs)
            preds = torch.max(outputs, 1)[1]
            y_pred = np.append(y_pred, preds.cpu().numpy())

        acc_arr.append(accuracy_score(metric_test, y_pred))
        acc_cum += acc_arr[fold_number-1]
        rec_arr.append(recall_score(metric_test, y_pred, average='macro'))
        rec_cum += rec_arr[fold_number-1]
        pre_arr.append(precision_score(metric_test, y_pred, average='macro'))
        pre_cum += pre_arr[fold_number-1]
        f1_arr.append(f1_score(metric_test, y_pred, average='macro'))
        f1_cum  += f1_arr[fold_number-1]
        f1_arr_mic.append(f1_score(metric_test, y_pred, average='micro'))
        f1_cum_mic  += f1_arr_mic[fold_number-1]
        fold_number+=1

    print("Accuracy: ", acc_cum/5)
    print("Recall: ", rec_cum/5)
    print("Precision: ", pre_cum/5)
    print("F1 score: ", f1_cum/5)
    print("F1 score Micro: ", f1_cum_mic/5)
Example #51
0
def main():
    import time
    import prettytable
    from collections import Counter
    from sklearn import tree
    from sklearn import metrics
    from sklearn import preprocessing
    from imblearn.datasets import fetch_datasets
    from imblearn.metrics import geometric_mean_score
    from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold

    start_time = time.time()
    dataset = fetch_datasets()['satimage']
    X = dataset.data
    y = dataset.target
    print(Counter(y))

    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    # cv = RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42)
    dic = {'recall': [], 'precision': [], 'f1': [], 'auc': [], 'gmean': []}
    results = prettytable.PrettyTable(
        ["Classifier", "Precision", 'Recall', 'F-measure', 'AUC', 'G-mean'])
    for train, test in cv.split(X, y):
        # preprocessing
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        # training
        sb = CGMOS(ratio=0.5, sigmafactor=1, random_state=42)
        # testing
        X_res, y_res = sb.fit_sample(X_train_minmax, y[train])

        model = tree.DecisionTreeClassifier(max_depth=8,
                                            min_samples_split=10,
                                            random_state=42)
        model.fit(X_res, y_res)
        predict = model.predict(X_test_minmax)
        probability = model.predict_proba(X_test_minmax)[:, 1]

        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        dic['precision'].append(precision)
        dic['recall'].append(recall)
        dic['f1'].append(f1)
        dic['auc'].append(auc)
        dic['gmean'].append(gmean)

    results.add_row([
        'CGMOS',
        np.mean(np.array(dic['precision'])),
        np.mean(np.array(dic['recall'])),
        np.mean(np.array(dic['f1'])),
        np.mean(np.array(dic['auc'])),
        np.mean(np.array(dic['gmean']))
    ])
    print(results)
    print('CGMOS building id transforming took %fs!' %
          (time.time() - start_time))
 def calculate(self):
     return GenericEvaluatorResults(metrics.recall_score(
         y_pred=np.array(self._outputs).argmax(axis=-1),
         y_true=np.array(self._targets),
         average=self._average
     ), self._average + '-recall', '%5.4f', is_max_better=True)
Example #53
0
    print("Best Selected Parameters:")
    print(tuned_model.best_params_)

    #Predict on test data. y_pred_test contains predictions for each sample
    y_pred_test = tuned_model.best_estimator_.predict(X_data_test)

    # print the prediction on test sets (if needed)
    #print( y_pred_test)

    #get class wise precision score and store the results in a list
    #Set 'average=None' to get class wise results
    precisionResult = precision_score(y_data_test, y_pred_test, average=None)
    precScoreList.append(precisionResult)

    #get class wise recall score  and store the results in a list
    recallResult = recall_score(y_data_test, y_pred_test, average=None)
    recallScoreList.append(recallResult)

    #get class wise f-measure score and store the results in a list
    fScoreResult = f1_score(y_data_test, y_pred_test, average=None)
    fScoreList.append(fScoreResult)

    print()
    print()
    print(
        "***Scikit learn will set a metric (e.g. recall) value to zero and display a warning message "
    )
    print("when no samples present for a particular class in the test set***")
#For loop ends here
#Print the results of the list thta contain the results
#print(precScoreList)
checkpointer = callbacks.ModelCheckpoint(filepath="kddresults/dnn4layer/checkpoint-{epoch:02d}.hdf5", verbose=1, save_best_only=True, monitor='loss')
csv_logger = CSVLogger('kddresults/dnn4layer/training_set_dnnanalysis.csv',separator=',', append=False)
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=100, callbacks=[checkpointer,csv_logger])
model.save("kddresults/dnn4layer/dnn4layer_model.hdf5")
'''

score = []
name = []
from sklearn.metrics import confusion_matrix
import os
for file in os.listdir("kddresults/dnn4layer/"):
  model.load_weights("kddresults/dnn4layer/"+file)
  y_train1 = y_test
  y_pred = model.predict_classes(X_test)
  accuracy = accuracy_score(y_train1, y_pred)
  recall = recall_score(y_train1, y_pred , average="binary")
  precision = precision_score(y_train1, y_pred , average="binary")
  f1 = f1_score(y_train1, y_pred, average="binary")
  print("----------------------------------------------")
  print("accuracy")
  print("%.3f" %accuracy)
  print("recall")
  print("%.3f" %recall)
  print("precision")
  print("%.3f" %precision)
  print("f1score")
  print("%.3f" %f1)
  score.append(accuracy)
  name.append(file)

Example #55
0
# First classifier.
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn import grid_search
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

clf_nb = GaussianNB()
clf_nb.fit(features_train, labels_train)
pred = clf_nb.predict(features_test)
accuracy_nb = accuracy_score(pred, labels_test)

print 'NB Precision:', precision_score(labels_test, pred)
print 'NB Recall:', recall_score(labels_test, pred)
print 'NB Accuracy:', accuracy_nb

# Support Vector Machines (SVM) supervised classification algorithm and
# accuracy testing.
# Second classifier.
from sklearn.svm import SVC

clf_svm = SVC()
clf_svm.fit(features_train, labels_train)
pred = clf_svm.predict(features_test)
accuracy_svm = accuracy_score(pred, labels_test)

print 'SVM Precision:', precision_score(labels_test, pred)
print 'SVM Recall:', recall_score(labels_test, pred)
print 'SVM Accuracy:', accuracy_svm
Example #56
0
def calculate_sample_metrics(nclasses,
                             agg,
                             gt,
                             probs,
                             doc_starts,
                             print_per_class_results=False):
    result = -np.ones(len(SCORE_NAMES) - 3)

    gt = gt.astype(int)

    probs[np.isnan(probs)] = 0
    probs[np.isinf(probs)] = 0

    # token-level metrics
    result[0] = skm.accuracy_score(gt, agg)

    # the results are undefined if some classes are not present in the gold labels
    prec_by_class = skm.precision_score(gt,
                                        agg,
                                        average=None,
                                        labels=range(nclasses))
    rec_by_class = skm.recall_score(gt,
                                    agg,
                                    average=None,
                                    labels=range(nclasses))
    f1_by_class = skm.f1_score(gt, agg, average=None, labels=range(nclasses))

    if print_per_class_results:
        print('Token Precision:')
        print(prec_by_class)

        print('Token Recall:')
        print(rec_by_class)

        print('Token F1:')
        print(f1_by_class)

    result[1] = np.mean(prec_by_class[np.unique(gt)])
    result[2] = np.mean(rec_by_class[np.unique(gt)])
    result[3] = np.mean(f1_by_class[np.unique(gt)])

    # span-level metrics - strict
    p, r, f = strict_span_metrics_2(agg, gt, doc_starts)
    result[6] = p  # precision(agg, gt, True, doc_starts)
    result[7] = r  # recall(agg, gt, True, doc_starts)
    result[8] = f  # f1(agg, gt, True, doc_starts)

    # span-level metrics -- relaxed
    result[9] = precision(agg, gt, False, doc_starts)
    result[10] = recall(agg, gt, False, doc_starts)
    result[11] = f1(agg, gt, False, doc_starts)

    auc_score = 0
    total_weights = 0
    for i in range(probs.shape[1]):

        if not np.any(gt == i) or np.all(gt == i) or np.any(
                np.isnan(probs[:, i])) or np.any(np.isinf(probs[:, i])):
            print(
                'Could not evaluate AUC for class %i -- all data points have same value.'
                % i)
            continue

        auc_i = skm.roc_auc_score(gt == i, probs[:, i])
        # print 'AUC for class %i: %f' % (i, auc_i)
        auc_score += auc_i * np.sum(gt == i)
        total_weights += np.sum(gt == i)

        if print_per_class_results:
            print('AUC for class %i = %f' % (i, auc_i))

    result[4] = auc_score / float(total_weights) if total_weights > 0 else 0

    result[5] = skm.log_loss(gt, probs, eps=1e-100, labels=np.arange(nclasses))

    return result
Example #57
0
    print(X_train)
    y_train, y_test = target[train_index], target[test_index]
    print(y_train)
for i in range(5, 12):
    clf = tree.DecisionTreeClassifier(max_depth=i)
    print('--------------------{}---------------------------------'.format(i))
    print('cross_val_predict')
    predicted = cross_val_predict(
        clf,
        train,
        target,
        cv=10,
    )  # predict y values for the test fold

    print('mean recall in all classes:')
    print(met.recall_score(target, predicted, average=None))

    print('mean precision in all classes')
    print(met.precision_score(target, predicted, average=None))

    print('mean accuracy in all classes:')
    print(met.accuracy_score(target, predicted))
    print('----------------------------------------')

    fig, ax = plt.subplots()
    ax.scatter(target, predicted, edgecolors=(0, 0, 0))
    ax.plot([target.min(), target.max()],
            [target.min(), target.max()],
            'k--',
            lw=4)
    ax.set_xlabel('Measured k {}'.format(i))
Example #58
0
        def conf_mat(y_test, y_train, y_pred, y_train_pred, directory):
            # IMPORTANT: first argument is true values, second argument is predicted values
            # this produces a 2x2 numpy array (matrix)
            conf_mat_test = metrics.confusion_matrix(y_test, y_pred)
            conf_mat_3CV = metrics.confusion_matrix(y_train, y_train_pred)

            def draw_conf_mat(matrix, directory):
                datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M')
                labels = ['0', '1']
                ax = plt.subplot()
                sns.set(font_scale=1.5)
                sns.heatmap(matrix,
                            annot=True,
                            ax=ax,
                            annot_kws={'size': 18},
                            vmin=0,
                            vmax=12)
                # plt.title('Confusion matrix of the classifier')
                ax.set_xticklabels(labels, fontdict={'fontsize': 18})
                ax.set_yticklabels(labels, fontdict={'fontsize': 18})
                plt.xlabel('Predicted')
                plt.ylabel('True')
                plt.tight_layout()
                plt.savefig(os.path.join(
                    directory,
                    'confusion_matrix_tree_rand_ada_' + datestring + '.png'),
                            dpi=600)
                plt.close()

            draw_conf_mat(conf_mat_test, directory)
            #draw_conf_mat(conf_mat_3CV, directory, 'train_CV_')

            TP = conf_mat_test[1, 1]
            TN = conf_mat_test[0, 0]
            FP = conf_mat_test[0, 1]
            FN = conf_mat_test[1, 0]

            TP_CV = conf_mat_3CV[1, 1]
            TN_CV = conf_mat_3CV[0, 0]
            FP_CV = conf_mat_3CV[0, 1]
            FN_CV = conf_mat_3CV[1, 0]

            with open(
                    os.path.join(directory,
                                 'decisiontree_ada_randomsearch.txt'),
                    'a') as text_file:
                text_file.write('confusion matrix using test set: %s \n' %
                                conf_mat_test)
                text_file.write('confusion matrix using 3-fold CV: %s \n' %
                                conf_mat_3CV)
                text_file.write(
                    'Slicing confusion matrix for test set into: TP, TN, FP, FN \n'
                )
                text_file.write(
                    'Slicing confusion matrix for 3-fold CV into: TP_CV, TN_CV, FP_CV, FN_CV \n'
                )

            #calculate accuracy
            acc_score_man_test = (TP + TN) / float(TP + TN + FP + FN)
            acc_score_sklearn_test = metrics.accuracy_score(y_test, y_pred)
            acc_score_man_CV = (TP_CV + TN_CV) / float(TP_CV + TN_CV + FP_CV +
                                                       FN_CV)
            acc_score_sklearn_CV = metrics.accuracy_score(
                y_train, y_train_pred)
            with open(
                    os.path.join(directory,
                                 'decisiontree_ada_randomsearch.txt'),
                    'a') as text_file:
                text_file.write('Accuracy score: \n')
                text_file.write('accuracy score manual test: %s \n' %
                                acc_score_man_test)
                text_file.write('accuracy score sklearn test: %s \n' %
                                acc_score_sklearn_test)
                text_file.write('accuracy score manual CV: %s \n' %
                                acc_score_man_CV)
                text_file.write('accuracy score sklearn CV: %s \n' %
                                acc_score_sklearn_CV)

            #classification error
            class_err_man_test = (FP + FN) / float(TP + TN + FP + FN)
            class_err_sklearn_test = 1 - metrics.accuracy_score(y_test, y_pred)
            class_err_man_CV = (FP_CV + FN_CV) / float(TP_CV + TN_CV + FP_CV +
                                                       FN_CV)
            class_err_sklearn_CV = 1 - metrics.accuracy_score(
                y_train, y_train_pred)
            with open(
                    os.path.join(directory,
                                 'decisiontree_ada_randomsearch.txt'),
                    'a') as text_file:
                text_file.write('Classification error: \n')
                text_file.write('classification error manual test: %s \n' %
                                class_err_man_test)
                text_file.write('classification error sklearn test: %s \n' %
                                class_err_sklearn_test)
                text_file.write('classification error manual CV: %s \n' %
                                class_err_man_CV)
                text_file.write('classification error sklearn CV: %s \n' %
                                class_err_sklearn_CV)

            #sensitivity/recall/true positive rate; correctly placed positive cases
            sensitivity_man_test = TP / float(FN + TP)
            sensitivity_sklearn_test = metrics.recall_score(y_test, y_pred)
            sensitivity_man_CV = TP_CV / float(FN_CV + TP_CV)
            sensitivity_sklearn_CV = metrics.recall_score(
                y_train, y_train_pred)
            with open(
                    os.path.join(directory,
                                 'decisiontree_ada_randomsearch.txt'),
                    'a') as text_file:
                text_file.write('Sensitivity/Recall/True positives: \n')
                text_file.write('sensitivity manual test: %s \n' %
                                sensitivity_man_test)
                text_file.write('sensitivity sklearn test: %s \n' %
                                sensitivity_sklearn_test)
                text_file.write('sensitivity manual CV: %s \n' %
                                sensitivity_man_CV)
                text_file.write('sensitivity sklearn CV: %s \n' %
                                sensitivity_sklearn_CV)

            #specificity
            specificity_man_test = TN / (TN + FP)
            specificity_man_CV = TN_CV / (TN_CV + FP_CV)
            with open(
                    os.path.join(directory,
                                 'decisiontree_ada_randomsearch.txt'),
                    'a') as text_file:
                text_file.write('Specificity: \n')
                text_file.write('specificity manual test: %s \n' %
                                specificity_man_test)
                text_file.write('specificity manual CV: %s \n' %
                                specificity_man_CV)

            #false positive rate
            false_positive_rate_man_test = FP / float(TN + FP)
            false_positive_rate_man_CV = FP_CV / float(TN_CV + FP_CV)
            with open(
                    os.path.join(directory,
                                 'decisiontree_ada_randomsearch.txt'),
                    'a') as text_file:
                text_file.write('False positive rate or 1-specificity: \n')
                text_file.write('false positive rate manual test: %s \n' %
                                false_positive_rate_man_test)
                text_file.write('1 - specificity test: %s \n' %
                                (1 - specificity_man_test))
                text_file.write('false positive rate manual CV: %s \n' %
                                false_positive_rate_man_CV)
                text_file.write('1 - specificity CV: %s \n' %
                                (1 - specificity_man_CV))

            #precision/confidence of placement
            precision_man_test = TP / float(TP + FP)
            precision_sklearn_test = metrics.precision_score(y_test, y_pred)
            precision_man_CV = TP_CV / float(TP_CV + FP_CV)
            precision_sklearn_CV = metrics.precision_score(
                y_train, y_train_pred)
            with open(
                    os.path.join(directory,
                                 'decisiontree_ada_randomsearch.txt'),
                    'a') as text_file:
                text_file.write(
                    'Precision or confidence of classification: \n')
                text_file.write('precision manual: %s \n' % precision_man_test)
                text_file.write('precision sklearn: %s \n' %
                                precision_sklearn_test)
                text_file.write('precision manual CV: %s \n' %
                                precision_man_CV)
                text_file.write('precision sklearn CV: %s \n' %
                                precision_sklearn_CV)

            #F1 score; uses precision and recall
            f1_score_sklearn_test = f1_score(y_test, y_pred)
            f1_score_sklearn_CV = f1_score(y_train, y_train_pred)
            with open(
                    os.path.join(directory,
                                 'decisiontree_ada_randomsearch.txt'),
                    'a') as text_file:
                text_file.write('F1 score: \n')
                text_file.write('F1 score sklearn test: %s \n' %
                                f1_score_sklearn_test)
                text_file.write('F1 score sklearn CV: %s \n' %
                                f1_score_sklearn_CV)
Example #59
0
from prepare import readbunchobj
data = readbunchobj('dataset_woe.data')
X_train = pd.DataFrame(data.X_train)
X_test = data.X_test
y_train = data.y_train
y_test = data.y_test

# # 缺失值插补
# imp = SimpleImputer(strategy='mean')  # 均值 单变量插补
# X_train = imp.fit_transform(X_train)  # 训练集插补
# X_test = imp.transform(X_test)  # 测试集插补
#
# # 归一化
# prep = StandardScaler()
# X_train = prep.fit_transform(X_train)
# X_test = prep.transform(X_test)

if0 = IsolationForest(bootstrap=True, n_jobs=-1, random_state=10)
if0.fit(X_train)
y_pred = if0.predict(X_test)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

c_m = metrics.confusion_matrix(y_test, y_pred)
print('真反例:{0}\n假反例:{1}\n真正例:{2}\n假正例:{3}\n'.format(c_m[0][0], c_m[1][0],
                                                    c_m[1][1], c_m[0][1]))
print("召回率:%.4f" % metrics.recall_score(y_test, y_pred))
print("查准率:%.4f" % metrics.precision_score(y_test, y_pred))
print("F1:%.4f" % metrics.f1_score(y_test, y_pred))
print("roc_auc:%.4f" % metrics.roc_auc_score(y_test, y_pred))
Example #60
0
]

y_pred_test_6_with_update = [
    -1, 22, 11, 15, 11, 22, 11, 22, 22, 22, 22, 22, 11, 22, 19, 22, 22, 22, 22,
    22, 22, 22, 11, 22, 22
]

y_pred_test_6 = [
    15, -1, 15, 15, 15, 22, 22, 22, 22, 11, 19, 19, 19, 22, 19, 22, 11, 22, 22,
    22, 22, 22, 22, 11, 11
]

if __name__ == '__main__':
    accuracy = accuracy_score(y_true, y_pred_test_1)
    precision = precision_score(y_true, y_pred_test_1, average="weighted")
    recall = recall_score(y_true, y_pred_test_1, average="weighted")

    print("Reconhecedor estático")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

    accuracy = accuracy_score(y_true, y_pred_test_2)
    precision = precision_score(y_true, y_pred_test_2, average='weighted')
    recall = recall_score(y_true, y_pred_test_2, average='weighted')

    print("Reconhecedor estático apontando para uma foto no computador")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")