def random_forest(): l=1 if(l==1): print("------------------------RANDOM FOREST-----------------------") df = pd.read_csv(var.get(), low_memory=False) df = df.sample(frac=1).reset_index(drop=True) frauds = df.loc[df['Class'] == 1] non_frauds = df.loc[df['Class'] == 0] print("\nWe have", len(frauds), "fraud data points and", len(non_frauds), "nonfraudulent data points.") X = df.iloc[:,:-1] y = df['Class'] print("X and y sizes, respectively:", len(X), len(y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35) print("Train and test sizes, respectively:", len(X_train), len(y_train), "|", len(X_test), len(y_test)) print("Total number of frauds:", len(y.loc[df['Class'] == 1])) print("Number of frauds on y_test:", len(y_test.loc[df['Class'] == 1])) print("Number of frauds on y_train:", len(y_train.loc[df['Class'] == 1])) clf= RandomForestClassifier() clf.fit(X_train, y_train) y_predicted1 =np.array(clf.predict(X_test)) y_right1=np.array(y_test) confusion_matrix1=ConfusionMatrix(y_right1,y_predicted1) print("\n\nConfusion matrix:\n%s" % confusion_matrix1) #confusion_matrix1.plot(normalized=True) T = Text(root, height=60, width=60) T.pack(pady=20,side=BOTTOM, fill=Y) for l in confusion_matrix1.stats(): T.insert(END,[l,confusion_matrix1.stats()[l]]) T.insert(END,"\n") d['ACC'].append(confusion_matrix1.stats()['ACC']*100) d['TPR'].append(confusion_matrix1.stats()['TPR']*100) fpr,tpr,thresholds=roc_curve(y_right1, y_predicted1) aucarr['auc'].append(auc(fpr,tpr))
def logistic_regression(): print("------------------------LOGISTIC REGRESSION-----------------------") df = pd.read_csv(var.get(), low_memory=False) df = df.sample(frac=1).reset_index(drop=True) frauds = df.loc[df['Class'] == 1] non_frauds = df.loc[df['Class'] == 0] print("\n") print("We have", len(frauds), "fraud data points and", len(non_frauds), "nonfraudulent data points.\n") X = df.iloc[:,:-1] y = df['Class'] print("X and y sizes, respectively:", len(X), len(y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35) '''print("\nTrain and test sizes, respectively:", len(X_train), len(y_train), "|", len(X_test), len(y_test)) print("Total number of frauds:", len(y.loc[df['Class'] == 1])) print("Number of frauds on y_test:", len(y_test.loc[df['Class'] == 1])) print("Number of frauds on y_train:", len(y_train.loc[df['Class'] == 1]))''' logistic = linear_model.LogisticRegression(C=1e5) logistic.fit(X_train, y_train) print("\nScore: ", logistic.score(X_test, y_test)) y_predicted = np.array(logistic.predict(X_test)) y_right = np.array(y_test) confusion_matrix = ConfusionMatrix(y_right, y_predicted) print("\n\nConfusion matrix:\n%s" % confusion_matrix) #confusion_matrix.plot(normalized=True) T = Text(root, height=60, width=60) T.pack(pady=20,side=BOTTOM, fill=Y) for l in confusion_matrix.stats(): T.insert(END,[l,confusion_matrix.stats()[l]]) T.insert(END,"\n") d['ACC'].append(confusion_matrix.stats()['ACC']*100) d['TPR'].append(confusion_matrix.stats()['TPR']*100) fpr,tpr,thresholds=roc_curve(y_right, y_predicted) aucarr['auc'].append(auc(fpr,tpr))
def plot_confusion_matrix_with_accuracy(classes, y_true, y_pred, title, sum_overall_accuracy, total_predictions): cm = ConfusionMatrix(y_true, y_pred) print('Current Overall accuracy: ' + str(cm.stats()['overall']['Accuracy'])) if total_predictions != 0: print('Total Overall Accuracy: ' + str(sum_overall_accuracy / total_predictions)) else: print('Total Overall Accuracy: ' + str(cm.stats()['overall']['Accuracy'])) conf_matrix = confusion_matrix(y_true, y_pred) plt.figure() plot_confusion_matrix(conf_matrix, classes=classes, title=title) plt.show()
def logistic_reg_smote(): l=1 if(l==1): print("------------------------LOGISTIC REGRESSION WITH SMOTE-----------------------") df = pd.read_csv(var.get(), low_memory=False) df = df.sample(frac=1).reset_index(drop=True) frauds = df.loc[df['Class'] == 1] non_frauds = df.loc[df['Class'] == 0] print("\nWe have", len(frauds), "fraud data points and", len(non_frauds), "nonfraudulent data points.") X = df.iloc[:,:-1] y = df['Class'] print("X and y sizes, respectively:", len(X), len(y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35) print("Train and test sizes, respectively:", len(X_train), len(y_train), "|", len(X_test), len(y_test)) print("Total number of frauds:", len(y.loc[df['Class'] == 1])) print("Number of frauds on y_test:", len(y_test.loc[df['Class'] == 1])) print("Number of frauds on y_train:", len(y_train.loc[df['Class'] == 1])) df2 = pdml.ModelFrame(X_train, target=y_train) sampler = df2.imbalance.over_sampling.SMOTE() sampled = df2.fit_sample(sampler) print("\nSize of training set after over sampling:", len(sampled)) X_train_sampled = sampled.iloc[:,1:] y_train_sampled = sampled['Class'] logistic = linear_model.LogisticRegression(C=1e5) logistic.fit(X_train_sampled, y_train_sampled) print("Score: ", logistic.score(X_test, y_test)) y_predicted1 = np.array(logistic.predict(X_test)) y_right1 = np.array(y_test) confusion_matrix1 = ConfusionMatrix(y_right1, y_predicted1) print("\n\nConfusion matrix:\n%s" % confusion_matrix1) #confusion_matrix1.plot(normalized=True) T = Text(root, height=60, width=60) T.pack(pady=20,side=BOTTOM, fill=Y) for l in confusion_matrix1.stats(): T.insert(END,[l,confusion_matrix1.stats()[l]]) T.insert(END,"\n") d['ACC'].append(confusion_matrix1.stats()['ACC']*100) d['TPR'].append(confusion_matrix1.stats()['TPR']*100) fpr,tpr,thresholds=roc_curve(y_right1, y_predicted1) aucarr['auc'].append(auc(fpr,tpr))
def test_pandas_confusion_cm_stats_animals(self): y_true = ['rabbit', 'cat', 'rabbit', 'rabbit', 'cat', 'dog', 'dog', 'rabbit', 'rabbit', 'cat', 'dog', 'rabbit'] y_pred = ['cat', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'dog', 'cat', 'rabbit', 'cat', 'rabbit', 'rabbit'] print("y_true: %s" % y_true) print("y_pred: %s" % y_pred) cm = ConfusionMatrix(y_true, y_pred) assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix) assert isinstance(cm.stats(), OrderedDict) assert cm.population == len(y_true) # 12 cm.print_stats() cm_stats = cm.stats() # noqa assert cm.binarize("cat").TP == cm.get("cat") # cm.get("cat", "cat") assert cm.binarize("cat").TP == 3 assert cm.binarize("dog").TP == cm.get("dog") # 1 assert cm.binarize("rabbit").TP == cm.get("rabbit") # 3
def test_pandas_confusion_cm_stats_integers(self): y_true = [600, 200, 200, 200, 200, 200, 200, 200, 500, 500, 500, 200, 200, 200, 200, 200, 200, 200, 200, 200] y_pred = [100, 200, 200, 100, 100, 200, 200, 200, 100, 200, 500, 100, 100, 100, 100, 100, 100, 100, 500, 200] print("y_true: %s" % y_true) print("y_pred: %s" % y_pred) cm = ConfusionMatrix(y_true, y_pred) assert isinstance(cm, pdml.confusion_matrix.LabeledConfusionMatrix) assert isinstance(cm.stats(), OrderedDict) cm.print_stats()
def calc_general_stats(rows): y_true = [] y_pred = [] for i in range(len(rows)): row = rows[i] className_true = row.split('/')[4] numBoxes = int(row.split(' ')[1]) className_pred = 'none' if numBoxes > 0: className_pred = row.split(' ')[1 + numBoxes].split(',')[4] y_true.append(className_true) y_pred.append(className_pred) # stats cm = ConfusionMatrix(y_true, y_pred) cm.print_stats() cm.stats() # other report... target_names = [ 'arrabida', 'camara', 'clerigos', 'musica', 'none', 'serralves' ] print(classification_report(y_true, y_pred, target_names=target_names)) # plot cm = confusion_matrix(y_true, y_pred) classes = ['arrabida', 'camara', 'clerigos', 'musica', 'none', 'serralves'] df_cm = pd.DataFrame(cm, index=classes, columns=classes) plt.figure(figsize=(10, 7)) sn.set(font_scale=1.4) ax = sn.heatmap(cm, annot=True, annot_kws={"size": 16}, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g') plt.show()
print(encoder.classes_) y_train = onehot(encoder.transform(y_train1)) y_valid = onehot(encoder.transform(y_valid1)) # In[113]: # trainning process print("Evaluation on Training Dataset") Predictions = model.predict(x_train) print(Predictions.shape) Prediction = np.argmax(Predictions, axis=1) print(Prediction.shape) print(np.sum(Prediction == encoder.transform(y_train1))) cm = ConfusionMatrix(Prediction, encoder.transform(y_train1)) ConfusionMatrix = cm.stats()['cm'] ClassStatistics = cm.stats()['class'] OverallStatistics = cm.stats()['overall'] ClassStatistics.to_csv("trainStatsResnet.csv") print(OverallStatistics) print(ConfusionMatrix) print("Evaluation on Test Dataset") Predictions = model.predict(x_test) print(Predictions.shape) Prediction = np.argmax(Predictions, axis=1) print(Prediction.shape) print(np.sum(Prediction == encoder.transform(y_test1))) cm = ConfusionMatrix(Prediction, encoder.transform(y_test1)) ConfusionMatrix = cm.stats()['cm']
most_accurate_percentile, stats = best_model_threshold_by_roc( df, percentiles=np.arange(99., -1, -1)) df['actual_class'], r_bins = pd.qcut(df['rome'], 10, labels=list(range(1, 11)), retbins=True) df['predic_class'] = pd.qcut(df['predicted'], 10, labels=list(range(1, 11))) # df['predic_class'] = pd.cut(df['predicted'], bins=r_bins, labels=list(range(1, 11))) cm = ConfusionMatrix(df['actual_class'].to_list(), df['predic_class'].to_list()) cm.print_stats() statdict = cm.stats() cm_stats = statdict['class'] matrix = cm.to_dataframe() matrix.index.rename('ROME decile', inplace=True) matrix.columns.rename('ROME$_\mathrm{NN}$ decile', inplace=True) plt.close() # sns.heatmap(matrix / (len(predicted)//10) , cmap='Greys', annot=matrix, fmt='d') ax = sns.heatmap(matrix / (len(predicted) // 10), annot=matrix, fmt='d', cmap='gray_r', vmin=0.0, vmax=0.56) cbar = ax.collections[0].colorbar
accuracy = macro_accuracy confusion_matrix = ConfusionMatrix(actual_classes, predicted_classes) print() print('Macro-accuracy:', str(accuracy) + '%. Details (considering MICRO-accuracy):') confusion_matrix.print_stats() #time print() end_time = time.time() elapsed_time = time_format(end_time - start_time) print('Testing elapsed time:', elapsed_time) os.makedirs(os.path.join('results', 'test'), exist_ok=True) with open( os.path.join( 'results', 'test', test_data_path.replace(os.sep, '_').replace('.', '_') + '--' + model_file.replace('.pth', '.txt')), 'w') as results_txt: results_txt.write('Macro-accuracy: ' + str(accuracy) + '%. Details (considering MICRO-accuracy):\n\n') results_txt.write(str(confusion_matrix.stats())) results_txt.write('\n\nWRONG PREDICTIONS:\n\n') for wrong_prediction in wrong_predictions: path, label, prediction = wrong_prediction results_txt.write(path + ' is ' + label + ' and was predicted as ' + prediction + '\n') results_txt.write('\n\nTime: ' + elapsed_time)
# to keep time short, set epochs = 1 model.fit(train_data, train_labels, epochs=1, workers=4, use_multiprocessing=True) #Making Predictions predictions = model.predict(eval_data) # Extracting max probability predictions_number = np.array([]) for row_num in range(predictions.shape[0]): # row_num = 0 predictions_number = np.append(predictions_number, np.argmax(predictions[row_num])) # Just precaution predictions_number = predictions_number.astype(int) confusion_matrix = ConfusionMatrix(eval_labels, predictions_number) cms = confusion_matrix.stats() print("Overall Accuracy is ", round(cms['overall']['Accuracy'], 2),", Kappa is ", round(cms['overall']['Kappa'], 2)) # none: Overall Accuracy is 0.99 , Kappa is 0.99 #%% Weight regularization: L2 (weight decay) -> cost added is proportional to the square of the value of the weights coefficients model = tf.keras.models.Sequential() model.add(tf.keras.layers.Conv2D(64, (5, 5), activation='relu', input_shape=(im_wh, im_wh, 1), kernel_regularizer= tf.keras.regularizers.l2(0.001))) model.add(tf.keras.layers.MaxPooling2D((2, 2))) model.add(tf.keras.layers.Conv2D(64, (5, 5), activation='relu', kernel_regularizer= tf.keras.regularizers.l2(0.001))) model.add(tf.keras.layers.Flatten()) model.add(tf.keras.layers.Dense(64, activation='relu', kernel_regularizer= tf.keras.regularizers.l2(0.001))) model.add(tf.keras.layers.Dense(10, activation='softmax')) model.summary() # Note shape of images is going down. Note: it'll not go down when padding="same"
predicted = model.predict(testdata) cm = ConfusionMatrix(expected, predicted) print(expected.shape) print(predicted.shape) expected = np.array(expected) predicted = np.array(predicted) cm.print_stats() np.savetxt('expected.txt', expected, fmt='%01d') np.savetxt('predicted.txt', predicted, fmt='%01d') print(cm) print(expected.shape) print(predicted.shape) cm.stats() print("***************************************************************") # fit a k-nearest neighbor model to the data model = KNeighborsClassifier() model.fit(traindata, trainlabel) print(model) # make predictions expected = testlabel predicted = model.predict(testdata) # summarize the fit of the model cm = metrics.confusion_matrix(expected, predicted) print(cm) tpr = float(cm[0][0]) / np.sum(cm[0]) fpr = float(cm[1][1]) / np.sum(cm[1])
else: final_predict = item[best_index] final_result.append(final_predict) all_results_matrix = confusion_matrix(y_actual_set, np.array(final_result)) correct_result = 0 for i in range(27): correct_result = correct_result + all_results_matrix[i][i] accuracy_ensemble = correct_result / len(monitorList) target_names = [ 'Swipt Left', 'Swipe Right', 'Wave', 'Clap', 'Throw', 'Arm Cross', 'Basketball shoot', 'Draw X', 'Draw Circle CW', 'Draw Circle CCW', 'Draw Triangle', 'Bowling', 'Boxing', 'Baseball Swing', 'Tennis Swing', 'Arm Curl', 'Tennis Serve', 'Push', 'Knock', 'Catch', 'Pickup Throw', 'Jog', 'Walk', 'Sit to Stand', 'Stand to Sit', 'lunge', 'Squad' ] print( classification_report(y_actual_set, np.array(final_result), target_names=target_names)) cm = ConfusionMatrix(y_actual_set, np.array(final_result)) cm.plot() stats = cm.stats() cmstats = dict(stats) cmstats2 = cmstats['class'] cmstats2.to_csv('ensemble.csv', sep=',')
def performance_eval(results, test_data, risk_pref, dfb): # ''' A function to conduct model evaluation with pseudo timestamp labels ''' prediction = [] label = [] confidence = [] threshold = dfb #print(test_data) if test_data == "1.10train" or test_data == "0.10train": for i in results: if risk_pref == 'TPR': if i[1] >= threshold: #if i[1] <= threshold: # FP prediction.append(1) label.append(0) confidence.append(round(i[1], 3)) elif i[1] < threshold: #elif i[1] > threshold: # TN prediction.append(0) label.append(0) confidence.append(round(i[1], 3)) else: print("Error") else: if i[1] <= threshold: # FP prediction.append(1) label.append(0) confidence.append(round(i[1], 3)) elif i[1] > threshold: # TN prediction.append(0) label.append(0) confidence.append(round(i[1], 3)) else: print("Error") else: attack_timestamps = filepath + test_data + '.csv' with open(attack_timestamps, 'r') as f: reader = csv.reader(f) attack_times = list(reader) timestamp = [int(attack_times[0][0]), int(attack_times[0][1])] for i in results: if risk_pref == 'TPR': if timestamp[0] <= i[0] <= timestamp[1] and i[1] >= threshold: #if timestamp[0] <= i[0] <= timestamp[1] and i[1] <= threshold: # TP prediction.append(1) label.append(1) confidence.append(round(i[1], 3)) elif timestamp[0] <= i[0] <= timestamp[1] and i[1] < threshold: #elif timestamp[0] <= i[0] <= timestamp[1] and i[1] > threshold: # FN prediction.append(0) label.append(1) confidence.append(round(i[1], 3)) elif i[0] < timestamp[0] and i[1] >= threshold or i[0] > timestamp[1] and i[1] >= threshold: #elif i[0] < timestamp[0] and i[1] <= threshold or i[0] > timestamp[1] and i[1] <= threshold: # FP prediction.append(1) label.append(0) confidence.append(round(i[1], 3)) elif i[0] < timestamp[0] and i[1] < threshold or i[0] > timestamp[1] and i[1] < threshold: #elif i[0] < timestamp[0] and i[1] > threshold or i[0] > timestamp[1] and i[1] > threshold: # TN prediction.append(0) label.append(0) confidence.append(round(i[1], 3)) else: print("Error") print(timestamp, i) # print(timestamp, i, test_pred, test_label) else: if timestamp[0] <= i[0] <= timestamp[1] and i[1] <= threshold: # TP prediction.append(1) label.append(1) confidence.append(round(i[1], 3)) elif timestamp[0] <= i[0] <= timestamp[1] and i[1] > threshold: # FN prediction.append(0) label.append(1) confidence.append(round(i[1], 3)) elif i[0] < timestamp[0] and i[1] <= threshold or i[0] > timestamp[1] and i[1] <= threshold: # FP prediction.append(1) label.append(0) confidence.append(round(i[1], 3)) elif i[0] < timestamp[0] and i[1] > threshold or i[0] > timestamp[1] and i[1] > threshold: # TN prediction.append(0) label.append(0) confidence.append(round(i[1], 3)) else: print("Error") print(timestamp, i) # print(timestamp, i, test_pred, test_label) # calculate accuracy for a purely naive classifier naive = sum(label) / len(label) naive1 = 1 - naive if naive1 > naive: nc = naive1 else: nc = naive #array = np.column_stack = (prediction, label) prediction = np.asarray(prediction) label = np.asarray(label) ACC = accuracy_score(label, prediction) # reward = np.mean(confidence) cubed = [] ''' for x in confidence: if presence == 1: if x == 0: pass else: cubed.append(x) else: #cubed.append(x) x = x ** 3 cubed.append(x) ''' ''' for x in confidence: if x == 0: pass else: cubed.append(x) # x = x ** 3 # cubed.append(x) ''' for x in confidence: x = x ** 3 cubed.append(x) #''' cubes = sum(cubed) conf = sum(confidence) if conf == 0: pass else: test = (cubes / conf) # reward = test **(1./3.) conf = math.sqrt(test) #''' #conf = np.mean(confidence) #if risk_pref == 'TPR': #reward = conf print("Variance: " + str(round(np.var(confidence), 3))) reward = round(np.var(confidence), 3) #if conf > dfb: # reward = conf #else: # reward = 0 #else: #conf = 1 - conf #reward = conf #if conf < dfb: # #reward = conf # reward = conf #else: # reward = 0 #''' #print('Subset Accuracy (J): ' + str(ACC)) if test_data == "1.10train" or test_data == "0.10train": TPR = 'N/A' TNR = 'N/A' PREC = 'N/A' F1 = 'N/A' else: cm = ConfusionMatrix(prediction, label) stats = cm.stats() print(stats) TPR = stats['TPR'] TNR = stats['TNR'] PREC = stats['PPV'] F1 = stats['F1_score'] print('Reward: ' + str(reward)) return reward, ACC, F1, TPR, TNR, PREC, nc, conf
print(Y_test) #evaluate train set error #train_error = classifier.score(X_train, Y_train) #test_error = classifier.score(X_test, Y_test) #print(train_error, test_error) #-------------> EVALUATION evaluation_file.write('DT_EVALUATION:') evaluation_file.write('\n\nConfusion_matrix:\n') evaluation_file.write(str(confusion_matrix(Y_test, Y_pred))) evaluation_file.write('\n\nClassification report:\n') evaluation_file.write(str(classification_report(Y_test, Y_pred))) #evaluation_file.write("\nAccuracy is "+str(accuracy_score(Y_test,Y_pred)*100) + '\n') evaluation_file.write('\n\n') cm = ConfusionMatrix(Y_test, Y_pred) evaluation_file.write(str(cm.stats())) sn.heatmap(confusion_matrix(Y_test, Y_pred), annot=True, cmap='YlGnBu') plt.savefig(path_results + title + '/' + 'Confusion_Matrix.pdf') #plt.show() plt.close() print("\nAccuracy is " + str(accuracy_score(Y_test, Y_pred) * 100) + '\n') #X_test_plot = scaling.inverse_transform(X_test) value = 1.5 width = 100 plot_decision_regions( X=X_test, y=Y_test,
def evaluate(textFile, valueFile=None, varStatusBar=None, varCmOutput=None, varOutput=None): timestamp = strftime("%Y-%m-%d:%H-%M-%S") reportFile = "./reports/" + timestamp + ".txt" outputFile = "./evaluations/" + timestamp + ".csv" statsFile = "./statistics/" + timestamp + ".txt" wf = WordFilter() totalReal = [] totalPred = [] with open("./data/Priors.csv", "r") as priorFile: print(priorFile) priors = priorFile.readline().strip().split(',')[1:] priors = [log10(float(x)) for x in priors] testSize = 0 lst = [] lst.append(("Real Emotion", "Predicted Emotion", "Tweet")) for line in tqdm(textFile): testSize += 1 lineID = line.split(',')[0] words = wf.filterWords(line) predValues = [] unfound = [] for word in words: try: values = evaluateWord(word) except IOError: varStatusBar.set( "WordMap not found. Please train system first.") raise if values is not None: predValues.append(values) else: unfound.append(word) predValues = map(sum, zip(*predValues)) predProb = map(sum, zip(priors, predValues)) predEmotion = guessEmotion(predProb) valueFormat = ",".join("%.2f" % n for n in predValues) if valueFile: realValues = [ float(i) for i in valueFile.readline().strip().split(',')[1:] ] realEmotion = guessEmotion(realValues) if predEmotion != "No Words Found": totalReal.append(realEmotion) totalPred.append(predEmotion) if realEmotion != predEmotion: lst.append((realEmotion, predEmotion, line)) with open(outputFile, "a+") as output: output.write("{},{},{}\n".format(lineID, predEmotion, valueFormat)) with open(reportFile, "a+") as report: report.write("{}\n".format(line)) report.write("Filtered: {}\n".format(words)) report.write("Words not found:{}\n".format(unfound)) report.write("Emotion probabilities: {}\n".format(valueFormat)) report.write("Predicted emotion: {}\n".format(predEmotion)) if valueFile: report.write("Correct emotion: {}\n".format(realEmotion)) report.write("-" * 70) report.write("\n") if valueFile: varStatusBar.set("Evaluation Complete.") with open('./data/RealPred.csv', 'w') as realpredFile: writer = csv.writer(realpredFile, delimiter=',') writer.writerows(lst) cm = ConfusionMatrix(totalReal, totalPred) viewPlot = tkMessageBox.askyesno("Confusion Matrix", "View confusion matrix plot?") if viewPlot: normaliseData = tkMessageBox.askyesno("Confusion Matrix", "Normalise plot?") varOutput.set("Accuracy: " + str(cm.stats()['overall']['Accuracy'])) varCmOutput.set("Confusion Matrix: \n" + str(cm.stats()['cm'])) data = cm.stats() for key, value in data.items(): print(key, value) cm.plot(normalized=normaliseData) plt.show() with open(statsFile, "w+") as report: report.seek(0) report.write(str(cm)) report.write("\n")
for i in range(0, len(dataset['Rapor Aciklamasi'])): text = re.sub('[^a-zA-Z]', '', dataset['Rapor Aciklamasi'][i]) text = text.lower() text = text.split() ps = PorterStemmer() text = ''.join(text) corpus.append(text) cv = CountVectorizer(max_features=4000) X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 0].values X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.2, random_state=7) clf_entropy = tarin_using_entropy(X_train, X_validation, Y_train) print("Results Using Entropy:") # Prediction using entropy y_pred_entropy = prediction(X_validation, clf_entropy) from pandas_ml import ConfusionMatrix import pandas pandas.set_option('display.max_colwidth', 15) pandas.set_option('display.max_columns', 5) cm = ConfusionMatrix(Y_validation, y_pred_entropy) cm.print_stats() print(cm.stats())
def best_model_threshold_by_roc(df, percentiles): # True and False positive rates tpr, fpr = [], [] # markedness, accuracy, f1_score = [], [], [] stats_list = [] assert 0 <= percentiles.min() assert percentiles.max() <= 100 for perc in percentiles: # Highest decile of ROME and predictions serve as targets df['actual_target'] = df['rome'].gt(df['rome'].quantile(90 / 100)) df['predic_target'] = df['predicted'].gt(df['predicted'].quantile( perc / 100)) # confusion_matrix = pd.crosstab(df['actual_target'], df['predic_target'], rownames=['ROME'], colnames=['Predicted']) # print(confusion_matrix) # sns.heatmap(confusion_matrix, annot=True) # plt.show() # ConfusionMatrix has nice stats, but throws an error if one array has exclusively True or False if (len(df['actual_target'].unique()) > 1) and (len( df['predic_target'].unique()) > 1): cm = ConfusionMatrix(df['actual_target'].to_list(), df['predic_target'].to_list()) tpr.append(cm.stats()['TPR']) fpr.append(cm.stats()['FPR']) # stats = [cm.stats()[key] for key in cm.stats()] # stats_list.append(stats) stats_list.append(cm.stats().values()) else: df['actual_target'] = df['actual_target'].astype(int) df['predic_target'] = df['predic_target'].astype(int) cm = pd.crosstab(df['actual_target'], df['predic_target'], rownames=['ROME'], colnames=['Predicted']) # only True predictions if cm.columns.get_values() == 1: tpr.append(1) fpr.append( 1 ) #(cm.loc[0] / cm[1].sum()) # cm.loc[0] are the False target which are predicted True # ony False predictions elif cm.columns.get_values() == 0: tpr.append(0) fpr.append(0) else: raise ValueError('Targets consist of only True or only False.') plt.plot([0] + fpr, [0] + tpr, c='r', ls='', marker='o', ms=0.5) # loop through each x,y pair for i, xy in enumerate(zip(fpr, tpr)): corr = 0. #-0.05 # adds a little correction to put annotation in marker's centrum plt.annotate(str(percentiles[i].astype(int)), xy=(xy[0] + corr, xy[1] + corr), fontsize=2) plt.xlabel('False positive rate (False positives / Target negative)') plt.ylabel('True positive rate (True positives / Target positive)') plt.title( 'Predictions of ROME by KitchenSink-NN and varying threshold for high organisation.' ) plt.axes().set_aspect('equal') plt.savefig(home + '/Desktop/ROC', dpi=400, bbox_inches='tight') # Distance to point (0, 1) dist = np.sqrt((0 - np.array(fpr))**2 + (1 - np.array(tpr))**2) best_threshold = percentiles[dist.argmin()] stats = pd.DataFrame(stats_list) stats.rename(dict(zip(list(range(26)), cm.stats().keys())), axis='columns', inplace=True) stats.index.rename('Threshold', inplace=True) stats.columns.rename('Statistics', inplace=True) stats = stats[::-1].reset_index(drop=True) return best_threshold, stats
def model_func(alg, X_train, X_test, Y_train, Y_test, target, predictors, filename): #Fit the algorithm on the data algorithm=alg.fit(X_train, Y_train) print (algorithm) #Predict training set: dtrain_predictions = alg.predict(X_train) #Perform cross-validation: cv_score = model_selection.cross_val_score(alg, X_train, Y_train , cv=10, scoring='neg_mean_squared_error') #scoring='accuracy', cv=kfold #kfold = model_selection.KFold(n_splits=10, random_state=seed) #(n=num_instances, n_folds=num_folds=n_splits=n-fold CV) cv_score = np.sqrt(np.abs(cv_score)) #Print model report: print("\nModel Report") print("RMSE : %.4g" % np.sqrt(metrics.mean_squared_error((Y_train), dtrain_predictions))) print("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score))) #Predict on testing data and computing error rate: #dtest[target] = alg.predict(dtest[predictors]) predictions = alg.predict(X_test) errors = predictions != Y_test number_of_errors = errors.sum() error_rate = errors.sum() / len(predictions) #Print R-squared sse = ((Y_test - predictions) ** 2).sum(axis=0) tse = ((Y_test - np.average(Y_test , axis=0)) ** 2).sum(axis=0)) print("R-squared : %.4g" % (1 - (sse / tse))) print("RMSE Test : %.4g" % np.sqrt(metrics.mean_squared_error((Y_test), predictions))) print("Score:%s" % (alg.score(X_train, Y_train))) print("Decision Function:\n%s" % (alg.decision_function(X_test))) print("Intercept:%s" %(alg.intercept_)) print("Coefficents:\n%s"%(alg.coef_)) print("Number of errors=%i, error rate=%.2f" % (number_of_errors, error_rate)) print("Classification Report:\n%s" % (classification_report(Y_test, predictions, labels=np.unique(predictions)))) CMat = ConfusionMatrix(Y_test, predictions) print("Statistics regarding classification model =%s" % (CMat.stats())) # ROC-AUC score: closer to 1 the better. This metric dosen't work in multinomial cases. #k_fold = model_selection.KFold(n_splits=10, random_state=7) #results = model_selection.cross_val_score(alg, predictors, target, cv = k_fold, scoring='roc_auc') #print("AUC: %.3f (%.3f)") % (results.mean(), results.std()) #Plot the confusion matrix mat = confusion_matrix(Y_test, RMC_y_pred) sns.heatmap(mat, annot=True, fmt='d', cbar=True, xticklabels=label, yticklabels=label, linewidths=.5) plt.xlabel('true label') plt.ylabel('predicted label'); # Graph on testing data: line/model plt.scatter(Y_test, predictions , marker='+', color='r') plt.xlabel("True Values") plt.ylabel("Predictions") plt.show()
def evaluate_model(trainX, trainy, testX, testy): verbose, epochs, batch_size = 2, 10, 6 n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainy.shape[1] model = Sequential() # reshape output into [samples, timesteps, features] trainy = trainy.reshape((trainy.shape[0], trainy.shape[1])) testy = testy.reshape((testy.shape[0], testy.shape[1])) # define model model = Sequential() model.add(Conv1D(filters=512, kernel_size=15, padding='same', activation='relu', input_shape=(n_timesteps, n_features))) model.add(MaxPooling1D(pool_size=8, padding='same')) model.add(Conv1D(filters=512, kernel_size=15, padding='same', activation='relu', input_shape=(n_timesteps, n_features))) model.add(MaxPooling1D(pool_size=8, padding='same',)) model.add(Conv1D(filters=1024, kernel_size=15, padding='same', activation='relu', input_shape=(n_timesteps, n_features))) model.add(MaxPooling1D(pool_size=8, padding='same')) model.add(Flatten()) model.add(RepeatVector(n_outputs)) model.add(Bidirectional(LSTM(200, activation='relu', return_sequences=True, input_shape=(n_timesteps, n_features)))) model.add(Dropout(0.5)) model.add(Bidirectional(LSTM(200, activation='relu', return_sequences=True, input_shape=(n_timesteps, n_features)))) model.add(Dropout(0.5)) model.add(Bidirectional(LSTM(200, activation='relu', return_sequences=True, input_shape=(n_timesteps, n_features)))) model.add(Dropout(0.5)) model.add(Bidirectional(LSTM(200, activation='relu', return_sequences=False, input_shape=(n_timesteps, n_features)))) #model.add((Dense(200, activation='relu'))) model.add((Dense(n_outputs, activation='sigmoid'))) opt = optimizers.Adam(lr=1e-6) model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['binary_accuracy']) class_weight = {0: 1., 1: 3.} # fit network history = model.fit(trainX, trainy, shuffle=False, epochs=epochs, validation_split=0.1, class_weight=class_weight, batch_size=batch_size, verbose=verbose) # evaluate model _, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=0) # list all data in history #print(history.history.keys()) # make predictions #trainPredict = model.predict(trainX) testPredict = model.predict(testX) # compute confusion matrix y_pred = argmax(testPredict, axis=1, out=None) # save prediction dataset #prediction = DataFrame(trainPredict).to_csv('train_predictLSTM.csv') DataFrame(testPredict).to_csv('./test_predictLSTM.csv') y_actu = argmax(testy, axis=1, out=None) y_pred = y_pred.reshape(y_pred.shape[0],) y_actu = y_actu.reshape(y_actu.shape[0],) df = stack((y_pred, y_actu)) df = df.transpose() df = df.reshape(df.shape[0], 2) DataFrame(df).to_csv('./classification.csv') #y_pred = np.delete(y_pred, 1) print(y_actu.shape, y_pred.shape) cm = ConfusionMatrix(y_actu, y_pred) cm.print_stats() d = cm.stats() f1 = list(d.items())[17] f1 = f1[1] print(f1) return accuracy
def test_model(test_model, test_dataloader): print("Testing started..") test_model.eval() correct = 0 total = 0 all_labels_d = torch.tensor([], dtype=torch.long).to(device) all_predictions_d = torch.tensor([], dtype=torch.long).to(device) all_predictions_probabilities_d = torch.tensor( [], dtype=torch.float).to(device) if batch_size == 1: all_timePerFrame_host = [] else: print("Please set batch size to 1....") exit(0) with torch.no_grad(): for inputs, labels in test_dataloader: inputs = inputs.to(device) labels = labels.to(device) frame_time_start = datetime.datetime.now() # frame start time outputs = test_model(inputs) outputs = F.softmax(outputs, 1) #print(outputs) predicted_probability, predicted = torch.max(outputs.data, 1) frame_time_end = datetime.datetime.now() # frame end time time_per_image = (frame_time_end - frame_time_start).total_seconds() #print((predicted == labels).sum()) total += labels.size(0) correct += (predicted == labels).sum() all_labels_d = torch.cat((all_labels_d, labels), 0) all_predictions_d = torch.cat((all_predictions_d, predicted), 0) all_predictions_probabilities_d = torch.cat( (all_predictions_probabilities_d, predicted_probability), 0) all_timePerFrame_host = all_timePerFrame_host + [time_per_image] print('copying some data back to cpu for generating confusion matrix...') y_true = all_labels_d.cpu() y_predicted = all_predictions_d.cpu() # to('cpu') testset_predicted_probabilites = all_predictions_probabilities_d.cpu( ) # to('cpu') class_names = test_datasets.classes # taking class names for plotting confusion matrix cm = confusion_matrix(y_true, y_predicted, target_number_labels) # confusion matrix print('Accuracy of the network on the %d test images: %f %%' % (total, (100.0 * correct / total))) print(cm) print("taking class names to plot CM") print("Generating confution matrix") plot_confusion_matrix(cm, classes=class_names, title='my confusion matrix') #plot_confusion_matrix(cm, classes=target_number_labels, title='my confusion matrix') # print('confusion matrix saved to ', plot_dir) ################################################################## # classification report ################################################################# #print(classification_report(y_true, y_predicted, target_names=target_number_labels)) ################################################################## # Standard metrics for medico Task ################################################################# print("Printing standard metric for medico task") print("Accuracy =", mtc.accuracy_score(y_true, y_predicted)) print("Precision score =", mtc.precision_score(y_true, y_predicted, average="weighted")) print("Recall score =", mtc.recall_score(y_true, y_predicted, average="weighted")) print("F1 score =", mtc.f1_score(y_true, y_predicted, average="weighted")) print("Specificity =") print("MCC =", mtc.matthews_corrcoef(y_true, y_predicted)) ################################################################## # Standard metrics for medico Task ################################################################# print("Printing standard metric for medico task") print("1. Recall score (REC) =", mtc.recall_score(y_true, y_predicted, average="weighted")) print("2. Precision score (PREC) =", mtc.precision_score(y_true, y_predicted, average="weighted")) print("3. Specificity (SPEC) =") print("4. Accuracy (ACC) =", mtc.accuracy_score(y_true, y_predicted)) print("5. Matthews correlation coefficient(MCC) =", mtc.matthews_corrcoef(y_true, y_predicted)) print("6. F1 score (F1) =", mtc.f1_score(y_true, y_predicted, average="weighted")) panda_cm_data = ConfusionMatrix(y_true, y_predicted) panda_cm_data.print_stats() cm_dictionary = panda_cm_data.stats() print("cm _ dictionary saving") f = open(os.path.join(history_dir, "20_5_cm_dictionary.pkl"), "wb") pickle.dump(cm_dictionary['class'], f) f.close() print('Finished.. ') print('Finished.. ') return y_predicted, testset_predicted_probabilites, all_timePerFrame_host
confusion_matrix.plot(normalized=True) plt.show() # In[176]: confusion_matrix.print_stats() # ### Percentage of Fraud Trasaction which model detected incorrectly # In[177]: print("FNR is {0}".format(confusion_matrix.stats()['FNR'])) # #### Plotting ROC curve # In[178]: logit_roc_auc = roc_auc_score(y_test, lr.predict(X_test)) fpr, tpr, thresholds = roc_curve(y_predicted, lr.predict_proba(X_test)[:,1]) plt.figure() plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc) plt.plot([0, 1], [0, 1],'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate')
def test_model(test_model, test_dataloader): print("Testing started..") test_model.eval() correct = 0 total = 0 all_labels_d = torch.tensor([], dtype=torch.long).to(device) all_predictions_d = torch.tensor([], dtype=torch.long).to(device) with torch.no_grad(): for inputs, labels in test_dataloader: inputs = inputs.to(device) labels = labels.to(device) outputs = test_model(inputs) #outputs = (outputs1*0.6 + outputs2*0.4)/2 _, predicted = torch.max(outputs.data, 1) print((predicted == labels).sum()) total += labels.size(0) correct += (predicted == labels).sum() all_labels_d = torch.cat((all_labels_d, labels), 0) all_predictions_d = torch.cat((all_predictions_d, predicted), 0) print('copying some data back to cpu for generating confusion matrix...') testset_labels = all_labels_d.cpu() testset_predicted_labels = all_predictions_d.cpu() # to('cpu') cm = confusion_matrix(testset_labels, testset_predicted_labels) # confusion matrix print('Accuracy of the network on the %d test images: %f %%' % (total, (100.0 * correct / total))) print(cm) print("taking class names to plot CM") class_names = test_datasets.classes # taking class names for plotting confusion matrix print("Generating confution matrix") plot_confusion_matrix(cm, classes=class_names, title='my confusion matrix') print('confusion matrix saved to ', plot_dir) ################################################################## # classification report ################################################################# print( classification_report(testset_labels, testset_predicted_labels, target_names=class_names)) ################################################################## # Standard metrics for medico Task ################################################################# print("Printing standard metric for medico task") weights = [ 1 / 53, 1 / 81, 1 / 138, 1 / 125, 1 / 134, 1 / 11, 1 / 125, 1 / 132, 1 / 132, 1 / 4, 1 / 184, 1 / 72, 1 / 120, 1 / 39, 1 / 110, 1 / 138 ] print( "1. Recall score (REC) =", mtc.recall_score(testset_labels, testset_predicted_labels, average="weighted")) print( "2. Precision score (PREC) =", mtc.precision_score(testset_labels, testset_predicted_labels, average="weighted")) print("3. Specificity (SPEC) =") print( "4. Accuracy (ACC) =", mtc.accuracy_score(testset_labels, testset_predicted_labels, weights)) print("5. Matthews correlation coefficient(MCC) =", mtc.matthews_corrcoef(testset_labels, testset_predicted_labels)) print( "6. F1 score (F1) =", mtc.f1_score(testset_labels, testset_predicted_labels, average="weighted")) panda_cm_data = ConfusionMatrix(testset_labels, testset_predicted_labels) panda_cm_data.print_stats() cm_dictionary = panda_cm_data.stats() print("cm _ dictionary saving") f = open(os.path.join(history_dir, "24_3_cm_dictionary.pkl"), "wb") pickle.dump(cm_dictionary['class'], f) f.close() print('Finished.. ')
scores = cross_val_score(clf, X, y, cv=5) print scores.mean(), scores from sklearn.metrics import classification_report le = preprocessing.LabelEncoder() le.fit(y.as_matrix()) target_names = le.classes_ print classification_report(y_test, y_predictions, target_names=target_names) print collections.Counter(y_test.factorize()[0]) ''' This will plot the correlation between the attributes from pandas.tools.plotting import scatter_matrix scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') plt.show() ''' ''' -----------Printing the stats-------------- cm.stats() ''' from pandas.tools.plotting import scatter_matrix scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') plt.show() #show plots #plt.show()