def version2(): # Data cleaning in NLP Model corpus = [] for i in range(0, 527383): review = re.sub( '[^a-zA-Z]', ' ', df.iloc[i, 1]) # Removing all elements except words from all reviews review = review.lower() review = review.split() review = [ word for word in review if not word in set(sw.words('english')) ] stammer = ps() review = [stammer.stem(word) for word in review] review = " ".join(review) corpus.append(review) features = cv().fit_transform(corpus) labels = df.iloc[:, -1] train_test_split(features, labels, 100) features_test_vectorized = cv().transform(features_test) features_train_vectorized = cv().fit_transform(features_train) model = lr().fit(features_train_vectorized, labels_train) predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) cm(labels_test, predictions) return model
def loop_through_csv(df): FPR = [] TPR = [] n = 1 # n stores number of each run #for loop runs through point in the data set for i in range(1, 29, 3): y_pred = set_pred(df, i).copy() #sets the predictor value y_true = set_true(df, i).copy() #sets the true value #prints which run it is print("Run " + str(n)) #Use Sklearn to check if matrix and summary is correct print("Confusion Matrix with summary using Sklearn to Check") print(cm(y_true, y_pred)) cfm = cm(y_true, y_pred) print(classification_report(y_true, y_pred)) #Field is calculated without library print("Calculated Confusion Matrix with Summary that is Calculated") print(pd.crosstab(y_true, y_pred)) #Prints the the confusion matrix print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)) #Returns the (TP, FP, TN, FN) = set_plot(cfm, n) print_metrics(TP, FP, TN, FN) FPRm, TPRm = return_rates(TP, FP, TN, FN) FPR.append(FPRm) TPR.append(TPRm) n = n + 1 return FPR, TPR
def train_predict(classifier, sample_size, X_train, X_test, y_train, y_test,typ): # inputs: # classifier: the learning algorithm to be trained and predicted on # sample_size: the size of samples (number) to be drawn from training set # X_train: features training set # y_train: Activity_number_ID training set # X_test: features testing set # y_test: Activity_number_ID testing set # Empty dictionary will include all dataframes and info related to training and testing. results = {} # Fitting the classifier to the training data using slicing with 'sample_size' start= timer() # Get start time classifier = classifier.fit(X_train[0:sample_size,:],y_train[0:sample_size])# fiting the classfier end = timer() # Get end time # Calculate the training time results['train_time'] = end-start # Get the predictions on the test set(X_test), # then get predictions on the first 3000 training samples(X_train) using .predict() start = timer() # Get start time predictions_test = classifier.predict(X_test) # predict predictions_train =classifier.predict(X_train[:3000,:]) end = timer() # Get end time # Calculate the total prediction time results['pred_time'] =end-start # Compute accuracy on the first 300 training samples which is y_train[:300] results['acc_train'] = accuracy(y_train[:3000],predictions_train) # Compute accuracy on test set using accuracy_score() results['acc_test'] = accuracy(y_test,predictions_test) # Adapting the confusion matrix shape to the type of data used if typ==1: confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6], sample_weight=None) # columns=['WK','WU','WD','SI','ST','LD'] index=['WK','WU','WD','SI','ST','LD'] if typ==2: confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6,7,8,9,10,11,12], sample_weight=None) columns=['WK','WU','WD','SI','ST','LD','St-Si','Si-St','Si-Li','Li-Si','St-Li','Li-St'] index= ['WK','WU','WD','SI','ST','LD','St-Si','Si-St','Si-Li','Li-Si','St-Li','Li-St'] if typ==3: confusion_matrix=cm(y_test, predictions_test, labels=[1,2,3,4,5,6,7], sample_weight=None) columns=['WK','WU','WD','SI','ST','LD','PT'] index=['WK','WU','WD','SI','ST','LD','PT'] if sample_size==len(X_train):# if 100% of training is achieved # apply the confusion matrix function to the last contingency table generated confusion_matrix_df=(pd.DataFrame(data=confusion_matrix,columns=columns,index=index)).pipe(full_confusion_matrix) else:# if not # create a dataframe from the contingency table confusion_matrix_df=pd.DataFrame(data=confusion_matrix,columns=columns,index=index) # Return the results return (results,confusion_matrix_df)
def classifier(self): db = self.db_prepared.copy() db['quality_range'] = db.quality.apply(lambda q: 0 if q <= 4 else 1 if q <= 7 else 2) db['type'] = db.type.apply(lambda q: 0 if q == 'white' else 1) X = db[[ 'type', 'alcohol', 'density', 'volatile acidity', 'chlorides', 'citric acid', 'fixed acidity', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates', 'residual sugar', 'pH' ]] y = db.quality_range X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40) lr = LogisticRegression(random_state=40) lr.fit(X_train, y_train) train_accuracy = lr.score(X_train, y_train) test_accuracy = lr.score(X_test, y_test) print('One-vs-rest', '-' * 35, 'Accuracy in Train Group : {:.2f}'.format(train_accuracy), 'Accuracy in Test Group : {:.2f}'.format(test_accuracy), sep='\n') predictions = lr.predict(X_test) score = round(accuracy_score(y_test, predictions), 3) cm1 = cm(y_test, predictions) sns.heatmap(cm1, annot=True, fmt=".0f") plt.xlabel('Predicted Values') plt.ylabel('Actual Values') plt.title('Accuracy Score: {0}'.format(score), size=15) plt.show() pred_test = lr.predict(X_test) pred_train = lr.predict(X_train) quality_pred = LogisticRegression(random_state=40) quality_pred.fit(X_train, y_train) confusion_matrix_train = cm(y_train, pred_train) confusion_matrix_test = cm(y_test, pred_test) TN = confusion_matrix_test[0][0] TP = confusion_matrix_test[1][1] FP = confusion_matrix_test[0][1] FN = confusion_matrix_test[1][0] print("(Total) True Negative :", TN) print("(Total) True Positive :", TP) print("(Total) Negative Positive :", FP) print("(Total) Negative Negative :", FN) print("Accuracy Score of Our Model : ", quality_pred.score(X_test, y_test)) Error_Rate = 1 - (accuracy_score(y_test, pred_test)) print("Error rate: ", Error_Rate)
def version1(): # Logistic Regression Model train_test_split(df["reviewText"], df["Positivity"], 100) features_train_vectorized = cv().fit_transform(features_train) features_test_vectorized = cv().transform(features_test) model = lr().fit(features_train_vectorized, labels_train) # Model creation for logistic regression predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) # Generating prediction score cm(labels_test, predictions) return model
def version3(): # TF_IDF Model global vect train_test_split(df["reviewText"], df["Positivity"], 100) vect = TfidfVectorizer(min_df=5) features_train_vectorized = vect.fit_transform(features_train) features_test_vectorized = vect.transform(features_test) model = lr().fit(features_train_vectorized, labels_train) predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) cm(labels_test, predictions) return model
def confusion_matrix_printed(self, actual_y, y_hat): tn, fn, tp, fp = cm(actual_y, y_hat).ravel() error = fn + fp correct = tn + tp total = error + correct print(f'Error: {round(error / total, 4) * 100}%') print(f'Accuracy: {round(correct / total, 4) * 100}%')
def RandomForest_Classifier(self): #Random foreste göre sınıflandırma xtrain, xtest, ytrain, ytest = tts(veri[["age", "bmi"]], veri["sex"], test_size=0.33) rfc = RFC(n_estimators=10) rfc.fit(xtrain, ytrain) tahmin = rfc.predict(xtest) ConfMatris = cm(tahmin, ytest) ConfMatris = p.DataFrame(data=ConfMatris, index=["Erkek Sayısı", "Kadın Sayısı"], columns=["Erkek Tahmini", " Kadın Tahmin"]) plt.title( "RANDOM FOREST ALGORİTMASINA GÖRE SINIFLANDIRMANIN GÖRSELLEŞTİRİLMESİ\n" ) plt.pcolormesh(ConfMatris) plt.show() print( "RANDOM FOREST ALGORİTMASINA GÖRE SINIFLANDIRMA İÇİN KARMAŞIKLIK MATRİSİ" ) print(ConfMatris) dogru = ConfMatris.iloc[0, 0] + ConfMatris.iloc[1, 1] yanlis = ConfMatris.iloc[1, 0] + ConfMatris.iloc[0, 1] print( "\nDoğru Sınıflandırma Sayısı: {}\nYanlış Sınıflandırma Sayısı: {}" .format(dogru, yanlis))
def cm_labeled(clf, Xtest, ytest, threshold = 0.5): '''Show a nicely-labeled version of the confusion matrix.''' return pd.DataFrame( cm(ytest, clf.predict_proba(Xtest)[:,1] >= threshold, labels = [1,0]), columns = ['Predicted positive', 'Predicted negative'], index = ['Actually positive', 'Actually negative'] )
def SupportVectorMachine( self): #Destek vektör makinesi algoritmasına göre sınıflandırma xtrain, xtest, ytrain, ytest = tts(veri[["age", "bmi"]], veri["sex"], test_size=0.33) supportvector = SVC(kernel="linear") supportvector.fit(xtrain, ytrain) tahmin = supportvector.predict(xtest) ConfMatris = cm(tahmin, ytest) ConfMatris = p.DataFrame(data=ConfMatris, index=["Erkek Sayısı", "Kadın Sayısı"], columns=["Erkek Tahmini", " Kadın Tahmin"]) plt.title( "DESTEK VEKTÖR MAKİNESİNE GÖRE SINIFLANDIRMANIN GÖRSELLEŞTİRİLMESİ\n" ) plt.pcolormesh(ConfMatris) plt.show() print("DESTEK VEKTÖR MAKİNESİ İÇİN KARMAŞIKLIK MATRİSİ") print(ConfMatris) dogru = ConfMatris.iloc[0, 0] + ConfMatris.iloc[1, 1] yanlis = ConfMatris.iloc[1, 0] + ConfMatris.iloc[0, 1] print( "\nDoğru Sınıflandırma Sayısı: {}\nYanlış Sınıflandırma Sayısı: {}" .format(dogru, yanlis))
def acc(loader): accuracy = 0 num_batches = 0 act = np.array([]) pred = np.array([]) for batch in loader: gpu = batch.question_text.to(device).long() preds = bid_lstm_cnn(gpu) target = batch.target.numpy() preds = preds.cpu().detach().numpy() preds = np.array([np.argmax(row) for row in preds]) total_correct = sum(target == preds) act = np.concatenate((act, target)) pred = np.concatenate((pred, preds)) accuracy += total_correct num_batches += 1 ass = accuracy / (num_batches * batch_size) print(ass) formula1 = f1(act, pred) print(formula1) tn, fp, fn, tp = cm(act, pred).ravel() print( 'True positives -> {}\nFalse positives -> {}\nTrue negatives -> {}\nFalse negatives -> {}\n' .format(tp, fp, tn, fn)) return ass, formula1
def error_display(result, num=1): y_true = result['gt_class'] y_pred = result['pre_class'] cmatrix = cm(y_true, y_pred) num_finechips = sum(cmatrix[0]) num_flawchips = len(result) - num_finechips num_pre_finechips = sum(cmatrix[:, 0]) num_pre_flawchips = len(result) - num_pre_finechips print('confusion matrix:\n') print(cmatrix) pres = [] recs = [] for i in range(4): precison = cmatrix[i, i] / (sum(cmatrix[i]) +1)* 100 recall = cmatrix[i, i] / (sum(cmatrix[:, i])+1) * 100 pres.append(precison) recs.append(recall) print('precision and recall on class%d : %d%% %d%% \n' % (i, precison, recall)) print('total validation samples num : %d' % len(result)) print('mean presicion : %d%%' % (np.mean(pres))) print('mean recall : %d%%' % (np.mean(recs))) print('mean ap : %d%% ' % (np.mean(result['ap']) * 100)) return
def cm_f1_test(model, test_data, test_labels): test_pred = model.predict(test_data) scores = f1(test_labels, test_pred, average=None) argSort = scores.argsort() scores = scores[argSort] return cm(test_labels, test_pred), (argSort[:2], scores[:2])
def train(ctx, vocab_size, num_classes, filter_num, batch_size, word_embed_size, training_steps, learning_rate, print_loss_every, confusion_matrix, keep_proba, filter_sizes, save_model): # Load dataset (x_train, y_train), (x_test, y_test) = get_dataset(ctx.train_path, ctx.test_path) sequence_length = x_train.shape[1] dataset_size = x_train.shape[0] tf.reset_default_graph() with tf.Graph().as_default(): cnn = TextCNN(sequence_length, vocab_size, word_embed_size, filter_sizes, filter_num, num_classes) # Set eval feed_dict train_feed_dict = { cnn.input_x: x_train, cnn.input_y: y_train, cnn.keep_proba: 1.0 } test_feed_dict = { cnn.input_x: x_test, cnn.input_y: y_test, cnn.keep_proba: 1.0 } # Train saver = tf.train.Saver() train_step = tf.train.AdamOptimizer(learning_rate).minimize(cnn.loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(training_steps): start = (i * batch_size) % dataset_size end = min(start + batch_size, dataset_size) feed_dict = { cnn.input_x: x_train[start:end], cnn.input_y: y_train[start:end], cnn.keep_proba: keep_proba } sess.run(train_step, feed_dict=feed_dict) if i % print_loss_every == 0: avg_cost = cnn.loss.eval(feed_dict=feed_dict) train_acc = cnn.accuracy.eval(feed_dict=train_feed_dict) test_acc = cnn.accuracy.eval(feed_dict=test_feed_dict) test_pred = cnn.pred.eval(feed_dict=test_feed_dict) print(f"Epoch: {i:04d} | AvgCost: {avg_cost:7.4f}", end="") print(f" | Train/Test ACC: {train_acc:.3f}/{test_acc:.3f}") # After training, save the sess if save_model: save_path = saver.save(sess, SESS_PATH) print('Model state has been saved!') if confusion_matrix: binary = cm(y_true=y_test, y_pred=test_pred) print('\n', 'Confusion Matrix: ') print(binary) plot_confusion_matrix(binary) plt.show()
def KNN_Classification( self): #K-Nearest Neighbour algoritmasına göre sınıflandırma xtrain, xtest, ytrain, ytest = tts(veri[["age", "bmi"]], veri["sex"], test_size=0.33) knn = KNN(n_neighbors=3) knn.fit(xtrain, ytrain) tahmin = knn.predict(xtest) ConfMatris = cm(tahmin, ytest) ConfMatris = p.DataFrame(data=ConfMatris, index=["Erkek Sayısı", "Kadın Sayısı"], columns=["Erkek Tahmini", " Kadın Tahmin"]) plt.title( "K-NN SINIFLANDIRMASINA GÖRE SINIFLANDIRMANIN GÖRSELLEŞTİRİLMESİ\n" ) plt.pcolormesh(ConfMatris) plt.show() print("K-NN SINIFLANDIRMASI İÇİN KARMAŞIKLIK MATRİSİ") print(ConfMatris) dogru = ConfMatris.iloc[0, 0] + ConfMatris.iloc[1, 1] yanlis = ConfMatris.iloc[1, 0] + ConfMatris.iloc[0, 1] print( "\nDoğru Sınıflandırma Sayısı: {}\nYanlış Sınıflandırma Sayısı: {}" .format(dogru, yanlis))
def update(self, model, trnx, trny, tstx, tsty, prediction, count, con, score): model.fit(trnx, trny) acc = model.score(tstx, tsty) prediction[:,count] = model.predict(tstx) #predict the outcomes con.append(cm(tsty, prediction[:,count]))#creates confusion matrix prediction[:,count] = (prediction[:,count])*acc score[:,count] = acc
def confusion_matrix(self, X, y): """Return a confusion matrix with format: ----------- | TP | FP | ----------- | FN | TN | ----------- Parameters ---------- y_true : ndarray - 1D y_pred : ndarray - 1D Returns ------- ndarray - 2D """ x_vectorized = self._vectorizer.transform(X) y_pred = self._classifier.predict(x_vectorized) [[tn, fp], [fn, tp]] = cm(y, y_pred) print '-----------' print '| TP | FP |' print '-----------' print '| FN | TN |' print '-----------' return np.array([[tp, fp], [fn, tn]])
def cosine_similar_measure(test_firingtime, y_test, a, b, c, avg_class_dist): i = 0 y_pred_val = [] sim = [] tot_sim = [] for a_val in test_firingtime: sim = [] for b_val in avg_class_dist: sim.append( cosine_similarity(a_val.reshape(1, len(a_val)), b_val.reshape(1, len(b_val)))) tot_sim.append(sim) y_pred_val.append(np.argmax(tot_sim[i])) i = i + 1 from sklearn.metrics import (precision_score, recall_score, f1_score, accuracy_score, mean_squared_error, mean_absolute_error) accuracy = accuracy_score(y_test, y_pred_val) * 100 recall = recall_score(y_test, y_pred_val, average="macro") precision = precision_score(y_test, y_pred_val, average="macro") f1 = f1_score(y_test, y_pred_val, average="macro") print("accuracy") print("%.3f" % accuracy) print("precision") print("%.3f" % precision) print("recall") print("%.3f" % recall) print("f1score") print("%.3f" % f1) from sklearn.metrics import confusion_matrix as cm cm = cm(y_test, y_pred_val) print("Confusion matrix\n", cm) return y_pred_val
def print_metrics(self, predicted_output): """ Print some MVP metrics. sklearn is used for calculation of all the metric values. Confusion matrix values (true positive, false negative, false positive and true negative), precision, recall, f1-score and accuracy is calculated. There are few other metrics which comes under classification report, but meh to them. We need the actual labels and the predicted labels to calculate the metrics. We can get the actual labels from the class variable and the predicted output or predicted labels are passed as a parameter after running each algorithm. :param predicted_output: Predicted labels """ res = cm(self.y_test, predicted_output) tp = res[0][0] fn = res[1][0] fp = res[0][1] tn = res[1][1] print("Accuracy: ", acs(self.y_test, predicted_output)) print("TP: ", tp, ", FN: ", fn, ", FP: ", fp, "TN: ", tn) print(cr(self.y_test, predicted_output))
def eval(Y_true, Y_pred): cms = [] for Y in Y_pred: cms.append(cm(Y_true, Y)) cms = np.array(cms) cm_mean = np.mean(cms, axis=0) cm_std = np.std(cms, axis=0) return cm_mean, cm_std
def main(): train_data, train_labels, test_data, test_labels = test_train_split() predicted_output = predict(train_data, test_data[:, :57]) print("confusion matrix : \n", cm(test_labels, predicted_output)) print("Recall : ", recall(test_labels, predicted_output)) print("Accuracy:", accuracy_score(test_labels, predicted_output) * 100, "%") print("precision : ", precision_score(test_labels, predicted_output))
def lrw(): lw(str(clfr) + '\n') lw(cr(y_test, y_)) lw('\n\n') lw(str(cm(y_test, y_))) lw('\n\n') log.close() log = open(log_file, "a")
def self_cm(X, y): from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) from sklearn.metrics import confusion_matrix as cm print(cm(y_test, gs3.predict(X_test)))
def confusion_matrix(truth, predictions): # I use Sklearn's confusion_matrix and add the names of the classes. classes = ['art', 'eve', 'geo', 'gpe', 'nat', 'org', 'per', 'tim'] df = pd.DataFrame(cm(truth, predictions, labels=classes)) df.index = classes df.columns = classes return df
def compAccu(X_dev, y_dev_df, pred_y, vocab, k=1): X_def_list = list(X_dev['word']) y_dev = list(y_dev_df['tag']) correct = 0 total = 0 unk_count = 0 incorrect = [] unk_incorrect = [] unk_incorrect_label = [] known_incorrect = [] # print (pred_y) sentence_labels = [] sentence_preds = [] optimal = 0 suboptimal = 0 for word, pred, truth in zip(X_def_list, pred_y, y_dev): # print (pred, dev) if truth == '*' or truth == '<STOP>': continue if not is_unk(word, vocab): unk_count += 1 if pred == truth: correct += 1 else: incorrect.append([word, pred, truth]) if is_unk(word, vocab): known_incorrect.append([word, pred, truth]) else: unk_incorrect.append([word, pred, truth]) unk_incorrect_label.append(pred) unk_incorrect_label.append(truth) total += 1 sentence_labels.append(truth) sentence_preds.append(pred) if word == '.' or word == '!' or word == '?': if not correct_sentence(sentence_preds, sentence_labels): suboptimal += 1 else: optimal += 1 sentence_labels = [] sentence_preds = [] accu = correct/total print( f"Suboptimal: {suboptimal} / {optimal + suboptimal} = {suboptimal / (optimal + suboptimal)}") incorrect_df = pd.DataFrame(incorrect, columns=['X', 'pred', 'truth']) unk_incorrect_df = pd.DataFrame( unk_incorrect, columns=['X', 'pred', 'truth']) unk_incorrect_df.to_csv('unk_incorrect.csv') #print (incorrect_df) #print (unk_incorrect_df) #print ('known_accu: ', 1-len(known_incorrect)/(total-unk_count)) print('unk_accu: ', 1-len(unk_incorrect)/unk_count) conf = cm(unk_incorrect_df['truth'], unk_incorrect_df['pred'], labels=[ 'NN', 'NNS', 'NNP', 'NNPS']) np.savetxt("conf", conf, delimiter=",", fmt='%3.0f') return accu
def confusion_matrix(self, y_true, y_pred, labels=None): """Implementation of the confusion matrix. :param y_true: numpy.array :param y_pred: numpy.array :param labels: list[str] | list[int] :rtype: numpy.array """ return cm(y_true, y_pred, labels=labels)
def ACC(gt, pred): from sklearn.metrics import confusion_matrix as cm if type(gt) == list: gt = np.array(gt) if type(pred) == list: pred = np.array(pred) # ipdb.set_trace() acc = cm(gt, pred) acc_norm = acc.astype('float') / acc.sum(axis=1)[:, np.newaxis] aca = np.diag(acc_norm).mean() return acc, aca
def evaluate(input_dir, corpus, metrics): classes = [ "NOT", "PART-OF", "INTERACTOR", "REGULATOR-POSITIVE", "REGULATOR-NEGATIVE" ] predictions = [] true_classes = [] correct_predictions = 0 data_file = open(corpus, "r") data_list = data_file.read().split("\n") device = torch.device("cuda") # Load model model = BertForSequenceClassification.from_pretrained( input_dir, local_files_only=True, cache_dir=None) tokenizer = BertTokenizer.from_pretrained(input_dir) model.to(device) # Predict classes for seq in data_list: text = json.loads(seq)["text"] true_class = json.loads(seq)["custom_label"] input_ids = torch.tensor( tokenizer.encode(text)).unsqueeze(0).to(device) outputs = model(input_ids) pred_class = classes[torch.softmax(outputs.logits, dim=1).argmax()] if pred_class == true_class: correct_predictions += 1 predictions.append(pred_class) true_classes.append(true_class) precision, recall, fscore, _ = score(true_classes, predictions, average='macro') # Print the classification report and confusion matrices print(classification_report(true_classes, predictions)) print( cm(true_classes, predictions, labels=[ "INTERACTOR", "NOT", "PART-OF", "REGULATOR-NEGATIVE", "REGULATOR-POSITIVE" ])) metrics["f1-score"].append(fscore) metrics["recall"].append(recall) metrics["precision"].append(precision) metrics["accuracy"].append(correct_predictions / len(data_list)) return metrics
def conf_matrix(pred, test, tlist): '''Computes the confusion matrix over the predicitions from the model. -pred: set of predictions -test: set of ground truth -tlist: list of classes.''' test.loc[:,1] = pred test.loc[:,0] = [tlist[i] for i in test.loc[:,0]] test.loc[:,1] = [tlist[i] for i in test.loc[:,1]] classes = np.unique(test.loc[:,0]) conf_mat = cm(test.loc[:,0], test.loc[:,1], classes) return conf_mat, classes
def confusion_matrix(y_true, y_pred, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): c = cm(y_true.tolist(), y_pred.tolist(), labels=classes) plot_confusion_matrix(c, classes, normalize=normalize, title=title, cmap=cmap)
def plot_cm(y_true, y_pred): confmat = cm(y_true, y_pred) fig, ax = plt.subplots(figsize=(5, 5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') plt.xticks(np.arange(0, 5, 1)) # x軸の目盛りを指定 plt.yticks(np.arange(0, 5, 1)) plt.xlabel('true label') plt.ylabel('predicted label') plt.show()
def objective_recall(W, X1, X2, X1_label, X2_label): """ Fairness objective based on recall """ weighted_sum_m_norm, weighted_sum_f_norm = prepare_normalized_weighted_sum( W, X1, X2) predictions1 = [1 if w >= 0.5 else 0 for w in weighted_sum_m_norm] conf_mat1 = cm(X1_label, predictions1) TP1, FN1, FP1, TN1 = conf_mat1.ravel() recall1 = TP1 / (TP1 + FN1) predictions2 = [1 if w >= 0.5 else 0 for w in weighted_sum_f_norm] conf_mat2 = cm(X2_label, predictions2) TP2, FN2, FP2, TN2 = conf_mat2.ravel() recall2 = TP2 / (TP2 + FN2) ratio = get_ratio(recall1, recall2) return -(1 - ratio)
def test(self): score = self.model.evaluate(self.X_te,self.y_te, batch_size=self.bs, verbose=1) print("*"*20) print("Test acc of ", score[1]) self.pred_te = self.model.predict(self.X_te).argmax(1) y_te = self.y_te.argmax(1) cm_te = cm(y_te,self.pred_te) print('Test cm ==> ') print(cm_te) print("*"*20)
def kappa(y_true, y_pred): O = cm(y_true, y_pred) N = max(max(y_true), max(y_pred)) + 1 W = np.zeros((N, N), "float32") for i in np.arange(N): for j in np.arange(N): W[i, j] = (i - j) ** 2 W /= (N - 1) ** 2 hist_true = np.bincount(y_true, minlength=N) hist_pred = np.bincount(y_pred, minlength=N) E = np.outer(hist_true, hist_pred).astype("float32") / len(y_true) return 1 - (np.sum(W * O) / np.sum(W * E))
def classifier(file_name): review_sparse_vect, rating_sparse_vect = bag_of_words(file_name) # support vector classifier one vs all clf = SVC(C=1, kernel='linear', gamma=1, verbose=False, probability=False, decision_function_shape='ovr') clf.fit(review_sparse_vect, rating_sparse_vect) # Model fitting completeion # print("Fitting completed") predicted = cv.cross_val_predict(clf, review_sparse_vect, rating_sparse_vect, cv=10) # calculation of metrics print("accuracy_score\t", acc_score(rating_sparse_vect, predicted)) print("precision_score\t", pre_score(rating_sparse_vect, predicted)) print("recall_score\t", rc_score(rating_sparse_vect, predicted)) print("\nclassification_report:\n\n", cr(rating_sparse_vect, predicted)) print("\nconfusion_matrix:\n", cm(rating_sparse_vect, predicted))
def benchmark(self, clf, X_train, y_train, X_test, y_test): output(80 * '_') # fit output("Training:") t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 output("train time: %0.3fs" % train_time) # predict t0 = time() pred = clf.predict(X_test) try: proba = clf.predict_proba(X_test) except: proba = None try: log_proba = clf.predict_log_proba(X_test) except: log_proba = None test_time = time() - t0 output("test time: %0.3fs" % test_time) # get metrics for the positve class only (heavy class imbalance) # p_score = mlu.get_pos_precision(cm(y_test, pred)) # r_score = mlu.get_pos_recall(cm(y_test, pred)) # f_measure = mlu.get_f_measure(p_score, r_score) # get metrics p_scores, r_scores, f_measures, support = get_scores(y_test, pred, self.beta) p_score_avg = p_scores.mean() r_score_avg = r_scores.mean() f_measure_avg = f_measures.mean() output("precision: %0.3f \trecall: %0.3f" % (p_score_avg, r_score_avg)) # output results output("Classification results:") output(cr(y_test, pred)) output(cm(y_test, pred)) clf_descr = str(clf).split('(')[0] # get the name of the classifier from its repr() return clf_descr, p_score_avg, r_score_avg, f_measure_avg, train_time, test_time, proba
print 'Fold [%s]'%(i) X_train = train_pca[train_index] Y_train = tr_labels_tr[train_index] X_cv = train_pca[cv_index] Y_cv = tr_labels_tr[cv_index] clf.fit(X_train, Y_train) blend_train_pca[cv_index,j] = clf.predict(X_cv) blend_test_pca[:,j] = clf.predict(test_pca) blend_test_pca[:,j] = blend_test_pca.mean(1) blend_test_train= np.append(blend_train,blend_train_pca,axis=1) blend_test_test = np.append(blend_test,blend_test_pca,axis=1) bclf = LogisticRegression() pred = bclf.fit(blend_test_train,tr_labels_tr) cm(pred,te_labels) ## class "OTHER" adaboost: base_clf = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=None,class_weight='auto') n_estimators = [10, 20, 30, 40, 50] for n in n_estimators: clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=n, learning_rate=1.0, algorithm='SAMME.R', random_state=None) clf.fit(train_,tr_labels_tr) pred = clf.predict(valid_) print("Validation data:") cm(pred, tr_labels_va) print("with {n} estimators:".format(n=n)) pred_test = clf.predict(test_) print("Test data:") cm(pred_test, te_labels)
# log.act("creating full random sample") # randData=np.genfromtxt(randFilename,delimiter=" ") # botsData=np.genfromtxt(botsFilename,delimiter=" ") # print(randData.shape) # print(botsData.shape) # fullRandData=np.vstack((randData,botsData)) # log.sum(fullRandData.shape,"data set size") # log.sum(np.sum(fullRandData[:,1]),"number of bots") # log.sum(fullRandData.shape[0]-np.sum(fullRandData[:,1]),"number of people") # np.savetxt(fullRandFilename,fullRandData,delimiter="\t",fmt="%.2f") log.act("building classifier") data=np.genfromtxt(fullRandFilename,delimiter="\t") lr=LogisticRegression(penalty="l1",C=0.8) log.prevLine() X=np.c_[data[:,2:27],data[:,28]] print(X.shape) y=data[:,1] ids=np.argwhere(np.isnan(X)) print(ids.shape) lr.fit(X,y) # log.params("LogisticRegression(penalty=\"l1\",C=0.1)") log.sum(f1_score(lr.predict(X),y),"f1 score") log.sum(cm(y,lr.predict(X))," confusion matrix") print(lr.get_params()) print(lr.coef_) log.act("end")
def confusion_matrix(actual, prediction): matrix = cm(actual, prediction) print matrix
from sklearn.ensemble import RandomForestClassifier def test_logit_classifier(X_train, X_valid, X_test, y_train, y_valid, y_test,param_c, pca = False): if pca = True: pca_transform = PCA() X_train = pca_transform.fit_transform(X_train) X_valid = pca_transform.transform(X_valid) X_test = pca_transform.transform(X_test) print("--PCA-transformed data--") for c in param_c: clf =LogisticRegression(C=c,class_weight='auto',penalty='l1',dual=False) clf.fit(X_train,y_train) pred = clf.predict(X_valid) print("Validation data:") print(" parameter C = {num}".format(num = c)) cm(pred,y_valid) pred_test = clf.predict(X_test) print("Test data:) print(" parameter C = {num}".format(num=c)) cm(pred_test,y_test) def test_rf_classifier(X_train, X_valid, X_test, y_train, y_valid, y_test,param_n, pca = False): if pca = True: pca_transform = PCA() X_train = pca_transform.fit_transform(X_train) X_valid = pca_transform.transform(X_valid) X_test = pca_transform.transform(X_test) print("--PCA-transformed data--") for n_estimators in param_n: clf =RandomForestClassifier(n_estimators = n_estimators) clf.fit(X_train,y_train)
def start_split_data(data_list): random_list = dc(data_list) random.shuffle(random_list) predicted_list = [] mark = 0 acc_list = [] act_class_list = [] for i in range(10): # fold range test_list = [] training_list = [] while (mark < int(len(random_list))): for train_ele in range(0, mark): training_list.append(random_list[train_ele]) else: index = mark mark = int(len(random_list) / 10) + index for test_element in range(index, mark): test_list.append(random_list[test_element]) for training_element in range(mark, int(len(random_list))): training_list.append(random_list[training_element]) # print(training_list) # fold completion Node.children = [] Node.leaf_children = [] Node.temp_children = [] Node.new_children = [] Node.len_training_list = len(training_list) Node.old_pessi_err = (node_err_cal(training_list, max_class( training_list, class_column), class_column) + 1) / \ Node.len_training_list root = Node(training_list) # print(root.data) root.node_type = 'root' build_tree(root) predicted_temp_list = [] actual_list = [] temp_root = dc(root) for test_element in test_list: actual_list.append(int(test_element[class_column])) found = int(class_finder(test_element, temp_root)) predicted_temp_list.append(found) predicted_list.append(found) acc_list.append( accuracy(actual_list, predicted_temp_list, class_column)) break print(mean(acc_list)) act_class_list = class_list_gen(random_list) # print(len(act_class_list),len(predicted_list)) while (len(act_class_list) > len(predicted_list)): del act_class_list[-1] c_matrix = cm(act_class_list, predicted_list) print('Confusion matrix\n', c_matrix) c_report = cr(act_class_list, predicted_list) print("All Measures required for this data set \n", c_report) fpr, tpr, thd = rc(act_class_list, predicted_list) roc_auc = auc(fpr, tpr) if formula_input == 2: plt.title('ROC for %s with information gain(red) and gini(blue)' % file_name[0]) plt.plot(fpr, tpr, label='%s AUC = %0.2f' % (formula_measure, roc_auc)) plt.legend(loc='lower right') else: plt.title('ROC for %s ' % file_name[0]) plt.plot(fpr, tpr, label='%s AUC = %0.2f' % (formula_measure, roc_auc)) plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate')
def update(self, model, trnx, trny, tstx, tsty): model.fit(trnx, trny) self.acc = model.score(tstx, tsty) prediction = model.predict(tstx) # predict the outcomes self.con = cm(tsty, prediction) # creates confusion matrix
def main(train_file, test_file, load_method="csv", opti_method=None, maxiter=100, batch_size=-1, units=None, lmbda=0, alpha=100, beta=1000): """ Manages files and operations for the neural network model creation, training, and testing. @parameters: load_method - the dataset file format, either "csv" or "hdf" opti_method - specifies the optimization method to use, "l-bfgs", "cg", or None (defaults to SGD) maxiter - the maximum number of iterations allowed for training batch_size - the number of instance for each mini-batch, -1 implies batch processing units - a sequence of integers separated by '.' such that each integer represents the number of units in a sequence of hidden layers. lmbda - the regularization term alpha - the numerator for the learning rate schedule (relevant for SGD only) beta - the denominator for the learning rate schedule (relevant for SGD only) """ # open and load csv files if load_method == "csv": X_train, y_train = mlu.load_csv(train_file, True) # load and shuffle training set X_test, y_test = mlu.load_csv(test_file) elif load_method == "hdf": X_train, y_train = mlu.loadh(train_file, True) # load and shuffle training set X_test, y_test = mlu.loadh(test_file) else: raise Exception("Dataset file type not recognized: acceptable formats are 'csv' and 'hfd'.") # perform feature scaling X_train = mlu.scale_features(X_train, 0.0, 1.0) X_test = mlu.scale_features(X_test, 0.0, 1.0) # create the neural network classifier using the training data NNC = NeuralNetClassifier(opti_method, maxiter, batch_size, units, lmbda, alpha, beta) print "\nCreated a neural network classifier\n\t", NNC # fit the model to the loaded training data print "\nFitting the training data..." # costs, mags = NNC.fit(X_train, y_train) NNC.fit(X_train, y_train) # predict the results for the test data print "\nGenerating probability prediction for the test data..." y_pred = NNC.predict(X_test) ### output classification results ### # output class prediction probability for each instance in the test set print "\nThe probabilities for each instance in the test set are:\n" for prob in NNC.predict_proba(X_test): print prob # output accuracy print 'Accuracy: ', mlu.compute_accuracy(y_test, y_pred) # output sklearn style results if the module is availble try: from sklearn.metrics import classification_report as cr from sklearn.metrics import confusion_matrix as cm print print "Classification results:" print cr(y_test, y_pred) print cm(y_test, y_pred) except: pass # save model parameters as a pickle NNC.save_model("NNCModel.p")
def report_confusion_matrix(actual, pred, return_metrics=True): """Return a dataframe with the confusion matrix, and a series with the classification performance metrics. Parameters ---------- actual : np.ndarray, shape=(n_samples,) The array of actual values pred : np.ndarray, shape=(n_samples,) The array of predicted values return_metrics : bool, optional (default=True) Whether to return the metrics in a pd.Series. If False, index 1 of the returned tuple will be None. Returns ------- conf : pd.DataFrame, shape=(2, 2) The confusion matrix ser : pd.Series or None The metrics if ``return_metrics`` else None """ # ensure only two classes in each lens = [len(set(actual)), len(set(pred))] max_len = np.max(lens) if max_len > 2: raise ValueError('max classes is 2, but got %i' % max_len) cf = cm(actual, pred) # format: (col = pred, index = act) # array([[TN, FP], # [FN, TP]]) ser = None if return_metrics: total_pop = np.sum(cf) condition_pos = np.sum(cf[1, :]) condition_neg = np.sum(cf[0, :]) # alias the elements in the matrix tp = cf[1, 1] fp = cf[0, 1] tn = cf[0, 0] fn = cf[1, 0] # sums of the prediction cols pred_pos = tp + fp pred_neg = tn + fn acc = (tp + tn) / total_pop # accuracy tpr = tp / condition_pos # sensitivity, recall fpr = fp / condition_neg # fall-out fnr = fn / condition_pos # miss rate tnr = tn / condition_neg # specificity prev = condition_pos / total_pop # prevalence plr = tpr / fpr # positive likelihood ratio, LR+ nlr = fnr / tnr # negative likelihood ratio, LR- dor = plr / nlr # diagnostic odds ratio prc = tp / pred_pos # precision, positive predictive value fdr = fp / pred_pos # false discovery rate fomr = fn / pred_neg # false omission rate npv = tn / pred_neg # negative predictive value # define the series d = { 'Accuracy': acc, 'Diagnostic odds ratio': dor, 'Fall-out': fpr, 'False discovery rate': fdr, 'False Neg. Rate': fnr, 'False omission rate': fomr, 'False Pos. Rate': fpr, 'Miss rate': fnr, 'Neg. likelihood ratio': nlr, 'Neg. predictive value': npv, 'Pos. likelihood ratio': plr, 'Pos. predictive value': prc, 'Precision': prc, 'Prevalence': prev, 'Recall': tpr, 'Sensitivity': tpr, 'Specificity': tnr, 'True Pos. Rate': tpr, 'True Neg. Rate': tnr } ser = pd.Series(data=d) ser.name = 'Metrics' # create the DF conf = pd.DataFrame.from_records(data=cf, columns=['Neg', 'Pos']) conf.index = ['Neg', 'Pos'] return conf, ser
def confusion(y_true, y_pred): return cm(y_true, y_pred)