def evaluate(model, iterator_function, _batch_count, cuda_device, output_buffer=sys.stderr): if output_buffer is not None: print(_batch_count, file=output_buffer) model.eval() with torch.no_grad(): predictions = [] expectations = [] batch_generator = range(_batch_count) if output_buffer is not None: batch_generator = tqdm(batch_generator) for _ in batch_generator: features, targets = iterator_function() if cuda_device != -1: features = features.cuda(device=cuda_device) probs, _, _ = model(example_batch=features) batch_pred = np.argmax(probs.detach().cpu().numpy(), axis=-1).tolist() batch_tgt = targets.detach().cpu().numpy().tolist() predictions.extend(batch_pred) expectations.extend(batch_tgt) model.train() return acc(expectations, predictions) * 100, \ pr(expectations, predictions) * 100, \ rc(expectations, predictions) * 100, \ f1(expectations, predictions) * 100,
def build_classifier_and_test(train_X, train_y, test_X, test_y, clf, print_train_result=True): clf.fit(train_X, train_y) if print_train_result == True: p_tr = clf.predict(train_X) print("Train Accuracy:\t", acc(train_y, p_tr)) print("Train Precision:\t", pr(train_y, p_tr)) print("Train Recall_score:\t", rc(train_y, p_tr)) print("Train F-score:\t", f1(train_y, p_tr)) predicted = clf.predict(test_X) print("Accuracy:\t", acc(test_y, predicted)) print("Precision:\t", pr(test_y, predicted)) print("Recall_score:\t", rc(test_y, predicted)) print("F-score:\t", f1(test_y, predicted))
def clone_analysis(data_paths): code = [] labels = [] positives = 0 for file_name in data_paths: data = json.load(open(file_name)) for example in data: code.append(example['tokenized']) l = 0 if 'label' in example.keys(): l = int(example['label']) elif 'lebel' in example.keys(): l = int(example['lebel']) elif 'leble' in example.keys(): l = int(example['leble']) elif 'lable' in example.keys(): l = int(example['lable']) if l > 1: l = 1 positives += l labels.append(l) print(len(code), len(labels), positives, len(labels) - positives) vectorizer = TfidfVectorizer(input=code, lowercase=False, ngram_range=(1, 3)) X = vectorizer.fit_transform(code) model = KMeans(n_clusters=10, max_iter=100) model.fit(X) y = model.predict(X) cluster_to_positive = [0] * 10 cluster_to_negative = [0] * 10 for pred, label in zip(y, labels): if label == 1: cluster_to_positive[pred] += 1 else: cluster_to_negative[pred] += 1 print(cluster_to_positive) print(cluster_to_negative) percentages = [ float(p) / (p + n) for p, n in zip(cluster_to_positive, cluster_to_negative) ] for p in percentages: print(p) for _ in range(5): XTrain, XTest, YTrain, YTest = train_test_split(X, labels, test_size=0.2) model = RandomForestClassifier() model.fit(XTrain, YTrain) predicted = model.predict(XTest) print('%.3f\t%.3f\t%.3f\t%.3f' % (acc(YTest, predicted) * 100, pr(YTest, predicted) * 100, rc(YTest, predicted) * 100, f1(YTest, predicted) * 100)) pass
def fit_test(clf, train_tuple, test_tuple): ''' fit_test function that fits a classifier in train_tuple and report AUC results on test_tuple The tuples should be given as (data, label) ''' data_train, labels_train = train_tuple data_test, labels_test = test_tuple scaler = StandardScaler() scaler.fit(data_train) data_train = scaler.transform(data_train) data_test = scaler.transform(data_test) clf.fit(data_train, labels_train) fpr, tpr, _ = rc(labels_test, clf.predict_proba(data_test)[:, 1]) return auc(fpr, tpr)
def fit_test(clf, train_tuple, test_tuple): ''' fit_test function that fits a classifier in train_tuple and report AUC results on test_tuple The tuples should be given as (data, label) ''' data_train, labels_train = train_tuple data_test, labels_test = test_tuple scaler = StandardScaler() scaler.fit(data_train) data_train = scaler.transform(data_train) data_test = scaler.transform(data_test) clf.fit(data_train, labels_train) fpr, tpr, _ = rc(labels_test, clf.predict(data_test)[:, 1]) return auc(fpr, tpr)
def roc_curve(output, target): try: from sklearn.metrics import roc_curve as rc except ImportError: raise RuntimeError("ROC Curve requires scikit-learnto be installed.") with torch.no_grad(): pred = torch.argmax(output, dim=1) assert pred.shape[0] == len(target) fpr, tpr, _ = rc(target.cpu().numpy(), output[:, 1].cpu().numpy()) fig = plt.figure() plt.plot(fpr, tpr) fig.canvas.draw() buf = np.asarray(fig.canvas.buffer_rgba(), dtype=np.uint8)[:, :, :3] image = torch.from_numpy(buf).permute(2, 0, 1) plt.close(fig) return image
#ROC curves ind_val = random.sample(range(12000), 2000) xvalid = xtrain.iloc[ind_val,:] yvalid = ytrain.iloc[ind_val] xtrain = xtrain.drop(ind_val) ytrain = ytrain.drop(ind_val) xtrain = xtrain.to_numpy() svc = svm.SVC(C = 10, probability=True) svc.fit(xtrain, ytrain) y_score = svc.predict_proba(xvalid) fpr_svm, tpr_svm, thresholds_svm = rc(yvalid, y_score[:,1]) y_score = svc.predict_proba(xtrain) fpr_svm_tr, tpr_svm_tr, thresholds_svm = rc(ytrain, y_score[:,1]) plt.figure(0) plt.title('ROC Curves Heldout Set') plt.plot(fpr_svm, tpr_svm, label = 'Support Vector Machine') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.legend() plt.savefig('ROC_Curves.jpg') ###########################################################################################3 # Print out a few examples of photos that we are misclasified by my model in order to learn from
Returns ------- WRITEME """ assert classifier is not None, "Why would you pass not classifier?" # Data scaling based on training set scaler = StandardScaler() scaler.fit(data[train_idx]) data_train = scaler.transform(data[train_idx]) data_test = scaler.transform(data[test_idx]) classifier.fit(data_train, labels[train_idx]) fpr, tpr, thresholds = rc(labels[test_idx], classifier.predict_proba(data_test)[:, 1]) return auc(fpr, tpr) def load_data(source_dir, data_pattern): """ Loads the data from multiple sources if provided. Parameters ---------- source_dir: str data_pattern: str Returns -------
# test model features_t = features[test_index] test_op = label[test_index] #true_op.append(test_op) pred_hsv = model_hsv.predict(features_t) score_hsv = model_hsv.score(features_t, test_op) pred_ssv = model_ssv.predict(features_t) score_ssv = model_ssv.score(features_t, test_op) acc_h.append(score_hsv) acc_s.append(score_ssv) print 'Time spent in each fold:' print time.time() - start_time # plot ROC y_score_hsv = model_hsv.decision_function(features_t) fpr_h, tpr_h, _ = rc(test_op, y_score_hsv) y_score_ssv = model_ssv.decision_function(features_t) fpr_s, tpr_s, _ = rc(test_op, y_score_ssv) fig1 = plt.figure() lw = 1 plt.plot(fpr_h, tpr_h, color='darkorange', lw=lw, label='ROC curve (Soft SVM)') plt.plot(fpr_s, tpr_s, color='deeppink', lw=lw, label='ROC curve (Hard SVM)') plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right")
negative_batch=x_n) repr = representation.detach().cpu().numpy() prediction_classes = np.argmax(prediction_prob.detach().cpu().numpy(), axis=-1) # print( # "Epoch %3d, Loss: %10.4f, Accuracy: %5.2f, Precision: %5.2f, Recall: %5.2f, F1: %5.2f" % ( # epoch, batch_loss.detach().cpu().item(), # acc(targets, prediction_classes), pr(targets, prediction_classes), # rc(targets, prediction_classes), f1(targets, prediction_classes) # ) # ) if epoch % 1 == 0: prediction_prob, representation, batch_loss = model( example_batch=test_x, targets=test_y) repr = representation.detach().cpu().numpy() prediction_classes = np.argmax( prediction_prob.detach().cpu().numpy(), axis=-1) print('=' * 100) print( "Test %3d, Loss: %10.4f, Accuracy: %5.2f, Precision: %5.2f, Recall: %5.2f, F1: %5.2f" % (epoch, batch_loss.detach().cpu().item(), acc(test_y, prediction_classes), pr(test_y, prediction_classes), rc(test_y, prediction_classes), f1(test_y, prediction_classes))) print('=' * 100) plot_embedding(repr, test_y, title='Epoch %d' % epoch) batch_loss.backward() optimizer.step() pass
#its double the original size 2125350, which is what we want #generate score - has 2 modes. By default generates the probabilities, but adding another argument def gs(x, y="prob"): #70/30 train test split x_train, x_test, y_train, y_test = tts(x, x.label, test_size=0.3) data = x_train.iloc[:, :32] test_data = x_test.iloc[:, :32] #train model classifier = lr(random_state=0).fit(data, y_train) if y == "prob": pred = classifier.predict_proba(test_data) else: pred = classifier.predict(test_data) return pred, y_test.values print(gs(finalSet)) #AUC Curve a = gs(finalSet, "pred") print(auc(a[0], a[1])) #Plot it truePos, falsePos, thresholds = rc(a[1], a[0]) plt.plot(truePos, falsePos) plt.show()
def start_split_data(data_list): random_list = dc(data_list) random.shuffle(random_list) predicted_list = [] mark = 0 acc_list = [] act_class_list = [] for i in range(10): # fold range test_list = [] training_list = [] while (mark < int(len(random_list))): for train_ele in range(0, mark): training_list.append(random_list[train_ele]) else: index = mark mark = int(len(random_list) / 10) + index for test_element in range(index, mark): test_list.append(random_list[test_element]) for training_element in range(mark, int(len(random_list))): training_list.append(random_list[training_element]) # print(training_list) # fold completion Node.children = [] Node.leaf_children = [] Node.temp_children = [] Node.new_children = [] Node.len_training_list = len(training_list) Node.old_pessi_err = (node_err_cal(training_list, max_class( training_list, class_column), class_column) + 1) / \ Node.len_training_list root = Node(training_list) # print(root.data) root.node_type = 'root' build_tree(root) predicted_temp_list = [] actual_list = [] temp_root = dc(root) for test_element in test_list: actual_list.append(int(test_element[class_column])) found = int(class_finder(test_element, temp_root)) predicted_temp_list.append(found) predicted_list.append(found) acc_list.append( accuracy(actual_list, predicted_temp_list, class_column)) break print(mean(acc_list)) act_class_list = class_list_gen(random_list) # print(len(act_class_list),len(predicted_list)) while (len(act_class_list) > len(predicted_list)): del act_class_list[-1] c_matrix = cm(act_class_list, predicted_list) print('Confusion matrix\n', c_matrix) c_report = cr(act_class_list, predicted_list) print("All Measures required for this data set \n", c_report) fpr, tpr, thd = rc(act_class_list, predicted_list) roc_auc = auc(fpr, tpr) if formula_input == 2: plt.title('ROC for %s with information gain(red) and gini(blue)' % file_name[0]) plt.plot(fpr, tpr, label='%s AUC = %0.2f' % (formula_measure, roc_auc)) plt.legend(loc='lower right') else: plt.title('ROC for %s ' % file_name[0]) plt.plot(fpr, tpr, label='%s AUC = %0.2f' % (formula_measure, roc_auc)) plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate')
Returns ------- WRITEME """ assert classifier is not None, "Why would you pass not classifier?" # Data scaling based on training set scaler = StandardScaler() scaler.fit(data[train_idx]) data_train = scaler.transform(data[train_idx]) data_test = scaler.transform(data[test_idx]) classifier.fit(data_train, labels[train_idx]) fpr, tpr, thresholds = rc(labels[test_idx], classifier.predict_proba(data_test)[:, 1]) return auc(fpr, tpr) def load_data(source_dir, data_pattern): """ Loads the data from multiple sources if provided. Parameters ---------- source_dir: str data_pattern: str Returns ------- data: array_like