def tune_threshold(y, y_prob, eta=0.1, plev=0.5, max_iter=100, output=True): """ Tunes the threshold of the decision rule to improve accuracy. Keyword Arguments: y - theground truth y_prob - the model predictions eta - learning rate plev - the level of precision we are trying to maintain """ threshold = 0.5 yhat = decide(y_prob, threshold) p = precision(y, yhat) r = recall(y, yhat) initial_loss = directional_loss(y, yhat) if output: print(f"Precision = {p}, Recall = {r}, Threshold = {threshold}") for i in range(1, max_iter): threshold -= eta / i * threshold yhat = decide(y_prob, threshold) p = precision(y, yhat) r = recall(y, yhat) if output: print(f"Precision = {p}, Recall = {r}, Threshold = {threshold}") if (p <= plev): return threshold return threshold
def eval_kaggle_score(df_pred, Num): metrics = MulticlassMetrics( df_pred.rdd.map(lambda ar: (float(np.argsort(ar.probability)[-1:]), float(ar.hotel_cluster)))) NumCluster = Num avg_precision = metrics.precision() for i in range(1, NumCluster): metrics = MulticlassMetrics( df_pred.rdd.map(lambda ar: (float(np.argsort(ar.probability)[-(i + 1):-i]), float(ar.hotel_cluster)))) avg_precision += metrics.precision() return avg_precision
def pred_precision_kaggle(prediction, NumCluster): pred_label = prediction.rdd.map(lambda x: (float( np.argsort(-1 * x.probability)[:1]), float((x.hotel_cluster)))) metrics = MulticlassMetrics(pred_label) avg_precision = metrics.precision() for i in range(1, NumCluster): pred_label = prediction.rdd.map( lambda x: (float(np.argsort(-1 * x.probability)[i:(i + 1)]), float(x.hotel_cluster))) metrics = MulticlassMetrics(pred_label) avg_precision += metrics.precision() return avg_precision
def tune_model_width(build_fn, x_train, y_train, x_val, y_val, max_width=50): """ Takes a 3-Layer nueral network and expands width to see if there are tangible benefits to increasing the width of the hidden layer in the model. Parameters: build_fn - function that returns a keras nn model with the specified parameters x_train - the data matrix y_train - the response function x_val - validation data y_val - validation data function """ acc = [] pre = [] rec = [] for i in range(15, max_width): width = i model = feed_forward.build_model(x_train, y_train, width=width, suppress=True) model.fit(x_train, y_train, epochs=100, verbose=0) y_val_prob = model.predict(x_val)[:, 0] y_val_hat = decide(y_val_prob, 0.5) acc.append(accuracy(y_val, y_val_hat)) pre.append(precision(y_val, y_val_hat)) rec.append(recall(y_val, y_val_hat)) return acc, pre, rec
def present_results_simp(y_test, predictions): results_list = [] for k, v in predictions.items(): inter_list = [ k, accuracy(v, y_test), precision(v, y_test), precision_top(v, y_test, 0.01), precision_top(v, y_test, 0.02), precision_top(v, y_test, 0.05), precision_top(v, y_test, 0.1), precision_top(v, y_test, 0.2), precision_top(v, y_test, 0.3), precision_top(v, y_test, 0.5), recall(v, y_test), recall_top(v, y_test, 0.01), recall_top(v, y_test, 0.02), recall_top(v, y_test, 0.05), recall_top(v, y_test, 0.1), recall_top(v, y_test, 0.2), recall_top(v, y_test, 0.3), recall_top(v, y_test, 0.5), f1(v, y_test) ] results_list.append(inter_list) df = pd.DataFrame(results_list) df.columns = [ 'Model', 'Accuracy', 'Precision', 'Precision top 1%', 'Precision top 2%', 'Precision top 5%', 'Precision top 10%', 'Precision top 20%', 'Precision top 30%', 'Precision top 50%', 'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%', 'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%', 'F 1' ] return df
def evaluate_rf(x_train, y_train, x_test, y_test, thresh=thresh, ntrees=[25, 100, 500], maxfeats=[1, .5, 4]): rd = { 'predicted': [], 'ntrees': [], 'nfeats': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy': [], 'class': [] } for size in ntrees: for f in maxfeats: scores = random_forest_classifier(size, f, x_train, y_train, x_test) for t in thresh: scores = list(stats.rankdata(scores, 'average') / len(scores)) preds = [compare_to_threshold(x, t) for x in scores] rd['predicted'].append(preds) rd['ntrees'].append(size) rd['nfeats'].append(f) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('rf') return pd.DataFrame(rd)
def evaluate_knn(x_train, y_train, x_test, y_test, kays=[3, 5, 7, 9, 11], thresh=thresh): ''' generates df of predictions, penalties, k values, thresholds, precision, recall, and accuracy to help find best model ''' rd = { 'predicted': [], 'k': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy': [], 'class': [] } for k in kays: scores = knn_classifier(x_train, y_train, x_test, k)[:, 1] for t in thresh: scores = list(stats.rankdata(scores, 'average') / len(scores)) preds = [compare_to_threshold(x, t) for x in scores] rd['predicted'].append(preds) rd['k'].append(k) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('knn') return pd.DataFrame(rd)
def evaluate_logreg(x_train, y_train, x_test, y_test, c_values=[.01,.1,1,10,100], thresh=thresh): ''' generates df of predictions, penalties, c_values, thresholds, precision, recall, and accuracy of logistic regression ''' penalties = ['l2'] rd = {'predicted': [], 'penalty': [], 'C': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy':[], 'class': []} for p in penalties: for c in c_values: scores = logreg_classifier(x_train, y_train, x_test, c, p) for t in thresh: scores = list(stats.rankdata(scores, 'average')/len(scores)) preds = [compare_to_threshold(x, t)for x in scores] rd['predicted'].append(preds) rd['penalty'].append(p) rd['C'].append(c) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('logreg') return pd.DataFrame(rd)
def get_metrics(prediction, y_test): ''' Computes accuracy, precision, recall, ROC-AUC and F1 metrics for consideroing predictions produced by a ML and actual values of a dependent variables. Inputs: - prediction: an array with predictions. - y_test: an array with actual values. Returns a dictionary with metrics of a ML model. ''' Accuracy = accuracy(prediction, y_test) Precision = precision(prediction, y_test) Recall = recall(prediction, y_test) try: AUC = roc_auc(prediction, y_test) except ValueError: AUC = 0 F1 = f1(prediction, y_test) metrics_dict = { 'Accuracy': Accuracy, 'Precision': Precision, 'Recall': Recall, 'AUC': AUC, 'F1': F1 } return metrics_dict
def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh): ''' you get it ''' criterion = ['entropy', 'gini'] rd = { 'predicted': [], 'crit': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy': [], 'class': [] } for c in criterion: scores = dectree_classifier(x_train, y_train, x_test, c) for t in thresh: scores = list(stats.rankdata(scores, 'average') / len(scores)) preds = [compare_to_threshold(x, t) for x in list(scores)] rd['predicted'].append(preds) rd['crit'].append(c) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('dectree') return pd.DataFrame(rd)
def pred_precision(prediction): pred_label = prediction.rdd.map( lambda x: (float(x.prediction), float(x.hotel_cluster))) metrics = MulticlassMetrics(pred_label) precision = metrics.precision() return round(precision * 100, 2)
def print_prediction_metrics(clf, x, y, k): pred = cross_val_predict(clf, x, y, cv=StratifiedKFold(n_splits=k, shuffle=True)) print("Accuracy: ", round(accuracy(y, pred), 2)) print("Precision on spam: ", round(precision(y, pred, average=None)[1], 3)) print("Recall on spam: ", round(recall(y, pred, average=None)[1], 3)) return
def update_metrics(gt, pre, f1_m, p_m, r_m, acc_m): f1_value = f1(gt, pre, average="micro") f1_m.update(f1_value) p_value = precision(gt, pre, average="micro", zero_division=0) p_m.update(p_value) r_value = recall(gt, pre, average="micro") r_m.update(r_value) acc_value = accuracy(gt, pre) acc_m.update(acc_value)
def __init__(self, model, parameters, name, threshold, x_train, y_train, x_test, y_test): self.params = parameters self.model = model.set_params(**parameters) self.scores = classify(x_train, y_train, x_test, self.model) self.truth = y_test self.predictions = predict(self.scores, threshold) self.accuracy = accuracy(self.truth, self.predictions) self.precision = precision(self.truth, self.predictions) self.recall = recall(self.truth, self.predictions) self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall) self.name = None ClassifierAnalyzer.identifier += 1
def evaluate_bagging(x_train, y_train, x_test, y_test, thresh=thresh): rd = {'predicted': [], 'threshold': [], 'precision': [], 'recall': [], 'accuracy':[], 'class': []} scores = bagging_classifier(x_train, y_train, x_test) for t in thresh: scores = list(stats.rankdata(scores, 'average')/len(scores)) preds = [compare_to_threshold(x, t) for x in list(scores)] rd['predicted'].append(preds) rd['threshold'].append(t) rd['precision'].append(precision(y_test, preds)) rd['recall'].append(recall(y_test, preds)) rd['accuracy'].append(accuracy(y_test, preds)) rd['class'].append('bagging') return pd.DataFrame(rd)
def calculate_precision(predicted_scores, y_test, threshold): ''' Calculate the precision of the trained model. Inputs: predicted_scores (numpy array) - probabilities that data points belong to class 1 y_test (pandas dataframe) - label testing data threshold (float) - if predicted score is above this threshold, consider it to be class 1 Outputs: test_precision (float) ''' predictions = get_predictions_with_threshold(predicted_scores, threshold) test_precision = precision(y_test, predictions) return test_precision
def __init__(self, model, parameters, name, threshold, x_train, y_train, x_test, y_test): self.params = parameters self.t = threshold self.model = model.set_params(**parameters) self.scores = classify(x_train, y_train, x_test, self.model) self.truth = y_test self.predictions = predict(self.scores, self.t) self.predicted_for_pct = sum(self.predictions) / len(self.predictions) self.accuracy = accuracy(self.truth, self.predictions) self.precision = precision(self.truth, self.predictions) self.recall = recall(self.truth, self.predictions) self.f1 = (self.accuracy * self.precision * 2) / (self.accuracy + self.precision) self.name = ClassifierAnalyzer.identifier_counter self.roc_auc = None ClassifierAnalyzer.identifier_counter += 1
def simp_run_through(evals, facs, features, year_col, start, split, end, classes, parameters, thresholds): rv = { 'class': [], 'DT': [], 'threshold': [], 'precision': [], 'recall': [], 'preds': [], 'top_features': [] } df = make_df(evals, facs) train, test = simp_windows(df, year_col, start, split, end, features) trx, tr_y, tex, te_y = simp_x_y_split(train, test) train_dates = trx['EVALUATION_START_DATE'] test_dates = tex['EVALUATION_START_DATE'] trx.drop(['EVALUATION_START_DATE', 'FACILITY_NAME', 'MostRecentEval'], inplace=True, axis=1) tex.drop(['EVALUATION_START_DATE', 'FACILITY_NAME', 'MostRecentEval'], inplace=True, axis=1) for c in classes: for p in pars[c]: if c == 'DT': scores, imps = dectree_classifier(trx, tr_y, tex, p) for t in thresholds: preds = [compare_to_threshold(x, t) for x in list(scores)] rv[c].append(p) rv['class'].append(c) rv['threshold'].append(t) rv['precision'].append(precision(te_y, preds)) rv['recall'].append(recall(te_y, preds)) rv['preds'].append(preds) rv['top_features'].append( list(zip(list(tex.columns), list(imps)))) final = pd.DataFrame(rv) final.to_csv('results.csv') return print(final)
def make_prediction_matrix(self): rv_dic = {} predictions_df = pd.DataFrame() for thresh in self.t: x = round((1 - thresh), 2) preds = 'predictions_{}pct'.format(x) a = 'precision_{}pct'.format(x) b = 'recall_{}pct'.format(x) c = 'f1_{}pct'.format(x) predictions = predict(self.scores, thresh) predictions = [int(x) for x in predictions] d = '{}_at_{}pct'.format(self.name, x) predictions_df[d] = predictions prec = precision(self.truth, predictions) rec = recall(self.truth, predictions) rv_dic[a] = [prec] rv_dic[b] = [rec] rv_dic[c] = [(prec * rec * 2) / (prec + rec)] rv_dic['model'] = [self.name] return pd.DataFrame(rv_dic), predictions_df
def test_classifiers(X,y,n=7,rname="results.txt"): clfs={ # "Bagging KNN": [BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5),[],[],[],[]], "NN (kNN k=1)": [KNeighborsClassifier(n_neighbors=1),[],[],[],[],[]], #"NN (kNN k=3)": [KNeighborsClassifier(n_neighbors=3),[],[],[],[],[]], "NN (kNN k=3 w)": [KNeighborsClassifier(n_neighbors=3, weights='distance'),[],[],[],[],[]], "NN (kNN k=5 w)": [KNeighborsClassifier(n_neighbors=5, weights='distance'),[],[],[],[],[]], #"NN (kNN k=7 w)": [KNeighborsClassifier(n_neighbors=7, weights='distance'),[],[],[],[]], #"SVM - Linear kernel": [svm.SVC(kernel="rbf",probability=True),[],[],[],[]], # "Naive Bayes": [GaussianNB(),[],[],[],[]], # "SVM Sigmoide": [svm.SVC(kernel="sigmoid"),[],[],[],[]], #"ANN":[MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),[],[],[],[]], } #V=["Voting KNN",[None,[],[],[],[]]] skf=kfold(y, n_iter=n, random_state=None, train_size=0.7) output=open(rname,"w") for train,test in skf: Xt,Yt=X[train],y[train] Xv,Yv=X[test],y[test] votes=[] for (k,v) in clfs.items(): v[0].fit(Xt,Yt) #print(clfs[k]) Yr=v[0].predict(Xv) #print(accs(Yv,Yr)) v[1].append(accs(Yv,Yr)) v[2].append(f1(Yv,Yr,average="macro")) v[3].append(recall(Yv,Yr,average="macro")) v[4].append(precision(Yv,Yr)) v[5].append(kappa(Yv,Yr)) #votes.append(Yr) #Yp=predict(votes) for k,v in clfs.items(): fm="%s | %s| %s | %s | %s\n" output.write(fm %(k,"Accuracy",np.mean(v[1]),min(v[1]),max(v[1]))) #output.write(fm %(k,"Kappa",np.mean(v[5]),min(v[5]),max(v[5]))) output.write(fm %(k,"F1",np.mean(v[2]),min(v[2]),max(v[2]))) output.write(fm %(k,"Recall",np.mean(v[3]),min(v[3]),max(v[3]))) output.write(fm %(k,"Precision",np.mean(v[4]),min(v[4]),max(v[4])))
def crossValidate(X, y, nfold): kf = KFold(n_splits=nfold, shuffle=True) kf.get_n_splits(X) sorted_indices = np.loadtxt('final_sorted_indices.txt', dtype=int) r = 16 print("K-fold: K=", nfold) f1 = 0 acc = 0 prec = 0 rec = 0 spec = 0 for train_index, test_index in kf.split(X): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] selected_feature_indices = sorted_indices[:r] X_train_selected_features = X_train[:, selected_feature_indices] X_test_selected_features = X_test[:, selected_feature_indices] clf = GaussianNB() y_pred = clf.fit(X_train_selected_features, y_train).predict(X_test_selected_features) f1 += fscore(y_test, y_pred) acc += accuracy(y_test, y_pred) prec += precision(y_test, y_pred) rec += recall(y_test, y_pred) spec += specificity(y_test, y_pred) print('fscore', f1 / nfold) print('accuracy', acc / nfold) print('precision', prec / nfold) print('recall', rec / nfold) print('specificity', spec / nfold)
def present_results(y_test, predictions): results_list = [] for k, v in predictions.items(): inter_list = [ k, accuracy(v, y_test), precision(v, y_test), precision_top(v, y_test, 0.01), precision_top(v, y_test, 0.02), precision_top(v, y_test, 0.05), precision_top(v, y_test, 0.1), precision_top(v, y_test, 0.2), precision_top(v, y_test, 0.3), precision_top(v, y_test, 0.5), recall(v, y_test), recall_top(v, y_test, 0.01), recall_top(v, y_test, 0.02), recall_top(v, y_test, 0.05), recall_top(v, y_test, 0.1), recall_top(v, y_test, 0.2), recall_top(v, y_test, 0.3), recall_top(v, y_test, 0.5), f1(v, y_test) ] if k[:6] != 'd_tree': inter_list.append(roc_auc(v, y_test)) else: inter_list.append('ND') results_list.append(inter_list) df = pd.DataFrame(results_list) df.columns = [ 'Model', 'Accuracy', 'Precision', 'Precision top 1%', 'Precision top 2%', 'Precision top 5%', 'Precision top 10%', 'Precision top 20%', 'Precision top 30%', 'Precision top 50%', 'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%', 'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%', 'F 1', 'ROC AUC' ] return df
def compute_eval_stats(classifier, y_data, rankings, threshold): ''' Takes: classifier object, true target data, predicted score rankings, ranking threshold cutoff Returns: accuracy, precision, recall of predictions of classifier on x for y ''' predicted_test = np.where(rankings < threshold, 1, 0) # print(threshold) # print(predicted_test.sum()) # print(predicted_test[0:10]) # print("num unique ranks: ", pd.DataFrame(pred_scores)[0].unique().shape) # print("eval stats rankings are: ", rankings[0:10]) stats = [ accuracy(y_data, predicted_test), precision(y_data, predicted_test), recall(y_data, predicted_test), f1(y_data, predicted_test), roc(y_data, predicted_test) ] return stats
def get_trigger_identification_f1(gold_Y, pred_Y): """ Print out P/R/F1 scores for trigger identification :param gold_Y: gold output :param pred_Y: predicted output :return: """ gold_ti = [] pred_ti = [] for i in range(len(gold_Y)): if gold_Y[i] != 0: gold_ti.append(1) else: gold_ti.append(0) if pred_Y[i] != 0: pred_ti.append(1) else: pred_ti.append(0) return 100*precision(gold_ti, pred_ti), \ 100*recall(gold_ti, pred_ti), \ 100*f1(gold_ti, pred_ti)
from sklearn.metrics import precision_score as precision from sklearn.naive_bayes import GaussianNB # TODO: split the data into training and testing sets, # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. from sklearn import cross_validation features = X labels = y features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5, random_state=0) clf1 = DecisionTreeClassifier() clf1.fit(features_train, labels_train) decision_tree_recall = recall(labels_test, clf1.predict(features_test)) decision_tree_precision = precision(labels_test, clf1.predict(features_test)) print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(decision_tree_recall, decision_tree_precision) clf2 = GaussianNB() clf2.fit(features_train, labels_train) nb_recall = recall(labels_test, clf2.predict(features_test)) nb_precision = precision(labels_test, clf2.predict(features_test)) print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(nb_recall, nb_precision) # clf2.fit(X, y) # print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(y,clf2.predict(X)),precision(y,clf2.predict(X))) results = { "Naive Bayes Recall": nb_recall, "Naive Bayes Precision": nb_precision, "Decision Tree Recall": decision_tree_recall, "Decision Tree Precision": decision_tree_precision
print(len(y) - Counter(y)[4]) bi_y = list(map(binary_y, y)) print(Counter(bi_y)) precisions = [] lams = [] recalls = [] f1s = [] for i, lam in enumerate(lam_list): S = np.load(folder + "\\" + "lam" + lam + "\\" + r"l21S.npk", allow_pickle=True) predictions = list(map(binary_error, np.linalg.norm(S, axis=1))) print("lambda:", lam) print("precision", precision(bi_y, predictions, labels=["o", "m"], pos_label="o")) print("recall", recall(bi_y, predictions, labels=["o", "m"], pos_label="o")) print("f1", f1_score(bi_y, predictions, labels=["o", "m"], pos_label="o")) lams.append(lam) precisions.append( precision(bi_y, predictions, labels=["o", "m"], pos_label="o")) recalls.append(recall(bi_y, predictions, labels=["o", "m"], pos_label="o")) f1s.append(f1_score(bi_y, predictions, labels=["o", "m"], pos_label="o")) print(CM(bi_y, predictions)) print("------------") print(len(lams), len(recalls), len(f1s), len(precisions)) d = { "lambda": list(map(float, lams)), "precision": precisions,
X = X._get_numeric_data() y = X['Survived'] del X['Age'], X['Survived'] from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import f1_score from sklearn.metrics import recall_score as recall from sklearn.metrics import precision_score as precision from sklearn.naive_bayes import GaussianNB from sklearn import cross_validation # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25) clf1 = DecisionTreeClassifier() clf1.fit(X_train, y_train) precision_cf1 = precision(y_test,y_test_pred) recall_cf1 = recall(y_test,y_test_pred) score_1 = f1_score(y_test, clf1.predict(X_test)) print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(precision_cf1, recall_cf1) print "Decision Tree F1 score: {:.2f}".format(score_1) clf2 = GaussianNB() clf2.fit(X_train, y_train) precision_cf2 = precision(y_test,y_test_pred) recall_cf2 = recall(y_test,y_test_pred) score_2 = f1_score(y_test, clf2.predict(X_test)) print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(precision_cf2, recall_cf2) print "GaussianNB F1 score: {:.2f}".format(score_2)
X = X._get_numeric_data() y = X['Survived'] del X['Age'], X['Survived'] # TODO: split the data into training and testing sets, # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y) clf1 = DecisionTreeClassifier() clf1.fit(X_train, y_train) clf2 = GaussianNB() clf2.fit(X_train, y_train) tree_recall, tree_precision = recall(y_test, clf1.predict(X_test)), precision( y_test, clf1.predict(X_test)) nb_recall, nb_precision = recall(y_test, clf2.predict(X_test)), precision( y_test, clf2.predict(X_test)) print "Decision Tree recall: {:.2f} and precision: {:.2f}".format( tree_recall, tree_precision) print "GaussianNB recall: {:.2f} and precision: {:.2f}".format( nb_recall, nb_precision) results = { "Naive Bayes Recall": nb_recall, "Naive Bayes Precision": nb_precision, "Decision Tree Recall": tree_recall, "Decision Tree Precision": tree_precision }
def analyze_campus_policies(model_size): """ runs tests with the trained Random Forest model, with each pair of intents in the campi dataset """ print("MODEL TEST USING CAMPI ALL") campi_by_uni_dset = dataset.read('conflicts', 'campi', 'all') results = [] summary = { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 } summary_by_type = { 'qos': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'negation': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'path': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'time': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'synonym': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 }, 'hierarchy': { 'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0, 'precision': 0, 'recall': 0, 'f1': 0 } } model = ClassificationModel('forest') if model.load(model_size): for case in campi_by_uni_dset: features_vector = get_features(case['sentence']['nile'], case['hypothesis']['nile']) prediction = model.predict([features_vector])[0] if prediction == case['conflict']: summary['tp' if prediction == 1 else 'tn'] += 1 summary_by_type[case['type']]['tp' if prediction == 1 else 'tn'] += 1 else: print(case['sentence']['nile'], case['hypothesis']['nile']) summary['fp' if prediction == 1 else 'fn'] += 1 summary_by_type[case['type']]['fp' if prediction == 1 else 'fn'] += 1 print(features_vector, prediction, case['conflict']) results.append( (case['sentence']['university'], case['hypothesis']['university'], case['sentence']['text'], case['hypothesis']['text'], case['sentence']['nile'], case['hypothesis']['nile'], case['type'], case['conflict'], features_vector, prediction)) with open(config.CONFLICTS_RESULTS_PATH.format('campi', 'all'), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow([ 'sentence university', 'hypothesis university', 'sentence text', 'hypothesis text', 'sentence nile', 'hypothesis nile', 'type', 'conflict', 'features', 'prediction' ]) for (stn_uni, hyp_uni, stn_text, hyp_text, stn_nile, hyp_nile, type, conflict, features, prediction) in results: csv_writer.writerow([ stn_uni, hyp_uni, stn_text, hyp_text, stn_nile, hyp_nile, type, conflict, features, prediction ]) summary['precision'] = metrics.precision(summary['tp'], summary['fp']) summary['recall'] = metrics.recall(summary['tp'], summary['fn']) summary['f1'] = metrics.f1_score(summary['precision'], summary['recall']) with open(config.CONFLICTS_RESULTS_PATH.format('campi', 'all_summary'), 'w') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',') csv_writer.writerow( ['type', 'tp', 'tn', 'fp', 'fn', 'precision', 'recall', 'f1']) for type, result in summary_by_type.items(): result['precision'] = metrics.precision( result['tp'], result['fp']) result['recall'] = metrics.recall(result['tp'], result['fn']) result['f1'] = metrics.f1_score(result['precision'], result['recall']) csv_writer.writerow([ type, result['tp'], result['tn'], result['fp'], result['fn'], result['precision'], result['recall'], result['f1'] ]) csv_writer.writerow([ 'total', summary['tp'], summary['tn'], summary['fp'], summary['fn'], summary['precision'], summary['recall'], summary['f1'] ]) print(summary) else: print("Problem loading model")
features, labels, test_size=0.4, random_state=0) # The decision tree classifier # clf1 = DecisionTreeClassifier() # clf1.fit(features,labels) # create the decision tree classifier, clf1 clf1 = DecisionTreeClassifier() # Train the decision tree classifier with labels_train and features_train ( you 'train' with the 'trains') clf1.fit(features_train, labels_train) #Use precision and recall evaluation metric to test the 'test' data ie features_test and label_test print "Decision Tree recall: {:.2f} and precision: {:.2f}".format( recall(labels_test, clf1.predict(features_test)), precision(labels_test, clf1.predict(features_test))) # As seen in above line # Get the decision tree recall 'dt_recall by applying recall function on 'test set' data of features and labels ie features_test & labels_test dt_recall = recall(labels_test, clf1.predict(features_test)) # Also # Get the decision tree precision 'dt_precision by applying precision function on 'test set' data of features and labels ie features_test & labels_test dt_precision = precision(labels_test, clf1.predict(features_test)) # The naive Bayes classifier # clf2 = GaussianNB() # clf2.fit(features,labels) # First, as usual create the classifier, clf2 clf2 = GaussianNB()
from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import recall_score as recall from sklearn.metrics import precision_score as precision from sklearn.naive_bayes import GaussianNB # TODO: split the data into training and testing sets, # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. from sklearn import cross_validation X_train, x, Y_train, y = cross_validation.train_test_split(X, Y) clf1 = DecisionTreeClassifier() clf1.fit(X_train, Y_train) recall1 = recall(y,clf1.predict(x)) precision1 = precision(y,clf1.predict(x)) print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall1, precision1) clf2 = GaussianNB() clf2.fit(X_train, Y_train) recall2 = recall(y,clf2.predict(x)) precision2 = precision(y,clf2.predict(x)) print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall2, precision2) results = { "Naive Bayes Recall": recall2, "Naive Bayes Precision": precision2, "Decision Tree Recall": recall1, "Decision Tree Precision": precision1 }
def lgb_precision_macro(pred, real): ''' sklearn.metrics.precision_score wrapper for LGB ''' is_higher_better = True score = precision(real.label, pred>0.5, average = 'macro') return 'lgb_precision_macro', score, is_higher_better
X = X._get_numeric_data() y = X['Survived'] del X['Age'], X['Survived'] from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import recall_score as recall from sklearn.metrics import precision_score as precision from sklearn.naive_bayes import GaussianNB # TODO: split the data into training and testing sets, # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. from sklearn import cross_validation labels_train, labels_test, features_train, features_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) clf1 = DecisionTreeClassifier() clf1.fit(labels_train, features_train) print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(features_test, clf1.predict(labels_test)), precision(features_test, clf1.predict(labels_test))) clf2 = GaussianNB() clf2.fit(labels_train, features_train) print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(features_test ,clf2.predict(labels_test)), precision(features_test, clf2.predict(labels_test))) results = { "Naive Bayes Recall": recall(features_test ,clf2.predict(labels_test)), "Naive Bayes Precision": precision(features_test, clf2.predict(labels_test)), "Decision Tree Recall": recall(features_test, clf1.predict(labels_test)), "Decision Tree Precision": precision(features_test, clf1.predict(labels_test)) }
from sklearn import cross_validation features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.4, random_state=0) # The decision tree classifier # clf1 = DecisionTreeClassifier() # clf1.fit(features,labels) # create the decision tree classifier, clf1 clf1 = DecisionTreeClassifier() # Train the decision tree classifier with labels_train and features_train ( you 'train' with the 'trains') clf1.fit(features_train, labels_train) #Use precision and recall evaluation metric to test the 'test' data ie features_test and label_test print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(labels_test, clf1.predict(features_test)), precision(labels_test, clf1.predict(features_test))) # As seen in above line # Get the decision tree recall 'dt_recall by applying recall function on 'test set' data of features and labels ie features_test & labels_test dt_recall = recall(labels_test, clf1.predict(features_test)) # Also # Get the decision tree precision 'dt_precision by applying precision function on 'test set' data of features and labels ie features_test & labels_test dt_precision = precision(labels_test, clf1.predict(features_test)) # The naive Bayes classifier # clf2 = GaussianNB() # clf2.fit(features,labels)
"Decision Tree Score": accuracy_score(clf1.predict(feature_test),label_test) } #Consufion matrix from sklearn.metrics import confusion_matrix confusions = { "Naive Bayes": confusion_matrix(clf2.predict(feature_test), label_test), "Decision Tree": confusion_matrix(clf1.predict(feature_test), label_test) } print confusions # Precision and recall from sklearn.metrics import recall_score as recall from sklearn.metrics import precision_score as precision results = { "Naive Bayes Recall": recall(clf2.predict(feature_test),label_test), "Naive Bayes Precision": precision(clf2.predict(feature_test),label_test), "Decision Tree Recall": recall(clf1.predict(feature_test),label_test), "Decision Tree Precision": precision(clf1.predict(feature_test),label_test) } print results # Naive Bayes from sklearn.metrics import f1_score F1_scores = { "Naive Bayes": f1_score(clf2.predict(feature_test),label_test), "Decision Tree": f1_score(clf1.predict(feature_test),label_test) } print F1_scores
y_file = sys.argv[1] p_file = sys.argv[2] print "loading p..." p = np.loadtxt( p_file ) y_predicted = np.ones(( p.shape[0] )) y_predicted[p < 0] = -1 print "loading y..." y = np.loadtxt( y_file, usecols= [0] ) print "accuracy:", accuracy( y, y_predicted ) print "precision:", precision( y, y_predicted, average='binary' ) print "recall:", recall( y, y_predicted, average='binary' ) print "AUC:", AUC( y, p ) print print "confusion matrix:" print confusion_matrix( y, y_predicted ) """ run score.py data/test_v.txt vw/p_v_logistic.txt accuracy: 0.994675826535 confusion matrix: [[27444 136]