def tune_threshold(y, y_prob, eta=0.1, plev=0.5, max_iter=100, output=True):
    """
    Tunes the threshold of the decision rule to improve accuracy.
    
    Keyword Arguments: 
    y - theground truth
    y_prob - the model predictions
    eta - learning rate 
    plev - the level of precision we are trying to maintain
    """
    threshold = 0.5
    yhat = decide(y_prob, threshold)
    p = precision(y, yhat)
    r = recall(y, yhat)
    initial_loss = directional_loss(y, yhat)
    if output:
        print(f"Precision = {p}, Recall = {r}, Threshold = {threshold}")

    for i in range(1, max_iter):
        threshold -= eta / i * threshold
        yhat = decide(y_prob, threshold)

        p = precision(y, yhat)
        r = recall(y, yhat)

        if output:
            print(f"Precision = {p}, Recall = {r}, Threshold = {threshold}")

        if (p <= plev):
            return threshold

    return threshold
Beispiel #2
0
def eval_kaggle_score(df_pred, Num):
    metrics = MulticlassMetrics(
        df_pred.rdd.map(lambda ar: (float(np.argsort(ar.probability)[-1:]),
                                    float(ar.hotel_cluster))))
    NumCluster = Num
    avg_precision = metrics.precision()
    for i in range(1, NumCluster):
        metrics = MulticlassMetrics(
            df_pred.rdd.map(lambda ar:
                            (float(np.argsort(ar.probability)[-(i + 1):-i]),
                             float(ar.hotel_cluster))))
        avg_precision += metrics.precision()
    return avg_precision
Beispiel #3
0
def pred_precision_kaggle(prediction, NumCluster):

    pred_label = prediction.rdd.map(lambda x: (float(
        np.argsort(-1 * x.probability)[:1]), float((x.hotel_cluster))))
    metrics = MulticlassMetrics(pred_label)
    avg_precision = metrics.precision()

    for i in range(1, NumCluster):
        pred_label = prediction.rdd.map(
            lambda x: (float(np.argsort(-1 * x.probability)[i:(i + 1)]),
                       float(x.hotel_cluster)))
        metrics = MulticlassMetrics(pred_label)
        avg_precision += metrics.precision()

    return avg_precision
def tune_model_width(build_fn, x_train, y_train, x_val, y_val, max_width=50):
    """
    Takes a 3-Layer nueral network and expands width to see if there 
    are tangible benefits to increasing the width of the hidden layer 
    in the model. 
    
    Parameters: 
    build_fn - function that returns a keras nn model with the specified parameters 
    x_train - the data matrix 
    y_train - the response function
    x_val - validation data
    y_val - validation data function
    """

    acc = []
    pre = []
    rec = []

    for i in range(15, max_width):
        width = i
        model = feed_forward.build_model(x_train,
                                         y_train,
                                         width=width,
                                         suppress=True)
        model.fit(x_train, y_train, epochs=100, verbose=0)

        y_val_prob = model.predict(x_val)[:, 0]
        y_val_hat = decide(y_val_prob, 0.5)

        acc.append(accuracy(y_val, y_val_hat))
        pre.append(precision(y_val, y_val_hat))
        rec.append(recall(y_val, y_val_hat))

    return acc, pre, rec
Beispiel #5
0
def present_results_simp(y_test, predictions):
    results_list = []
    for k, v in predictions.items():
        inter_list = [
            k,
            accuracy(v, y_test),
            precision(v, y_test),
            precision_top(v, y_test, 0.01),
            precision_top(v, y_test, 0.02),
            precision_top(v, y_test, 0.05),
            precision_top(v, y_test, 0.1),
            precision_top(v, y_test, 0.2),
            precision_top(v, y_test, 0.3),
            precision_top(v, y_test, 0.5),
            recall(v, y_test),
            recall_top(v, y_test, 0.01),
            recall_top(v, y_test, 0.02),
            recall_top(v, y_test, 0.05),
            recall_top(v, y_test, 0.1),
            recall_top(v, y_test, 0.2),
            recall_top(v, y_test, 0.3),
            recall_top(v, y_test, 0.5),
            f1(v, y_test)
        ]
        results_list.append(inter_list)
    df = pd.DataFrame(results_list)
    df.columns = [
        'Model', 'Accuracy', 'Precision', 'Precision top 1%',
        'Precision top 2%', 'Precision top 5%', 'Precision top 10%',
        'Precision top 20%', 'Precision top 30%', 'Precision top 50%',
        'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%',
        'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%',
        'F 1'
    ]
    return df
def evaluate_rf(x_train,
                y_train,
                x_test,
                y_test,
                thresh=thresh,
                ntrees=[25, 100, 500],
                maxfeats=[1, .5, 4]):
    rd = {
        'predicted': [],
        'ntrees': [],
        'nfeats': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }
    for size in ntrees:
        for f in maxfeats:
            scores = random_forest_classifier(size, f, x_train, y_train,
                                              x_test)
            for t in thresh:
                scores = list(stats.rankdata(scores, 'average') / len(scores))
                preds = [compare_to_threshold(x, t) for x in scores]
                rd['predicted'].append(preds)
                rd['ntrees'].append(size)
                rd['nfeats'].append(f)
                rd['threshold'].append(t)
                rd['precision'].append(precision(y_test, preds))
                rd['recall'].append(recall(y_test, preds))
                rd['accuracy'].append(accuracy(y_test, preds))
                rd['class'].append('rf')

    return pd.DataFrame(rd)
def evaluate_knn(x_train,
                 y_train,
                 x_test,
                 y_test,
                 kays=[3, 5, 7, 9, 11],
                 thresh=thresh):
    '''
    generates df of predictions, penalties, k values, thresholds, precision,
    recall, and accuracy to help find best model
    '''
    rd = {
        'predicted': [],
        'k': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }
    for k in kays:
        scores = knn_classifier(x_train, y_train, x_test, k)[:, 1]
        for t in thresh:
            scores = list(stats.rankdata(scores, 'average') / len(scores))
            preds = [compare_to_threshold(x, t) for x in scores]
            rd['predicted'].append(preds)
            rd['k'].append(k)
            rd['threshold'].append(t)
            rd['precision'].append(precision(y_test, preds))
            rd['recall'].append(recall(y_test, preds))
            rd['accuracy'].append(accuracy(y_test, preds))
            rd['class'].append('knn')

    return pd.DataFrame(rd)
Beispiel #8
0
def evaluate_logreg(x_train, y_train, x_test, y_test,
                    c_values=[.01,.1,1,10,100], thresh=thresh):
    '''
    generates df of predictions, penalties, c_values, thresholds, precision, recall, and
    accuracy of logistic regression
    '''
    penalties = ['l2']
    rd = {'predicted': [], 'penalty': [], 'C': [], 'threshold': [],
          'precision': [], 'recall': [], 'accuracy':[], 'class': []}
    
    for p in penalties:
        for c in c_values:
            scores = logreg_classifier(x_train, y_train, x_test, c, p)
            for t in thresh:
                scores = list(stats.rankdata(scores, 'average')/len(scores))
                preds = [compare_to_threshold(x, t)for x in scores]
                rd['predicted'].append(preds)
                rd['penalty'].append(p)
                rd['C'].append(c)
                rd['threshold'].append(t)
                rd['precision'].append(precision(y_test, preds))
                rd['recall'].append(recall(y_test, preds))
                rd['accuracy'].append(accuracy(y_test, preds))
                rd['class'].append('logreg')

    return pd.DataFrame(rd)
Beispiel #9
0
def get_metrics(prediction, y_test):
    '''
	Computes accuracy, precision, recall, ROC-AUC and F1 metrics for 
	consideroing predictions produced by a ML and actual values of a 
	dependent variables.
	Inputs:
		- prediction: an array with predictions.
		- y_test: an array with actual values.
	Returns a dictionary with metrics of a ML model.
	'''
    Accuracy = accuracy(prediction, y_test)
    Precision = precision(prediction, y_test)
    Recall = recall(prediction, y_test)
    try:
        AUC = roc_auc(prediction, y_test)
    except ValueError:
        AUC = 0
    F1 = f1(prediction, y_test)

    metrics_dict = {
        'Accuracy': Accuracy,
        'Precision': Precision,
        'Recall': Recall,
        'AUC': AUC,
        'F1': F1
    }
    return metrics_dict
def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh):
    '''
    you get it
    '''
    criterion = ['entropy', 'gini']
    rd = {
        'predicted': [],
        'crit': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }

    for c in criterion:
        scores = dectree_classifier(x_train, y_train, x_test, c)
        for t in thresh:
            scores = list(stats.rankdata(scores, 'average') / len(scores))
            preds = [compare_to_threshold(x, t) for x in list(scores)]
            rd['predicted'].append(preds)
            rd['crit'].append(c)
            rd['threshold'].append(t)
            rd['precision'].append(precision(y_test, preds))
            rd['recall'].append(recall(y_test, preds))
            rd['accuracy'].append(accuracy(y_test, preds))
            rd['class'].append('dectree')

    return pd.DataFrame(rd)
Beispiel #11
0
def pred_precision(prediction):

    pred_label = prediction.rdd.map(
        lambda x: (float(x.prediction), float(x.hotel_cluster)))
    metrics = MulticlassMetrics(pred_label)
    precision = metrics.precision()

    return round(precision * 100, 2)
def print_prediction_metrics(clf, x, y, k):
    pred = cross_val_predict(clf,
                             x,
                             y,
                             cv=StratifiedKFold(n_splits=k, shuffle=True))
    print("Accuracy: ", round(accuracy(y, pred), 2))
    print("Precision on spam: ", round(precision(y, pred, average=None)[1], 3))
    print("Recall on spam: ", round(recall(y, pred, average=None)[1], 3))
    return
Beispiel #13
0
 def update_metrics(gt, pre, f1_m, p_m, r_m, acc_m):
     f1_value = f1(gt, pre, average="micro")
     f1_m.update(f1_value)
     p_value = precision(gt, pre, average="micro", zero_division=0)
     p_m.update(p_value)
     r_value = recall(gt, pre, average="micro")
     r_m.update(r_value)
     acc_value = accuracy(gt, pre)
     acc_m.update(acc_value)
Beispiel #14
0
 def __init__(self, model, parameters, name, threshold, x_train, y_train, x_test,
              y_test):
     self.params = parameters
     self.model = model.set_params(**parameters)
     self.scores = classify(x_train, y_train, x_test, self.model)
     self.truth = y_test
     self.predictions = predict(self.scores, threshold)
     self.accuracy = accuracy(self.truth, self.predictions)
     self.precision = precision(self.truth, self.predictions)
     self.recall = recall(self.truth, self.predictions)
     self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
     self.name = None
     ClassifierAnalyzer.identifier += 1
Beispiel #15
0
def evaluate_bagging(x_train, y_train, x_test, y_test, thresh=thresh):
    rd = {'predicted': [], 'threshold': [], 'precision': [], 'recall': [],
          'accuracy':[], 'class': []}
    scores = bagging_classifier(x_train, y_train, x_test)
    for t in thresh:
        scores = list(stats.rankdata(scores, 'average')/len(scores))
        preds = [compare_to_threshold(x, t) for x in list(scores)]
        rd['predicted'].append(preds)
        rd['threshold'].append(t)
        rd['precision'].append(precision(y_test, preds))
        rd['recall'].append(recall(y_test, preds))
        rd['accuracy'].append(accuracy(y_test, preds))
        rd['class'].append('bagging')
        
    return pd.DataFrame(rd)
Beispiel #16
0
def calculate_precision(predicted_scores, y_test, threshold):
	'''
	Calculate the precision of the trained model.

	Inputs:
		predicted_scores (numpy array) - probabilities that data points belong
			to class 1
		y_test (pandas dataframe) - label testing data
		threshold (float) - if predicted score is above this threshold,
			consider it to be class 1
	Outputs:
		test_precision (float)
	'''
	predictions = get_predictions_with_threshold(predicted_scores, threshold)
	test_precision = precision(y_test, predictions)

	return test_precision
Beispiel #17
0
 def __init__(self, model, parameters, name, threshold, x_train, y_train,
              x_test, y_test):
     self.params = parameters
     self.t = threshold
     self.model = model.set_params(**parameters)
     self.scores = classify(x_train, y_train, x_test, self.model)
     self.truth = y_test
     self.predictions = predict(self.scores, self.t)
     self.predicted_for_pct = sum(self.predictions) / len(self.predictions)
     self.accuracy = accuracy(self.truth, self.predictions)
     self.precision = precision(self.truth, self.predictions)
     self.recall = recall(self.truth, self.predictions)
     self.f1 = (self.accuracy * self.precision * 2) / (self.accuracy +
                                                       self.precision)
     self.name = ClassifierAnalyzer.identifier_counter
     self.roc_auc = None
     ClassifierAnalyzer.identifier_counter += 1
Beispiel #18
0
def simp_run_through(evals, facs, features, year_col, start, split, end,
                     classes, parameters, thresholds):
    rv = {
        'class': [],
        'DT': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'preds': [],
        'top_features': []
    }
    df = make_df(evals, facs)
    train, test = simp_windows(df, year_col, start, split, end, features)
    trx, tr_y, tex, te_y = simp_x_y_split(train, test)
    train_dates = trx['EVALUATION_START_DATE']
    test_dates = tex['EVALUATION_START_DATE']
    trx.drop(['EVALUATION_START_DATE', 'FACILITY_NAME', 'MostRecentEval'],
             inplace=True,
             axis=1)
    tex.drop(['EVALUATION_START_DATE', 'FACILITY_NAME', 'MostRecentEval'],
             inplace=True,
             axis=1)
    for c in classes:
        for p in pars[c]:
            if c == 'DT':
                scores, imps = dectree_classifier(trx, tr_y, tex, p)
            for t in thresholds:
                preds = [compare_to_threshold(x, t) for x in list(scores)]
                rv[c].append(p)
                rv['class'].append(c)
                rv['threshold'].append(t)
                rv['precision'].append(precision(te_y, preds))
                rv['recall'].append(recall(te_y, preds))
                rv['preds'].append(preds)
                rv['top_features'].append(
                    list(zip(list(tex.columns), list(imps))))

    final = pd.DataFrame(rv)
    final.to_csv('results.csv')

    return print(final)
    def make_prediction_matrix(self):
        rv_dic = {}
        predictions_df = pd.DataFrame()
        for thresh in self.t:
            x = round((1 - thresh), 2)
            preds = 'predictions_{}pct'.format(x)
            a = 'precision_{}pct'.format(x)
            b = 'recall_{}pct'.format(x)
            c = 'f1_{}pct'.format(x)
            predictions = predict(self.scores, thresh)
            predictions = [int(x) for x in predictions]
            d = '{}_at_{}pct'.format(self.name, x)
            predictions_df[d] = predictions
            prec = precision(self.truth, predictions)
            rec = recall(self.truth, predictions)
            rv_dic[a] = [prec]
            rv_dic[b] = [rec]
            rv_dic[c] = [(prec * rec * 2) / (prec + rec)]
            rv_dic['model'] = [self.name]

        return pd.DataFrame(rv_dic), predictions_df
Beispiel #20
0
def test_classifiers(X,y,n=7,rname="results.txt"):        
    clfs={
#        "Bagging KNN": [BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5),[],[],[],[]],
        "NN (kNN k=1)": [KNeighborsClassifier(n_neighbors=1),[],[],[],[],[]],
        #"NN (kNN k=3)": [KNeighborsClassifier(n_neighbors=3),[],[],[],[],[]],
        "NN (kNN k=3 w)": [KNeighborsClassifier(n_neighbors=3, weights='distance'),[],[],[],[],[]],
        "NN (kNN k=5 w)": [KNeighborsClassifier(n_neighbors=5, weights='distance'),[],[],[],[],[]],
        #"NN (kNN k=7 w)": [KNeighborsClassifier(n_neighbors=7, weights='distance'),[],[],[],[]],
        #"SVM - Linear kernel": [svm.SVC(kernel="rbf",probability=True),[],[],[],[]],
 #       "Naive Bayes": [GaussianNB(),[],[],[],[]],
#        "SVM Sigmoide": [svm.SVC(kernel="sigmoid"),[],[],[],[]],
        #"ANN":[MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),[],[],[],[]],
    }
    #V=["Voting KNN",[None,[],[],[],[]]]
    skf=kfold(y, n_iter=n, random_state=None,  train_size=0.7)
    output=open(rname,"w")
    for train,test in skf:
        Xt,Yt=X[train],y[train]
        Xv,Yv=X[test],y[test]
        votes=[]
        for (k,v)  in  clfs.items():
            v[0].fit(Xt,Yt)
            #print(clfs[k])
            Yr=v[0].predict(Xv)
            #print(accs(Yv,Yr))
            v[1].append(accs(Yv,Yr))
            v[2].append(f1(Yv,Yr,average="macro"))
            v[3].append(recall(Yv,Yr,average="macro"))
            v[4].append(precision(Yv,Yr))
            v[5].append(kappa(Yv,Yr))
            #votes.append(Yr)
        #Yp=predict(votes)
    for k,v in clfs.items():
        fm="%s | %s| %s | %s | %s\n"
        output.write(fm %(k,"Accuracy",np.mean(v[1]),min(v[1]),max(v[1])))
        #output.write(fm  %(k,"Kappa",np.mean(v[5]),min(v[5]),max(v[5])))
        output.write(fm %(k,"F1",np.mean(v[2]),min(v[2]),max(v[2])))
        output.write(fm %(k,"Recall",np.mean(v[3]),min(v[3]),max(v[3])))
        output.write(fm %(k,"Precision",np.mean(v[4]),min(v[4]),max(v[4])))
def crossValidate(X, y, nfold):
    kf = KFold(n_splits=nfold, shuffle=True)
    kf.get_n_splits(X)

    sorted_indices = np.loadtxt('final_sorted_indices.txt', dtype=int)

    r = 16

    print("K-fold: K=", nfold)
    f1 = 0
    acc = 0
    prec = 0
    rec = 0
    spec = 0
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        selected_feature_indices = sorted_indices[:r]
        X_train_selected_features = X_train[:, selected_feature_indices]
        X_test_selected_features = X_test[:, selected_feature_indices]

        clf = GaussianNB()
        y_pred = clf.fit(X_train_selected_features,
                         y_train).predict(X_test_selected_features)

        f1 += fscore(y_test, y_pred)
        acc += accuracy(y_test, y_pred)
        prec += precision(y_test, y_pred)
        rec += recall(y_test, y_pred)
        spec += specificity(y_test, y_pred)

    print('fscore', f1 / nfold)
    print('accuracy', acc / nfold)
    print('precision', prec / nfold)
    print('recall', rec / nfold)
    print('specificity', spec / nfold)
Beispiel #22
0
def present_results(y_test, predictions):
    results_list = []
    for k, v in predictions.items():
        inter_list = [
            k,
            accuracy(v, y_test),
            precision(v, y_test),
            precision_top(v, y_test, 0.01),
            precision_top(v, y_test, 0.02),
            precision_top(v, y_test, 0.05),
            precision_top(v, y_test, 0.1),
            precision_top(v, y_test, 0.2),
            precision_top(v, y_test, 0.3),
            precision_top(v, y_test, 0.5),
            recall(v, y_test),
            recall_top(v, y_test, 0.01),
            recall_top(v, y_test, 0.02),
            recall_top(v, y_test, 0.05),
            recall_top(v, y_test, 0.1),
            recall_top(v, y_test, 0.2),
            recall_top(v, y_test, 0.3),
            recall_top(v, y_test, 0.5),
            f1(v, y_test)
        ]
        if k[:6] != 'd_tree':
            inter_list.append(roc_auc(v, y_test))
        else:
            inter_list.append('ND')
        results_list.append(inter_list)
    df = pd.DataFrame(results_list)
    df.columns = [
        'Model', 'Accuracy', 'Precision', 'Precision top 1%',
        'Precision top 2%', 'Precision top 5%', 'Precision top 10%',
        'Precision top 20%', 'Precision top 30%', 'Precision top 50%',
        'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%',
        'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%',
        'F 1', 'ROC AUC'
    ]
    return df
Beispiel #23
0
def compute_eval_stats(classifier, y_data, rankings, threshold):
    ''' Takes: classifier object, true target data, predicted score rankings, 
                ranking threshold cutoff
        Returns: accuracy, precision, recall of predictions of classifier on x for y
    '''

    predicted_test = np.where(rankings < threshold, 1, 0)

    # print(threshold)
    # print(predicted_test.sum())
    # print(predicted_test[0:10])
    # print("num unique ranks: ", pd.DataFrame(pred_scores)[0].unique().shape)
    # print("eval stats rankings are: ", rankings[0:10])

    stats = [
        accuracy(y_data, predicted_test),
        precision(y_data, predicted_test),
        recall(y_data, predicted_test),
        f1(y_data, predicted_test),
        roc(y_data, predicted_test)
    ]

    return stats
Beispiel #24
0
def get_trigger_identification_f1(gold_Y, pred_Y):
    """
    Print out P/R/F1 scores for trigger identification

    :param gold_Y: gold output
    :param pred_Y: predicted output
    :return:
    """
    gold_ti = []
    pred_ti = []

    for i in range(len(gold_Y)):
        if gold_Y[i] != 0:
            gold_ti.append(1)
        else:
            gold_ti.append(0)
        if pred_Y[i] != 0:
            pred_ti.append(1)
        else:
            pred_ti.append(0)

    return 100*precision(gold_ti, pred_ti), \
           100*recall(gold_ti, pred_ti), \
           100*f1(gold_ti, pred_ti)
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB


# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
features = X
labels = y
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5, random_state=0)

clf1 = DecisionTreeClassifier()
clf1.fit(features_train, labels_train)
decision_tree_recall = recall(labels_test, clf1.predict(features_test))
decision_tree_precision = precision(labels_test, clf1.predict(features_test))
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(decision_tree_recall, decision_tree_precision)

clf2 = GaussianNB()
clf2.fit(features_train, labels_train)
nb_recall = recall(labels_test, clf2.predict(features_test))
nb_precision = precision(labels_test, clf2.predict(features_test))
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(nb_recall, nb_precision)
# clf2.fit(X, y)
# print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(y,clf2.predict(X)),precision(y,clf2.predict(X)))

results = {
  "Naive Bayes Recall": nb_recall,
  "Naive Bayes Precision": nb_precision,
  "Decision Tree Recall": decision_tree_recall,
  "Decision Tree Precision": decision_tree_precision
print(len(y) - Counter(y)[4])

bi_y = list(map(binary_y, y))
print(Counter(bi_y))

precisions = []
lams = []
recalls = []
f1s = []
for i, lam in enumerate(lam_list):
    S = np.load(folder + "\\" + "lam" + lam + "\\" + r"l21S.npk",
                allow_pickle=True)
    predictions = list(map(binary_error, np.linalg.norm(S, axis=1)))
    print("lambda:", lam)
    print("precision",
          precision(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    print("recall", recall(bi_y, predictions, labels=["o", "m"],
                           pos_label="o"))
    print("f1", f1_score(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    lams.append(lam)
    precisions.append(
        precision(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    recalls.append(recall(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    f1s.append(f1_score(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    print(CM(bi_y, predictions))
    print("------------")
print(len(lams), len(recalls), len(f1s), len(precisions))

d = {
    "lambda": list(map(float, lams)),
    "precision": precisions,
X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25)


clf1 = DecisionTreeClassifier()
clf1.fit(X_train, y_train)
precision_cf1 = precision(y_test,y_test_pred)
recall_cf1 = recall(y_test,y_test_pred)
score_1 = f1_score(y_test, clf1.predict(X_test))
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(precision_cf1, recall_cf1)
print "Decision Tree F1 score: {:.2f}".format(score_1)

clf2 = GaussianNB()
clf2.fit(X_train, y_train)
precision_cf2 = precision(y_test,y_test_pred)
recall_cf2 = recall(y_test,y_test_pred)
score_2 = f1_score(y_test, clf2.predict(X_test))
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(precision_cf2, recall_cf2)
print "GaussianNB F1 score: {:.2f}".format(score_2)
Beispiel #28
0
X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y)

clf1 = DecisionTreeClassifier()
clf1.fit(X_train, y_train)

clf2 = GaussianNB()
clf2.fit(X_train, y_train)

tree_recall, tree_precision = recall(y_test, clf1.predict(X_test)), precision(
    y_test, clf1.predict(X_test))
nb_recall, nb_precision = recall(y_test, clf2.predict(X_test)), precision(
    y_test, clf2.predict(X_test))

print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(
    tree_recall, tree_precision)
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(
    nb_recall, nb_precision)

results = {
    "Naive Bayes Recall": nb_recall,
    "Naive Bayes Precision": nb_precision,
    "Decision Tree Recall": tree_recall,
    "Decision Tree Precision": tree_precision
}
Beispiel #29
0
def analyze_campus_policies(model_size):
    """ runs tests with the trained Random Forest model, with each pair of intents in the campi dataset """
    print("MODEL TEST USING CAMPI ALL")

    campi_by_uni_dset = dataset.read('conflicts', 'campi', 'all')
    results = []
    summary = {
        'tp': 0,
        'tn': 0,
        'fp': 0,
        'fn': 0,
        'precision': 0,
        'recall': 0,
        'f1': 0
    }
    summary_by_type = {
        'qos': {
            'tp': 0,
            'tn': 0,
            'fp': 0,
            'fn': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'negation': {
            'tp': 0,
            'tn': 0,
            'fp': 0,
            'fn': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'path': {
            'tp': 0,
            'tn': 0,
            'fp': 0,
            'fn': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'time': {
            'tp': 0,
            'tn': 0,
            'fp': 0,
            'fn': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'synonym': {
            'tp': 0,
            'tn': 0,
            'fp': 0,
            'fn': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        },
        'hierarchy': {
            'tp': 0,
            'tn': 0,
            'fp': 0,
            'fn': 0,
            'precision': 0,
            'recall': 0,
            'f1': 0
        }
    }

    model = ClassificationModel('forest')
    if model.load(model_size):
        for case in campi_by_uni_dset:
            features_vector = get_features(case['sentence']['nile'],
                                           case['hypothesis']['nile'])
            prediction = model.predict([features_vector])[0]
            if prediction == case['conflict']:
                summary['tp' if prediction == 1 else 'tn'] += 1
                summary_by_type[case['type']]['tp' if prediction ==
                                              1 else 'tn'] += 1
            else:
                print(case['sentence']['nile'], case['hypothesis']['nile'])
                summary['fp' if prediction == 1 else 'fn'] += 1
                summary_by_type[case['type']]['fp' if prediction ==
                                              1 else 'fn'] += 1

            print(features_vector, prediction, case['conflict'])
            results.append(
                (case['sentence']['university'],
                 case['hypothesis']['university'], case['sentence']['text'],
                 case['hypothesis']['text'], case['sentence']['nile'],
                 case['hypothesis']['nile'], case['type'], case['conflict'],
                 features_vector, prediction))

        with open(config.CONFLICTS_RESULTS_PATH.format('campi', 'all'),
                  'w') as csvfile:
            csv_writer = csv.writer(csvfile, delimiter=',')
            csv_writer.writerow([
                'sentence university', 'hypothesis university',
                'sentence text', 'hypothesis text', 'sentence nile',
                'hypothesis nile', 'type', 'conflict', 'features', 'prediction'
            ])
            for (stn_uni, hyp_uni, stn_text, hyp_text, stn_nile, hyp_nile,
                 type, conflict, features, prediction) in results:
                csv_writer.writerow([
                    stn_uni, hyp_uni, stn_text, hyp_text, stn_nile, hyp_nile,
                    type, conflict, features, prediction
                ])

        summary['precision'] = metrics.precision(summary['tp'], summary['fp'])
        summary['recall'] = metrics.recall(summary['tp'], summary['fn'])
        summary['f1'] = metrics.f1_score(summary['precision'],
                                         summary['recall'])

        with open(config.CONFLICTS_RESULTS_PATH.format('campi', 'all_summary'),
                  'w') as csvfile:
            csv_writer = csv.writer(csvfile, delimiter=',')
            csv_writer.writerow(
                ['type', 'tp', 'tn', 'fp', 'fn', 'precision', 'recall', 'f1'])
            for type, result in summary_by_type.items():
                result['precision'] = metrics.precision(
                    result['tp'], result['fp'])
                result['recall'] = metrics.recall(result['tp'], result['fn'])
                result['f1'] = metrics.f1_score(result['precision'],
                                                result['recall'])

                csv_writer.writerow([
                    type, result['tp'], result['tn'], result['fp'],
                    result['fn'], result['precision'], result['recall'],
                    result['f1']
                ])

            csv_writer.writerow([
                'total', summary['tp'], summary['tn'], summary['fp'],
                summary['fn'], summary['precision'], summary['recall'],
                summary['f1']
            ])

        print(summary)
    else:
        print("Problem loading model")
Beispiel #30
0
    features, labels, test_size=0.4, random_state=0)

# The decision tree classifier
# clf1 = DecisionTreeClassifier()
# clf1.fit(features,labels)

# create the decision tree classifier, clf1
clf1 = DecisionTreeClassifier()

# Train the decision tree classifier with labels_train and features_train ( you 'train' with the 'trains')
clf1.fit(features_train, labels_train)

#Use precision and recall evaluation metric to test the 'test' data  ie features_test and label_test
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(
    recall(labels_test, clf1.predict(features_test)),
    precision(labels_test, clf1.predict(features_test)))

# As seen in above line
# Get the decision tree recall 'dt_recall by applying recall function on 'test set' data of features and labels ie features_test & labels_test
dt_recall = recall(labels_test, clf1.predict(features_test))

# Also
# Get the decision tree precision 'dt_precision by applying precision function on 'test set' data of features and labels ie features_test & labels_test
dt_precision = precision(labels_test, clf1.predict(features_test))

# The naive Bayes classifier
# clf2 = GaussianNB()
# clf2.fit(features,labels)

# First, as usual create the classifier, clf2
clf2 = GaussianNB()
Beispiel #31
0

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, x, Y_train, y = cross_validation.train_test_split(X, Y)

clf1 = DecisionTreeClassifier()
clf1.fit(X_train, Y_train)
recall1 = recall(y,clf1.predict(x))
precision1 = precision(y,clf1.predict(x))
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall1, precision1)

clf2 = GaussianNB()
clf2.fit(X_train, Y_train)
recall2 = recall(y,clf2.predict(x))
precision2 = precision(y,clf2.predict(x))
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall2, precision2)

results = {
  "Naive Bayes Recall": recall2,
  "Naive Bayes Precision": precision2,
  "Decision Tree Recall": recall1,
  "Decision Tree Precision": precision1
}
Beispiel #32
0
def lgb_precision_macro(pred, real):
    ''' sklearn.metrics.precision_score wrapper for LGB '''
    is_higher_better = True
    score = precision(real.label, pred>0.5, average = 'macro')
    return 'lgb_precision_macro', score, is_higher_better
X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
labels_train, labels_test, features_train, features_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) 

clf1 = DecisionTreeClassifier()
clf1.fit(labels_train, features_train)
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(features_test, clf1.predict(labels_test)), precision(features_test, clf1.predict(labels_test)))

clf2 = GaussianNB()
clf2.fit(labels_train, features_train)
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(features_test ,clf2.predict(labels_test)), precision(features_test, clf2.predict(labels_test)))

results = {
  "Naive Bayes Recall": recall(features_test ,clf2.predict(labels_test)),
  "Naive Bayes Precision": precision(features_test, clf2.predict(labels_test)),
  "Decision Tree Recall": recall(features_test, clf1.predict(labels_test)),
  "Decision Tree Precision": precision(features_test, clf1.predict(labels_test))
}
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.4, random_state=0)

# The decision tree classifier
# clf1 = DecisionTreeClassifier()
# clf1.fit(features,labels)

# create the decision tree classifier, clf1
clf1 = DecisionTreeClassifier()

# Train the decision tree classifier with labels_train and features_train ( you 'train' with the 'trains')
clf1.fit(features_train, labels_train)

#Use precision and recall evaluation metric to test the 'test' data  ie features_test and label_test
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(labels_test, clf1.predict(features_test)), precision(labels_test, clf1.predict(features_test)))

# As seen in above line
# Get the decision tree recall 'dt_recall by applying recall function on 'test set' data of features and labels ie features_test & labels_test
dt_recall = recall(labels_test, clf1.predict(features_test))

# Also
# Get the decision tree precision 'dt_precision by applying precision function on 'test set' data of features and labels ie features_test & labels_test
dt_precision = precision(labels_test, clf1.predict(features_test))




# The naive Bayes classifier
# clf2 = GaussianNB()
# clf2.fit(features,labels)
 "Decision Tree Score": accuracy_score(clf1.predict(feature_test),label_test)
}

#Consufion matrix
from sklearn.metrics import confusion_matrix
confusions = {
 "Naive Bayes": confusion_matrix(clf2.predict(feature_test), label_test),
 "Decision Tree": confusion_matrix(clf1.predict(feature_test), label_test)
}

print confusions

# Precision and recall
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
results = {
  "Naive Bayes Recall": recall(clf2.predict(feature_test),label_test),
  "Naive Bayes Precision": precision(clf2.predict(feature_test),label_test),
  "Decision Tree Recall": recall(clf1.predict(feature_test),label_test),
  "Decision Tree Precision": precision(clf1.predict(feature_test),label_test)
}

print results

# Naive Bayes
from sklearn.metrics import f1_score
F1_scores = {
 "Naive Bayes": f1_score(clf2.predict(feature_test),label_test),
 "Decision Tree": f1_score(clf1.predict(feature_test),label_test)
}
print F1_scores
y_file = sys.argv[1]
p_file = sys.argv[2]

print "loading p..."

p = np.loadtxt( p_file )

y_predicted = np.ones(( p.shape[0] ))
y_predicted[p < 0] = -1

print "loading y..."

y = np.loadtxt( y_file, usecols= [0] )

print "accuracy:", accuracy( y, y_predicted )
print "precision:", precision( y, y_predicted, average='binary' )
print "recall:", recall( y, y_predicted, average='binary' )
print "AUC:", AUC( y, p )

print
print "confusion matrix:"
print confusion_matrix( y, y_predicted )


"""
run score.py data/test_v.txt vw/p_v_logistic.txt

accuracy: 0.994675826535

confusion matrix:
[[27444   136]