def tune_threshold(y, y_prob, eta=0.1, plev=0.5, max_iter=100, output=True):
    """
    Tunes the threshold of the decision rule to improve accuracy.
    
    Keyword Arguments: 
    y - theground truth
    y_prob - the model predictions
    eta - learning rate 
    plev - the level of precision we are trying to maintain
    """
    threshold = 0.5
    yhat = decide(y_prob, threshold)
    p = precision(y, yhat)
    r = recall(y, yhat)
    initial_loss = directional_loss(y, yhat)
    if output:
        print(f"Precision = {p}, Recall = {r}, Threshold = {threshold}")

    for i in range(1, max_iter):
        threshold -= eta / i * threshold
        yhat = decide(y_prob, threshold)

        p = precision(y, yhat)
        r = recall(y, yhat)

        if output:
            print(f"Precision = {p}, Recall = {r}, Threshold = {threshold}")

        if (p <= plev):
            return threshold

    return threshold
Beispiel #2
0
def present_results_simp(y_test, predictions):
    results_list = []
    for k, v in predictions.items():
        inter_list = [
            k,
            accuracy(v, y_test),
            precision(v, y_test),
            precision_top(v, y_test, 0.01),
            precision_top(v, y_test, 0.02),
            precision_top(v, y_test, 0.05),
            precision_top(v, y_test, 0.1),
            precision_top(v, y_test, 0.2),
            precision_top(v, y_test, 0.3),
            precision_top(v, y_test, 0.5),
            recall(v, y_test),
            recall_top(v, y_test, 0.01),
            recall_top(v, y_test, 0.02),
            recall_top(v, y_test, 0.05),
            recall_top(v, y_test, 0.1),
            recall_top(v, y_test, 0.2),
            recall_top(v, y_test, 0.3),
            recall_top(v, y_test, 0.5),
            f1(v, y_test)
        ]
        results_list.append(inter_list)
    df = pd.DataFrame(results_list)
    df.columns = [
        'Model', 'Accuracy', 'Precision', 'Precision top 1%',
        'Precision top 2%', 'Precision top 5%', 'Precision top 10%',
        'Precision top 20%', 'Precision top 30%', 'Precision top 50%',
        'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%',
        'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%',
        'F 1'
    ]
    return df
def evaluate_dectree(x_train, y_train, x_test, y_test, thresh=thresh):
    '''
    you get it
    '''
    criterion = ['entropy', 'gini']
    rd = {
        'predicted': [],
        'crit': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }

    for c in criterion:
        scores = dectree_classifier(x_train, y_train, x_test, c)
        for t in thresh:
            scores = list(stats.rankdata(scores, 'average') / len(scores))
            preds = [compare_to_threshold(x, t) for x in list(scores)]
            rd['predicted'].append(preds)
            rd['crit'].append(c)
            rd['threshold'].append(t)
            rd['precision'].append(precision(y_test, preds))
            rd['recall'].append(recall(y_test, preds))
            rd['accuracy'].append(accuracy(y_test, preds))
            rd['class'].append('dectree')

    return pd.DataFrame(rd)
def evaluate_rf(x_train,
                y_train,
                x_test,
                y_test,
                thresh=thresh,
                ntrees=[25, 100, 500],
                maxfeats=[1, .5, 4]):
    rd = {
        'predicted': [],
        'ntrees': [],
        'nfeats': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }
    for size in ntrees:
        for f in maxfeats:
            scores = random_forest_classifier(size, f, x_train, y_train,
                                              x_test)
            for t in thresh:
                scores = list(stats.rankdata(scores, 'average') / len(scores))
                preds = [compare_to_threshold(x, t) for x in scores]
                rd['predicted'].append(preds)
                rd['ntrees'].append(size)
                rd['nfeats'].append(f)
                rd['threshold'].append(t)
                rd['precision'].append(precision(y_test, preds))
                rd['recall'].append(recall(y_test, preds))
                rd['accuracy'].append(accuracy(y_test, preds))
                rd['class'].append('rf')

    return pd.DataFrame(rd)
Beispiel #5
0
def evaluate_logreg(x_train, y_train, x_test, y_test,
                    c_values=[.01,.1,1,10,100], thresh=thresh):
    '''
    generates df of predictions, penalties, c_values, thresholds, precision, recall, and
    accuracy of logistic regression
    '''
    penalties = ['l2']
    rd = {'predicted': [], 'penalty': [], 'C': [], 'threshold': [],
          'precision': [], 'recall': [], 'accuracy':[], 'class': []}
    
    for p in penalties:
        for c in c_values:
            scores = logreg_classifier(x_train, y_train, x_test, c, p)
            for t in thresh:
                scores = list(stats.rankdata(scores, 'average')/len(scores))
                preds = [compare_to_threshold(x, t)for x in scores]
                rd['predicted'].append(preds)
                rd['penalty'].append(p)
                rd['C'].append(c)
                rd['threshold'].append(t)
                rd['precision'].append(precision(y_test, preds))
                rd['recall'].append(recall(y_test, preds))
                rd['accuracy'].append(accuracy(y_test, preds))
                rd['class'].append('logreg')

    return pd.DataFrame(rd)
def evaluate_knn(x_train,
                 y_train,
                 x_test,
                 y_test,
                 kays=[3, 5, 7, 9, 11],
                 thresh=thresh):
    '''
    generates df of predictions, penalties, k values, thresholds, precision,
    recall, and accuracy to help find best model
    '''
    rd = {
        'predicted': [],
        'k': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'accuracy': [],
        'class': []
    }
    for k in kays:
        scores = knn_classifier(x_train, y_train, x_test, k)[:, 1]
        for t in thresh:
            scores = list(stats.rankdata(scores, 'average') / len(scores))
            preds = [compare_to_threshold(x, t) for x in scores]
            rd['predicted'].append(preds)
            rd['k'].append(k)
            rd['threshold'].append(t)
            rd['precision'].append(precision(y_test, preds))
            rd['recall'].append(recall(y_test, preds))
            rd['accuracy'].append(accuracy(y_test, preds))
            rd['class'].append('knn')

    return pd.DataFrame(rd)
Beispiel #7
0
    def perform_SER(self, input, target, device):

        LOAD_PATH = '/home/zhoukun/SER/Speech-Emotion-Recognition-main/checkpoint/best_model_esd.pth'
        model_SER = acrnn().to('cpu')
        model_SER.load_state_dict(torch.load(LOAD_PATH, map_location='cpu'))
        criterion = torch.nn.CrossEntropyLoss()
        best_valid_uw = 0

        model_SER.eval()
        size = input.shape[0]
        with torch.no_grad():
            inputs = torch.tensor(input).to('cpu')
            targets = torch.tensor(target, dtype=torch.long).to('cpu')
            outputs, emotion_embedding_low, emotion_embedding_high = model_SER(
                inputs)
            loss = criterion(outputs, targets).cpu().detach().numpy()

        cost_valid = np.sum(loss) / size
        valid_acc_uw = recall(target.cpu().detach().numpy(),
                              np.argmax(outputs.cpu().detach().numpy(), 1),
                              average='macro')
        valid_conf = confusion(target.cpu().detach().numpy(),
                               np.argmax(outputs.cpu().detach().numpy(), 1))

        if valid_acc_uw > best_valid_uw:
            best_valid_uw = valid_acc_uw
            best_valid_conf = valid_conf

        cost_valid = torch.tensor(cost_valid).to(device)
        emotion_embedding_high = torch.tensor(emotion_embedding_high).to(
            device)
        best_valid_uw = torch.tensor(best_valid_uw).to(device)
        return cost_valid, emotion_embedding_high, best_valid_uw
Beispiel #8
0
def get_metrics(prediction, y_test):
    '''
	Computes accuracy, precision, recall, ROC-AUC and F1 metrics for 
	consideroing predictions produced by a ML and actual values of a 
	dependent variables.
	Inputs:
		- prediction: an array with predictions.
		- y_test: an array with actual values.
	Returns a dictionary with metrics of a ML model.
	'''
    Accuracy = accuracy(prediction, y_test)
    Precision = precision(prediction, y_test)
    Recall = recall(prediction, y_test)
    try:
        AUC = roc_auc(prediction, y_test)
    except ValueError:
        AUC = 0
    F1 = f1(prediction, y_test)

    metrics_dict = {
        'Accuracy': Accuracy,
        'Precision': Precision,
        'Recall': Recall,
        'AUC': AUC,
        'F1': F1
    }
    return metrics_dict
def tune_model_width(build_fn, x_train, y_train, x_val, y_val, max_width=50):
    """
    Takes a 3-Layer nueral network and expands width to see if there 
    are tangible benefits to increasing the width of the hidden layer 
    in the model. 
    
    Parameters: 
    build_fn - function that returns a keras nn model with the specified parameters 
    x_train - the data matrix 
    y_train - the response function
    x_val - validation data
    y_val - validation data function
    """

    acc = []
    pre = []
    rec = []

    for i in range(15, max_width):
        width = i
        model = feed_forward.build_model(x_train,
                                         y_train,
                                         width=width,
                                         suppress=True)
        model.fit(x_train, y_train, epochs=100, verbose=0)

        y_val_prob = model.predict(x_val)[:, 0]
        y_val_hat = decide(y_val_prob, 0.5)

        acc.append(accuracy(y_val, y_val_hat))
        pre.append(precision(y_val, y_val_hat))
        rec.append(recall(y_val, y_val_hat))

    return acc, pre, rec
def main():
    train_data, train_labels, test_data, test_labels = test_train_split()
    predicted_output = predict(train_data, test_data[:, :57])
    print("confusion matrix : \n", cm(test_labels, predicted_output))
    print("Recall : ", recall(test_labels, predicted_output))
    print("Accuracy:",
          accuracy_score(test_labels, predicted_output) * 100, "%")
    print("precision : ", precision_score(test_labels, predicted_output))
def print_prediction_metrics(clf, x, y, k):
    pred = cross_val_predict(clf,
                             x,
                             y,
                             cv=StratifiedKFold(n_splits=k, shuffle=True))
    print("Accuracy: ", round(accuracy(y, pred), 2))
    print("Precision on spam: ", round(precision(y, pred, average=None)[1], 3))
    print("Recall on spam: ", round(recall(y, pred, average=None)[1], 3))
    return
Beispiel #12
0
 def update_metrics(gt, pre, f1_m, p_m, r_m, acc_m):
     f1_value = f1(gt, pre, average="micro")
     f1_m.update(f1_value)
     p_value = precision(gt, pre, average="micro", zero_division=0)
     p_m.update(p_value)
     r_value = recall(gt, pre, average="micro")
     r_m.update(r_value)
     acc_value = accuracy(gt, pre)
     acc_m.update(acc_value)
Beispiel #13
0
 def __init__(self, model, parameters, name, threshold, x_train, y_train, x_test,
              y_test):
     self.params = parameters
     self.model = model.set_params(**parameters)
     self.scores = classify(x_train, y_train, x_test, self.model)
     self.truth = y_test
     self.predictions = predict(self.scores, threshold)
     self.accuracy = accuracy(self.truth, self.predictions)
     self.precision = precision(self.truth, self.predictions)
     self.recall = recall(self.truth, self.predictions)
     self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
     self.name = None
     ClassifierAnalyzer.identifier += 1
Beispiel #14
0
def evaluate_bagging(x_train, y_train, x_test, y_test, thresh=thresh):
    rd = {'predicted': [], 'threshold': [], 'precision': [], 'recall': [],
          'accuracy':[], 'class': []}
    scores = bagging_classifier(x_train, y_train, x_test)
    for t in thresh:
        scores = list(stats.rankdata(scores, 'average')/len(scores))
        preds = [compare_to_threshold(x, t) for x in list(scores)]
        rd['predicted'].append(preds)
        rd['threshold'].append(t)
        rd['precision'].append(precision(y_test, preds))
        rd['recall'].append(recall(y_test, preds))
        rd['accuracy'].append(accuracy(y_test, preds))
        rd['class'].append('bagging')
        
    return pd.DataFrame(rd)
Beispiel #15
0
    def recall(self, predictions):
        """
        Calculate recall score given the predictions.

        Parameters
        ----------
        predictions : ndarray
            Model's predictions.

        Returns
        -------
        int
            Recall score (using `macro` method.)

        """
        return recall(self.target, predictions, average="macro")
Beispiel #16
0
def calculate_recall(predicted_scores, y_test, threshold):
	'''
	Calculate the recall of the trained model.

	Inputs:
		predicted_scores (numpy array) - probabilities that data points belong
			to class 1
		y_test (pandas dataframe) - label testing data
		threshold (float) - if predicted score is above this threshold,
			consider it to be class 1
	Outputs:
		test_recall (float)
	'''
	predictions = get_predictions_with_threshold(predicted_scores, threshold)
	test_recall = recall(y_test, predictions)

	return test_recall
Beispiel #17
0
 def __init__(self, model, parameters, name, threshold, x_train, y_train,
              x_test, y_test):
     self.params = parameters
     self.t = threshold
     self.model = model.set_params(**parameters)
     self.scores = classify(x_train, y_train, x_test, self.model)
     self.truth = y_test
     self.predictions = predict(self.scores, self.t)
     self.predicted_for_pct = sum(self.predictions) / len(self.predictions)
     self.accuracy = accuracy(self.truth, self.predictions)
     self.precision = precision(self.truth, self.predictions)
     self.recall = recall(self.truth, self.predictions)
     self.f1 = (self.accuracy * self.precision * 2) / (self.accuracy +
                                                       self.precision)
     self.name = ClassifierAnalyzer.identifier_counter
     self.roc_auc = None
     ClassifierAnalyzer.identifier_counter += 1
Beispiel #18
0
    def recall(self, utt_preds):
        """
        Calculate recall score given the predictions.

        Parameters
        ----------
        utt_preds : ndarray
            Processed predictions.

        Returns
        -------
        float
            Recall score (using `macro` method.)

        """
        ur = recall(self.actual_target, utt_preds, average="macro")
        return ur
Beispiel #19
0
def simp_run_through(evals, facs, features, year_col, start, split, end,
                     classes, parameters, thresholds):
    rv = {
        'class': [],
        'DT': [],
        'threshold': [],
        'precision': [],
        'recall': [],
        'preds': [],
        'top_features': []
    }
    df = make_df(evals, facs)
    train, test = simp_windows(df, year_col, start, split, end, features)
    trx, tr_y, tex, te_y = simp_x_y_split(train, test)
    train_dates = trx['EVALUATION_START_DATE']
    test_dates = tex['EVALUATION_START_DATE']
    trx.drop(['EVALUATION_START_DATE', 'FACILITY_NAME', 'MostRecentEval'],
             inplace=True,
             axis=1)
    tex.drop(['EVALUATION_START_DATE', 'FACILITY_NAME', 'MostRecentEval'],
             inplace=True,
             axis=1)
    for c in classes:
        for p in pars[c]:
            if c == 'DT':
                scores, imps = dectree_classifier(trx, tr_y, tex, p)
            for t in thresholds:
                preds = [compare_to_threshold(x, t) for x in list(scores)]
                rv[c].append(p)
                rv['class'].append(c)
                rv['threshold'].append(t)
                rv['precision'].append(precision(te_y, preds))
                rv['recall'].append(recall(te_y, preds))
                rv['preds'].append(preds)
                rv['top_features'].append(
                    list(zip(list(tex.columns), list(imps))))

    final = pd.DataFrame(rv)
    final.to_csv('results.csv')

    return print(final)
    def make_prediction_matrix(self):
        rv_dic = {}
        predictions_df = pd.DataFrame()
        for thresh in self.t:
            x = round((1 - thresh), 2)
            preds = 'predictions_{}pct'.format(x)
            a = 'precision_{}pct'.format(x)
            b = 'recall_{}pct'.format(x)
            c = 'f1_{}pct'.format(x)
            predictions = predict(self.scores, thresh)
            predictions = [int(x) for x in predictions]
            d = '{}_at_{}pct'.format(self.name, x)
            predictions_df[d] = predictions
            prec = precision(self.truth, predictions)
            rec = recall(self.truth, predictions)
            rv_dic[a] = [prec]
            rv_dic[b] = [rec]
            rv_dic[c] = [(prec * rec * 2) / (prec + rec)]
            rv_dic['model'] = [self.name]

        return pd.DataFrame(rv_dic), predictions_df
def crossValidate(X, y, nfold):
    kf = KFold(n_splits=nfold, shuffle=True)
    kf.get_n_splits(X)

    sorted_indices = np.loadtxt('final_sorted_indices.txt', dtype=int)

    r = 16

    print("K-fold: K=", nfold)
    f1 = 0
    acc = 0
    prec = 0
    rec = 0
    spec = 0
    for train_index, test_index in kf.split(X):
        # print("TRAIN:", train_index, "TEST:", test_index)

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        selected_feature_indices = sorted_indices[:r]
        X_train_selected_features = X_train[:, selected_feature_indices]
        X_test_selected_features = X_test[:, selected_feature_indices]

        clf = GaussianNB()
        y_pred = clf.fit(X_train_selected_features,
                         y_train).predict(X_test_selected_features)

        f1 += fscore(y_test, y_pred)
        acc += accuracy(y_test, y_pred)
        prec += precision(y_test, y_pred)
        rec += recall(y_test, y_pred)
        spec += specificity(y_test, y_pred)

    print('fscore', f1 / nfold)
    print('accuracy', acc / nfold)
    print('precision', prec / nfold)
    print('recall', rec / nfold)
    print('specificity', spec / nfold)
Beispiel #22
0
def present_results(y_test, predictions):
    results_list = []
    for k, v in predictions.items():
        inter_list = [
            k,
            accuracy(v, y_test),
            precision(v, y_test),
            precision_top(v, y_test, 0.01),
            precision_top(v, y_test, 0.02),
            precision_top(v, y_test, 0.05),
            precision_top(v, y_test, 0.1),
            precision_top(v, y_test, 0.2),
            precision_top(v, y_test, 0.3),
            precision_top(v, y_test, 0.5),
            recall(v, y_test),
            recall_top(v, y_test, 0.01),
            recall_top(v, y_test, 0.02),
            recall_top(v, y_test, 0.05),
            recall_top(v, y_test, 0.1),
            recall_top(v, y_test, 0.2),
            recall_top(v, y_test, 0.3),
            recall_top(v, y_test, 0.5),
            f1(v, y_test)
        ]
        if k[:6] != 'd_tree':
            inter_list.append(roc_auc(v, y_test))
        else:
            inter_list.append('ND')
        results_list.append(inter_list)
    df = pd.DataFrame(results_list)
    df.columns = [
        'Model', 'Accuracy', 'Precision', 'Precision top 1%',
        'Precision top 2%', 'Precision top 5%', 'Precision top 10%',
        'Precision top 20%', 'Precision top 30%', 'Precision top 50%',
        'Recall', 'Recall top 1%', 'Recall top 2%', 'Recall top 5%',
        'Recall top 10%', 'Recall top 20%', 'Recall top 30%', 'Recall top 50%',
        'F 1', 'ROC AUC'
    ]
    return df
Beispiel #23
0
def test_classifiers(X,y,n=7,rname="results.txt"):        
    clfs={
#        "Bagging KNN": [BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5),[],[],[],[]],
        "NN (kNN k=1)": [KNeighborsClassifier(n_neighbors=1),[],[],[],[],[]],
        #"NN (kNN k=3)": [KNeighborsClassifier(n_neighbors=3),[],[],[],[],[]],
        "NN (kNN k=3 w)": [KNeighborsClassifier(n_neighbors=3, weights='distance'),[],[],[],[],[]],
        "NN (kNN k=5 w)": [KNeighborsClassifier(n_neighbors=5, weights='distance'),[],[],[],[],[]],
        #"NN (kNN k=7 w)": [KNeighborsClassifier(n_neighbors=7, weights='distance'),[],[],[],[]],
        #"SVM - Linear kernel": [svm.SVC(kernel="rbf",probability=True),[],[],[],[]],
 #       "Naive Bayes": [GaussianNB(),[],[],[],[]],
#        "SVM Sigmoide": [svm.SVC(kernel="sigmoid"),[],[],[],[]],
        #"ANN":[MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1),[],[],[],[]],
    }
    #V=["Voting KNN",[None,[],[],[],[]]]
    skf=kfold(y, n_iter=n, random_state=None,  train_size=0.7)
    output=open(rname,"w")
    for train,test in skf:
        Xt,Yt=X[train],y[train]
        Xv,Yv=X[test],y[test]
        votes=[]
        for (k,v)  in  clfs.items():
            v[0].fit(Xt,Yt)
            #print(clfs[k])
            Yr=v[0].predict(Xv)
            #print(accs(Yv,Yr))
            v[1].append(accs(Yv,Yr))
            v[2].append(f1(Yv,Yr,average="macro"))
            v[3].append(recall(Yv,Yr,average="macro"))
            v[4].append(precision(Yv,Yr))
            v[5].append(kappa(Yv,Yr))
            #votes.append(Yr)
        #Yp=predict(votes)
    for k,v in clfs.items():
        fm="%s | %s| %s | %s | %s\n"
        output.write(fm %(k,"Accuracy",np.mean(v[1]),min(v[1]),max(v[1])))
        #output.write(fm  %(k,"Kappa",np.mean(v[5]),min(v[5]),max(v[5])))
        output.write(fm %(k,"F1",np.mean(v[2]),min(v[2]),max(v[2])))
        output.write(fm %(k,"Recall",np.mean(v[3]),min(v[3]),max(v[3])))
        output.write(fm %(k,"Precision",np.mean(v[4]),min(v[4]),max(v[4])))
Beispiel #24
0
def compute_eval_stats(classifier, y_data, rankings, threshold):
    ''' Takes: classifier object, true target data, predicted score rankings, 
                ranking threshold cutoff
        Returns: accuracy, precision, recall of predictions of classifier on x for y
    '''

    predicted_test = np.where(rankings < threshold, 1, 0)

    # print(threshold)
    # print(predicted_test.sum())
    # print(predicted_test[0:10])
    # print("num unique ranks: ", pd.DataFrame(pred_scores)[0].unique().shape)
    # print("eval stats rankings are: ", rankings[0:10])

    stats = [
        accuracy(y_data, predicted_test),
        precision(y_data, predicted_test),
        recall(y_data, predicted_test),
        f1(y_data, predicted_test),
        roc(y_data, predicted_test)
    ]

    return stats
Beispiel #25
0
def get_trigger_identification_f1(gold_Y, pred_Y):
    """
    Print out P/R/F1 scores for trigger identification

    :param gold_Y: gold output
    :param pred_Y: predicted output
    :return:
    """
    gold_ti = []
    pred_ti = []

    for i in range(len(gold_Y)):
        if gold_Y[i] != 0:
            gold_ti.append(1)
        else:
            gold_ti.append(0)
        if pred_Y[i] != 0:
            pred_ti.append(1)
        else:
            pred_ti.append(0)

    return 100*precision(gold_ti, pred_ti), \
           100*recall(gold_ti, pred_ti), \
           100*f1(gold_ti, pred_ti)
def evaluate():
    with tf.Graph().as_default() as g:
        model = crnn.CRNN('test')
        model._build_model()

        #load training data
        test_data, test_label, valid_data, valid_label, Valid_label, Test_label, pernums_test, pernums_valid = load_data(
        )
        # test, valid segment size
        test_size = test_data.shape[0]
        valid_size = valid_data.shape[0]
        # for hole sentence label
        test_label = dense_to_one_hot(test_label, 4)
        valid_label = dense_to_one_hot(valid_label, 4)
        # for segement label
        Test_label = dense_to_one_hot(Test_label, 4)
        Valid_label = dense_to_one_hot(Valid_label, 4)

        # for sgement type : 1 :for hole sentence, 2: for sgement sentecne
        tnum = pernums_test.shape[0]
        vnum = pernums_valid.shape[0]

        pred_test_uw = np.empty((tnum, 4), dtype=np.float32)
        pred_test_w = np.empty((tnum, 4), dtype=np.float32)

        valid_iter = divmod((valid_size), FLAGS.valid_batch_size)[0]
        test_iter = divmod((test_size), FLAGS.test_batch_size)[0]
        y_pred_valid = np.empty((valid_size, 4), dtype=np.float32)
        y_pred_test = np.empty((test_size, 4), dtype=np.float32)

        y_test = np.empty((tnum, 4), dtype=np.float32)
        y_valid = np.empty((vnum, 4), dtype=np.float32)

        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            labels=model.labels, logits=model.logits)
        variable_averages = tf.train.ExponentialMovingAverage(FLAGS.momentum)
        variable_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variable_to_restore)
        #saver = tf.train.Saver()

        flag = False
        best_valid_uw = 0
        best_valid_w = 0
        for i in range(5):
            with tf.Session() as sess:
                ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint)
                if ckpt and ckpt.model_checkpoint_path:
                    saver.restore(sess, ckpt.model_checkpoint_path)
                    global_step = ckpt.model_checkpoint_path.split(
                        '/')[-1].split('-')[-1]

                    #for validation data
                    index = 0
                    cost_valid = 0
                    if (valid_size < FLAGS.valid_batch_size):
                        validate_feed = {
                            model.inputs: valid_data,
                            model.labels: Valid_label
                        }
                        y_pred_valid, loss = sess.run(
                            [model.softmax, cross_entropy],
                            feed_dict=validate_feed)
                        cost_valid = cost_valid + np.sum(loss)
                    else:
                        for v in range(valid_iter):
                            v_begin = v * FLAGS.valid_batch_size
                            v_end = (v + 1) * FLAGS.valid_batch_size
                            if (v == valid_iter - 1):
                                if (v_end < valid_size):
                                    v_end = valid_size
                            validate_feed = {
                                model.inputs: valid_data[v_begin:v_end],
                                model.labels: Valid_label[v_begin:v_end]
                            }
                            loss, y_pred_valid[v_begin:v_end, :] = sess.run(
                                [cross_entropy, model.softmax],
                                feed_dict=validate_feed)
                            cost_valid = cost_valid + np.sum(loss)
                    cost_valid = cost_valid / valid_size

                    print(y_pred_valid)
                    valid_acc_uw = recall(np.argmax(Valid_label, 1),
                                          np.argmax(y_pred_valid, 1),
                                          average='macro')
                    valid_acc_w = recall(np.argmax(Valid_label, 1),
                                         np.argmax(y_pred_valid, 1),
                                         average='weighted')
                    valid_conf = confusion(np.argmax(Valid_label, 1),
                                           np.argmax(y_pred_valid, 1))

                    print('----------segment metrics---------------')
                    print("Best valid_UA: %3.4g" % best_valid_uw)
                    print("Best valid_WA: %3.4g" % best_valid_w)
                    print('Valid Confusion Matrix:["ang","sad","hap","neu"]')
                    print(valid_conf)
                    print('----------segment metrics---------------')

                    for s in range(vnum):
                        y_valid[s, :] = np.max(
                            y_pred_valid[index:index + pernums_valid[s], :], 0)
                        index += pernums_valid[s]
                    valid_acc_uw = recall(np.argmax(valid_label, 1),
                                          np.argmax(y_valid, 1),
                                          average='macro')
                    valid_acc_w = recall(np.argmax(valid_label, 1),
                                         np.argmax(y_valid, 1),
                                         average='weighted')
                    valid_conf = confusion(np.argmax(valid_label, 1),
                                           np.argmax(y_valid, 1))

                    #for test set
                    index = 0
                    for t in range(test_iter):
                        t_begin = t * FLAGS.test_batch_size
                        t_end = (t + 1) * FLAGS.test_batch_size
                        if (t == test_iter - 1):
                            if (t_end < test_size):
                                t_end = test_size
                        #print t_begin,t_end,t,test_iter
                        test_feed = {
                            model.inputs: test_data[t_begin:t_end],
                            model.labels: Test_label[t_begin:t_end]
                        }
                        y_pred_test[t_begin:t_end, :] = sess.run(
                            model.logits, feed_dict=test_feed)

                    for s in range(tnum):
                        y_test[s, :] = np.max(
                            y_pred_test[index:index + pernums_test[s], :], 0)
                        index = index + pernums_test[s]

                    if valid_acc_uw > best_valid_uw:
                        best_valid_uw = valid_acc_uw
                        pred_test_uw = y_test
                        test_acc_uw = recall(np.argmax(test_label, 1),
                                             np.argmax(y_test, 1),
                                             average='macro')
                        test_conf = confusion(np.argmax(test_label, 1),
                                              np.argmax(y_test, 1))
                        confusion_uw = test_conf
                        flag = True

                    if valid_acc_w > best_valid_w:
                        best_valid_w = valid_acc_w
                        pred_test_w = y_test
                        test_acc_w = recall(np.argmax(test_label, 1),
                                            np.argmax(y_test, 1),
                                            average='weighted')
                        test_conf = confusion(np.argmax(test_label, 1),
                                              np.argmax(y_test, 1))
                        confusion_w = test_conf
                        flag = True
                    print(
                        "*****************************************************************"
                    )
                    print(global_step)
                    print("Epoch: %s" % global_step)
                    print("Valid cost: %2.3g" % cost_valid)
                    print("Valid_UA: %3.4g" % valid_acc_uw)
                    print("Valid_WA: %3.4g" % valid_acc_w)
                    print("Best valid_UA: %3.4g" % best_valid_uw)
                    print("Best valid_WA: %3.4g" % best_valid_w)
                    print('Valid Confusion Matrix:["ang","sad","hap","neu"]')
                    print(valid_conf)
                    print("Test_UA: %3.4g" % test_acc_uw)
                    print("Test_WA: %3.4g" % test_acc_w)
                    print('Test Confusion Matrix:["ang","sad","hap","neu"]')
                    print(confusion_uw)
                    print(
                        "*****************************************************************"
                    )
                    if (flag):
                        f = open(FLAGS.pred_name, 'wb')
                        pickle.dump((
                            best_valid_uw,
                            best_valid_w,
                            pred_test_w,
                            test_acc_w,
                            confusion_w,
                            pred_test_uw,
                            test_acc_uw,
                            confusion_uw,
                        ), f)
                        f.close()
                        flag = False
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB


# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
features = X
labels = y
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.5, random_state=0)

clf1 = DecisionTreeClassifier()
clf1.fit(features_train, labels_train)
decision_tree_recall = recall(labels_test, clf1.predict(features_test))
decision_tree_precision = precision(labels_test, clf1.predict(features_test))
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(decision_tree_recall, decision_tree_precision)

clf2 = GaussianNB()
clf2.fit(features_train, labels_train)
nb_recall = recall(labels_test, clf2.predict(features_test))
nb_precision = precision(labels_test, clf2.predict(features_test))
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(nb_recall, nb_precision)
# clf2.fit(X, y)
# print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(y,clf2.predict(X)),precision(y,clf2.predict(X)))

results = {
  "Naive Bayes Recall": nb_recall,
  "Naive Bayes Precision": nb_precision,
  "Decision Tree Recall": decision_tree_recall,
Beispiel #28
0
def train():
    #####load data##########
    
    train_data,train_label,test_data,test_label,valid_data,valid_label,Valid_label,Test_label,pernums_test,pernums_valid = load_data(FLAGS.traindata_path)
    train_label = dense_to_one_hot(train_label,FLAGS.num_classes)
    valid_label = dense_to_one_hot(valid_label,FLAGS.num_classes)
    Valid_label = dense_to_one_hot(Valid_label,FLAGS.num_classes)
    valid_size = valid_data.shape[0]
    dataset_size = train_data.shape[0]
    vnum = pernums_valid.shape[0]
    best_valid_uw = 0
    
    
    ##########tarin model###########
    X = tf.placeholder(tf.float32, shape=[None, FLAGS.image_height,FLAGS.image_width,FLAGS.image_channel])
    Y = tf.placeholder(tf.int32, shape=[None, FLAGS.num_classes])
    is_training = tf.placeholder(tf.bool)
    lr = tf.placeholder(tf.float32)
    keep_prob = tf.placeholder(tf.float32)
    Ylogits = acrnn(X, is_training=is_training, dropout_keep_prob=keep_prob)
    tf.summary.histogram("predict y ", Ylogits)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels =  Y, logits =  Ylogits)
    cost = tf.reduce_mean(cross_entropy)
    tf.summary.scalar("loss_function", cost)
    var_trainable_op = tf.trainable_variables()
    if FLAGS.is_adam:
        # not apply gradient clipping
        train_op = tf.train.AdamOptimizer(lr).minimize(cost)            
    else:
        # apply gradient clipping
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, var_trainable_op), 5)
        opti = tf.train.AdamOptimizer(lr)
        train_op = opti.apply_gradients(zip(grads, var_trainable_op))
    correct_pred = tf.equal(tf.argmax(Ylogits, 1), tf.argmax(Y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    saver=tf.train.Saver(tf.global_variables())
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        merged_summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(FLAGS.logdir, sess.graph)
        if FLAGS.restore:
	# Restore saved model if the user requested it, default = True
            try:
                checkpoint_state = tf.train.get_checkpoint_state(FLAGS.checkpoint)
                if (checkpoint_state and checkpoint_state.model_name):
                   # log('Loading checkpoint {}'.format(checkpoint_state.model), slack=True)
                    print('Loading checkpoint %s',checkpoint_state.model)
                    saver.restore(sess, checkpoint_state.model_name)
                else:
                    print('No model to load at %s',FLAGS.checkpoint)
                    #log('No model to load at {}'.format(FLAGS.checkpoint), slack=True)
                    #saver.save(sess, FLAGS.model_name)
            except tf.errors.OutOfRangeError as e:
                print('Cannot restore checkpoint:%s',e)
                  #log('Cannot restore checkpoint: {}'.format(e), slack=True)
        else:
            print('Starting new training!')
                #log('Starting new training!', slack=True)
            saver.save(sess, FLAGS.model_name)

        for i in range(FLAGS.num_epoch):
            #learning_rate = FLAGS.learning_rate            
            start = (i * FLAGS.batch_size) % dataset_size
            end = min(start+FLAGS.batch_size, dataset_size)
            [_,tcost,tracc] = sess.run([train_op,cost,accuracy], feed_dict={X:train_data[start:end,:,:,:], Y:train_label[start:end,:],
                                            is_training:True, keep_prob:FLAGS.dropout_keep_prob, lr:FLAGS.learning_rate})
            tf.summary.scalar("train_loss", tcost)
            tf.summary.scalar("train_acc", tracc)
            if i % 5 == 0:
                #for valid data
                valid_iter = divmod((valid_size),FLAGS.batch_size)[0]
                y_pred_valid = np.empty((valid_size,FLAGS.num_classes),dtype=np.float32)
                y_valid = np.empty((vnum,4),dtype=np.float32)
                index = 0
                cost_valid = 0
                if(valid_size < FLAGS.batch_size):
                    loss, y_pred_valid = sess.run([cross_entropy,Ylogits],feed_dict = {X:valid_data, Y:Valid_label,is_training:False, keep_prob:1})
                    cost_valid = cost_valid + np.sum(loss)
                for v in range(valid_iter):
                    v_begin = v*FLAGS.batch_size
                    v_end = (v+1)*FLAGS.batch_size
                    if(v == valid_iter-1):
                        if(v_end < valid_size):
                            v_end = valid_size
                    loss, y_pred_valid[v_begin:v_end,:] = sess.run([cross_entropy,Ylogits],feed_dict = {X:valid_data[v_begin:v_end],Y:Valid_label[v_begin:v_end],is_training:False, keep_prob:1})
                    cost_valid = cost_valid + np.sum(loss)
                cost_valid = cost_valid/valid_size
                tf.summary.scalar("valid cost", cost_valid)
                for s in range(vnum):
                    y_valid[s,:] = np.max(y_pred_valid[index:index+pernums_valid[s],:],0)
                    index = index + pernums_valid[s]
    
                valid_acc_uw = recall(np.argmax(valid_label,1),np.argmax(y_valid,1),average='macro')
                tf.summary.scalar("valid acc", valid_acc_uw)
                valid_conf = confusion(np.argmax(valid_label, 1),np.argmax(y_valid,1))
                if valid_acc_uw > best_valid_uw:
                    best_valid_uw = valid_acc_uw
                    best_valid_conf = valid_conf
                    saver.save(sess, os.path.join(FLAGS.checkpoint, FLAGS.model_name), global_step = i+1)
                print ("*****************************************************************")
                print ("Epoch: %05d" %(i+1))
                print ("Training cost: %2.3g" %tcost)   
                print ("Training accuracy: %3.4g" %tracc) 
                print ("Valid cost: %2.3g" %cost_valid)
                print ("Valid_UA: %3.4g" %valid_acc_uw)    
                print ("Best valid_UA: %3.4g" %best_valid_uw) 
                print ('Valid Confusion Matrix:["ang","sad","hap","neu"]')
                print (valid_conf)
                print ('Best Valid Confusion Matrix:["ang","sad","hap","neu"]')
                print (best_valid_conf)
                print ("*****************************************************************" )
            summary_str=sess.run(merged_summary_op, feed_dict = {X:train_data[start:end,:,:,:], Y:train_label[start:end,:],
                                            is_training:True, keep_prob:FLAGS.dropout_keep_prob, lr:FLAGS.learning_rate})
            summary_writer.add_summary(summary_str, i)
bi_y = list(map(binary_y, y))
print(Counter(bi_y))

precisions = []
lams = []
recalls = []
f1s = []
for i, lam in enumerate(lam_list):
    S = np.load(folder + "\\" + "lam" + lam + "\\" + r"l21S.npk",
                allow_pickle=True)
    predictions = list(map(binary_error, np.linalg.norm(S, axis=1)))
    print("lambda:", lam)
    print("precision",
          precision(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    print("recall", recall(bi_y, predictions, labels=["o", "m"],
                           pos_label="o"))
    print("f1", f1_score(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    lams.append(lam)
    precisions.append(
        precision(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    recalls.append(recall(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    f1s.append(f1_score(bi_y, predictions, labels=["o", "m"], pos_label="o"))
    print(CM(bi_y, predictions))
    print("------------")
print(len(lams), len(recalls), len(f1s), len(precisions))

d = {
    "lambda": list(map(float, lams)),
    "precision": precisions,
    "recall": recalls,
    "f1": f1s
X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25)


clf1 = DecisionTreeClassifier()
clf1.fit(X_train, y_train)
precision_cf1 = precision(y_test,y_test_pred)
recall_cf1 = recall(y_test,y_test_pred)
score_1 = f1_score(y_test, clf1.predict(X_test))
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(precision_cf1, recall_cf1)
print "Decision Tree F1 score: {:.2f}".format(score_1)

clf2 = GaussianNB()
clf2.fit(X_train, y_train)
precision_cf2 = precision(y_test,y_test_pred)
recall_cf2 = recall(y_test,y_test_pred)
score_2 = f1_score(y_test, clf2.predict(X_test))
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(precision_cf2, recall_cf2)
print "GaussianNB F1 score: {:.2f}".format(score_2)
def train():
    #####load data##########

    train_data, train_label, test_data, test_label, valid_data, valid_label, Valid_label, Test_label, pernums_test, pernums_valid = load_data(
        FLAGS.traindata_path)
    train_label = dense_to_one_hot(train_label, FLAGS.num_classes)
    valid_label = dense_to_one_hot(valid_label, FLAGS.num_classes)
    Valid_label = dense_to_one_hot(Valid_label, FLAGS.num_classes)
    valid_size = valid_data.shape[0]
    dataset_size = train_data.shape[0]
    vnum = pernums_valid.shape[0]
    best_valid_uw = 0

    ##########tarin model###########
    X = tf.placeholder(tf.float32,
                       shape=[
                           None, FLAGS.image_height, FLAGS.image_width,
                           FLAGS.image_channel
                       ])
    Y = tf.placeholder(tf.int32, shape=[None, FLAGS.num_classes])
    is_training = tf.placeholder(tf.bool)
    lr = tf.placeholder(tf.float32)
    keep_prob = tf.placeholder(tf.float32)
    Ylogits = acrnn(X, is_training=is_training, dropout_keep_prob=keep_prob)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=Y,
                                                            logits=Ylogits)
    cost = tf.reduce_mean(cross_entropy)
    var_trainable_op = tf.trainable_variables()
    if FLAGS.is_adam:
        # not apply gradient clipping
        train_op = tf.train.AdamOptimizer(lr).minimize(cost)
    else:
        # apply gradient clipping
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, var_trainable_op),
                                          5)
        opti = tf.train.AdamOptimizer(lr)
        train_op = opti.apply_gradients(zip(grads, var_trainable_op))
    correct_pred = tf.equal(tf.argmax(Ylogits, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    saver = tf.train.Saver(tf.global_variables())
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for i in range(FLAGS.num_epoch):
            #learning_rate = FLAGS.learning_rate
            start = (i * FLAGS.batch_size) % dataset_size
            end = min(start + FLAGS.batch_size, dataset_size)
            [_, tcost, tracc] = sess.run(
                [train_op, cost, accuracy],
                feed_dict={
                    X: train_data[start:end, :, :, :],
                    Y: train_label[start:end, :],
                    is_training: True,
                    keep_prob: FLAGS.dropout_keep_prob,
                    lr: FLAGS.learning_rate
                })
            if i % 5 == 0:
                # for valid data
                valid_iter = divmod((valid_size), FLAGS.batch_size)[0]
                y_pred_valid = np.empty((valid_size, FLAGS.num_classes),
                                        dtype=np.float32)
                y_valid = np.empty((vnum, 4), dtype=np.float32)
                index = 0
                cost_valid = 0
                if (valid_size < FLAGS.batch_size):
                    loss, y_pred_valid = sess.run(
                        [cross_entropy, Ylogits],
                        feed_dict={
                            X: valid_data,
                            Y: Valid_label,
                            is_training: False,
                            keep_prob: 1
                        })
                    cost_valid = cost_valid + np.sum(loss)
                for v in range(valid_iter):
                    v_begin = v * FLAGS.batch_size
                    v_end = (v + 1) * FLAGS.batch_size
                    if (v == valid_iter - 1):
                        if (v_end < valid_size):
                            v_end = valid_size
                    loss, y_pred_valid[v_begin:v_end, :] = sess.run(
                        [cross_entropy, Ylogits],
                        feed_dict={
                            X: valid_data[v_begin:v_end],
                            Y: Valid_label[v_begin:v_end],
                            is_training: False,
                            keep_prob: 1
                        })
                    cost_valid = cost_valid + np.sum(loss)
                cost_valid = cost_valid / valid_size
                for s in range(vnum):
                    y_valid[s, :] = np.max(
                        y_pred_valid[index:index + pernums_valid[s], :], 0)
                    index = index + pernums_valid[s]

                valid_acc_uw = recall(np.argmax(valid_label, 1),
                                      np.argmax(y_valid, 1),
                                      average='macro')
                valid_conf = confusion(np.argmax(valid_label, 1),
                                       np.argmax(y_valid, 1))
                if valid_acc_uw > best_valid_uw:
                    best_valid_uw = valid_acc_uw
                    best_valid_conf = valid_conf
                    saver.save(sess,
                               os.path.join(FLAGS.checkpoint,
                                            FLAGS.model_name),
                               global_step=i + 1)
                print(
                    "*****************************************************************"
                )
                print("Epoch: %05d" % (i + 1))
                print("Training cost: %2.3g" % tcost)
                print("Training accuracy: %3.4g" % tracc)
                print("Valid cost: %2.3g" % cost_valid)
                print("Valid_UA: %3.4g" % valid_acc_uw)
                print("Best valid_UA: %3.4g" % best_valid_uw)
                print('Valid Confusion Matrix:["ang","sad","hap","neu"]')
                print(valid_conf)
                print('Best Valid Confusion Matrix:["ang","sad","hap","neu"]')
                print(best_valid_conf)
                print(
                    "*****************************************************************"
                )
Beispiel #32
0

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
X_train, x, Y_train, y = cross_validation.train_test_split(X, Y)

clf1 = DecisionTreeClassifier()
clf1.fit(X_train, Y_train)
recall1 = recall(y,clf1.predict(x))
precision1 = precision(y,clf1.predict(x))
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall1, precision1)

clf2 = GaussianNB()
clf2.fit(X_train, Y_train)
recall2 = recall(y,clf2.predict(x))
precision2 = precision(y,clf2.predict(x))
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall2, precision2)

results = {
  "Naive Bayes Recall": recall2,
  "Naive Bayes Precision": precision2,
  "Decision Tree Recall": recall1,
  "Decision Tree Precision": precision1
}
X = X._get_numeric_data()
y = X['Survived']
del X['Age'], X['Survived']


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.naive_bayes import GaussianNB

# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.
from sklearn import cross_validation
labels_train, labels_test, features_train, features_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0) 

clf1 = DecisionTreeClassifier()
clf1.fit(labels_train, features_train)
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(features_test, clf1.predict(labels_test)), precision(features_test, clf1.predict(labels_test)))

clf2 = GaussianNB()
clf2.fit(labels_train, features_train)
print "GaussianNB recall: {:.2f} and precision: {:.2f}".format(recall(features_test ,clf2.predict(labels_test)), precision(features_test, clf2.predict(labels_test)))

results = {
  "Naive Bayes Recall": recall(features_test ,clf2.predict(labels_test)),
  "Naive Bayes Precision": precision(features_test, clf2.predict(labels_test)),
  "Decision Tree Recall": recall(features_test, clf1.predict(labels_test)),
  "Decision Tree Precision": precision(features_test, clf1.predict(labels_test))
}
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.4, random_state=0)

# The decision tree classifier
# clf1 = DecisionTreeClassifier()
# clf1.fit(features,labels)

# create the decision tree classifier, clf1
clf1 = DecisionTreeClassifier()

# Train the decision tree classifier with labels_train and features_train ( you 'train' with the 'trains')
clf1.fit(features_train, labels_train)

#Use precision and recall evaluation metric to test the 'test' data  ie features_test and label_test
print "Decision Tree recall: {:.2f} and precision: {:.2f}".format(recall(labels_test, clf1.predict(features_test)), precision(labels_test, clf1.predict(features_test)))

# As seen in above line
# Get the decision tree recall 'dt_recall by applying recall function on 'test set' data of features and labels ie features_test & labels_test
dt_recall = recall(labels_test, clf1.predict(features_test))

# Also
# Get the decision tree precision 'dt_precision by applying precision function on 'test set' data of features and labels ie features_test & labels_test
dt_precision = precision(labels_test, clf1.predict(features_test))




# The naive Bayes classifier
# clf2 = GaussianNB()
# clf2.fit(features,labels)
 "Decision Tree Score": accuracy_score(clf1.predict(feature_test),label_test)
}

#Consufion matrix
from sklearn.metrics import confusion_matrix
confusions = {
 "Naive Bayes": confusion_matrix(clf2.predict(feature_test), label_test),
 "Decision Tree": confusion_matrix(clf1.predict(feature_test), label_test)
}

print confusions

# Precision and recall
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
results = {
  "Naive Bayes Recall": recall(clf2.predict(feature_test),label_test),
  "Naive Bayes Precision": precision(clf2.predict(feature_test),label_test),
  "Decision Tree Recall": recall(clf1.predict(feature_test),label_test),
  "Decision Tree Precision": precision(clf1.predict(feature_test),label_test)
}

print results

# Naive Bayes
from sklearn.metrics import f1_score
F1_scores = {
 "Naive Bayes": f1_score(clf2.predict(feature_test),label_test),
 "Decision Tree": f1_score(clf1.predict(feature_test),label_test)
}
print F1_scores
p_file = sys.argv[2]

print "loading p..."

p = np.loadtxt( p_file )

y_predicted = np.ones(( p.shape[0] ))
y_predicted[p < 0] = -1

print "loading y..."

y = np.loadtxt( y_file, usecols= [0] )

print "accuracy:", accuracy( y, y_predicted )
print "precision:", precision( y, y_predicted, average='binary' )
print "recall:", recall( y, y_predicted, average='binary' )
print "AUC:", AUC( y, p )

print
print "confusion matrix:"
print confusion_matrix( y, y_predicted )


"""
run score.py data/test_v.txt vw/p_v_logistic.txt

accuracy: 0.994675826535

confusion matrix:
[[27444   136]
 [  236 42054]]