def best_c_logistic_regression(self,
                                   train_X,
                                   train_y,
                                   test_X,
                                   test_y,
                                   c_list=np.arange(0.1, 1, 0.1),
                                   penalty='l2'):

        auc = []
        for c in c_list:
            print(c)
            # all_solvers = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
            LR = LogisticRegression(C=c, penalty=penalty,
                                    solver='liblinear').fit(
                                        np.mat(train_X), np.ravel(train_y))
            pred = LR.predict_proba(np.mat(np.mat(test_X)))[:, 1]
            test_auc = roc_auc_score(test_y, pred)
            auc.append(test_auc)
        position = np.argmax(auc)
        c_best = c_list[position]
        print('max auc: ', max(auc))
        LR = LogisticRegression(C=c_best, penalty=penalty,
                                solver='liblinear').fit(
                                    np.mat(train_X), np.ravel(train_y))
        # parameters = {'C': c_list}
        # lr = GridSearchCV(n_jobs=-1, estimator=LogisticRegression(penalty=penalty), param_grid=parameters, scoring='f1', cv=5,)
        # LR.fit(train_X, train_y)
        # best_c,

        return LR
def get_acc_auc_kfold(X,Y,k=5):
	#TODO:First get the train indices and test indices for each iteration
	#Then train the classifier accordingly
	#Report the mean accuracy and mean auc of all the folds
	a=KFold(len(Y),k)
	acc=[]
	auc=[]
	for train_index, test_index in a:
		X_train, X_test = X[train_index], X[test_index]
		Y_train, Y_test = Y[train_index], Y[test_index]
		Y_pred=models.logistic_regression_pred(X_train, Y_train, X_test)
		'''
		false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, Y_pred)
		roc_auc = sklearn.metrics.roc_auc_score(Y_test, Y_pred)
		plt.title('Receiver Operating Characteristic')
		plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
		plt.legend(loc='lower right')
		plt.plot([0,1],[0,1],'r--')
		plt.xlim([-0.1,1.2])
		plt.ylim([-0.1,1.2])
		plt.ylabel('True Positive Rate')
		plt.xlabel('False Positive Rate')
		plt.show()
		'''
		
		acc_1=sklearn.metrics.accuracy_score(Y_test, Y_pred)
		auc_1=sklearn.metrics.roc_auc_score(Y_test, Y_pred)
		acc.append(acc_1)
		auc.append(auc_1)
	acc_mean=mean(acc)
	auc_mean=mean(auc)
	
	return acc_mean,auc_mean
Beispiel #3
0
def get_pairwise_score(model, user_name: str, print_score: bool = False):
    global preprocessingFunction
    auc, accuracy, frr, far = list(), list(), list(), list()
    legal_features = get_legal_test_features_for_user(user_name)
    all_illegal_features = get_all_illegal_test_features_for_user(user_name)
    all_illegal_features = shuffle(all_illegal_features)

    n_fold = all_illegal_features.shape[0] // legal_features.shape[0]
    for illegal_features in np.array_split(all_illegal_features, n_fold):
        pairwise_features = np.vstack((legal_features, illegal_features))
        if preprocessingFunction is not None:
            pairwise_features = preprocessingFunction.transform(
                pairwise_features)
        y_true = np.ones(pairwise_features.shape[0])
        y_true[legal_features.shape[0]:] = -1
        y_score = model.decision_function(pairwise_features)
        # ^-- Signed distance is positive for an inlier and negative for an outlier.
        auc.append(roc_auc_score(y_true, y_score))
        y_score = model.predict(pairwise_features)
        accuracy.append(np.mean(y_score == y_true))
        frr.append(
            np.sum(y_score[:legal_features.shape[0]] == -1) /
            pairwise_features.shape[0])
        far.append(
            np.sum(y_score[-illegal_features.shape[0]:] == 1) /
            pairwise_features.shape[0])
        if print_score:
            print(f"    AUC     = {auc[-1]:.2f}\n"
                  f"    ACC     = {accuracy[-1]:.2f}\n"
                  f"    FRR(I)  = {frr[-1]:.2f}%\n"
                  f"    FAR(II) = {far[-1]:.2f}%\n")
        return np.mean(auc), np.mean(accuracy), np.mean(frr), np.mean(far)
Beispiel #4
0
def train(model, cv_data, intMat, drugMat, targetMat):
    aupr, auc = [], []
    for seed in cv_data.keys():
        for W, test_data, test_label in cv_data[seed]:
            model.fix_model(W, intMat, drugMat, targetMat, seed)
            aupr_val, auc_val = model.evaluation(test_data, test_label)
            aupr.append(aupr_val)
            auc.append(auc_val)
    return np.array(aupr, dtype=np.float64), np.array(auc, dtype=np.float64)
def make_auc(files):
    df = pd.DataFrame()
    cols = ['GE-GE','GE-MIX','GE-TOSHIBA','MIX-GE','MIX-MIX','MIX-TOSHIBA','TOSHIBA-GE','TOSHIBA-MIX','TOSHIBA-TOSHIBA']
    auc  = []
    for file in files:
        auc.append(eval(file).auc.values)

    df = pd.DataFrame(auc).T
    df.columns = cols
    df    
    return df
Beispiel #6
0
def deep_learning(posfile, negfile, predfile, fileout):
    acc, sep, sen, mcc, f1, auc, prauc = [], [], [], [], [], [], []
    best = {
        'batch_size': 8.0,
        'drop_out': 0.10680339747442835,
        'hdim': 48.0,
        'l2_reg': 0.00024102301670176588,
        'learning_rate': 0.0012709235952012008,
        'sdim': 32.0,
        'tdim': 11.0
    }
    model = get_DNN_model(best)
    earlystopper = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
    for i in range(10):
        X_train, Y_train, X_val, Y_val, X_test, Y_test = get_DNN_data(
            posfile, negfile)
        model.fit(X_train,
                  Y_train,
                  batch_size=2**int(best['batch_size']),
                  epochs=100,
                  shuffle=True,
                  validation_data=(X_val, Y_val),
                  callbacks=[earlystopper])
        predictions = model.predict(X_test)
        rounded = [round(x[0]) for x in predictions]
        pred_train_prob = [x[0] for x in model.predict_proba(X_test)]
        accuracy, sepcificity, sensitivity, mccvalue, f1value, aucvalue, praucvalue = metrics(
            Y_test, rounded, pred_train_prob, fileout)
        acc.append(accuracy)
        sep.append(sepcificity)
        sen.append(sensitivity)
        mcc.append(mccvalue)
        f1.append(f1value)
        auc.append(aucvalue)
        prauc.append(praucvalue)
    fileout.write("DNN\n" + "Accuracy_mean: " + str(np.mean(acc)) + "\n" +
                  "Sepcificity_mean: " + str(np.mean(sep)) + "\n" +
                  "Sensitivity_mean: " + str(np.mean(sen)) + "\n" +
                  "MCC_mean: " + str(np.mean(mcc)) + "\n" + "Fscore_mean: " +
                  str(np.mean(f1)) + "\n" + "AUC_mean: " + str(np.mean(auc)) +
                  "\n" + "PRAUC_mean: " + str(np.mean(prauc)) + "\n")
    X_train, Y_train, X_val, Y_val, X_pred, Info_pred = get_DNN_pred_data(
        posfile, negfile, predfile)
    model.fit(X_train,
              Y_train,
              batch_size=2**int(best['batch_size']),
              epochs=100,
              shuffle=True,
              validation_data=(X_val, Y_val),
              callbacks=[earlystopper])
    predictions = model.predict(X_pred)
    rounded = [round(x[0]) for x in predictions]
    pred_train_prob = [x[0] for x in model.predict_proba(X_pred)]
    return rounded, pred_train_prob
Beispiel #7
0
def new_metric(tdf, seqnum_columns, y_label):
    true_label = list(tdf[y_label])
    auc = []
    for colums_name in seqnum_columns:
        pred_label = list(tdf[colums_name])
        fpr, tpr, thresholds = metrics.roc_curve(true_label,
                                                 pred_label,
                                                 pos_label=1)
        value = metrics.auc(fpr, tpr)
        auc.append(value)
    return pd.DataFrame({"method": seqnum_columns, "metric_auc": auc})
Beispiel #8
0
def train(model, cv_data, intMat, drugMat, targetMat, N=5):
    aupr, auc = [], []
    for seed in cv_data.keys():
        for W, test_data, test_label in cv_data[seed]:
            model.fix_model(W, intMat, drugMat, targetMat, seed)
            # model.fix_model(W*intMat, drugMat, targetMat, seed)
            scores = model.predict_scores(test_data)
            aupr_val, auc_val = evaluation(scores, test_label.astype(int))
            aupr.append(aupr_val)
            auc.append(auc_val)
    return np.array(aupr, dtype=np.float64), np.array(auc, dtype=np.float64)
Beispiel #9
0
def train_cv_model(X, Y):
    '''
    train cv xgb model and return auc vectors for test and train
    
    '''
    skf = StratifiedKFold(n_splits=3, shuffle=True)

    auc = []
    auc_train = []

    for train_index, test_index in skf.split(X, Y):
        print(train_index)

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

        model = xgb.XGBClassifier(
            objective='binary:logistic',
            colsample_bytree=1,
            learning_rate=0.03,
            max_depth=6,
            subsample=0.8,
            n_estimators=500,
            base_score=0.22,
            seed=2,
        )

        eval_result = {}
        eval_set = [(X_train, y_train, 'train'), (X_test, y_test, 'test')]

        model.fit(X_train,
                  y_train,
                  verbose=True,
                  eval_set=eval_set,
                  eval_metric="auc",
                  callbacks=[
                      xgb.callback.record_evaluation(eval_result),
                      xgb.callback.early_stop(15)
                  ])

        preds_test = model.predict_proba(X_test)[:, 1]
        roc_auc_score(y_test, preds_test)

        preds_train = model.predict_proba(X_train)[:, 1]
        roc_auc_score(y_train, preds_train)

        auc.append(roc_auc_score(y_test, preds_test))
        auc_train.append(roc_auc_score(y_train, preds_train))

    #print("test mean: {0}  train mean: {1}".format(np.mean(auc),np.mean(auc_train)))
    return {"test_auc": auc, "train_auc": auc_train, "model": model}
Beispiel #10
0
def train(model, cv_data, intMat, drugMat, targetMat):
    aupr, auc, ndcg, ndcg_inv, results = [], [], [], [], []
    for seed in cv_data.keys():
        for W, test_data, test_label in cv_data[seed]:
            t = time.clock()
            model.fix_model(W, intMat, drugMat, targetMat, seed)
            aupr_val, auc_val, ndcg_val, ndcg_inv_val = model.evaluation(
                test_data, test_label)
            results = results + [("", "", "", "")] + zip(
                test_data[:, 0], test_data[:, 1], test_label, model.scores)

            print(aupr_val, auc_val, ndcg_val, ndcg_inv_val, time.clock() - t)
            aupr.append(aupr_val)
            auc.append(auc_val)
            ndcg.append(ndcg_val)
            ndcg_inv.append(ndcg_inv_val)
    return np.array(aupr, dtype=np.float64), np.array(
        auc, dtype=np.float64), np.array(ndcg, dtype=np.float64), np.array(
            ndcg_inv, dtype=np.float64), results
def get_acc_auc_randomisedCV(X,Y,iterNo=5,test_percent=0.2):
	#TODO: First get the train indices and test indices for each iteration
	#Then train the classifier accordingly
	#Report the mean accuracy and mean auc of all the iterations
	a=ShuffleSplit(len(Y),iterNo,test_percent)
	acc=[]
	auc=[]
	for train_index, test_index in a:
		X_train, X_test = X[train_index], X[test_index]
		Y_train, Y_test = Y[train_index], Y[test_index]
		Y_pred=models.logistic_regression_pred(X_train, Y_train, X_test)
		acc_1=sklearn.metrics.accuracy_score(Y_test, Y_pred)
		auc_1=sklearn.metrics.roc_auc_score(Y_test, Y_pred)
		acc.append(acc_1)
		auc.append(auc_1)
	acc_mean=mean(acc)
	auc_mean=mean(auc)
	
	return acc_mean,auc_mean
Beispiel #12
0
def check_parameters(parameters, values, fixed={}, features=None):
    scores = []
    f1 = []
    auc = []
    for p in values:
        print(f'Fitting with {parameters}={p}')
        fts = X_train.columns if features is None else features
        kw = {parameters: p, **fixed}
        model = RandomForestClassifier(**kw)
        model.fit(X_train[fts], y_train)
        s = roc_auc_score(y_valid, model.predict_proba(X_valid)[:, 1])
        rf_f1, rf_auc = auc_score(model, X_valid[fts], y_valid)

        print('ROC AUC Score', s)
        print('F1', rf_f1)
        print('Auc', rf_auc)
        print('')
        scores.append(rf_auc)
        f1.append(rf_f1)
        auc.append(rf_auc)
    plt.title(parameters)
    plt.plot(values, scores)
Beispiel #13
0
def reporting(dat, task, m, f, dct, multi=False):
    data = {}
    #pheno25
    #task = '25 ddx'
    data[task][m + '_' + f] = {}

    if multi:
        f1 = []
        auc = []
        sen = {}
        spec = {}
        for loop in list(dat[f].keys()):
            sen[loop] = []
            spec[loop] = []
            f1.append(list(dat[f][loop]['f1_score'][0].values()))
            auc.append(list(dat[f][loop]['te_auc'][0].values()))
            for mat in list(dat[f][loop]['te_matrix'][0].values()):
                tn, fp, fn, tp = mat.ravel()
                sen[loop].append(1.0 * (tp / (tp + fn)))
                spec[loop].append(1.0 * (tn / (tn + fp)))
        f1 = (np.mean(f1, axis=0), np.std(f1, axis=0))
        auc = (np.mean(auc, axis=0), np.std(auc, axis=0))
        sen = (np.mean(list(sen.values()),
                       axis=0), np.std(list(sen.values()), axis=0))
        spec = (np.mean(list(spec.values()),
                        axis=0), np.std(list(spec.values()), axis=0))

        for n in range(len(f1[0])):
            data[task][m + '_' + f]['f1_' + dct[n]] = '{0:.3}'.format(
                f1[0][n]) + '({0:.3})'.format(f1[1][n])
            data[task][m + '_' + f]['auc_' + dct[n]] = '{0:.3}'.format(
                auc[0][n]) + '({0:.3})'.format(auc[1][n])
            try:
                data[task][m + '_' + f]['sen_' + dct[n]] = '{0:.3}'.format(
                    sen[0][n]) + '({0:.3})'.format(sen[1][n])
                data[task][m + '_' + f]['spec_' + dct[n]] = '{0:.3}'.format(
                    spec[0][n]) + '({0:.3})'.format(spec[1][n])
            except:
                pass
    else:
        f1 = []
        auc = []
        sen = []
        spec = []
        for loop in list(dat[f].keys()):
            f1.append(dat[f][loop]['f1_score'][0])
            auc.append(dat[f][loop]['te_auc'][0])
            tn, fp, fn, tp = dat[f][loop]['te_matrix'][0].ravel()
            sen.append(1.0 * (tp / (tp + fn)))
            spec.append(1.0 * (tn / (tn + fp)))
        f1 = (np.mean(f1, axis=0), np.std(f1, axis=0))
        auc = (np.mean(auc, axis=0), np.std(auc, axis=0))
        sen = (np.mean(sen, axis=0), np.std(sen, axis=0))
        spec = (np.mean(spec, axis=0), np.std(spec, axis=0))

        data[task][m + '_' + f]['f1'] = '{0:.3}'.format(
            f1[0]) + ' ({0:.3})'.format(f1[1])
        data[task][m + '_' + f]['auc'] = '{0:.3}'.format(
            auc[0]) + ' ({0:.3})'.format(auc[1])
        data[task][m + '_' + f]['sen'] = '{0:.3}'.format(
            sen[0]) + ' ({0:.3})'.format(sen[1])
        data[task][m + '_' + f]['spec'] = '{0:.3}'.format(
            spec[0]) + ' ({0:.3})'.format(spec[1])

    return data
Beispiel #14
0
end = 0
print len(list_exc)
steps = range(0, len(list_exc), 20)
print steps
feature_ids = []
auc = []
for i in range(0, len(steps) - 1):
    begin = steps[i]
    end = steps[i + 1]
    feature_ids.extend(list_exc[range(begin, end)])
    x_train = train_data[:, feature_ids]
    x_valid = valid_data[:, feature_ids]
    x_test = test_data[:, feature_ids]
    clf.fit(x_train, y_train)
    dec_val_test = clf.decision_function(x_test)
    auc.append(roc_auc_score(y_test, dec_val_test))
    #print feature_ids
feature_ids.extend(list_exc[range(begin, end)])
#print feature_ids
x_train = train_data[:, feature_ids]
x_valid = valid_data[:, feature_ids]
x_test = test_data[:, feature_ids]
clf.fit(x_train, y_train)
dec_val_test = clf.decision_function(x_test)
auc.append(roc_auc_score(y_test, dec_val_test))
clf2 = linear_model.LogisticRegression(C=0.01, penalty='l1')
clf2.fit(x_train, y_train)
dec_val_test = clf2.decision_function(x_test)
auc_lasso = roc_auc_score(y_test, dec_val_test)
lasso = [auc_lasso] * len(list_exc)
#plt.title(",fontsize=18)
Beispiel #15
0
                rec[0].append(numpy.average(yr, weights=ys))
                fone[0].append(numpy.average(yf1, weights=ys))
                sup[0].append(numpy.sum(ys))
                cnf_matrix=confusion_matrix(Y_test_temp, predictionMlp)
                cm_name_path="./Oberservations/Iter%s/pass%s/CM-MLP-fold%s.png" % (topiter, iter1,pltctr)
                plot_confusion_matrix(cnf_matrix, classes=['Class-1','Class-2','Class-3','Class-4','Class-5'],
                                      title='confusion matrix',pltname=cm_name_path)

                prob=mlp.predict_proba(X_test)
                prob=prob.transpose()
                auc = []
                tp = []
                fp = []
                for cx in range(0,5):
                    fpr, tpr, threshold = sklearn.metrics.roc_curve(ohc[cx], prob[cx], pos_label=1)
                    auc.append(sklearn.metrics.auc(fpr, tpr))
                    tp.append(tpr)
                    fp.append(fpr)
                plotROC(fpr=fp, tpr=tp,
                        path="./Oberservations/Iter%s/pass%s/ROC-pass-%s-MLP.png" % (topiter, iter1, pltctr), auc=auc)
                # ------------
                # -------------------------------------------------------------
                # logistic regression
                reports.append("----------------------Logistic Regression-------------------------------")
                print "-------------------------Logistic Regression----------------------------------"

                modelName.append("Logistic Regresion")
                logreg = LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter=5,
                                            class_weight='balanced', C=1)
                mLogreg = logreg.fit(X_train, Y_train_temp)
                pred = mLogreg.predict(X_test)
Beispiel #16
0
        estimator = CatBoostClassifier(iterations = 1000,depth = 10,learning_rate = 0.1,logging_level = None,scale_pos_weight = 45)
        #estimator = svm.SVC(kernel = 'rbf',C = 10, gamma = 0.012)
        #estimator = lgb.LGBMClassifier(is_unbalance = True, learning_rate = 0.012)
        model = estimator.fit(X_train[train,:], y_train[train])

        y_pred = estimator.predict(X_train[test,:])
        y_proba_pred = estimator.predict_proba(X_train[test,:])[:,1]
        TP = numpy.sum(numpy.logical_and(numpy.equal(y_train[test],1),numpy.equal(y_pred,1)))
        FP = numpy.sum(numpy.logical_and(numpy.equal(y_train[test],0),numpy.equal(y_pred,1)))
        TN = numpy.sum(numpy.logical_and(numpy.equal(y_train[test],0),numpy.equal(y_pred,0)))
        FN = numpy.sum(numpy.logical_and(numpy.equal(y_train[test],1),numpy.equal(y_pred,0)))

        accuracy = (TP+TN)/(TP+FP+TN+FN)
        acc.append(accuracy)
        fpr, tpr, th = metrics.roc_curve(y_train[test],y_proba_pred ,pos_label=1)
        auc.append(metrics.auc(fpr, tpr))
        plot_AUROC(fpr,tpr)
        aupr.append(metrics.average_precision_score(y_train[test],y_proba_pred))
        if metrics.average_precision_score(y_train[test],y_proba_pred) > max_num:
            max_num = metrics.average_precision_score(y_train[test],y_proba_pred)
            model.save_model(cmd+cellline_dir+'best_model{}'.format(kvalue))
        prec, rec, thres = metrics.precision_recall_curve(y_train[test],y_proba_pred ,pos_label=1)
        auprc.append(metrics.auc(rec, prec))

        plot_AUPRC(rec,prec)
        recall.append(metrics.recall_score(y_train[test],y_pred))
        precision.append(metrics.precision_score(y_train[test],y_pred))
        f1.append(metrics.f1_score(y_train[test],y_pred))
        m_c_c = (TP*TN - FP*FN)/(math.sqrt((TP+FN)*(TP+FP)*(TN+FN)*(TN+FP)))
        mcc.append(m_c_c)
Beispiel #17
0
def main():

    infofile = open(modelDir.replace('.h5', '_infofile.txt'))
    infos = infofile.readlines()
    analysis = infos[0].replace('Used analysis method: ', '').replace('\n', '')
    dataset = DatasetDir + infos[3].replace('Used dataset: ', '').replace(
        '\n', '')
    nvar = infos[5].replace('Used variables for training: ',
                            '').replace('\n', '')
    nvar = nvar.split()

    model = load_model(modelDir)

    scaler = joblib.load(SCALING)

    recurrent = False
    if analysis.lower() == 'rnn':
        recurrent = True

    h5f = h5py.File(dataset + '.h5', 'r')
    X_train = h5f['X_train'][:]
    y = h5f['y_train'][:]

    y_train = deepcopy(y)
    y_train[y != 0] = 0.
    y_train[y == 0] = 1.

    collection = []
    if recurrent:
        for col in COLLECTION:
            collection.append(h5f['X_train_' + col][:])

    h5f.close()

    where_nan = np.isnan(X_train)
    X_train[where_nan] = -999.
    X_train = scaler.transform(
        X_train)  # collection already standardized in training

    print '#----MODEL----#'
    print modelDir
    print model.summary()

    ######################################
    # Read in trained and tested dataset #
    ######################################

    if recurrent:
        y_hat = model.predict(collection + [X_train])
    else:
        y_hat = model.predict(X_train)

    importanceBySquaredWeight = getImportanceBySquaredWeight(
        model, nvar, recurrent)
    importanceByWeight = getImportanceByWeight(model, nvar, recurrent)
    impotanceByGrad = getImportanceByGradient(model, nvar, X_train, collection,
                                              recurrent)

    # Re-shuffle for re-evaluate
    X_train_reshuffled = []
    for idx, var in enumerate(nvar):
        X = np.copy(X_train)
        print X[:1]
        np.random.shuffle(X[:, idx])
        print X[:1], '\n'
        X_train_reshuffled.append(X)

    roc = []
    auc = []

    for i in xrange(len(X_train_reshuffled)):
        print type(X_train_reshuffled[i])
        if recurrent:
            y_predict = model.predict(collection + [X_train_reshuffled[i]])
        else:
            y_predict = model.predict(X_train_reshuffled[i])

        roc.append(roc_curve(y_train, y_predict[:, 0]))
        auc.append(roc_auc_score(y_train, y_predict[:, 0]))
        del y_predict

    roc.append(roc_curve(y_train, y_hat[:, 0]))
    auc.append(roc_auc_score(y_train, y_hat[:, 0]))
    print auc, '\n', importanceBySquaredWeight, '\n', importanceByWeight, '\n', impotanceByGrad, '\n'

    print 100 * '#'
    print '\n\t\t\tVariable ranking'
    print '\n sum of squared weights \t sum of absolute weights \t gradients \t AUC (after shuffle)'
    print 100 * '-'
    for i in xrange(len(nvar)):
        print '{}: {}\t{}: {}\t{}: {}\t{}: {}'.format(
            importanceBySquaredWeight[i][0], importanceBySquaredWeight[i][1],
            importanceByWeight[i][0], importanceByWeight[i][1],
            impotanceByGrad[i][0], impotanceByGrad[i][1], nvar[i], auc[i])
    print 100 * '-'
    print 100 * '#'

    print('Plotting the ROC curves ...')
    fig = plt.figure(figsize=(8, 6))
    ax1 = plt.subplot2grid((4, 4), (0, 0), colspan=4, rowspan=4)
    ax1.set_xlim((0, 1))
    ax1.set_ylim((0, 1))
    ax1.set_xlabel('$\epsilon_{Sig.}$', horizontalalignment='right', x=1.0)
    ax1.set_ylabel("$r_{Bkg.}$", horizontalalignment='right', y=1.0)

    for i in xrange(len(roc)):
        try:
            plt.plot(roc[i][1],
                     1 - roc[i][0],
                     '-',
                     label='w/o %s (AUC = %0.4f)' % (nvar[i], auc[i]))
        except IndexError:
            plt.plot(roc[i][1],
                     1 - roc[i][0],
                     '-',
                     label='Default (AUC = %0.4f)' % (auc[i]))

    plt.plot([0, 1], [1, 0], '--', color=(0.6, 0.6, 0.6), label='Luck')
    leg = plt.legend(loc="lower left", frameon=False)

    AtlasStyle_mpl.ATLASLabel(ax1, 0.13, 0.9, 'Work in progress')
    #AtlasStyle_mpl.LumiLabel(ax1, 0.02, 0.3, lumi=LUMI*0.001)

    plt.savefig("plots/" + modelfile + "_ROC_n-1.pdf")
    plt.savefig("plots/" + modelfile + "_ROC_n-1.png")
    plt.close()
Beispiel #18
0
def renew(f,t,a):
	fpr.append(f)
	tpr.append(t)
	auc.append(a)
import scipy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature
from sklearn.metrics import average_precision_score
import statistics as st

y_pred_cnn = h5py.File('prediction')
pred = np.array(y_pred_cnn['pred'])
testmat = scipy.io.loadmat('test_add_1.mat')
y_test = testmat['testdata']

auc = []
for i in range(0, 919):
    auc.append(roc_auc_score(y_test[:, i], pred[:, i]))
print(auc)
print(sum(auc) / 919)

y = range(0, 125)
plt.figure()
plt.plot(y, sorted(auc[0:125]))
plt.show()
y = range(0, 690)
plt.figure()
plt.plot(y, sorted(auc[125:815]))
plt.show()
y3 = range(0, 104)
plt.figure()
plt.plot(y3, sorted(auc[815:919]))
plt.show()
Beispiel #20
0
    def test_model(self, params=None):
        """
            Function to perform the core machine learning analysis. 
            Metrics are calculated in Cross Validation and stored as a dictionary
            with average values and std. 

            Arguements:
                params: A dictionary of parameters, "model_instance" is required. 
                        Should be of the form:
                        params={'model_instance': <desired model>,
                                'scaler_instance': <optional scaler>,
                                'imputer_instance': <optional imputer>}
                

            Returns: A dictionary of tested models with corresponding metrics
        """

######################### Scale, CV, Imputation ################################   

        self.params = params
        
        if 'scaler_instance' in self.params.keys():
            raise Exception('No scaler defined in params.' \
                            'Use form {"scaler_instance":<scaler>}')
        if 'scaler_instance' in self.params:
            scaler = self.params['scaler_instance']
            scaled_x = scaler.fit_transform(X=self.df)
            X = scaled_x
            y = self.target.values
        else:
            X = self.df.values
            y= self.target.values
            
        accuracies = []
        balanced_accuracies = []
        recalls = []
        precisions = []
        specificities = []
        f1_scores = []
        auc = []

        y_hat_probs = []
        y_tests = []

        model_instance = self.params['model_instance']
        k_fold = KFold(n_splits=self.cv_folds, 
                       random_state=self.random_seed,
                       shuffle=True)
        
        if 'imputer_instance' in self.params.keys():
            raise Exception('No imputer defined in params.' \
                            'Use form {"imputer_instance":<imputer>}')
        if 'imputer_instance' in self.params:
            med_imp = self.params['imputer_instance']

        kf = k_fold.split(X, y)
        for train_index, test_index in kf:
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]
        
            if 'imputer_instance' in self.params:
                X_train = med_imp.fit_transform(X_train)
                X_test = med_imp.fit_transform(X_test)

            trained_model = model_instance.fit(X=X_train, y=y_train)
            y_hat= trained_model.predict(X_test)
            y_hat_prob = [p[1] for p in trained_model.predict_proba(X_test)]
            accuracies.append(np.mean(y_hat == y_test)) 
            if self.include_auc:
                auc.append(roc_auc_score(y_test, y_hat_prob)) 

            recall, precision, specificity, balanced_accuracy, f1_score =\
                self.calculate_accuracies(y_hat, y_test)

            recalls.append(recall)
            precisions.append(precision)
            specificities.append(specificity)
            balanced_accuracies.append(balanced_accuracy)
            f1_scores.append(f1_score)

            y_hat_probs += y_hat_prob
            y_tests += y_test.tolist()
            
        
        model_id = self._make_model_id()
        if self.include_auc:
            self.results[model_id] = { 
                'model_id': model_id,
                'model': model_instance,
                'f1_score': self._make_result(f1_score),
                'recall': self._make_result(recalls),
                'precision': self._make_result(precisions),
                'specificity': self._make_result(specificities),
                'balanced_accuracy': self._make_result(balanced_accuracies),
                'accuracy': self._make_result(accuracies),
                'auc':self._make_result(auc)
            }
        else:
            self.results[model_id] = { 
                'model_id': model_id,
                'model': model_instance,
                'f1_score': self._make_result(f1_score),
                'recall': self._make_result(recalls),
                'precision': self._make_result(precisions),
                'specificity': self._make_result(specificities),
                'balanced_accuracy': self._make_result(balanced_accuracies),
                'accuracy': self._make_result(accuracies)
            }
            
        self.predictions[model_id] = {
            'prediction_probabilities': y_hat_probs,
            'y_test': y_tests,
        }
        return self.results
Beispiel #21
0
def plot_multi_SVM(prediction,
                   mutation_data,
                   label_type,
                   show_plots=False,
                   key=None,
                   n_classifiers=[1],
                   outputfolder=None):
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)

    keys = prediction.keys()
    SVMs = list()
    if key is None:
        label = keys[0]
    else:
        label = key
    SVMs = prediction[label]['classifiers']

    Y_test = prediction[label]['Y_test']
    X_test = prediction[label]['X_test']
    X_train = prediction[label]['X_train']
    Y_train = prediction[label]['Y_train']
    test_patient_IDs = prediction[label]['patient_ID_test']
    train_patient_IDs = prediction[label]['patient_ID_train']
    feature_labels = prediction[label]['feature_labels']

    # print(len(X_test[0][0]))
    # print(config)
    # X_train = data2['19q']['X_train']
    # Y_train = data2['19q']['Y_train']
    # mutation_data = gp.load_mutation_status(patientinfo, [[label]])
    if type(mutation_data) is not dict:
        if os.path.isfile(mutation_data):
            label_data = gp.load_mutation_status(mutation_data, [[label_type]])

    patient_IDs = label_data['patient_IDs']
    mutation_label = label_data['mutation_label']

    # print(len(SVMs))
    N_iterations = float(len(SVMs))

    # mutation_label = np.asarray(mutation_label)

    for n_class in n_classifiers:
        # output_json = os.path.join(outputfolder, ('performance_{}.json').format(str(n_class)))

        sensitivity = list()
        specificity = list()
        precision = list()
        accuracy = list()
        auc = list()
        # auc_train = list()
        f1_score_list = list()

        patient_classification_list = dict()

        trained_classifiers = list()

        y_score = list()
        y_test = list()
        pid_test = list()
        y_predict = list()

        # csvfile = os.path.join(outputfolder, ('scores_{}.csv').format(str(n_class)))
        # towrite = list()
        #
        # csvfile_plain = os.path.join(outputfolder, ('scores_plain_{}.csv').format(str(n_class)))
        # towrite_plain = list()

        empty_scores = {k: '' for k in natsort.natsorted(patient_IDs)}
        empty_scores = collections.OrderedDict(sorted(empty_scores.items()))
        # towrite.append(["Patient"] + empty_scores.keys())
        params = dict()
        for num, s in enumerate(SVMs):
            scores = empty_scores.copy()
            print("Processing {} / {}.").format(str(num + 1), str(len(SVMs)))
            trained_classifiers.append(s)

            # Extract test info
            test_patient_IDs_temp = test_patient_IDs[num]
            train_patient_IDs_temp = train_patient_IDs[num]
            X_train_temp = X_train[num]
            Y_train_temp = Y_train[num]
            X_test_temp = X_test[num]
            Y_test_temp = Y_test[num]

            # Extract sample size
            N_1 = float(len(train_patient_IDs_temp))
            N_2 = float(len(test_patient_IDs_temp))

            test_indices = list()
            for i_ID in test_patient_IDs_temp:
                test_indices.append(np.where(patient_IDs == i_ID)[0][0])

                if i_ID not in patient_classification_list:
                    patient_classification_list[i_ID] = dict()
                    patient_classification_list[i_ID]['N_test'] = 0
                    patient_classification_list[i_ID]['N_correct'] = 0
                    patient_classification_list[i_ID]['N_wrong'] = 0

                patient_classification_list[i_ID]['N_test'] += 1

            # y_truth = [mutation_label[0][k] for k in test_indices]
            # FIXME: order can be switched, need to find a smart fix
            # 1 for normal, 0 for KM
            # y_truth = [mutation_label[0][k][0] for k in test_indices]
            y_truth = Y_test_temp

            # Predict using the top N classifiers
            results = s.cv_results_['rank_test_score']
            indices = range(0, len(results))
            sortedindices = [x for _, x in sorted(zip(results, indices))]
            sortedindices = sortedindices[0:n_class]
            y_prediction = np.zeros([n_class, len(y_truth)])
            y_score = np.zeros([n_class, len(y_truth)])

            # Get some base objects required
            base_estimator = s.estimator
            y_train = Y_train_temp
            y_train_prediction = np.zeros([n_class, len(y_train)])
            scorer = s.scorer_
            train = np.asarray(range(0, len(y_train)))
            test = train  # This is in order to use the full training dataset to train the model

            # Remove the NaN features
            X_notnan = X_train_temp[:]
            for pnum, (pid, x) in enumerate(
                    zip(train_patient_IDs_temp, X_train_temp)):
                for fnum, (f, fid) in enumerate(zip(x, feature_labels)):
                    if np.isnan(f):
                        print(
                            "[PREDICT WARNING] NaN found, patient {}, label {}. Replacing with zero."
                        ).format(pid, fid)
                        # Note: X is a list of lists, hence we cannot index the element directly
                        features_notnan = x[:]
                        features_notnan[fnum] = 0
                        X_notnan[pnum] = features_notnan

            X_train_temp = X_notnan[:]
            X_train_temp = [(x, feature_labels) for x in X_train_temp]

            X_notnan = X_test_temp[:]
            for pnum, (pid,
                       x) in enumerate(zip(test_patient_IDs_temp,
                                           X_test_temp)):
                for fnum, (f, fid) in enumerate(zip(x, feature_labels)):
                    if np.isnan(f):
                        print(
                            "[PREDICT WARNING] NaN found, patient {}, label {}. Replacing with zero."
                        ).format(pid, fid)
                        # Note: X is a list of lists, hence we cannot index the element directly
                        features_notnan = x[:]
                        features_notnan[fnum] = 0
                        X_notnan[pnum] = features_notnan

            X_test_temp = X_notnan[:]
            # X_test_temp = [(x, feature_labels) for x in X_test_temp]

            # NOTE: need to build this in the SearchCVFastr Object
            for i, index in enumerate(sortedindices):
                print("Processing number {} of {} classifiers.").format(
                    str(i + 1), str(n_class))
                X_testtemp = X_test_temp[:]

                # Get the parameters from the index
                parameters_est = s.cv_results_['params'][index]
                parameters_all = s.cv_results_['params_all'][index]

                print parameters_all
                print s.cv_results_['mean_test_score'][index]

                # NOTE: kernel parameter can be unicode
                kernel = str(parameters_est[u'kernel'])
                del parameters_est[u'kernel']
                del parameters_all[u'kernel']
                parameters_est['kernel'] = kernel
                parameters_all['kernel'] = kernel

                # Refit a classifier using the settings given
                print("Refitting classifier with best settings.")
                # Only when using fastr this is an entry
                if 'Number' in parameters_est.keys():
                    del parameters_est['Number']

                best_estimator = clone(base_estimator).set_params(
                    **parameters_est)

                # ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler =\
                #     fit_and_score(best_estimator, X_train, y_train, scorer,
                #                   train, test, True, parameters_all,
                #                   t.fit_params,
                #                   t.return_train_score,
                #                   True, True, True,
                #                   t.error_score)

                ret, GroupSel, VarSel, SelectModel, _, scaler =\
                    fit_and_score(estimator=best_estimator,
                                  X=X_train_temp,
                                  y=y_train,
                                  scorer=scorer,
                                  train=train, test=test,
                                  verbose=True,
                                  para=parameters_all,
                                  fit_params=s.fit_params,
                                  return_train_score=s.return_train_score,
                                  return_n_test_samples=True,
                                  return_times=True,
                                  return_parameters=True,
                                  error_score=s.error_score)

                X = [x[0] for x in X_train_temp]
                if GroupSel is not None:
                    X = GroupSel.transform(X)
                    X_testtemp = GroupSel.transform(X_testtemp)

                if SelectModel is not None:
                    X = SelectModel.transform(X)
                    X_testtemp = SelectModel.transform(X_testtemp)

                if VarSel is not None:
                    X = VarSel.transform(X)
                    X_testtemp = VarSel.transform(X_testtemp)

                if scaler is not None:
                    X = scaler.transform(X)
                    X_testtemp = scaler.transform(X_testtemp)

                try:
                    if y_train is not None:
                        best_estimator.fit(X, y_train, **s.fit_params)
                    else:
                        best_estimator.fit(X, **s.fit_params)

                    # Predict the posterios using the fitted classifier for the training set
                    print("Evaluating performance on training set.")
                    if hasattr(best_estimator, 'predict_proba'):
                        probabilities = best_estimator.predict_proba(X)
                        y_train_prediction[i, :] = probabilities[:, 1]
                    else:
                        # Regression has no probabilities
                        probabilities = best_estimator.predict(X)
                        y_train_prediction[i, :] = probabilities[:]

                    # Predict the posterios using the fitted classifier for the test set
                    print("Evaluating performance on test set.")
                    if hasattr(best_estimator, 'predict_proba'):
                        probabilities = best_estimator.predict_proba(
                            X_testtemp)
                        y_prediction[i, :] = probabilities[:, 1]
                    else:
                        # Regression has no probabilities
                        probabilities = best_estimator.predict(X_testtemp)
                        y_prediction[i, :] = probabilities[:]

                    if type(s.estimator) == sklearn.svm.classes.SVC:
                        y_score[i, :] = best_estimator.decision_function(
                            X_testtemp)
                    else:
                        y_score[i, :] = best_estimator.decision_function(
                            X_testtemp)[:, 0]

                except ValueError:
                    # R2 score was set to zero previously
                    y_train_prediction[i, :] = np.asarray([0.5] * len(X))
                    y_prediction[i, :] = np.asarray([0.5] * len(X_testtemp))
                    y_score[i, :] = np.asarray([0.5] * len(X_testtemp))
                    probabilities = []

                # Add number parameter settings
                for k in parameters_all.keys():
                    if k not in params.keys():
                        params[k] = list()
                    params[k].append(parameters_all[k])

                # Save some memory
                del best_estimator, X, X_testtemp, ret, GroupSel, VarSel, SelectModel, scaler, parameters_est, parameters_all, probabilities

            # Take mean over posteriors of top n
            y_train_prediction_m = np.mean(y_train_prediction, axis=0)
            y_prediction_m = np.mean(y_prediction, axis=0)

            # NOTE: Not sure if this is best way to compute AUC
            y_score = y_prediction_m

            if type(s.estimator) == sklearn.svm.classes.SVC:
                # Look for optimal F1 performance on training set
                thresholds = np.arange(0, 1, 0.01)
                f1_scores = list()
                y_train_prediction = np.zeros(y_train_prediction_m.shape)
                for t in thresholds:
                    for ip, y in enumerate(y_train_prediction_m):
                        if y > t:
                            y_train_prediction[ip] = 1
                        else:
                            y_train_prediction[ip] = 0

                    f1_scores.append(
                        f1_score(y_train_prediction,
                                 y_train,
                                 average='weighted'))

                # Use best threshold to determine test score
                best_index = np.argmax(f1_scores)
                best_thresh = thresholds[best_index]
                best_thresh = 0.5
                y_prediction = np.zeros(y_prediction_m.shape)
                for ip, y in enumerate(y_prediction_m):
                    if y > best_thresh:
                        y_prediction[ip] = 1
                    else:
                        y_prediction[ip] = 0

                # y_prediction = t.predict(X_temp)

                y_prediction = [min(max(y, 0), 1) for y in y_prediction]
            else:
                y_prediction = y_prediction_m
                y_prediction = [min(max(y, 0), 1) for y in y_prediction]

            # NOTE: start of old function part

            print "Truth: ", y_truth
            print "Prediction: ", y_prediction

            for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction,
                                                     test_patient_IDs_temp):
                if i_truth == i_predict:
                    patient_classification_list[i_test_ID]['N_correct'] += 1
                else:
                    patient_classification_list[i_test_ID]['N_wrong'] += 1

            # print('bla')
            # print(y_truth)
            # print(y_prediction)

            c_mat = confusion_matrix(y_truth, y_prediction)
            TN = c_mat[0, 0]
            FN = c_mat[1, 0]
            TP = c_mat[1, 1]
            FP = c_mat[0, 1]

            if FN == 0 and TP == 0:
                sensitivity.append(0)
            else:
                sensitivity.append(float(TP) / (TP + FN))
            if FP == 0 and TN == 0:
                specificity.append(0)
            else:
                specificity.append(float(TN) / (FP + TN))
            if TP == 0 and FP == 0:
                precision.append(0)
            else:
                precision.append(float(TP) / (TP + FP))
            accuracy.append(accuracy_score(y_truth, y_prediction))
            auc.append(roc_auc_score(y_truth, y_score))
            f1_score_list.append(
                f1_score(y_truth, y_prediction, average='weighted'))

            # Adjusted according to "Inference for the Generelization error"

            accuracy_mean = np.mean(accuracy)
            S_uj = 1.0 / max((N_iterations - 1), 1) * np.sum(
                (accuracy_mean - accuracy)**2.0)

            print Y_test
            N_1 = float(len(Y_train[0]))
            N_2 = float(len(Y_test[0]))

            print(N_1)
            print(N_2)

            accuracy_var = np.sqrt((1.0 / N_iterations + N_2 / N_1) * S_uj)
            print(accuracy_var)
            print(np.sqrt(1 / N_iterations * S_uj))
            print(st.sem(accuracy))

        stats = dict()
        stats["Accuracy 95%:"] = str(
            compute_CI.compute_confidence(accuracy, N_1, N_2, 0.95))

        stats["AUC 95%:"] = str(
            compute_CI.compute_confidence(auc, N_1, N_2, 0.95))

        stats["F1-score 95%:"] = str(
            compute_CI.compute_confidence(f1_score_list, N_1, N_2, 0.95))

        stats["Precision 95%:"] = str(
            compute_CI.compute_confidence(precision, N_1, N_2, 0.95))

        stats["Sensitivity 95%: "] = str(
            compute_CI.compute_confidence(sensitivity, N_1, N_2, 0.95))

        stats["Specificity 95%:"] = str(
            compute_CI.compute_confidence(specificity, N_1, N_2, 0.95))

        print("Accuracy 95%:" +
              str(compute_CI.compute_confidence(accuracy, N_1, N_2, 0.95)))

        print("AUC 95%:" +
              str(compute_CI.compute_confidence(auc, N_1, N_2, 0.95)))

        print(
            "F1-score 95%:" +
            str(compute_CI.compute_confidence(f1_score_list, N_1, N_2, 0.95)))

        print("Precision 95%:" +
              str(compute_CI.compute_confidence(precision, N_1, N_2, 0.95)))

        print("Sensitivity 95%: " +
              str(compute_CI.compute_confidence(sensitivity, N_1, N_2, 0.95)))

        print("Specificity 95%:" +
              str(compute_CI.compute_confidence(specificity, N_1, N_2, 0.95)))

        alwaysright = dict()
        alwayswrong = dict()
        for i_ID in patient_classification_list:
            percentage_right = patient_classification_list[i_ID][
                'N_correct'] / float(
                    patient_classification_list[i_ID]['N_test'])

            # print(i_ID + ' , ' + str(patient_classification_list[i_ID]['N_test']) + ' : ' + str(percentage_right) + '\n')
            if percentage_right == 1.0:
                label = mutation_label[0][np.where(i_ID == patient_IDs)]
                label = label[0][0]
                alwaysright[i_ID] = label
                # alwaysright.append(('{} ({})').format(i_ID, label))
                print(("Always Right: {}, label {}").format(i_ID, label))

            if percentage_right == 0:
                label = mutation_label[0][np.where(
                    i_ID == patient_IDs)].tolist()
                label = label[0][0]
                alwayswrong[i_ID] = label
                # alwayswrong.append(('{} ({})').format(i_ID, label))
                print(("Always Wrong: {}, label {}").format(i_ID, label))

        stats["Always right"] = alwaysright
        stats["Always wrong"] = alwayswrong

        if show_plots:
            import matplotlib.pyplot as plt

            plt.figure()
            plt.boxplot(accuracy)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Accuracy')
            plt.tick_params(
                axis='x',  # changes apply to the x-axis
                which='both',  # both major and minor ticks are affected
                bottom='off',  # ticks along the bottom edge are off
                top='off',  # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(auc)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('AUC')
            plt.tick_params(
                axis='x',  # changes apply to the x-axis
                which='both',  # both major and minor ticks are affected
                bottom='off',  # ticks along the bottom edge are off
                top='off',  # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(precision)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Precision')
            plt.tick_params(
                axis='x',  # changes apply to the x-axis
                which='both',  # both major and minor ticks are affected
                bottom='off',  # ticks along the bottom edge are off
                top='off',  # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(sensitivity)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Sensitivity')
            plt.tick_params(
                axis='x',  # changes apply to the x-axis
                which='both',  # both major and minor ticks are affected
                bottom='off',  # ticks along the bottom edge are off
                top='off',  # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

            plt.figure()
            plt.boxplot(specificity)
            plt.ylim([-0.05, 1.05])
            plt.ylabel('Specificity')
            plt.tick_params(
                axis='x',  # changes apply to the x-axis
                which='both',  # both major and minor ticks are affected
                bottom='off',  # ticks along the bottom edge are off
                top='off',  # ticks along the top edge are off
                labelbottom='off')  # labels along the bottom edge are off
            plt.tight_layout()
            plt.show()

    return stats
Beispiel #22
0
def plot_single_SVM(prediction,
                    mutation_data,
                    label_type,
                    show_plots=False,
                    show_ROC=False):
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)

    keys = prediction.keys()
    SVMs = list()
    label = keys[0]
    SVMs = prediction[label]['classifiers']

    Y_test = prediction[label]['Y_test']
    X_test = prediction[label]['X_test']
    Y_train = prediction[label]['X_train']
    Y_score = list()

    # print(len(X_test[0][0]))
    # print(config)
    # X_train = data2['19q']['X_train']
    # Y_train = data2['19q']['Y_train']
    # mutation_data = gp.load_mutation_status(patientinfo, [[label]])
    if type(mutation_data) is not dict:
        if os.path.isfile(mutation_data):
            mutation_data = gp.load_mutation_status(mutation_data,
                                                    [[label_type]])

    patient_IDs = mutation_data['patient_IDs']
    mutation_label = mutation_data['mutation_label']
    # mutation_name = mutation_data['mutation_name']

    # print(len(SVMs))
    N_iterations = float(len(SVMs))

    # mutation_label = np.asarray(mutation_label)

    sensitivity = list()
    specificity = list()
    precision = list()
    accuracy = list()
    auc = list()
    # auc_train = list()
    f1_score_list = list()

    patient_classification_list = dict()

    for i in range(0, len(Y_test)):
        # print(Y_test[i])
        # if Y_test[i].shape[1] > 1:
        #     # print(Y_test[i])
        #     y_truth = np.prod(Y_test[i][:, 0:2], axis=1)
        # else:
        #     y_truth_test = Y_test[i]
        test_patient_IDs = prediction[label]['patient_ID_test'][i]

        if 'LGG-Radiogenomics-046' in test_patient_IDs:
            wrong_index = np.where(test_patient_IDs == 'LGG-Radiogenomics-046')
            test_patient_IDs = np.delete(test_patient_IDs, wrong_index)
            X_temp = X_test[i]
            print(X_temp.shape)
            X_temp = np.delete(X_test[i], wrong_index, axis=0)
            print(X_temp.shape)

            # X_test.pop(wrong_index[0])

            # print(len(X_test))
        else:
            X_temp = X_test[i]

        test_indices = list()
        for i_ID in test_patient_IDs:
            test_indices.append(np.where(patient_IDs == i_ID)[0][0])

            if i_ID not in patient_classification_list:
                patient_classification_list[i_ID] = dict()
                patient_classification_list[i_ID]['N_test'] = 0
                patient_classification_list[i_ID]['N_correct'] = 0
                patient_classification_list[i_ID]['N_wrong'] = 0

            patient_classification_list[i_ID]['N_test'] += 1

        y_truth = [mutation_label[0][k] for k in test_indices]
        # print(y_truth)
        # print(y_truth_test)
        # print(test_patient_IDs)

        y_predict_1 = SVMs[i].predict(X_temp)

        # print(y_predict_1).shape

        y_prediction = y_predict_1
        # y_prediction = np.prod(y_prediction, axis=0)

        print "Truth: ", y_truth
        print "Prediction: ", y_prediction

        for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction,
                                                 test_patient_IDs):
            if i_truth == i_predict:
                patient_classification_list[i_test_ID]['N_correct'] += 1
            else:
                patient_classification_list[i_test_ID]['N_wrong'] += 1

        # print('bla')
        # print(y_truth)
        # print(y_prediction)

        c_mat = confusion_matrix(y_truth, y_prediction)
        TN = c_mat[0, 0]
        FN = c_mat[1, 0]
        TP = c_mat[1, 1]
        FP = c_mat[0, 1]

        if FN == 0 and TP == 0:
            sensitivity.append(0)
        else:
            sensitivity.append(float(TP) / (TP + FN))
        if FP == 0 and TN == 0:
            specificity.append(0)
        else:
            specificity.append(float(TN) / (FP + TN))
        if TP == 0 and FP == 0:
            precision.append(0)
        else:
            precision.append(float(TP) / (TP + FP))
        accuracy.append(accuracy_score(y_truth, y_prediction))
        y_score = SVMs[i].decision_function(X_temp)
        Y_score.append(y_score)
        auc.append(roc_auc_score(y_truth, y_score))
        f1_score_list.append(
            f1_score(y_truth, y_prediction, average='weighted'))

        # if show_ROC:
        #     ROC_target_folder = '/archive/wkessels/output/ROC_temp/'
        #     if not os.path.exists(ROC_target_folder):
        #         os.makedirs(ROC_target_folder)
        #
        #     luck = [0, 1]
        #
        #     fpr, tpr, _ = roc_curve(y_truth, y_score)
        #     plt.figure()
        #     plt.plot(fpr, tpr, color='blue', label='ROC (AUC = {})'.format(auc[-1]))
        #     plt.plot(luck, luck, '--', color='red', label='luck')
        #     plt.xlabel('1-specificity')
        #     plt.ylabel('sensitivity')
        #     plt.axis([0, 1, 0, 1])
        #     plt.legend()
        #     plt.savefig(ROC_target_folder + 'ROC_cv{}.png'.format(i))
        #     print('Saved ROC figure in {}!'.format(ROC_target_folder))

    # Adjusted according to "Inference for the Generelization error"

    accuracy_mean = np.mean(accuracy)
    S_uj = 1.0 / max((N_iterations - 1), 1) * np.sum(
        (accuracy_mean - accuracy)**2.0)

    print Y_test
    N_1 = float(len(Y_train[0]))
    N_2 = float(len(Y_test[0]))

    print(N_1)
    print(N_2)

    accuracy_var = np.sqrt((1.0 / N_iterations + N_2 / N_1) * S_uj)
    print(accuracy_var)
    print(np.sqrt(1 / N_iterations * S_uj))
    print(st.sem(accuracy))

    stats = dict()
    stats["Accuracy 95%:"] = str(
        compute_CI.compute_confidence(accuracy, N_1, N_2, 0.95))

    stats["AUC 95%:"] = str(compute_CI.compute_confidence(auc, N_1, N_2, 0.95))

    stats["F1-score 95%:"] = str(
        compute_CI.compute_confidence(f1_score_list, N_1, N_2, 0.95))

    stats["Precision 95%:"] = str(
        compute_CI.compute_confidence(precision, N_1, N_2, 0.95))

    stats["Sensitivity 95%: "] = str(
        compute_CI.compute_confidence(sensitivity, N_1, N_2, 0.95))

    stats["Specificity 95%:"] = str(
        compute_CI.compute_confidence(specificity, N_1, N_2, 0.95))

    print("Accuracy 95%:" +
          str(compute_CI.compute_confidence(accuracy, N_1, N_2, 0.95)))

    print("AUC 95%:" + str(compute_CI.compute_confidence(auc, N_1, N_2, 0.95)))

    print("F1-score 95%:" +
          str(compute_CI.compute_confidence(f1_score_list, N_1, N_2, 0.95)))

    print("Precision 95%:" +
          str(compute_CI.compute_confidence(precision, N_1, N_2, 0.95)))

    print("Sensitivity 95%: " +
          str(compute_CI.compute_confidence(sensitivity, N_1, N_2, 0.95)))

    print("Specificity 95%:" +
          str(compute_CI.compute_confidence(specificity, N_1, N_2, 0.95)))

    what_to_print = ['always', 'mostly']
    for what in what_to_print:
        if what == 'always':
            alwaysright = dict()
            alwayswrong = dict()
            for i_ID in patient_classification_list:
                percentage_right = patient_classification_list[i_ID][
                    'N_correct'] / float(
                        patient_classification_list[i_ID]['N_test'])

                # print(i_ID + ' , ' + str(patient_classification_list[i_ID]['N_test']) + ' : ' + str(percentage_right) + '\n')
                if percentage_right == 1.0:
                    label = mutation_label[0][np.where(i_ID == patient_IDs)]
                    label = label[0][0]
                    alwaysright[i_ID] = label
                    # alwaysright.append(('{} ({})').format(i_ID, label))
                    print(("Always Right: {}, label {}").format(i_ID, label))

                if percentage_right == 0:
                    label = mutation_label[0][np.where(
                        i_ID == patient_IDs)].tolist()
                    label = label[0][0]
                    alwayswrong[i_ID] = label
                    # alwayswrong.append(('{} ({})').format(i_ID, label))
                    print(("Always Wrong: {}, label {}").format(i_ID, label))

            stats["Always right"] = alwaysright
            stats["Always wrong"] = alwayswrong
        elif what == 'mostly':
            margin = float(0.2)
            min_right = float(1 - margin)  #for mostly right
            max_right = float(margin)  #for mostly wrong
            mostlyright = dict()
            mostlywrong = dict()

            for i_ID in patient_classification_list:
                percentage_right = patient_classification_list[i_ID][
                    'N_correct'] / float(
                        patient_classification_list[i_ID]['N_test'])

                if percentage_right > min_right:
                    label = mutation_label[0][np.where(i_ID == patient_IDs)]
                    label = label[0][0]
                    mostlyright[i_ID] = [
                        label, "{}%".format(100 * percentage_right)
                    ]
                    print((
                        "Mostly Right: {}, label {}, percentage: {}%").format(
                            i_ID, label, 100 * percentage_right))

                if percentage_right < max_right:
                    label = mutation_label[0][np.where(
                        i_ID == patient_IDs)].tolist()
                    label = label[0][0]
                    mostlywrong[i_ID] = [
                        label, "{}%".format(100 * percentage_right)
                    ]
                    print((
                        "Mostly Wrong: {}, label {}, percentage: {}%").format(
                            i_ID, label, 100 * percentage_right))

            stats["Mostly right"] = mostlyright
            stats["Mostly wrong"] = mostlywrong
        else:
            raise IOError('Unknown argument given...')

    if show_plots:
        import matplotlib.pyplot as plt

        plt.figure()
        plt.boxplot(accuracy)
        plt.ylim([-0.05, 1.05])
        plt.ylabel('Accuracy')
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelbottom='off')  # labels along the bottom edge are off
        plt.tight_layout()
        plt.show()

        plt.figure()
        plt.boxplot(auc)
        plt.ylim([-0.05, 1.05])
        plt.ylabel('AUC')
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelbottom='off')  # labels along the bottom edge are off
        plt.tight_layout()
        plt.show()

        plt.figure()
        plt.boxplot(precision)
        plt.ylim([-0.05, 1.05])
        plt.ylabel('Precision')
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelbottom='off')  # labels along the bottom edge are off
        plt.tight_layout()
        plt.show()

        plt.figure()
        plt.boxplot(sensitivity)
        plt.ylim([-0.05, 1.05])
        plt.ylabel('Sensitivity')
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelbottom='off')  # labels along the bottom edge are off
        plt.tight_layout()
        plt.show()

        plt.figure()
        plt.boxplot(specificity)
        plt.ylim([-0.05, 1.05])
        plt.ylabel('Specificity')
        plt.tick_params(
            axis='x',  # changes apply to the x-axis
            which='both',  # both major and minor ticks are affected
            bottom='off',  # ticks along the bottom edge are off
            top='off',  # ticks along the top edge are off
            labelbottom='off')  # labels along the bottom edge are off
        plt.tight_layout()
        plt.show()

    # Save Y_score values
    Y_score_dict = dict()
    for j in range(len(Y_score)):
        Y_score_dict['CV_{}'.format(j)] = Y_score[j]
    Y_score = pd.DataFrame(Y_score_dict)
    Y_score.to_hdf('/archive/wkessels/output/Lipo_SVM/Y_score.hdf5', 'Y_score')

    # write_to_txt('Y_test', Y_test, ROC_data_folder)
    # write_to_txt('X_test', X_test, ROC_data_folder)
    # write_to_txt('Y_train', Y_train, ROC_data_folder)
    # write_to_txt('mutation_data', mutation_data, ROC_data_folder)
    # write_to_txt('patient_IDs', patient_IDs, ROC_data_folder)
    # write_to_txt('mutation_label',mutation_label, ROC_data_folder)
    # write_to_txt('y_truth', y_truth, ROC_data_folder)
    # write_to_txt('y_prediction', y_prediction, ROC_data_folder)
    # write_to_txt('y_score', y_score, ROC_data_folder)
    # write_to_txt('N_1', N_1, ROC_data_folder)
    # write_to_txt('N_2', N_2, ROC_data_folder)
    # write_to_txt('stats', stats, ROC_data_folder)

    return stats
Beispiel #23
0
        y_pred = pool_classifiers.predict(Xi_test)
        acc_subsample[i / 10] = np.mean(y_pred == yi_test)
        f1_subsample[i / 10] = f1_score(yi_test, y_pred, average='weighted')
        #gmean_subsample[i/10]= geometric_mean_score(yi_test, y_pred, average='weighted')
        auc_subsample[i / 10] = roc_auc_score(yi_test,
                                              y_pred,
                                              average='weighted')

        #fpr, tpr, thresholds = roc_curve(yi_test, y_pred)
        #auc_subsample[i/10]= auc(fpr, tpr)

    accuracy.append(acc_subsample)
    f1.append(f1_subsample)
    #    gmean.append(gmean_subsample)
    auc.append(auc_subsample)

##np.save(dataset+"Perceptron_bagging_acc.py", accuracy)
##np.save(dataset+"Perceptron_bagging_f1.py", f1)
##np.save(dataset+"Perceptron_bagging_gmean.py", gmean)
##np.save(dataset+"Perceptron_bagging_auc.py", auc)

##accuracy = []
##fold=0
##kfold = KFold(n_splits=10, shuffle=False, random_state=1)
##for train, test in kfold.split(X):
##    Xi_train = X[train]
##    yi_train = y[train]
##
##    Xi_test = X[test]
##    yi_test = y[test]
Beispiel #24
0
end= 0
print len(list_exc)
steps = range(0,len(list_exc),20)
print steps
feature_ids=[]
auc=[]
for i in range(0,len(steps)-1):
   begin = steps[i]
   end = steps[i+1]
   feature_ids.extend(list_exc[range(begin,end)])
   x_train=train_data[:,feature_ids]
   x_valid=valid_data[:,feature_ids]
   x_test=test_data[:,feature_ids]
   clf.fit(x_train,y_train)
   dec_val_test=clf.decision_function(x_test)
   auc.append(roc_auc_score(y_test,dec_val_test))
   #print feature_ids
feature_ids.extend(list_exc[range(begin,end)])
#print feature_ids
x_train=train_data[:,feature_ids]
x_valid=valid_data[:,feature_ids]
x_test=test_data[:,feature_ids]
clf.fit(x_train,y_train)
dec_val_test=clf.decision_function(x_test)
auc.append(roc_auc_score(y_test,dec_val_test))
clf2 = linear_model.LogisticRegression(C=0.01, penalty='l1')
clf2.fit(x_train,y_train)
dec_val_test=clf2.decision_function(x_test)
auc_lasso = roc_auc_score(y_test,dec_val_test)
lasso =[auc_lasso] * len(list_exc)
#plt.title(",fontsize=18)
Beispiel #25
0
def main():
	from pandas import read_csv
	# Read in the data

	# # NORMAL DATA
	# DATA = read_csv("norm_data__non_log.txt",sep='\t').T
	# DATA = DATA.apply(np.log).values # Retain the log due to the maximising values
	
	# MIN MAX DATA
	DATA = read_csv("norm_data__non_log.txt",sep='\t').T
	label = read_csv("sample_list.csv",sep=';')
	DATA = DATA.apply(np.log).values # Retain the log due to the maximising values

	# Conversion of string to bool
	mapping = {'Non-LCa':0,'LCa':1}
	TARGET = label.Disease.map(mapping).values

	print(DATA.shape)

	DATA = boost_select(DATA,TARGET)

	kf = KFold(n_splits=5, random_state=seed, shuffle=True)
	acc = []
	prec = []
	recall = []
	auc = []
	with open('results_deep.txt', 'w') as f:
		for train_index, test_index in kf.split(DATA):
			X_train, X_test, y_train, y_test = DATA[train_index],DATA[test_index],TARGET[train_index],TARGET[test_index]
			
			from sklearn.preprocessing import MinMaxScaler
			scaler = MinMaxScaler().fit(X_train)
			X_train = scaler.transform(X_train)
			X_test = scaler.transform(X_test)

			ada = ADASYN()
			X_train, y_train = ada.fit_resample(X_train,y_train)

			nb_epoch = 80
			batch_size = 64
			input_dim = DATA.shape[1] 
			learning_rate = 1e-7

			input_layer = Input(shape=(input_dim, ))

			net = Dense(200,activation="relu",activity_regularizer=regularizers.l2(learning_rate))(input_layer)
			net = Dense(400, activation="relu")(net)
			net = Dense(600, activation="relu")(net)
			net = Dense(800, activation="relu")(net)
			net = Dense(1000,activation="relu")(net)
			net = Dense(800,activation="relu")(net)
			net = Dense(600, activation="relu")(net)
			net = Dense(400, activation="relu")(net)
			net = Dense(200, activation="relu")(net)



			output_layer = Dense(1, activation='sigmoid')(net)

			
			model = Model(inputs=input_layer, outputs=output_layer)
			model.compile(metrics=['accuracy',precision_m,recall_m,f1_m],
		                    loss='binary_crossentropy',
		                    optimizer='adam')


			cp = ModelCheckpoint(filepath="NeuralNetworkModel.h5",
			                               save_best_only=True,
			                               verbose=0)

			tb = TensorBoard(log_dir='./logs',
			                histogram_freq=0,
			                write_graph=True,
			                write_images=True)

			history = model.fit(X_train, y_train,
			                    epochs=nb_epoch,
			                    batch_size=batch_size,
			                    shuffle=True,
			                    validation_data=(X_test, y_test),
			                    verbose=1,
			                    callbacks=[cp, tb]).history


			# This is to figure out the correct number of epochs before training
			# becomes redundant. Here, it is discovered 80 epochs satisfies this problem
			# Uncomment the code to visualise the test v train loss plot. 
			# plt.plot(history['loss'], linewidth=2, label='Train')
			# plt.plot(history['val_loss'], linewidth=2, label='Test')
			# plt.legend(loc='upper right')
			# plt.title('Model loss')
			# plt.ylabel('Loss')
			# plt.xlabel('Epoch')
			# #plt.ylim(ymin=0.70,ymax=1)
			# plt.show()

			# load weights
			model.load_weights("NeuralNetworkModel.h5")
			# Compile model (required to make predictions)
			model.compile(metrics=['accuracy',precision_m,recall_m,f1_m],
		                    loss='binary_crossentropy',
		                    optimizer='adam')
			y_pred = model.predict(X_test)

			auc.append(roc_auc_score(y_test, y_pred))
			recall.append(recall_score(y_test,np.round(y_pred,0)))
			prec.append(precision_score(y_test,np.round(y_pred,0)))
			acc.append(accuracy_score(y_test,np.round(y_pred,0)))

		print(np.mean(auc),np.mean(recall),np.mean(prec),np.mean(acc))
		print("MODEL 9 Hidden, Hidden Nodes [200,400,600,800,1000,800,600,400,200], \n L2 Regulariser Layer 1, Epoch: 80, LearnRate: 1e-7, Loss: Binary Cross, Opt: ADAM \n CV: 5 \n\n\n",file=f)
		print('N_FEATURES: {}, AUC: {}, RECALL: {}, PRECISION: {}, ACCURACY: {}, '.format(42,auc,recall,prec,acc),file=f)
####  5,构建基于Ridge的逻辑回归模型  #####
########################################
all_features = list(train_data.columns)
all_features.remove('ID')
all_features.remove('flag')

#对于参数C的选择,我们用交叉验证法选择最优的C
C_list = np.arange(0.01, 1, 0.01)
auc = []
for c in C_list:
    train2, validation = model_selection.train_test_split(train_data,
                                                          test_size=0.2)
    LR = LogisticRegression(C=c).fit(train2[all_features], train2['flag'])
    pred = LR.predict_proba(validation[all_features])[:, 1]
    test_auc = metrics.roc_auc_score(validation['flag'], pred)
    auc.append(test_auc)

position = auc.index(max(auc))
C_best = C_list[position]
print(max(auc))

LR = LogisticRegression(C=C_best).fit(train_data[all_features],
                                      train_data['flag'])
pred = LR.predict_proba(train_data[all_features])

####画ROC曲线
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

fpr, tpr, thresholds = roc_curve(train_data['flag'], pred[:, 1])
Beispiel #27
0
    tmp_auc=[]
    tmp_acc=[]
    tmp_f1=[]
    initial_time=time.time()
    for i in range(5):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        data, mark, test_size=0.05, random_state=i)
        clf.fit(X_train, y_train)
        y_predict = clf.predict_proba(X_test)[:,1]
        test_auc = metrics.roc_auc_score(y_test, y_predict)  # 验证集上的auc值
        #print('AUC:', test_auc)
        tmp_auc.append(test_auc)
        y_pred = clf.predict(X_test)
        #print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred))
        tmp_acc.append(metrics.accuracy_score(y_test,y_pred))
        #print ('F1-score: %.4f' %metrics.f1_score(y_test,y_predict))
        tmp_f1.append(metrics.f1_score(y_test, y_pred))
    over_time=time.time()
    auc.append(round(sum(tmp_auc)/len(tmp_auc),3))
    acc.append(round(sum(tmp_acc)/len(tmp_acc),3))
    f1.append(round(sum(tmp_f1)/len(tmp_f1),3))
    used_time.append(round(over_time-initial_time,3))
index=['AUC','ACC','F1','time']
out=[]
out.append(auc)
out.append(acc)
out.append(f1)
out.append(used_time)
data = pd.DataFrame(out,index=index)
data.to_csv('/Users/hhy/Desktop/node_model_information.csv',encoding='utf-8-sig',header=header)
def main():
    from pandas import read_csv
    # Read in the data

    # # NORMAL DATA
    # DATA = read_csv("norm_data__non_log.txt",sep='\t').T
    # DATA = DATA.apply(np.log).values # Retain the log due to the maximising values

    # MIN MAX DATA
    DATA = read_csv("norm_data__non_log.txt", sep='\t').T

    print(DATA.shape)

    label = read_csv("sample_list.csv", sep=';')
    DATA = DATA.apply(
        np.log).values  # Retain the log due to the maximising values

    # Conversion of string to bool
    mapping = {'Non-LCa': 0, 'LCa': 1}
    TARGET = label.Disease.map(mapping).values

    class_weight = {1: 2, 0: 1}

    DATA = feature_select(DATA, TARGET, 55)
    print(DATA.shape)

    kf = KFold(n_splits=2, random_state=seed, shuffle=True)
    acc = []
    prec = []
    recall = []
    auc = []
    with open('results_total_best_cv2_lr.txt', 'w') as f:
        for train_index, test_index in kf.split(DATA):
            X_train, X_test, y_train, y_test = DATA[train_index], DATA[
                test_index], TARGET[train_index], TARGET[test_index]

            from sklearn.preprocessing import MinMaxScaler
            scaler = MinMaxScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

            nb_epoch = 80
            batch_size = 64
            input_dim = DATA.shape[1]
            # learning_rate = 1
            # decay = learning_rate/nb_epoch

            input_layer = Input(shape=(input_dim, ))

            net = Dense(
                200,
                activation="relu",
                activity_regularizer=regularizers.l2(1e-7))(input_layer)
            net = Dense(400, activation="relu")(net)
            net = Dense(600, activation="relu")(net)
            net = Dense(800, activation="relu")(net)
            net = Dense(1000, activation="relu")(net)
            net = Dense(800, activation="relu")(net)
            net = Dense(600, activation="relu")(net)
            net = Dense(400, activation="relu")(net)
            net = Dense(200, activation="relu")(net)

            output_layer = Dense(1, activation='sigmoid')(net)

            adam = optimizers.Adam(lr=1e-5)

            model = Model(inputs=input_layer, outputs=output_layer)
            model.compile(metrics=['accuracy', precision_m, recall_m, f1_m],
                          loss='binary_crossentropy',
                          optimizer=adam)

            cp = ModelCheckpoint(filepath="NeuralNetworkModel.h5",
                                 save_best_only=True,
                                 verbose=0)

            tb = TensorBoard(log_dir='./logs',
                             histogram_freq=0,
                             write_graph=True,
                             write_images=True)

            history = model.fit(X_train,
                                y_train,
                                epochs=nb_epoch,
                                batch_size=batch_size,
                                shuffle=True,
                                validation_data=(X_test, y_test),
                                verbose=1,
                                class_weight=class_weight,
                                callbacks=[cp, tb]).history

            # This is to figure out the correct number of epochs before training
            # becomes redundant. Here, it is discovered 80 epochs satisfies this problem
            # Uncomment the code to visualise the test v train loss plot.
            plt.plot(history['loss'], linewidth=2, label='Train')
            plt.plot(history['val_loss'], linewidth=2, label='Test')
            plt.legend(loc='upper right')
            plt.title('Model loss')
            plt.ylabel('Loss')
            plt.xlabel('Epoch')
            plt.savefig("lossvepoch.png", dpi=300)
            #plt.ylim(ymin=0.70,ymax=1)
            plt.show()

            # load weights
            model.load_weights("NeuralNetworkModel.h5")
            # Compile model (required to make predictions)
            model.compile(metrics=['accuracy', precision_m, recall_m, f1_m],
                          loss='binary_crossentropy',
                          optimizer=adam)
            y_pred = model.predict(X_test)

            # plt.style.use(['seaborn-colorblind'])
            # fpr, tpr, _ = roc_curve(y_test, y_pred)
            # plt.figure(1)
            # plt.plot([0, 1], [0, 1], 'k--')
            # plt.plot(fpr, tpr, label='Deep Net',alpha=0.6,color='r')
            # plt.xlabel('False positive rate')
            # plt.ylabel('True positive rate')
            # plt.title('ROC curve AUC = {}'.format(roc_auc_score(y_test, y_pred)))
            # plt.legend(loc='best')
            # plt.show()

            auc.append(roc_auc_score(y_test, y_pred))
            recall.append(recall_score(y_test, np.round(y_pred, 0)))
            prec.append(precision_score(y_test, np.round(y_pred, 0)))
            acc.append(accuracy_score(y_test, np.round(y_pred, 0)))

            # AVERAGE PRECISION

            # from sklearn.metrics import average_precision_score
            # average_precision = average_precision_score(y_test, y_pred)

            # print('Average precision-recall score: {0:0.2f}'.format(
            #       average_precision))

            # # PRECISION RECALL CURVE
            # from sklearn.metrics import precision_recall_curve

            # from inspect import signature

            # precision, recall1, _ = precision_recall_curve(y_test, y_pred)

            # # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
            # step_kwargs = ({'step': 'post'}
            #                if 'step' in signature(plt.fill_between).parameters
            #                else {})
            # plt.step(recall1, precision, color='b', alpha=0.2,
            #          where='post')
            # plt.fill_between(recall1, precision, alpha=0.2, color='b', **step_kwargs)

            # plt.xlabel('Recall')
            # plt.ylabel('Precision')
            # plt.ylim([0.0, 1.05])
            # plt.xlim([0.0, 1.0])
            # plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
            #           average_precision))
            # plt.show()

        print(np.mean(auc), np.mean(recall), np.mean(prec), np.mean(acc))
        print(
            "MODEL 9 Hidden, Hidden Nodes [200,400,600,800,1000,800,600,400,200], \n L2 Regulariser Layer 1, Epoch: 300, LearnRate: 1e-20, Loss: Binary Cross, Opt: ADAM \n CV: 5 \n\n\n",
            file=f)
        print(
            'N_FEATURES: {}, AUC: {}, RECALL: {}, PRECISION: {}, ACCURACY: {} \n\n\n'
            .format(DATA.shape[1], auc, recall, prec, acc),
            file=f)
        print(
            'AUC MEAN: {}\n RECALL MEAN: {}\n PRECISON MEAN: {}\n ACCURACY MEAN: {}'
            .format(np.mean(auc), np.mean(recall), np.mean(prec),
                    np.mean(acc)))
Beispiel #29
0
def plotScores(scores, title, xLabel, yLabel='Score'): 

    x = []

    accuracy = []
    f1 = []
    prescion = []
    recall = []
    auc = []
    
    trainaccuracy = []
    trainf1 = []
    trainprescion = []
    trainrecall = []
    trainauc = []

    for score in scores:
        x.append(score.HyperParam)
        accuracy.append(score.Accuracy)
        prescion.append(score.Precision)
        recall.append(score.Recall)
        f1.append(score.F1)
        auc.append(score.AUC)
        
        trainaccuracy.append(score.TrainAccuracy)
        trainprescion.append(score.TrainPrecision)
        trainrecall.append(score.TrainRecall)
        trainf1.append(score.TrainF1)
        trainauc.append(score.TrainAUC)
     
    plotAccuracy(x, accuracy, trainaccuracy, title, xLabel, yLabel)
    plotF1(x, f1, trainf1, title, xLabel, yLabel)
    plotPrecision(x, prescion, trainprescion, title, xLabel, yLabel)
    plotRecall(x, recall, trainrecall, title, xLabel, yLabel) 

    plt.clf()
    plt.title = title
    plt.xlabel(xLabel)
    plt.ylabel(yLabel) 

    # plt.plot(x, accuracy, label='Accuracy', color='r', lw=1.0)   
    # # plt.plot(x, recall, label='Recall', color='g', lw=2.0)   
    # plt.plot(x, f1, label='F1', color='b', lw=1.0)  
    # # plt.plot(x, prescion, label='Precision', color='k', lw=2.0) 
    plt.plot(x, auc, label='AUC')  
    
    # plt.plot(x, trainaccuracy, label='Train Accuracy', color='r',  lw=5.0,  alpha=0.25)   
    # # plt.plot(x, trainprescion, label='Train Precision', color='g', marker='o', lw=1.0, ls='--')   
    # plt.plot(x, trainf1, label='Train F1', color='b',  lw=5.0,  alpha=0.25)  
    # # plt.plot(x, trainrecall, label='Train Recall', color='k', marker='o', lw=1.0, ls='--') 
    plt.plot(x, trainauc, label='Train AUC')  

    plt.legend()
    plt.grid() 
    # plt.ylim(0, 1)
    # plt.show()

    if showPlot:
        plt.show()
    else:
        name = baseGraphPath + title + ' AUC' + '.png'
        plt.savefig(name)