Ejemplo n.º 1
0
def acc(classifier,
        fname,
        labelF,
        indF,
        splits=10,
        fselect='None',
        nfeat=100,
        featmin=3,
        a=.05,
        header=True,
        hmap=False,
        f=True):

    if f:
        filterwarnings("ignore", category=ConvergenceWarning)
    acc = []
    acc_tr = []

    # load data
    if header:
        cts = pd.read_csv(fname + '.csv',
                          header=0,
                          index_col=0,
                          dtype={0: str})
    else:
        cts = pd.read_csv(fname + '.csv', header=None, index_col=None)
    ind = pd.read_csv(indF + '.csv', header=None)
    label = pd.read_csv(labelF + '.csv', header=0, index_col=0)
    rows = np.where(ind > 0)[0]
    if cts.shape[0] == len(rows):
        phi = cts
    else:
        phi = cts.iloc[rows]
    cancer = label.iloc[rows, 0]
    ind = ind.iloc[rows, 0]

    i = 0
    for i in range(0, splits):

        rows = np.where(ind != i + 1)
        X = phi.iloc[rows]
        s = X.shape
        if len(s) == 3:
            X = np.reshape(X, [s[0], s[1] * s[2]])
        else:
            X = np.reshape(X, [s[0], s[1]])
        y = cancer.iloc[rows]
        y = np.reshape(y, s[0])
        rows = np.where(ind == i + 1)
        X_test = phi.iloc[rows]
        s = X_test.shape
        if len(s) == 3:
            X_test = np.reshape(X_test, [s[0], s[1] * s[2]])
        else:
            X_test = np.reshape(X_test, [s[0], s[1]])
        y_test = cancer.iloc[rows]
        y_test = np.reshape(y_test, s[0])

        # subset features
        if 'min' in fselect:
            cols = np.where(X.astype(bool).sum(axis=0) > featmin)[0]
            X = X.iloc[:, cols]
            X_test = X_test.iloc[:, cols]

        if 'MI' in fselect:
            model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif 'PCA' in fselect:
            model = PCA(n_components=nfeat).fit(X)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif 'reg' in fselect:
            model = SelectFpr(f_classif, alpha=a).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)

        if hmap:
            heatmap(X, y, tail='_{0}_train'.format(i))
            heatmap(X_test, y_test, tail='_{0}_test'.format(i))

        # fit model
        model = classifier.fit(X, y)

        # Compute accuracy for validation set
        probas_ = model.predict_proba(X_test)
        y_hat = np.argmax(probas_, axis=1)
        acc.append(sum(y_hat == y_test) / len(y_test))

        # Compute accuracy for training set
        probas_ = model.predict_proba(X)
        y_hat = np.argmax(probas_, axis=1)
        acc_tr.append(sum(y_hat == y) / len(y))

        i += 1

    results = stats.ttest_1samp(acc, popmean=755 / 2126)
    p_val = results[1]

    results = stats.ttest_1samp(acc_tr, popmean=755 / 2126)
    p_val_tr = results[1]

    return np.mean(acc), np.std(acc), p_val, np.mean(acc_tr), np.std(
        acc_tr), p_val_tr
Ejemplo n.º 2
0
def roc(classifier, mdict, pname, splits=10, fselect='None', nfeat=100):

    tprs = []
    aucs = []
    aucs_tr = []
    mean_fpr = np.linspace(0, 1, 100)

    # load data
    phi = mdict.get('phi')
    testPhi = mdict.get('testPhi')
    asd = mdict.get('cvTrainASD')
    testASD = mdict.get('cvTestASD')

    i = 0
    for i in range(0, splits):

        X = phi[(i, 0)]
        s = X.shape
        X = np.reshape(X, [s[0], s[1] * s[2]])
        y = asd[(i, 0)]
        y = np.reshape(y, s[0])
        X_test = testPhi[(i, 0)]
        s = X_test.shape
        X_test = np.reshape(X_test, [s[0], s[1] * s[2]])
        y_test = testASD[(i, 0)]
        y_test = np.reshape(y_test, s[0])

        # reformat
        if fselect == 'MI':
            model = SelectKBest(mutual_info_classif, k=nfeat).fit(X, y)
            X = model.transform(X)
            X_test = model.transform(X_test)
        elif fselect == 'PCA':
            model = PCA(n_components=nfeat).fit(X)
            #plt.clf()
            #plt.scatter(range(0, X.shape[1]), model.components_, color='black', s=1)
            #plt.ylabel('PC value')
            #plt.xlabel('Index')
            #plt.savefig('plots/' + pname + '_pcVal_' + str(i+1) + '.png')
            X = model.transform(X)
            X_test = model.transform(X_test)
            #bins =np.linspace(-.005, .025, 20)
            #plt.hist(X[y == 0], bins=bins, alpha=0.5, label='non-ASD')
            #plt.hist(X[y == 1], bins=bins, alpha=0.5, label='ASD')
            #plt.legend(loc='upper right')
            #plt.savefig('plots/' + pname + '_hist_' + str(i + 1) + '.png')

        # fit model
        model = classifier.fit(X, y)

        probas_ = model.predict_proba(X_test)
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3,
                 label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

        # Compute AUC on Training set
        probas_ = model.predict_proba(X)
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y, probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        aucs_tr.append(roc_auc)

        i += 1
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Luck', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    plt.plot(mean_fpr, mean_tpr, color='b',
             label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
             lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                     label=r'$\pm$ 1 std. dev.')

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right", prop={'size': 6})
    plt.savefig('plots/' + pname + '.png')
    plt.clf()

    results = stats.ttest_1samp(aucs, popmean=0.5)
    p_val = results[1]

    results = stats.ttest_1samp(aucs_tr, popmean=0.5)
    p_val_tr = results[1]

    return mean_auc, std_auc, p_val, np.mean(aucs_tr), np.std(aucs_tr), p_val_tr