Esempio n. 1
0
def lpo_sklearn(X,y, regparam):
    lpo = LeavePOut(p=2)
    preda = []
    predb = []
    for train, test in lpo.split(X):
        rls = KernelRidge(kernel="rbf", gamma=0.01)
        rls.fit(X[train], y[train])
        p = rls.predict(X[test])
        preda.append(p[0])
        predb.append(p[1])
    return preda, predb
Esempio n. 2
0
def main():
    path_boy = "F:\\study in school\\machine learning\\forstudent\\实验数据\\boynew.txt"
    path_girl = "F:\\study in school\\machine learning\\forstudent\\实验数据\\girlnew.txt"
    # height = []
    # weight = []
    # feetsize = []
    x_boy = []
    x_girl = []
    label_boy = []  # 1表示男,0表示女
    label_girl = []
    readdata1(path_boy, x_boy, label_boy, 1)
    readdata1(path_girl, x_girl, label_girl, 0)
    x_boy = np.mat(x_boy)
    x_girl = np.mat(x_girl)
    m1 = x_boy.mean(0)
    m0 = x_girl.mean(0)
    S1 = (x_boy - m1[0]).T * (x_boy - m1[0])
    S0 = (x_girl - m0[0]).T * (x_girl - m0[0])
    Sw = S1 + S0
    S_inverse = Sw.I
    W = S_inverse * (m1 - m0).T
    M1 = float(W.T * m1.T)
    M0 = float(W.T * m0.T)
    w_decision0 = (M0 + M1) / 2
    path_boy_test = "F:\\study in school\\machine learning\\forstudent\\实验数据\\boy.txt"
    path_girl_test = "F:\\study in school\\machine learning\\forstudent\\实验数据\\girl.txt"
    x = []
    label = []
    readdata1(path_boy_test, x, label, 1)
    readdata1(path_girl_test, x, label, 0)
    label_test = []
    y = x * W
    errorcount = 0
    for i in range(len(label)):
        if float(y[i] > w_decision0):
            label_test.append(1)
            if label[i] != 1:
                errorcount = errorcount + 1
        else:
            label_test.append(0)
            if label[i] != 0:
                errorcount = errorcount + 1

    e_percentage = errorcount / len(label_test)
    print('fisher测试集的错误率为%f' % e_percentage)

    #留一法
    loo = LeavePOut(p=1)
    error = 0
    for train, test in loo.split(x, label):
        x_boy = []
        x_girl = []
        label_boy = []  # 1表示男,0表示女
        label_girl = []
        for i in train:
            if label[i] == 1:
                x_boy.append(x[i])
                label_boy.append(1)
            else:
                x_girl.append(x[i])
                label_girl.append(0)
        x_boy = np.mat(x_boy)
        x_girl = np.mat(x_girl)
        m1 = x_boy.mean(0)
        m0 = x_girl.mean(0)
        S1 = (x_boy - m1[0]).T * (x_boy - m1[0])
        S0 = (x_girl - m0[0]).T * (x_girl - m0[0])
        Sw = S1 + S0
        S_inverse = Sw.I
        W = S_inverse * (m1 - m0).T
        M1 = float(W.T * m1.T)
        M0 = float(W.T * m0.T)
        w_decision0 = (M0 + M1) / 2

        for j in test:
            if float(x[j] * W > w_decision0):
                if label[j] != 1:
                    error = error + 1
            else:
                label_test.append(0)
                if label[j] != 0:
                    error = error + 1

    print('fisher留一法的错误率为%f' % (error / len(label)))

    figure(3)
    FPR, TPR = get_roc_fisher(W, w_decision0, x, label)
    plot(FPR, TPR, label='fisher')

    figure(5)
    x1 = np.arange(130, 190, 0.01)
    y1 = (w_decision0 - W[0] * x1) / W[1]
    plot(x1, array(y1)[0])
    plot(x1, x1 * float(W[1]) / float(W[0]))
    for i in range(len(label)):
        if label[i] == 1:
            plot(float(x[i][0]), float(x[i][1]), 'o', color='r')
        else:
            plot(float(x[i][0]), float(x[i][1]), 'o', color='g')
        a=(float(x[i][1])+float(x[i][0])*float(W[0])/float(W[1]))/\
            (float(W[1])/float(W[0])+float(W[0])/float(W[1]))
        b = a * float(W[1]) / float(W[0])
        plot([float(x[i][0]), a], [float(x[i][1]), b], '--', color='0.75')

    axis([140, 190, 35, 85])

    Bayes()
import numpy as np

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, LeavePOut

# Set random seed for reproducibility
np.random.seed(1000)

if __name__ == '__main__':
    # Load the dataset
    data = load_iris()

    p = 3
    lr = LogisticRegression()

    # Perform Leave-P-Out Cross Validation
    lpo_scores = cross_val_score(lr,
                                 data['data'],
                                 data['target'],
                                 cv=LeavePOut(p))
    print('LPO scores (100): {}'.format(lpo_scores[0:100]))
    print('Average LPO score: {}'.format(lpo_scores.mean()))
Esempio n. 4
0
    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=3, random_state=2))

    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=4, random_state=0))

    cv = cls(n_splits=3)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == 3

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == 3


@pytest.mark.parametrize("cvs", [(LeaveOneOut(), ),
                                 (LeavePOut(2), LeavePOut(3))])
def test_leave_out(cvs):
    tokens = []
    for cv in cvs:
        assert tokenize(cv) == tokenize(cv)
        tokens.append(cv)
    assert len(set(tokens)) == len(tokens)

    cv = cvs[0]
    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(True):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol

    with assert_dask_compute(False):
Esempio n. 5
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2018  David Arroyo Menéndez

# Author: David Arroyo Menéndez <*****@*****.**>
# Maintainer: David Arroyo Menéndez <*****@*****.**>

# This file is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.

# This file is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with GNU Emacs; see the file COPYING.  If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301 USA,

from sklearn.model_selection import LeavePOut
import numpy as np

X = np.ones(4)
lpo = LeavePOut(p=2)
for train, test in lpo.split(X):
    print("train: %s, test: %s" % (train, test))
Esempio n. 6
0
        for i in test:
            bar[i] = "T"
            output_test = "{}({}: {}) ".format(output_test, i, data[i])

        print("[ {} ]".format(" ".join(bar)))
        print("Train: {}".format(output_train))
        print("Test:  {}\n".format(output_test))


# Create some data to split with
data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]])

# Our two methods
loocv = LeaveOneOut()
lpocv = LeavePOut(p=P_VAL)

split_loocv = loocv.split(data)
split_lpocv = lpocv.split(data)

print("""\
The Leave-P-Out method works by using every combination of P points as test data.

The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods.
A bar displaying the current train-test split as well as the actual data points are displayed for each split.
In the bar, "-" is a training point and "T" is a test point.
""")

print("Data:\n{}\n".format(data))

print("Leave-One-Out:\n")
min_list = []
if c == 'i':
    min_list = [0, 8, 9, 12]
else:
    min_list = [3, 4, 7, 10, 11, 13]
data = pd.read_csv('input_' + c + '_2_hrv_c.csv', header=None)
decisionTree = DecisionTreeClassifier()
knnClf = KNeighborsClassifier(
    n_neighbors=3
)  # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
svc = svm.SVC(
    kernel='linear',
    C=1)  # (kernel='linear', C=1)   #(kernel='rbf') #(kernel='poly', degree=5)
naive_bayes = GaussianNB()
rand_forrest = RandomForestClassifier(n_estimators=25)
lpo = LeavePOut(p=3)
kf = KFold(n_splits=5)
X_raw = data.iloc[:, :data.shape[1] - 1]
y = data.iloc[:, data.shape[1] - 1]


def my_validation(model, X_f, y_f):
    score = np.array([])
    if c == 'i':
        X_train = X_f.iloc[8:, :]
        y_train = y_f.iloc[8:]
        X_test = X_f.iloc[:8, :]
        y_test = y_f.iloc[:8]
    else:
        X_train = X_f.iloc[:8, :]
        y_train = y_f.iloc[:8]
Esempio n. 8
0
import numpy as np
from sklearn.model_selection import LeavePOut
'''
يترك عدد عناصر معين تقوم بتحديده للاختبار و الباقي للتدريب

'''
X = np.array([[1, 11], [2, 12], [3, 13], [4, 14], [5, 15], [6, 16], [7, 17],
              [8, 18], [9, 19], [10, 20]])

y = np.array([[1], [0], [1], [1], [0], [1], [1], [0], [0], [1]])

lpo = LeavePOut(4)
print('number of splits = ', str(lpo.get_n_splits(X)))

print("----------------------------------------------------------")

folds = lpo.split(X)

for train_index, test_index in folds:
    print('train : ', train_index, ' test : ', test_index)
    print('X_train \n ', X[train_index])
    print('X_test  \n ', X[test_index])
    print('y_train \n ', y[train_index])
    print('y_test  \n ', y[test_index])
    print("----------------------------------------------------------")
Esempio n. 9
0
def train(X,
          y,
          k_cross_validation_ratio,
          testing_size,
          optimal_k=True,
          min_range_k=0,
          max_range_k=0):

    X0_train, X_test, y0_train, y_test = train_test_split(
        X, y, test_size=testing_size, random_state=7)
    #Scaler is needed to scale all the inputs to a similar range
    scaler = StandardScaler()
    scaler = scaler.fit(X0_train)
    X0_train = scaler.transform(X0_train)
    X_test = scaler.transform(X_test)
    #X_train, X_eval, y_train, y_eval = train_test_split(X0_train, y0_train, test_size= 100/k_cross_validation_ratio, random_state=7)

    #finding the range for the optimal value of k either within the specified range (user input)
    # or by our default range
    if optimal_k and min_range_k > 0 and max_range_k > min_range_k:
        k_range = range(min_range_k, max_range_k)
    else:
        k_range = range(1, 50)

    scores = {}
    scores_list = []

    #finding the optimal nb of neighbors
    for k in tqdm(k_range):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X0_train, y0_train)
        y_pred = knn.predict(X_test)
        scores[k] = metrics.accuracy_score(y_test, y_pred)
        scores_list.append(metrics.accuracy_score(y_test, y_pred))

    k_optimal = scores_list.index(max(scores_list))
    model = KNeighborsClassifier(n_neighbors=k_optimal)

    eval_score_list = []
    #Evaluation using cross validation: lpo: leave p out
    from sklearn.model_selection import StratifiedKFold
    lpo = LeavePOut(p=1)
    accuracys = []

    skf = StratifiedKFold(n_splits=10, random_state=None)
    skf.get_n_splits(X0_train, y0_train)
    for train_index, test_index in skf.split(X0_train, y0_train):

        # print("TRAIN:", train_index, "Validation:", test_index)
        X_train, X_eval = pd.DataFrame(X0_train).iloc[
            train_index], pd.DataFrame(X0_train).iloc[test_index]
        y_train, y_eval = pd.DataFrame(y0_train).iloc[
            train_index], pd.DataFrame(y0_train).iloc[test_index]

        model.fit(X0_train, y0_train)
        predictions = model.predict(X_eval)
        score = accuracy_score(predictions, y_eval)
        accuracys.append(score)
        #scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
        #eval_score_list.append(scores.mean())

    #eval_accuracy = np.mean(eval_score_list)
    eval_accuracy = np.mean(accuracys)

    #save the pretrained model:
    model_name = 'pretrained_knn_model'
    pickle.dump(model, open(model_name, 'wb'))

    return eval_accuracy, model, X0_train, y0_train, X_test, y_test
Esempio n. 10
0
        4: weights[4]
    }
    over = SMOTE(sampling_strategy=ratio_over, random_state=314)
    X_train, y_train = over.fit_resample(X_train, y_train)

    # undersample samples > average
    ratio_under = {
        0: average_samples,
        1: average_samples,
        2: average_samples,
        3: average_samples,
        4: average_samples
    }
    under = RandomUnderSampler(sampling_strategy=ratio_under, random_state=314)
    X_train, y_train = under.fit_resample(X_train, y_train)
    cv_inner = LeavePOut(2)
    model = KerasClassifier(build_fn=create_model, verbose=1)

    batch_size = [8, 16, 32]
    neurons = [30, 40, 50]
    hidden_layers = [1, 2, 3]
    epochs = [10, 50, 100]
    activation = ['softmax', 'relu', 'tanh']
    param_grid = dict(batch_size=batch_size,
                      neurons=neurons,
                      hidden_layers=hidden_layers,
                      epochs=epochs,
                      activation=activation)
    grid = GridSearchCV(estimator=model,
                        param_grid=param_grid,
                        n_jobs=-2,
Esempio n. 11
0
def nestedCVClassifier(df,
                       outcomeVar,
                       predVars,
                       model,
                       params={},
                       nFolds=10,
                       LPO=None,
                       scorer='log_loss',
                       n_jobs=1):
    """Apply model to df in nested cross-validation framework
    with inner folds to optimize hyperparameters.
    and outer test folds to evaluate performance.
        
    Parameters
    ----------
    df : pd.DataFrame
        Must contain outcome and predictor variables.
    outcomeVar : str
    predVars : ndarray or list
        Predictor variables in the model.
    model : sklearn model
    nFolds : int
        N-fold stratified cross-validation
    LPO : int or None
        Use Leave-P-Out cross-validation instead of StratifiedNFoldCV
    params : dict
        Keys of model hyperparameters withe values to try in
        a grid search.

    Returns
    -------
    results : dict
        Contains results as keys below:
        fpr:            (100, ) average FPR for ROC
        tpr:            (100, ) average TPR for ROC
        AUC:            (outerFolds, ) AUC of ROC for each outer test fold
        meanAUC:        (1, ) AUC of the average ROC
        ACC:            (outerFolds, ) accuracy across outer test folds
        scores:         (outerFolds, innerFolds, Cs) log-likelihood for each C across inner and outer CV folds
        optimalCs:      (outerFolds, ) optimal C from each set of inner CV
        finalResult:    final fitted model with predict() exposed
        prob:           (N,) pd.Series of predicted probabilities avg over outer folds
        varList:        (Nvars, ) list of vars with non-zero coef in final model
        Cs:             (Cs, ) pre-specified grid of Cs
        coefs:          (outerFolds, predVars) refit with optimalC in each fold
        paths:          (outerFolds, Cs, predVars + intercept) avg across inner folds
        XVars:          list of all vars in X
        yVar:           name of outcome variable
        N:              total number of rows/instances in the model"""

    if not isinstance(predVars, list):
        predVars = list(predVars)

    tmp = df[[outcomeVar] + predVars].dropna()
    X, y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float)

    if LPO is None:
        innerCV = StratifiedKFold(n_splits=nFolds, shuffle=True)
        outerCV = StratifiedKFold(n_splits=nFolds, shuffle=True)
    else:
        innerCV = LeavePOut(LPO)
        outerCV = LeavePOut(LPO)

    if scorer == 'log_loss':
        scorerFunc = sklearn.metrics.make_scorer(sklearn.metrics.log_loss,
                                                 greater_is_better=False,
                                                 needs_proba=True,
                                                 needs_threshold=False,
                                                 labels=[0, 1])
    elif scorer == 'accuracy':
        scorerFunc = sklearn.metrics.make_scorer(
            sklearn.metrics.accuracy_score,
            greater_is_better=True,
            needs_proba=False,
            needs_threshold=False)

    fpr = np.linspace(0, 1, 100)
    tpr = np.nan * np.zeros((fpr.shape[0], nFolds))
    acc = np.nan * np.zeros(nFolds)
    auc = np.nan * np.zeros(nFolds)
    probs = []
    optimalParams = []
    optimalScores = []
    cvResults = []

    for outi, (trainInd, testInd) in enumerate(outerCV.split(X=X, y=y)):
        Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd]
        ytrain, ytest = y.iloc[trainInd], y.iloc[testInd]

        clf = GridSearchCV(estimator=model,
                           param_grid=params,
                           cv=innerCV,
                           refit=True,
                           scoring=scorerFunc,
                           n_jobs=n_jobs)
        clf.fit(Xtrain, ytrain)
        cvResults.append(clf.cv_results_)
        optimalParams.append(clf.best_params_)
        optimalScores.append(clf.best_score_)

        prob = clf.predict_proba(Xtest)
        fprTest, tprTest, _ = sklearn.metrics.roc_curve(ytest, prob[:, 1])
        tpr[:, outi] = np.interp(fpr, fprTest, tprTest)
        auc[outi] = sklearn.metrics.auc(fprTest, tprTest)
        acc[outi] = sklearn.metrics.accuracy_score(ytest,
                                                   np.round(prob[:, 1]),
                                                   normalize=True)

        probs.append(pd.Series(prob[:, 1], index=Xtest.index))

    meanTPR = np.mean(tpr, axis=1)
    meanTPR[0], meanTPR[-1] = 0, 1
    meanACC = np.mean(acc)
    meanAUC = sklearn.metrics.auc(fpr, meanTPR)
    """Compute mean probability over test predictions in CV"""
    probS = pd.concat(probs).groupby(level=0).agg(np.mean)
    probS.name = 'Prob'
    """Select "outer" optimal param for final model"""
    avgFunc = lambda v: 10**np.mean(np.log10(v))
    # avgFunc = lambda v: np.mean(v)
    optP = {
        k: avgFunc([o[k] for o in optimalParams])
        for k in optimalParams[0].keys()
    }

    for k, v in optP.items():
        setattr(model, k, v)
    result = model.fit(X=X, y=y)

    rocRes = rocStats(y, np.round(probS))

    outD = {
        'fpr': fpr,
        'tpr': meanTPR,
        'AUC': auc,
        'mAUC': meanAUC,
        'mACC': np.mean(acc),
        'ACC': acc,
        'CVres': cvResults,
        'optimalScores': np.array(optimalScores),
        'optimalParams': optimalParams,
        'finalParams': optP,
        'finalResult': result,  # final fitted model with predict() exposed
        'prob':
        probS,  # (N,) pd.Series of predicted probabilities avg over outer folds
        'Xvars': predVars,
        'Yvar': outcomeVar,
        'N': tmp.shape[0],
        'params': params
    }
    outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict())
    return outD
Esempio n. 12
0
def logisticL1NestedCV(df,
                       outcomeVar,
                       predVars,
                       nFolds=10,
                       LPO=None,
                       Cs=10,
                       n_jobs=1):
    """Apply logistic regression with L1-regularization (LASSO) to df.
    Uses nested cross-validation framework with inner folds to optimize C
    and outer test folds to evaluate performance.
        
    Parameters
    ----------
    df : pd.DataFrame
        Must contain outcome and predictor variables.
    outcomeVar : str
    predVars : ndarray or list
        Predictor variables in the model.
    nFolds : int
        N-fold stratified cross-validation
    LPO : int or None
        Use Leave-P-Out cross-validation instead of StratifiedNFoldCV
    Cs : int or list
        Each of the values in Cs describes the inverse of regularization strength.
        If Cs is as an int, then a grid of Cs values are chosen in a logarithmic
        scale between 1e-4 and 1e4. Smaller values specify stronger regularization.

    Returns
    -------
    results : dict
        Contains results as keys below:
        fpr:            (100, ) average FPR for ROC
        tpr:            (100, ) average TPR for ROC
        AUC:            (outerFolds, ) AUC of ROC for each outer test fold
        meanAUC:        (1, ) AUC of the average ROC
        ACC:            (outerFolds, ) accuracy across outer test folds
        scores:         (outerFolds, innerFolds, Cs) log-likelihood for each C across inner and outer CV folds
        optimalCs:      (outerFolds, ) optimal C from each set of inner CV
        finalResult:    final fitted model with predict() exposed
        prob:           (N,) pd.Series of predicted probabilities avg over outer folds
        varList:        (Nvars, ) list of vars with non-zero coef in final model
        Cs:             (Cs, ) pre-specified grid of Cs
        coefs:          (outerFolds, predVars) refit with optimalC in each fold
        paths:          (outerFolds, Cs, predVars + intercept) avg across inner folds
        XVars:          list of all vars in X
        yVar:           name of outcome variable
        N:              total number of rows/instances in the model"""

    if not isinstance(predVars, list):
        predVars = list(predVars)

    tmp = df[[outcomeVar] + predVars].dropna()
    X, y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float)

    if LPO is None:
        innerCV = StratifiedKFold(n_splits=nFolds, shuffle=True)
        outerCV = StratifiedKFold(n_splits=nFolds, shuffle=True)
    else:
        innerCV = LeavePOut(LPO)
        outerCV = LeavePOut(LPO)

    scorerFunc = sklearn.metrics.make_scorer(sklearn.metrics.log_loss,
                                             greater_is_better=False,
                                             needs_proba=True,
                                             needs_threshold=False,
                                             labels=[0, 1])

    fpr = np.linspace(0, 1, 100)
    tpr = np.nan * np.zeros((fpr.shape[0], nFolds))
    acc = np.nan * np.zeros(nFolds)
    auc = np.nan * np.zeros(nFolds)
    paths = []
    coefs = []
    probs = []
    optimalCs = np.nan * np.zeros(nFolds)
    scores = []

    for outi, (trainInd, testInd) in enumerate(outerCV.split(X=X, y=y)):
        Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd]
        ytrain, ytest = y.iloc[trainInd], y.iloc[testInd]

        model = sklearn.linear_model.LogisticRegressionCV(Cs=Cs,
                                                          cv=innerCV,
                                                          penalty='l1',
                                                          solver='liblinear',
                                                          scoring=scorerFunc,
                                                          refit=True,
                                                          n_jobs=n_jobs)
        """With refit = True, the scores are averaged across all folds,
        and the coefs and the C that corresponds to the best score is taken,
        and a final refit is done using these parameters."""

        results = model.fit(X=Xtrain, y=ytrain)
        prob = results.predict_proba(Xtest)

        class1Ind = np.nonzero(results.classes_ == 1)[0][0]
        fprTest, tprTest, _ = sklearn.metrics.roc_curve(
            ytest, prob[:, class1Ind])

        tpr[:, outi] = np.interp(fpr, fprTest, tprTest)
        auc[outi] = sklearn.metrics.auc(fprTest, tprTest)
        acc[outi] = sklearn.metrics.accuracy_score(ytest,
                                                   np.round(prob[:,
                                                                 class1Ind]),
                                                   normalize=True)
        optimalCs[outi] = results.C_[0]
        scores.append(results.scores_[1])
        paths.append(results.coefs_paths_[1])
        coefs.append(results.coef_)
        probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index))

    meanTPR = np.mean(tpr, axis=1)
    meanTPR[0], meanTPR[-1] = 0, 1
    meanACC = np.mean(acc)
    meanAUC = sklearn.metrics.auc(fpr, meanTPR)
    meanC = 10**np.mean(np.log10(optimalCs))
    paths = np.concatenate([p.mean(axis=0, keepdims=True) for p in paths],
                           axis=0)
    scores = np.concatenate([s[None, :, :] for s in scores], axis=0)
    """Compute mean probability over test predictions in CV"""
    probS = pd.concat(probs).groupby(level=0).agg(np.mean)
    probS.name = 'Prob'
    """Refit all the data with the optimal C for variable selection and 
    classification of holdout data"""
    model = sklearn.linear_model.LogisticRegression(C=meanC,
                                                    penalty='l1',
                                                    solver='liblinear')
    result = model.fit(X=X, y=y)
    varList = np.array(predVars)[result.coef_.ravel() != 0].tolist()

    rocRes = rocStats(y, np.round(probS))

    outD = {
        'fpr': fpr,  # (100, ) average FPR for ROC
        'tpr': meanTPR,  # (100, ) average TPR for ROC
        'AUC': auc,  # (outerFolds, ) AUC of ROC for each outer test fold
        'mAUC': meanAUC,  # (1, ) AUC of the average ROC
        'ACC': acc,  # (outerFolds, ) accuracy across outer test folds
        'mACC': np.mean(acc),
        'scores':
        scores,  # (outerFolds, innerFolds, Cs) score for each C across inner and outer CV folds
        'optimalCs':
        optimalCs,  # (outerFolds, ) optimal C from each set of inner CV
        'C': meanC,
        'finalResult': result,  # final fitted model with predict() exposed
        'prob':
        probS,  # (N,) pd.Series of predicted probabilities avg over outer folds
        'varList': varList,  # list of vars with non-zero coef in final model
        'Cs': Cs,  # pre-specified grid of Cs
        'coefs': np.concatenate(
            coefs),  # (outerFolds, predVars) refit with optimalC in each fold
        'paths':
        paths,  # (outerFolds, Cs, predVars + intercept) avg across inner folds 
        'Xvars': predVars,
        'Yvar': outcomeVar,
        'N': tmp.shape[0]
    }
    outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict())
    return outD
Esempio n. 13
0
def palatability_identity_calculations(rec_dir, pal_ranks=None,
                                       params=None, shell=False):
    warnings.filterwarnings('ignore', category=UserWarning)
    warnings.filterwarnings('ignore', category=RuntimeWarning)
    dat = load_dataset(rec_dir)
    dim = dat.dig_in_mapping
    if 'palatability_rank' in dim.columns:
        pass
    elif pal_ranks is None:
        dim = get_palatability_ranks(dim, shell=shell)
    else:
        dim['palatability_rank'] = dim['name'].map(pal_ranks)

    dim = dim.dropna(subset=['palatability_rank'])
    dim = dim[dim['palatability_rank'] > 0]
    dim = dim.reset_index(drop=True)
    num_tastes = len(dim)
    taste_names = dim.name.to_list()

    trial_list = dat.dig_in_trials.copy()
    trial_list = trial_list[[True if x in taste_names else False
                             for x in trial_list.name]]
    num_trials = trial_list.groupby('channel').count()['name'].unique()
    if len(num_trials) > 1:
        raise ValueError('Unequal number of trials for tastes to used')
    else:
        num_trials = num_trials[0]

    dim['num_trials'] = num_trials

    # Get which units to use
    unit_table = h5io.get_unit_table(rec_dir)
    unit_types = ['Single', 'Multi', 'All', 'Custom']
    unit_type = params.get('unit_type')
    if unit_type is None:
        q = userIO.ask_user('Which units do you want to use for taste '
                            'discrimination and  palatability analysis?',
                            choices=unit_types,
                            shell=shell)
        unit_type = unit_types[q]

    if unit_type == 'Single':
        chosen_units = unit_table.loc[unit_table['single_unit'],
                                      'unit_num'].to_list()
    elif unit_type == 'Multi':
        chosen_units = unit_table.loc[unit_table['single_unit'] == False,
                                      'unit_num'].to_list()
    elif unit_type == 'All':
        chosen_units = unit_table['unit_num'].to_list()
    else:
        selection = userIO.select_from_list('Select units to use:',
                                            unit_table['unit_num'],
                                            'Select Units',
                                            multi_select=True)
        chosen_units = list(map(int, selection))

    num_units = len(chosen_units)
    unit_table = unit_table.loc[chosen_units]

    # Enter Parameters
    if params is None or params.keys() != default_pal_id_params.keys():
        params = default_pal_id_params.copy()
        params = userIO.confirm_parameter_dict(params,
                                               ('Palatability/Identity '
                                                'Calculation Parameters'
                                                '\nTimes in ms'), shell=shell)

    win_size = params['window_size']
    win_step = params['window_step']
    print('Running palatability/identity calculations with parameters:\n%s' %
          pt.print_dict(params))

    with tables.open_file(dat.h5_file, 'r+') as hf5:
        trains_dig_in = hf5.list_nodes('/spike_trains')
        time = trains_dig_in[0].array_time[:]
        bin_times = np.arange(time[0], time[-1] - win_size + win_step,
                             win_step)
        num_bins = len(bin_times)

        palatability = np.empty((num_bins, num_units, num_tastes*num_trials),
                                dtype=int)
        identity = np.empty((num_bins, num_units, num_tastes*num_trials),
                            dtype=int)
        unscaled_response = np.empty((num_bins, num_units, num_tastes*num_trials),
                                     dtype=np.dtype('float64'))
        response  = np.empty((num_bins, num_units, num_tastes*num_trials),
                             dtype=np.dtype('float64'))
        laser = np.empty((num_bins, num_units, num_tastes*num_trials, 2),
                         dtype=float)

        # Fill arrays with data
        print('Filling data arrays...')
        onesies = np.ones((num_bins, num_units, num_trials))
        for i, row in dim.iterrows():
            idx = range(num_trials*i, num_trials*(i+1))
            palatability[:, :, idx] = row.palatability_rank * onesies
            identity[:, :, idx] = row.channel * onesies
            for j, u in enumerate(chosen_units):
                for k,t in enumerate(bin_times):
                    t_idx = np.where((time >= t) & (time <= t+win_size))[0]
                    unscaled_response[k, j, idx] = \
                            np.mean(trains_dig_in[i].spike_array[:, u, t_idx],
                                    axis=1)
                    try:
                        lasers[k, j, idx] = \
                            np.vstack((trains_dig_in[i].laser_durations[:],
                                       trains_dig_in[i].laser_onset_lag[:]))
                    except:
                        laser[k, j, idx] = np.zeros((num_trials, 2))

        # Scaling was not done, so:
        response = unscaled_response.copy()

        # Make ancillary_analysis node and put in arrays
        if '/ancillary_analysis' in hf5:
            hf5.remove_node('/ancillary_analysis', recursive=True)

        hf5.create_group('/', 'ancillary_analysis')
        hf5.create_array('/ancillary_analysis', 'palatability', palatability)
        hf5.create_array('/ancillary_analysis', 'identity', identity)
        hf5.create_array('/ancillary_analysis', 'laser', laser)
        hf5.create_array('/ancillary_analysis', 'scaled_neural_response',
                         response)
        hf5.create_array('/ancillary_analysis', 'window_params',
                         np.array([win_size, win_step]))
        hf5.create_array('/ancillary_analysis', 'bin_times', bin_times)
        hf5.create_array('/ancillary_analysis', 'unscaled_neural_response',
                         unscaled_response)

        # for backwards compatibility
        hf5.create_array('/ancillary_analysis', 'params',
                        np.array([win_size, win_step]))
        hf5.create_array('/ancillary_analysis', 'pre_stim', np.array(time[0]))
        hf5.flush()

        # Get unique laser (duration, lag) combinations
        print('Organizing trial data...')
        unique_lasers = np.vstack(list({tuple(row) for row in laser[0, 0, :, :]}))
        unique_lasers = unique_lasers[unique_lasers[:, 1].argsort(), :]
        num_conditions = unique_lasers.shape[0]
        trials = []
        for row in unique_lasers:
            tmp_trials = [j for j in range(num_trials * num_tastes)
                          if np.array_equal(laser[0, 0, j, :], row)]
            trials.append(tmp_trials)

        trials_per_condition = [len(x) for x in trials]
        if not all(x == trials_per_condition[0] for x in trials_per_condition):
            raise ValueError('Different number of trials for each laser condition')

        trials_per_condition = int(trials_per_condition[0] / num_tastes)  #assumes same number of trials per taste per condition
        print('Detected:\n    %i tastes\n    %i laser conditions\n'
              '    %i trials per condition per taste' %
              (num_tastes, num_conditions, trials_per_condition))
        trials = np.array(trials)

        # Store laser conditions and indices of trials per condition in trial x
        # taste space
        hf5.create_array('/ancillary_analysis', 'trials', trials)
        hf5.create_array('/ancillary_analysis', 'laser_combination_d_l',
                         unique_lasers)
        hf5.flush()

        # Taste Similarity Calculation
        neural_response_laser = np.empty((num_conditions, num_bins,
                                          num_tastes, num_units,
                                          trials_per_condition),
                                         dtype=np.dtype('float64'))
        taste_cosine_similarity = np.empty((num_conditions, num_bins,
                                            num_tastes, num_tastes),
                                           dtype=np.dtype('float64'))
        taste_euclidean_distance = np.empty((num_conditions, num_bins,
                                             num_tastes, num_tastes),
                                            dtype=np.dtype('float64'))

        # Re-format neural responses from bin x unit x (trial*taste) to
        # laser_condition x bin x taste x unit x trial
        print('Reformatting data arrays...')
        for i, trial in enumerate(trials):
            for j, _ in enumerate(bin_times):
                for k, _ in dim.iterrows():
                    idx = np.where((trial >= num_trials*k) &
                                   (trial < num_trials*(k+1)))[0]
                    neural_response_laser[i, j, k, :, :] = \
                            response[j, :, trial[idx]].T

        # Compute taste cosine similarity and euclidean distances
        print('Computing taste cosine similarity and euclidean distances...')
        for i, _ in enumerate(trials):
            for j, _ in enumerate(bin_times):
                for k, _ in dim.iterrows():
                    for l, _ in dim.iterrows():
                        taste_cosine_similarity[i, j, k, l] = \
                                np.mean(cosine_similarity(
                                    neural_response_laser[i, j, k, :, :].T,
                                    neural_response_laser[i, j, l, :, :].T))
                        taste_euclidean_distance[i, j, k, l] = \
                                np.mean(cdist(
                                    neural_response_laser[i, j, k, :, :].T,
                                    neural_response_laser[i, j, l, :, :].T,
                                    metric='euclidean'))

        hf5.create_array('/ancillary_analysis', 'taste_cosine_similarity',
                         taste_cosine_similarity)
        hf5.create_array('/ancillary_analysis', 'taste_euclidean_distance',
                         taste_euclidean_distance)
        hf5.flush()

        # Taste Responsiveness calculations
        bin_params = [params['num_comparison_bins'],
                      params['comparison_bin_size']]
        discrim_p = params['discrim_p']

        responsive_neurons = []
        discriminating_neurons = []
        taste_responsiveness = np.zeros((bin_params[0], num_units, 2))
        new_bin_times = np.arange(0, np.prod(bin_params), bin_params[1])
        baseline = np.where(bin_times < 0)[0]
        print('Computing taste responsiveness and taste discrimination...')
        for i, t in enumerate(new_bin_times):
            places = np.where((bin_times >= t) &
                              (bin_times <= t+bin_params[1]))[0]
            for j, u in enumerate(chosen_units):
                # Check taste responsiveness
                f, p = f_oneway(np.mean(response[places, j, :], axis=0),
                                np.mean(response[baseline, j, :], axis=0))
                if np.isnan(f):
                    f = 0.0
                    p = 1.0

                if p <= discrim_p and u not in responsive_neurons:
                    responsive_neurons.append(u)
                    taste_responsiveness[i, j, 0] = 1

                # Check taste discrimination
                taste_idx = [np.arange(num_trials*k, num_trials*(k+1))
                             for k in range(num_tastes)]
                taste_responses = [np.mean(response[places, j, :][:, k], axis=0)
                                   for k in taste_idx]
                f, p = f_oneway(*taste_responses)
                if np.isnan(f):
                    f = 0.0
                    p = 1.0

                if p <= discrim_p and u not in discriminating_neurons:
                    discriminating_neurons.append(u)

        responsive_neurons = np.sort(responsive_neurons)
        discriminating_neurons = np.sort(discriminating_neurons)

        # Write taste responsive and taste discriminating units to text file
        save_file = os.path.join(rec_dir, 'discriminative_responsive_neurons.txt')
        with open(save_file, 'w') as f:
            print('Taste discriminative neurons', file=f)
            for u in discriminating_neurons:
                print(u, file=f)

            print('Taste responsive neurons', file=f)
            for u in responsive_neurons:
                print(u, file=f)

        hf5.create_array('/ancillary_analysis', 'taste_disciminating_neurons',
                         discriminating_neurons)
        hf5.create_array('/ancillary_analysis', 'taste_responsive_neurons',
                         responsive_neurons)
        hf5.create_array('/ancillary_analysis', 'taste_responsiveness',
                         taste_responsiveness)
        hf5.flush()

        # Get time course of taste discrimibility
        print('Getting taste discrimination time course...')
        p_discrim = np.empty((num_conditions, num_bins, num_tastes, num_tastes,
                              num_units), dtype=np.dtype('float64'))
        for i in range(num_conditions):
            for j, t in enumerate(bin_times):
                for k in range(num_tastes):
                    for l in range(num_tastes):
                        for m in range(num_units):
                            _, p = ttest_ind(neural_response_laser[i, j, k, m, :],
                                             neural_response_laser[i, j, l, m, :],
                                             equal_var = False)
                            if np.isnan(p):
                                p = 1.0

                            p_discrim[i, j, k, l, m] = p

        hf5.create_array('/ancillary_analysis', 'p_discriminability',
                          p_discrim)
        hf5.flush()

        # Palatability Rank Order calculation (if > 2 tastes)
        t_start = params['pal_deduce_start_time']
        t_end = params['pal_deduce_end_time']
        if num_tastes > 2:
            print('Deducing palatability rank order...')
            palatability_rank_order_deduction(rec_dir, neural_response_laser,
                                              unique_lasers,
                                              bin_times, [t_start, t_end])

        # Palatability calculation
        r_spearman = np.zeros((num_conditions, num_bins, num_units))
        p_spearman = np.ones((num_conditions, num_bins, num_units))
        r_pearson = np.zeros((num_conditions, num_bins, num_units))
        p_pearson = np.ones((num_conditions, num_bins, num_units))
        f_identity = np.ones((num_conditions, num_bins, num_units))
        p_identity = np.ones((num_conditions, num_bins, num_units))
        lda_palatability = np.zeros((num_conditions, num_bins))
        lda_identity = np.zeros((num_conditions, num_bins))
        r_isotonic = np.zeros((num_conditions, num_bins, num_units))
        id_pal_regress = np.zeros((num_conditions, num_bins, num_units, 2))
        pairwise_identity = np.zeros((num_conditions, num_bins, num_tastes, num_tastes))
        print('Computing palatability metrics...')

        for i, t in enumerate(trials):
            for j in range(num_bins):
                for k in range(num_units):
                    ranks = rankdata(response[j, k, t])
                    r_spearman[i, j, k], p_spearman[i, j, k] = \
                            spearmanr(ranks, palatability[j, k, t])
                    r_pearson[i, j, k], p_pearson[i, j, k] = \
                            pearsonr(response[j, k, t], palatability[j, k, t])
                    if np.isnan(r_spearman[i, j, k]):
                        r_spearman[i, j, k] = 0.0
                        p_spearman[i, j, k] = 1.0

                    if np.isnan(r_pearson[i, j, k]):
                        r_pearson[i, j, k] = 0.0
                        p_pearson[i, j, k] = 1.0

                    # Isotonic regression of firing against palatability
                    model = IsotonicRegression(increasing = 'auto')
                    model.fit(palatability[j, k, t], response[j, k, t])
                    r_isotonic[i, j, k] = model.score(palatability[j, k, t],
                                                      response[j, k, t])

                    # Multiple Regression of firing rate against palatability and identity
                    # Regress palatability on identity
                    tmp_id = identity[j, k, t].reshape(-1, 1)
                    tmp_pal = palatability[j, k, t].reshape(-1, 1)
                    tmp_resp = response[j, k, t].reshape(-1, 1)
                    model_pi = LinearRegression()
                    model_pi.fit(tmp_id, tmp_pal)
                    pi_residuals = tmp_pal - model_pi.predict(tmp_id)

                    # Regress identity on palatability
                    model_ip = LinearRegression()
                    model_ip.fit(tmp_pal, tmp_id)
                    ip_residuals = tmp_id - model_ip.predict(tmp_pal)

                    # Regress firing on identity
                    model_fi = LinearRegression()
                    model_fi.fit(tmp_id, tmp_resp)
                    fi_residuals = tmp_resp - model_fi.predict(tmp_id)

                    # Regress firing on palatability
                    model_fp = LinearRegression()
                    model_fp.fit(tmp_pal, tmp_resp)
                    fp_residuals = tmp_resp - model_fp.predict(tmp_pal)

                    # Get partial correlation coefficient of response with identity
                    idp_reg0, p = pearsonr(fp_residuals, ip_residuals)
                    if np.isnan(idp_reg0):
                        idp_reg0 = 0.0

                    idp_reg1, p = pearsonr(fi_residuals, pi_residuals)
                    if np.isnan(idp_reg1):
                        idp_reg1 = 0.0

                    id_pal_regress[i, j, k, 0] = idp_reg0
                    id_pal_regress[i, j, k, 1] = idp_reg1

                    # Identity Calculation
                    samples = []
                    for _, row in dim.iterrows():
                        taste = row.channel
                        samples.append([trial for trial in t
                                        if identity[j, k, trial] == taste])

                    tmp_resp = [response[j, k, sample] for sample in samples]
                    f_identity[i, j, k], p_identity[i, j, k] = f_oneway(*tmp_resp)
                    if np.isnan(f_identity[i, j, k]):
                        f_identity[i, j, k] = 0.0
                        p_identity[i, j, k] = 1.0


                # Linear Discriminant analysis for palatability
                X = response[j, :, t]
                Y = palatability[j, 0, t]
                test_results = []
                c_validator = LeavePOut(1)
                for train, test in c_validator.split(X, Y):
                    model = LDA()
                    model.fit(X[train, :], Y[train])
                    tmp = np.mean(model.predict(X[test]) == Y[test])
                    test_results.append(tmp)

                lda_palatability[i, j] = np.mean(test_results)

                # Linear Discriminant analysis for identity
                Y = identity[j, 0, t]
                test_results = []
                c_validator = LeavePOut(1)
                for train, test in c_validator.split(X, Y):
                    model = LDA()
                    model.fit(X[train, :], Y[train])
                    tmp = np.mean(model.predict(X[test]) == Y[test])
                    test_results.append(tmp)

                lda_identity[i, j] = np.mean(test_results)

                # Pairwise Identity Calculation
                for ti1, r1 in dim.iterrows():
                    for ti2, r2 in dim.iterrows():
                        t1 = r1.channel
                        t2 = r2.channel
                        tmp_trials = np.where((identity[j, 0, :] == t1) |
                                              (identity[j, 0, :] == t2))[0]
                        idx = [trial for trial in t if trial in tmp_trials]
                        X = response[j, :, idx]
                        Y = identity[j, 0, idx]
                        test_results = []
                        c_validator = StratifiedShuffleSplit(n_splits=10,
                                                             test_size=0.25,
                                                             random_state=0)
                        for train, test in c_validator.split(X, Y):
                            model = GaussianNB()
                            model.fit(X[train, :], Y[train])
                            tmp_score = model.score(X[test, :], Y[test])
                            test_results.append(tmp_score)

                        pairwise_identity[i, j, ti1, ti2] = np.mean(test_results)

        hf5.create_array('/ancillary_analysis', 'r_pearson', r_pearson)
        hf5.create_array('/ancillary_analysis', 'r_spearman', r_spearman)
        hf5.create_array('/ancillary_analysis', 'p_pearson', p_pearson)
        hf5.create_array('/ancillary_analysis', 'p_spearman', p_spearman)
        hf5.create_array('/ancillary_analysis', 'lda_palatability', lda_palatability)
        hf5.create_array('/ancillary_analysis', 'lda_identity', lda_identity)
        hf5.create_array('/ancillary_analysis', 'r_isotonic', r_isotonic)
        hf5.create_array('/ancillary_analysis', 'id_pal_regress', id_pal_regress)
        hf5.create_array('/ancillary_analysis', 'f_identity', f_identity)
        hf5.create_array('/ancillary_analysis', 'p_identity', p_identity)
        hf5.create_array('/ancillary_analysis', 'pairwise_NB_identity', pairwise_identity)
        hf5.flush()

    warnings.filterwarnings('default', category=UserWarning)
    warnings.filterwarnings('default', category=RuntimeWarning)
Esempio n. 14
0
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
random_state = 12883823
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=random_state)
for train, test in rkf.split(X): print("%s %s" % (train, test))

# Leave One Out (LOO)
from sklearn.model_selection import LeaveOneOut
X = [1, 2, 3, 4]
loo = LeaveOneOut()
for train, test in loo.split(X): print("%s %s" % (train, test))

# Leave P out (LPO)
# Example of Leave-2-Out on a dataset with 4 samples:
from sklearn.model_selection import LeavePOut
X = np.ones(4)
lpo = LeavePOut(p=2)
for train, test in lpo.split(X): print("%s %s" % (train, test))

## Cross validation of time series data
# Example of 3-split time series cross-validation on a dataset with 6 samples:
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv = TimeSeriesSplit(n_splits=3)
print(tscv)  
TimeSeriesSplit(max_train_size=None, n_splits=3)
for train, test in tscv.split(X): print("%s %s" % (train, test))

#### Cross validation and model selection

### Model evaluation: Quantifying the quality of prediction
Esempio n. 15
0
import numpy as np
from sklearn.model_selection import LeavePOut
# ----------------------------------------------------
'''

class sklearn.model_selection.LeavePOut(p)

'''
# ----------------------------------------------------

X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])

#lpo = LeavePOut(1)
#lpo = LeavePOut(2)
lpo = LeavePOut(3)
print(lpo.get_n_splits(X))

print(lpo)
lpo = LeavePOut(p=2)

for train_index, test_index in lpo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print('X_train \n', X_train)
    print('X_test \n', X_test)
    print('y_train \n', y_train)
    print('y_test \n', y_test)
    print('*********************')
    permutations_path = op.join(permutations_dir,subject+'_permutations.jl')
    subj_permuts = joblib.load(permutations_path)
    if subj_ind == 0:
        allsubj_permuts = subj_permuts
    else:
        shift = allsubj_permuts.shape[1]
        allsubj_permuts = np.hstack([allsubj_permuts,shift+subj_permuts])

print(allsubj_permuts.shape)

n_permuts = allsubj_permuts.shape[0]
"""

modality_list = ['A', 'V']

lnso_cv = LeavePOut(n_leftout_subjects)
n_splits = lnso_cv.get_n_splits(subjects_list, subjects_list, subjects_list)

print(n_splits)

allsplits_xval_inds = []

for split_ind, (trainsubj_inds, testsubj_inds) in enumerate(
        lnso_cv.split(subjects_list, subjects_list, subjects_list)):
    # initialize struct for storing all train and test inds for this split
    xval_inds = dict()
    for modality in modality_list:
        xval_inds['train_{}'.format(modality)] = []
        xval_inds['test_{}'.format(modality)] = []

    shift_ind = 0
Esempio n. 17
0
df_hi = df[df['Conc'] == 'Hi']

working_dir = '../results'
working_data = glob(os.path.join(working_dir, '*all_words.csv'))

label_map = {'Living': 0, 'Nonliving': 1}

for lan in ['en', 'es', 'eu']:
    f = [item for item in working_data if (f'{lan}_all_words' in item)][0]
    df_words = pd.read_csv(f, encoding='latin-1')
    word_vecs = np.array([df_words[word].values for word in df_hi[lan]])
    clf = make_pipeline(
        StandardScaler(),
        LogisticRegression(C=1, solver='liblinear', multi_class='auto'))
    labels = np.array([label_map[item] for item in df_hi['Living']])
    cv = LeavePOut(p=2)
    results = dict(
        fold=[],
        score=[],
        test_word1=[],
        test_word2=[],
    )
    groups = df_hi[lan].values
    for fold, (idx_train, idx_test) in enumerate(
            cv.split(word_vecs, labels, groups=groups)):
        X_train, y_train = word_vecs[idx_train], labels[idx_train]
        X_test, y_test = word_vecs[idx_test], labels[idx_test]
        X_train, y_train = shuffle(X_train, y_train)
        test_pairs = groups[idx_test]
        clf = make_pipeline(
            StandardScaler(),
Esempio n. 18
0
    min_list = [0, 8, 9, 12]
else:
    min_list = [3, 4, 7, 10, 11, 13]

# fit a CART model to the data
data = pd.read_csv('input_' + c + '_2_hrv_c.csv', header=None)
decisionTree = DecisionTreeClassifier()
knnClf = KNeighborsClassifier(
    n_neighbors=3
)  # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
svc = svm.SVC(
    kernel='linear',
    C=1)  #(kernel='linear', C=1)   #(kernel='rbf') #(kernel='poly', degree=5)
naive_bayes = GaussianNB()
rand_forrest = RandomForestClassifier(n_estimators=25)
lpo = LeavePOut(p=3)
X_raw = data.iloc[:, :data.shape[1] - 1]
y = data.iloc[:, data.shape[1] - 1]
# X = X_raw.iloc[:, best_features_list]

# lsvc = LinearSVC(C=0.7, penalty="l1", dual=False).fit(X_old, y)
# model = SelectFromModel(lsvc, prefit=True)
# X = model.transform(X_old)
# print X.shape

# model_name_list = ['decision tree', 'knn', 'svm', 'naive bayes'] #, 'random forrest']
# model_list = [decisionTree, knnClf, svc, naive_bayes]# , rand_forrest]
model_name_list = ['knn']  #, 'random forrest']
model_list = [svc]  # , rand_forrest]

from sklearn.model_selection import ShuffleSplit
Esempio n. 19
0
    lower_bound = lower_bound - (lower_bound % 100)

df_animal = df_animal[df_animal['picked']]
df_object = df_object[df_object['picked']]
df_animal = df_animal.nlargest(lower_bound,'Mean\nFamiliarity')
df_object = df_object.nlargest(lower_bound,'Mean\nFamiliarity')
df_final = pd.concat([df_animal,df_object])
df_final = df_final.sort_values(['Category','Word'])

ewrq
base_clf = make_pipeline(StandardScaler(),
                    LogisticRegression(C=1, solver='liblinear',
                                       multi_class='auto'))
word_vecs = np.array([model_word2vec[word] for word in df_final['Word']])
labels = np.array([label_map[item] for item in df_final['Category']])
cv = LeavePOut(p = 2)
groups = df_final['Word'].values

results = dict(
                fold = [],
                score = [],
                test_word1 = [],
                test_word2 = [],
                )

for fold, (idx_train,idx_test) in tqdm(enumerate(cv.split(word_vecs,labels,groups = groups))): 
    X_train,y_train = word_vecs[idx_train],labels[idx_train]
    X_test,y_test = word_vecs[idx_test],labels[idx_test]
    X_train,y_train = shuffle(X_train,y_train)
    test_pairs = groups[idx_test]
    clf = clone(base_clf)
Esempio n. 20
0
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 1, 1, 2, 2, 2])
loo = LeaveOneOut()
print(loo)
for train_index, test_index in loo.split(X):
    print("Train Index:", train_index, ",Test Index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # print(X_train,X_test,y_train,y_test)

#LeavePOut
import numpy as np
from sklearn.model_selection import LeavePOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 1, 1, 2, 2, 2])
lpo = LeavePOut(p=2)
print(lpo)
for train_index, test_index in lpo.split(X):
    print("Train Index:", train_index, ",Test Index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # print(X_train,X_test,y_train,y_test)

#随机划分法
#ShuffleSplit
import numpy as np
from sklearn.model_selection import ShuffleSplit
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 2, 1, 2, 1, 2])
rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
print(rs)
Esempio n. 21
0
# ==================================K折交叉验证、留一交叉验证、留p交叉验证、随机排列交叉验证==========================================
# k折划分子集
kf = KFold(n_splits=2)
for train, test in kf.split(iris.data):
    print("k折划分:%s %s" % (train.shape, test.shape))
    break

# 留一划分子集
loo = LeaveOneOut()
for train, test in loo.split(iris.data):
    print("留一划分:%s %s" % (train.shape, test.shape))
    break

# 留p划分子集
lpo = LeavePOut(p=2)
for train, test in loo.split(iris.data):
    print("留p划分:%s %s" % (train.shape, test.shape))
    break

# 随机排列划分子集
ss = ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)
for train_index, test_index in ss.split(iris.data):
    print("随机排列划分:%s %s" % (train.shape, test.shape))
    break

# ==================================分层K折交叉验证、分层随机交叉验证==========================================
skf = StratifiedKFold(n_splits=3)  # 各个类别的比例大致和完整数据集中相同
for train, test in skf.split(iris.data, iris.target):
    print("分层K折划分:%s %s" % (train.shape, test.shape))
    break
X = data.data
y = data.target

clf = linear_model.LogisticRegression()

loocv = LeaveOneOut()

train_index, test_index = next(loocv.split(X, y))  # 1つだけ

y.size, train_index.size, test_index.size  # サイズを見てみる

scores = cross_val_score(clf, X, y, cv=loocv)  # LeaveOneOut

scores.mean() * 100, scores.std() * 100, scores.size

loocv = LeavePOut(2)
# scores = cross_val_score(clf, X, y, cv=loocv) # LeavePOut 終わらない! n_C_2オーダー
# scores.mean(), scores.std(), scores.size

group = np.array(list(range(50)) * 12)
group = np.sort(group[:y.size])
group.size

group

loocv = LeaveOneGroupOut()

for train_index, test_index in loocv.split(X, y, group):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
Esempio n. 23
0
from sklearn.pipeline import Pipeline

from sktime.classification.interval_based import CanonicalIntervalForest
from sktime.transformations.panel.pca import PCATransformer
from sktime.utils._testing.estimator_checks import _make_args

DATA_ARGS = [
    {"return_numpy": True, "n_columns": 2},
    {"return_numpy": False, "n_columns": 2},
]
# StratifiedGroupKFold(n_splits=2), , removed, not available in sklearn 0.24
CROSS_VALIDATION_METHODS = [
    KFold(n_splits=2),
    RepeatedKFold(n_splits=2, n_repeats=2),
    LeaveOneOut(),
    LeavePOut(p=5),
    ShuffleSplit(n_splits=2, test_size=0.25),
    StratifiedKFold(n_splits=2),
    StratifiedShuffleSplit(n_splits=2, test_size=0.25),
    GroupKFold(n_splits=2),
    LeavePGroupsOut(n_groups=5),
    GroupShuffleSplit(n_splits=2, test_size=0.25),
    TimeSeriesSplit(n_splits=2),
]
PARAMETER_TUNING_METHODS = [
    GridSearchCV,
    RandomizedSearchCV,
    HalvingGridSearchCV,
    HalvingRandomSearchCV,
]
COMPOSITE_ESTIMATORS = [
Esempio n. 24
0
# ### Leave-p-out
# 
# Este un tipo de validación en la que no se define un porcentaje para el conjunto de validación, sino un número $p$ de muestras para validación y las restantes $n-p$ quedan para el entrenamiento. En este caso el número de repeticiones estará definido por el número de combinaciones posibles.

# In[19]:


X=np.random.randn(10,2)


# In[20]:


from sklearn.model_selection import LeavePOut
lpo = LeavePOut(2)
lpo.get_n_splits(X)


# Que corresponde al número de combinaciones posibles N combinado 2.

# In[21]:


from itertools import combinations 
len(list(combinations(range(X.shape[0]), 2)))

LeavePOut(p=1) es igual a LeaveOneOut()
# ## Metodología de validación para problemas desbalanceados
# 
# 
			if np.isnan(r_spearman[i, j, k]):
				r_spearman[i, j, k] = 0.0
				p_spearman[i, j, k] = 1.0
			if np.isnan(r_pearson[i, j, k]):
				r_pearson[i, j, k] = 0.0
				p_pearson[i, j, k] = 1.0

# Move to linear discriminant analysis
lda_palatability = np.zeros((unique_lasers.shape[0], identity.shape[0]))
for i in range(unique_lasers.shape[0]):
	for j in range(identity.shape[0]):
		X = response[j, :, trials[i]] 
		Y = palatability[j, 0, trials[i]]
		# Use k-fold cross validation where k = 1 sample left out
		test_results = []
		c_validator = LeavePOut(1)
		for train, test in c_validator.split(X, Y):
			model = LDA()
			model.fit(X[train, :], Y[train])
			# And test on the left out kth trial - compare to the actual class of the kth trial and store in test results
			test_results.append(np.mean(model.predict(X[test]) == Y[test]))
		lda_palatability[i, j] = np.mean(test_results)

# Save these arrays to file
hf5.create_array('/ancillary_analysis', 'r_pearson', r_pearson)
hf5.create_array('/ancillary_analysis', 'p_pearson', p_pearson)
hf5.create_array('/ancillary_analysis', 'r_spearman', r_spearman)
hf5.create_array('/ancillary_analysis', 'p_spearman', p_spearman)
hf5.create_array('/ancillary_analysis', 'lda_palatability', lda_palatability)
hf5.flush()
Esempio n. 26
0
    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=3, random_state=2)
    )

    assert tokenize(cls(n_splits=3, random_state=0)) != tokenize(
        cls(n_splits=4, random_state=0)
    )

    cv = cls(n_splits=3)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == 3

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == 3


@pytest.mark.parametrize("cvs", [(LeaveOneOut(),), (LeavePOut(2), LeavePOut(3))])
def test_leave_out(cvs):
    tokens = []
    for cv in cvs:
        assert tokenize(cv) == tokenize(cv)
        tokens.append(cv)
    assert len(set(tokens)) == len(tokens)

    cv = cvs[0]
    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(True):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol

    with assert_dask_compute(False):
Esempio n. 27
0
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

data = list(range(1, 11))
print(data)

print(train_test_split(data, train_size=.8))

kf = KFold(n_splits=5)
for train, validate in kf.split(data):
    print(train, validate)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train, validate in kf.split(data):
    print(train, validate)

loo = LeaveOneOut()
for train, validate in loo.split(data):
    print(train, validate)

lpo = LeavePOut(p=2)
for train, validate in lpo.split(data):
    print(train, validate)

ss = ShuffleSplit(n_splits=3, test_size=2, random_state=0)
for train, validate in ss.split(data):
    print(train, validate)

tscv = TimeSeriesSplit(n_splits=5)
for train, validate in tscv.split(data):
    print(train, validate)
        for i in test:
            bar[i] = "T"
            output_test = "{}({}: {}) ".format(output_test, i, data[i])
            
        print("[ {} ]".format(" ".join(bar)))
        print("Train: {}".format(output_train))
        print("Test:  {}\n".format(output_test))


# Create some data to split with
data = numpy.array([[1, 2], [3, 4], [5, 6], [7, 8]])

# Our two methods
loocv = LeaveOneOut()
lpocv = LeavePOut(p=P_VAL)

split_loocv = loocv.split(data)
split_lpocv = lpocv.split(data)

print("""\
The Leave-P-Out method works by using every combination of P points as test data.

The following output shows the result of splitting some sample data by Leave-One-Out and Leave-P-Out methods.
A bar displaying the current train-test split as well as the actual data points are displayed for each split.
In the bar, "-" is a training point and "T" is a test point.
""")

print("Data:\n{}\n".format(data))

print("Leave-One-Out:\n")
Esempio n. 29
0
    3, 25, 31, 45, 80, 94, 95, 98
], [3, 38, 43, 45, 49, 67, 80, 81, 86, 87, 98, 99, 107, 109],
                      [45, 49, 53, 64, 65, 81, 87, 89, 90]]

# fit a CART model to the data
data = pd.read_csv('input_i_2_hrv_c.csv', header=None)
decisionTree = DecisionTreeClassifier()
knnClf = KNeighborsClassifier(
    n_neighbors=3
)  # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
svc = svm.SVC(
    kernel='linear',
    C=1)  # (kernel='linear', C=1)   #(kernel='rbf') #(kernel='poly', degree=5)
naive_bayes = GaussianNB()
rand_forrest = RandomForestClassifier(n_estimators=25)
lpo = LeavePOut(p=3)
X_raw = data.iloc[:, :data.shape[1] - 1]
y = data.iloc[:, data.shape[1] - 1]
# X = X_raw.iloc[:, best_features_list]

# lsvc = LinearSVC(C=0.7, penalty="l1", dual=False).fit(X_old, y)
# model = SelectFromModel(lsvc, prefit=True)
# X = model.transform(X_old)
# print X.shape

model_name_list = ['decision tree', 'knn', 'svm',
                   'naive bayes']  # , 'random forrest']
model_list = [decisionTree, knnClf, svc, naive_bayes]  # , rand_forrest]

# # ----------------------------------------------------------------------------------------------------------------------
# # this part is for selecting best features, every iteration select the feature combination with highest score
Esempio n. 30
0
# example with KFold cross validation
from sklearn.model_selection import KFold
crossval_method = KFold(n_splits=3)
crossvalidated = cross_validate(classifier,
                                 donnee.loc[:, donnee.columns != "target"],
                                 donnee.target,                                
                                 cv = crossval_method)
crossvalidated.get("test_score").mean()

# run one of these and then run crossvalidated at the end
from sklearn.model_selection import RepeatedKFold
crossval_method = RepeatedKFold(n_splits=2, n_repeats=2)
from sklearn.model_selection import LeaveOneOut
crossval_method = LeaveOneOut()
from sklearn.model_selection import LeavePOut
crossval_method = LeavePOut(p = 1)
from sklearn.model_selection import ShuffleSplit
crossval_method = ShuffleSplit(n_splits=3, test_size=0.3)
from sklearn.model_selection import StratifiedKFold
crossval_method = StratifiedKFold(n_splits=3)

crossvalidated = cross_validate(classifier,
                                 donnee.loc[:, donnee.columns != "target"],
                                 donnee.target,                                
                                 cv = crossval_method)
crossvalidated.get("test_score").mean()

# see also
# from sklearn.model_selection import GroupKFold, LeaveOneGroupOut, LeavePGroupsOut, GroupShuffleSplit, TimeSeriesSplit

# from now on, with cross validation, train and test sets will be built from X_train and y_train. X_test and y_test are now really validation sets.