コード例 #1
0
ファイル: lgo.py プロジェクト: aatapa/RLScore
def lgo_core(X,y, groups, regparam):
    logo = LeaveOneGroupOut()
    rls = RLS(X,y, regparam=regparam, kernel="GaussianKernel", gamma=0.01)
    errors = []
    for train, test in logo.split(X, y, groups=groups):
        p = rls.holdout(test)
        e = sqerror(y[test], p)
        errors.append(e)
    return np.mean(errors)
コード例 #2
0
ファイル: lgo.py プロジェクト: aatapa/RLScore
def lgo_sklearn(X,y, groups, regparam):
    logo = LeaveOneGroupOut()
    errors = []
    for train, test in logo.split(X, y, groups=groups):
        rls = KernelRidge(kernel="rbf", gamma=0.01)
        rls.fit(X[train], y[train])
        p = rls.predict(X[test])
        e = sqerror(y[test], p)       
        errors.append(e)
    return np.mean(errors)
コード例 #3
0
ファイル: notebook008.py プロジェクト: cemsbr/phd_notebook
def get_pred_cv(lm, x, y_true, groups, use_logs):
    cv = LeaveOneGroupOut()
    y_pred = pd.Series()
    for train_ix, test_ix in cv.split(x, y_true, groups):
        x_train = x.iloc[train_ix]
        y_train = y_true.iloc[train_ix]
        x_test = x.iloc[test_ix]
        lm.fit(x_train, y_train)
        if use_logs:
            lm.fit(np.log(x_train), np.log(y_train))
            arr = np.exp(lm.predict(np.log(x_test)))
        else:
            lm.fit(x_train, y_train)
            arr = lm.predict(x_test)
        s = pd.Series(arr, index=x_test.index)
        y_pred = y_pred.append(s, verify_integrity=True)
    return y_pred.sort_index()
コード例 #4
0
ファイル: test_time_gen.py プロジェクト: nfoti/mne-python
def test_generalization_across_time():
    """Test time generalization decoding."""
    from sklearn.svm import SVC
    # KernelRidge is used for testing 1) regression analyses 2) n-dimensional
    # predictions.
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import roc_auc_score, mean_squared_error

    epochs = make_epochs()
    y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1))
    if check_version('sklearn', '0.18'):
        from sklearn.model_selection import (KFold, StratifiedKFold,
                                             ShuffleSplit, LeaveOneGroupOut)
        cv = LeaveOneGroupOut()
        cv_shuffle = ShuffleSplit()
        # XXX we cannot pass any other parameters than X and y to cv.split
        # so we have to build it before hand
        cv_lolo = [(train, test) for train, test in cv.split(
                   y_4classes, y_4classes, y_4classes)]

        # With sklearn >= 0.17, `clf` can be identified as a regressor, and
        # the scoring metrics can therefore be automatically assigned.
        scorer_regress = None
    else:
        from sklearn.cross_validation import (KFold, StratifiedKFold,
                                              ShuffleSplit, LeaveOneLabelOut)
        cv_shuffle = ShuffleSplit(len(epochs))
        cv_lolo = LeaveOneLabelOut(y_4classes)

        # With sklearn < 0.17, `clf` cannot be identified as a regressor, and
        # therefore the scoring metrics cannot be automatically assigned.
        scorer_regress = mean_squared_error
    # Test default running
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(picks='foo')
    assert_equal("<GAT | no fit, no prediction, no score>", "%s" % gat)
    assert_raises(ValueError, gat.fit, epochs)
    with warnings.catch_warnings(record=True):
        # check classic fit + check manual picks
        gat.picks = [0]
        gat.fit(epochs)
        # check optional y as array
        gat.picks = None
        gat.fit(epochs, y=epochs.events[:, 2])
        # check optional y as list
        gat.fit(epochs, y=epochs.events[:, 2].tolist())
    assert_equal(len(gat.picks_), len(gat.ch_names), 1)
    assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), no "
                 "prediction, no score>", '%s' % gat)
    assert_equal(gat.ch_names, epochs.ch_names)
    # test different predict function:
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(predict_method='decision_function')
    gat.fit(epochs)
    # With classifier, the default cv is StratifiedKFold
    assert_true(gat.cv_.__class__ == StratifiedKFold)
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1))
    gat.predict_method = 'predict_proba'
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 2))
    gat.predict_method = 'foo'
    assert_raises(NotImplementedError, gat.predict, epochs)
    gat.predict_method = 'predict'
    gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 15, 14, 1))
    assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), "
                 "predicted 14 epochs, no score>",
                 "%s" % gat)
    gat.score(epochs)
    assert_true(gat.scorer_.__name__ == 'accuracy_score')
    # check clf / predict_method combinations for which the scoring metrics
    # cannot be inferred.
    gat.scorer = None
    gat.predict_method = 'decision_function'
    assert_raises(ValueError, gat.score, epochs)
    # Check specifying y manually
    gat.predict_method = 'predict'
    gat.score(epochs, y=epochs.events[:, 2])
    gat.score(epochs, y=epochs.events[:, 2].tolist())
    assert_equal("<GAT | fitted, start : -0.200 (s), stop : 0.499 (s), "
                 "predicted 14 epochs,\n scored "
                 "(accuracy_score)>", "%s" % gat)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs, y=epochs.events[:, 2])

    old_mode = gat.predict_mode
    gat.predict_mode = 'super-foo-mode'
    assert_raises(ValueError, gat.predict, epochs)
    gat.predict_mode = old_mode

    gat.score(epochs, y=epochs.events[:, 2])
    assert_true("accuracy_score" in '%s' % gat.scorer_)
    epochs2 = epochs.copy()

    # check _DecodingTime class
    assert_equal("<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: "
                 "0.050 (s), length: 0.050 (s), n_time_windows: 15>",
                 "%s" % gat.train_times_)
    assert_equal("<DecodingTime | start: -0.200 (s), stop: 0.499 (s), step: "
                 "0.050 (s), length: 0.050 (s), n_time_windows: 15 x 15>",
                 "%s" % gat.test_times_)

    # the y-check
    gat.predict_mode = 'mean-prediction'
    epochs2.events[:, 2] += 10
    gat_ = copy.deepcopy(gat)
    with use_log_level('error'):
        assert_raises(ValueError, gat_.score, epochs2)
    gat.predict_mode = 'cross-validation'

    # Test basics
    # --- number of trials
    assert_true(gat.y_train_.shape[0] ==
                gat.y_true_.shape[0] ==
                len(gat.y_pred_[0][0]) == 14)
    # ---  number of folds
    assert_true(np.shape(gat.estimators_)[1] == gat.cv)
    # ---  length training size
    assert_true(len(gat.train_times_['slices']) == 15 ==
                np.shape(gat.estimators_)[0])
    # ---  length testing sizes
    assert_true(len(gat.test_times_['slices']) == 15 ==
                np.shape(gat.scores_)[0])
    assert_true(len(gat.test_times_['slices'][0]) == 15 ==
                np.shape(gat.scores_)[1])

    # Test score_mode
    gat.score_mode = 'foo'
    assert_raises(ValueError, gat.score, epochs)
    gat.score_mode = 'fold-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15, 5])
    gat.score_mode = 'mean-sample-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15])
    gat.score_mode = 'mean-fold-wise'
    scores = gat.score(epochs)
    assert_array_equal(np.shape(scores), [15, 15])
    gat.predict_mode = 'mean-prediction'
    with warnings.catch_warnings(record=True) as w:
        gat.score(epochs)
        assert_true(any("score_mode changed from " in str(ww.message)
                        for ww in w))

    # Test longer time window
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times={'length': .100})
    with warnings.catch_warnings(record=True):
        gat2 = gat.fit(epochs)
    assert_true(gat is gat2)  # return self
    assert_true(hasattr(gat2, 'cv_'))
    assert_true(gat2.cv_ != gat.cv)
    with warnings.catch_warnings(record=True):  # not vectorizing
        scores = gat.score(epochs)
    assert_true(isinstance(scores, np.ndarray))  # type check
    assert_equal(len(scores[0]), len(scores))  # shape check
    assert_equal(len(gat.test_times_['slices'][0][0]), 2)
    # Decim training steps
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times={'step': .100})
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)
    gat.score(epochs)
    assert_true(len(gat.scores_) == len(gat.estimators_) == 8)  # training time
    assert_equal(len(gat.scores_[0]), 15)  # testing time

    # Test start stop training & test cv without n_fold params
    y_4classes = np.hstack((epochs.events[:7, 2], epochs.events[7:, 2] + 1))
    train_times = dict(start=0.090, stop=0.250)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_lolo, train_times=train_times)
    # predict without fit
    assert_raises(RuntimeError, gat.predict, epochs)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs, y=y_4classes)
    gat.score(epochs)
    assert_equal(len(gat.scores_), 4)
    assert_equal(gat.train_times_['times'][0], epochs.times[6])
    assert_equal(gat.train_times_['times'][-1], epochs.times[9])

    # Test score without passing epochs & Test diagonal decoding
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(test_times='diagonal')
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.fit(epochs)
    assert_raises(RuntimeError, gat.score)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.predict(epochs)
    scores = gat.score()
    assert_true(scores is gat.scores_)
    assert_equal(np.shape(gat.scores_), (15, 1))
    assert_array_equal([tim for ttime in gat.test_times_['times']
                        for tim in ttime], gat.train_times_['times'])
    # Test generalization across conditions
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(predict_mode='mean-prediction', cv=2)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs[0:6])
    with warnings.catch_warnings(record=True):
        # There are some empty test folds because of n_trials
        gat.predict(epochs[7:])
        gat.score(epochs[7:])

    # Test training time parameters
    gat_ = copy.deepcopy(gat)
    # --- start stop outside time range
    gat_.train_times = dict(start=-999.)
    with use_log_level('error'):
        assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(start=999.)
    assert_raises(ValueError, gat_.fit, epochs)
    # --- impossible slices
    gat_.train_times = dict(step=.000001)
    assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(length=.000001)
    assert_raises(ValueError, gat_.fit, epochs)
    gat_.train_times = dict(length=999.)
    assert_raises(ValueError, gat_.fit, epochs)

    # Test testing time parameters
    # --- outside time range
    gat.test_times = dict(start=-999.)
    with warnings.catch_warnings(record=True):  # no epochs in fold
        assert_raises(ValueError, gat.predict, epochs)
    gat.test_times = dict(start=999.)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    # --- impossible slices
    gat.test_times = dict(step=.000001)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    gat_ = copy.deepcopy(gat)
    gat_.train_times_['length'] = .000001
    gat_.test_times = dict(length=.000001)
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat_.predict, epochs)
    # --- test time region of interest
    gat.test_times = dict(step=.150)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.predict(epochs)
    assert_array_equal(np.shape(gat.y_pred_), (15, 5, 14, 1))
    # --- silly value
    gat.test_times = 'foo'
    with warnings.catch_warnings(record=True):  # no test epochs
        assert_raises(ValueError, gat.predict, epochs)
    assert_raises(RuntimeError, gat.score)
    # --- unmatched length between training and testing time
    gat.test_times = dict(length=.150)
    assert_raises(ValueError, gat.predict, epochs)
    # --- irregular length training and testing times
    # 2 estimators, the first one is trained on two successive time samples
    # whereas the second one is trained on a single time sample.
    train_times = dict(slices=[[0, 1], [1]])
    # The first estimator is tested once, the second estimator is tested on
    # two successive time samples.
    test_times = dict(slices=[[[0, 1]], [[0], [1]]])
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(train_times=train_times,
                                       test_times=test_times)
    gat.fit(epochs)
    with warnings.catch_warnings(record=True):  # not vectorizing
        gat.score(epochs)
    assert_array_equal(np.shape(gat.y_pred_[0]), [1, len(epochs), 1])
    assert_array_equal(np.shape(gat.y_pred_[1]), [2, len(epochs), 1])
    # check cannot Automatically infer testing times for adhoc training times
    gat.test_times = None
    assert_raises(ValueError, gat.predict, epochs)

    svc = SVC(C=1, kernel='linear', probability=True)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(clf=svc, predict_mode='mean-prediction')
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)

    # sklearn needs it: c.f.
    # https://github.com/scikit-learn/scikit-learn/issues/2723
    # and http://bit.ly/1u7t8UT
    with use_log_level('error'):
        assert_raises(ValueError, gat.score, epochs2)
        gat.score(epochs)
    assert_true(0.0 <= np.min(scores) <= 1.0)
    assert_true(0.0 <= np.max(scores) <= 1.0)

    # Test that error if cv is not partition
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_shuffle,
                                       predict_mode='cross-validation')
    gat.fit(epochs)
    assert_raises(ValueError, gat.predict, epochs)
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=cv_shuffle,
                                       predict_mode='mean-prediction')
    gat.fit(epochs)
    gat.predict(epochs)

    # Test that gets error if train on one dataset, test on another, and don't
    # specify appropriate cv:
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime()
    gat.fit(epochs)
    with warnings.catch_warnings(record=True):
        gat.fit(epochs)

    gat.predict(epochs)
    assert_raises(ValueError, gat.predict, epochs[:10])

    # Make CV with some empty train and test folds:
    # --- empty test fold(s) should warn when gat.predict()
    gat._cv_splits[0] = [gat._cv_splits[0][0], np.empty(0)]
    with warnings.catch_warnings(record=True) as w:
        gat.predict(epochs)
        assert_true(len(w) > 0)
        assert_true(any('do not have any test epochs' in str(ww.message)
                        for ww in w))
    # --- empty train fold(s) should raise when gat.fit()
    with warnings.catch_warnings(record=True):  # dep
        gat = GeneralizationAcrossTime(cv=[([0], [1]), ([], [0])])
    assert_raises(ValueError, gat.fit, epochs[:2])

    # Check that still works with classifier that output y_pred with
    # shape = (n_trials, 1) instead of (n_trials,)
    if check_version('sklearn', '0.17'):  # no is_regressor before v0.17
        with warnings.catch_warnings(record=True):  # dep
            gat = GeneralizationAcrossTime(clf=KernelRidge(), cv=2)
        epochs.crop(None, epochs.times[2])
        gat.fit(epochs)
        # With regression the default cv is KFold and not StratifiedKFold
        assert_true(gat.cv_.__class__ == KFold)
        gat.score(epochs)
        # with regression the default scoring metrics is mean squared error
        assert_true(gat.scorer_.__name__ == 'mean_squared_error')

    # Test combinations of complex scenarios
    # 2 or more distinct classes
    n_classes = [2, 4]  # 4 tested
    # nicely ordered labels or not
    le = LabelEncoder()
    y = le.fit_transform(epochs.events[:, 2])
    y[len(y) // 2:] += 2
    ys = (y, y + 1000)
    # Univariate and multivariate prediction
    svc = SVC(C=1, kernel='linear', probability=True)
    reg = KernelRidge()

    def scorer_proba(y_true, y_pred):
        return roc_auc_score(y_true, y_pred[:, 0])

    # We re testing 3 scenario: default, classifier + predict_proba, regressor
    scorers = [None, scorer_proba, scorer_regress]
    predict_methods = [None, 'predict_proba', None]
    clfs = [svc, svc, reg]
    # Test all combinations
    for clf, predict_method, scorer in zip(clfs, predict_methods, scorers):
        for y in ys:
            for n_class in n_classes:
                for predict_mode in ['cross-validation', 'mean-prediction']:
                    # Cannot use AUC for n_class > 2
                    if (predict_method == 'predict_proba' and n_class != 2):
                        continue

                    y_ = y % n_class

                    with warnings.catch_warnings(record=True):
                        gat = GeneralizationAcrossTime(
                            cv=2, clf=clf, scorer=scorer,
                            predict_mode=predict_mode)
                        gat.fit(epochs, y=y_)
                        gat.score(epochs, y=y_)

                    # Check that scorer is correctly defined manually and
                    # automatically.
                    scorer_name = gat.scorer_.__name__
                    if scorer is None:
                        if is_classifier(clf):
                            assert_equal(scorer_name, 'accuracy_score')
                        else:
                            assert_equal(scorer_name, 'mean_squared_error')
                    else:
                        assert_equal(scorer_name, scorer.__name__)
コード例 #5
0
for eachLine in dat_groups_obj:
    ret = re.findall('\d+', eachLine)
    if ret is not None:
        P_I_pairs.append((int(ret[0]), int(ret[1]), int(ret[1])))
P_I_pairs = np.asarray(P_I_pairs)
groups = [a[0] for a in P_I_pairs]

X_sparse, y = load_svmlight_file(dat_file_name)
X_dense = X_sparse.todense()

#remove features in row 6
#x1,x2,x3 = np.hsplit(X_dense, [5,6])
#X = np.hstack((x1,x3))
X = X_dense

logo = LeaveOneGroupOut()

#regr_dct = MLPRegressor(hidden_layer_sizes=(80,60, 30, 30, 20, 20, 20), activation='logistic', solver='adam', learning_rate_init=0.0001, max_iter=500,random_state=1)
regr_dct = MLPRegressor(hidden_layer_sizes=(4),
                        activation='tanh',
                        solver='adam')
#regr_dct = MLPRegressor(hidden_layer_sizes=(100, 80,60), activation='logistic', solver='adam')
#regr_dct = tree.DecisionTreeRegressor()
#regr_dct = ensemble.RandomForestRegressor(n_estimators=20)
#regr_dct = gaussian_process.GaussianProcessRegressor(kernel=None)
#regr_dct = neighbors.RadiusNeighborsRegressor(radius=1.0)
#regr_dct = svm.SVR(kernel='rbf')

scores_dct = list()

P_scores = list()
コード例 #6
0
svc = SVC(kernel='linear')

# FEATURE SELECTION
from sklearn.feature_selection import SelectPercentile, f_classif

feature_selection = SelectPercentile(f_classif, percentile=10)

from sklearn.pipeline import Pipeline

anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)])
anova_svc.fit(X, y)
y_pred = anova_svc.predict(X)

from sklearn.model_selection import LeaveOneGroupOut, cross_val_score

cv = LeaveOneGroupOut()

# Compute the prediction accuracy for the different folds (i.e. session)

#session = behavioral[condition_mask].to_records(index=False)
#print(session.dtype.names)

# NESTED CROSS VALIDATION
nested_cv_scores = cross_val_score(grid, X, y, cv=5)
#cv_scores = cross_val_score(anova_svc, X, conditions,)

# Print the results
print("Nested CV score: %.4f" % np.mean(nested_cv_scores))

# Here is the image
coef = svc.coef_
コード例 #7
0
import numpy
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
#import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneGroupOut
from keras.utils import np_utils
import keras    
from sklearn.metrics import confusion_matrix
from keras.callbacks import EarlyStopping
from keras.layers import Dropout



logo = LeaveOneGroupOut()
X = numpy.loadtxt("data_for_keras/FC_forecast.csv", delimiter=",")
Y_int=numpy.loadtxt("data_for_keras/label_forecast.csv", delimiter=",")
logo = LeaveOneGroupOut()
grp=numpy.loadtxt("data_for_keras/speaker_group.csv", delimiter=",")

F2con={} #save the confusion matrix in each speaker case
F2ACC={} #save the accuracy in each speaker case
F2WR={} #save the weighted accuracy in each speaker case
F2UWR={} ##save the Unweighted Accuracy in each speaker case
F=0
for train, test in logo.split(X, Y_int, grp):
    
    callbacks = [EarlyStopping(monitor='val_loss', patience=10)]
      # create model
    F2model = Sequential()
コード例 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--C',
                        type=float,
                        help='C parameter of SVM',
                        required=True)
    parser.add_argument('--save_estimator',
                        help='Save the estimator',
                        action='store_true',
                        default=False)

    args = parser.parse_args()
    C = args.C
    save_estimator = args.save_estimator

    np.random.seed(21)

    task_name = __file__.split('/')[-1].split('.')[0]
    print('TASK: {}'.format(task_name))

    os.makedirs(os.path.join(results_root, task_name), exist_ok=True)

    name = 'C_{}.npy'.format(C)
    if os.path.exists(os.path.join(results_root, task_name, name)):
        print('{} already exists, skipping...'.format(task_name))
        return 1

    ff_list = []
    y_list = []
    y_logo_list = []
    # Loading Features
    for dataset_name, _ in dataset_ext.items():
        y_same_param = []
        y_logo_same_param = []
        feature_path = glob.glob(
            os.path.join(cooccurrences_root, '{}.npy'.format(dataset_name)))[0]
        dataset_logo_label = dataset_label[dataset_name]

        ff = np.load(feature_path)

        if '_orig' in dataset_name:
            y_same_param += [0] * len(ff)
            y_logo_same_param += [dataset_logo_label] * len(ff)
        elif '_gan' in dataset_name:
            y_same_param += [1] * len(ff)
            y_logo_same_param += [dataset_logo_label] * len(ff)

        ff_list += [ff]
        y_list += y_same_param
        y_logo_list += y_logo_same_param

    ff_list = np.concatenate(ff_list, axis=0)
    y_list = np.array(y_list)
    y_logo_list = np.array(y_logo_list)

    # Subsampling
    sub_idx = np.random.choice(np.arange(len(ff_list)), int(len(ff_list) // 3))

    X = ff_list[sub_idx]
    y = y_list[sub_idx]
    y_logo = y_logo_list[sub_idx]

    # Shuffling training set
    shuffle_idx = np.arange(len(y))
    np.random.shuffle(shuffle_idx)
    X = X[shuffle_idx]
    y = y[shuffle_idx]
    y_logo = y_logo[shuffle_idx]

    # Normalize samples
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    print('C: {}, Total {} samples, Leave-One-Group-Out cv. Feature size: {}'.
          format(C, X.shape[0], X.shape[1]))

    # Create model
    model = LinearSVC(dual=False, C=C)

    # LOGO cv policy
    logo = LeaveOneGroupOut()

    cv = cross_validate(estimator=model,
                        X=X,
                        y=y,
                        groups=y_logo,
                        scoring='balanced_accuracy',
                        cv=logo,
                        verbose=2,
                        return_estimator=save_estimator,
                        n_jobs=-1)

    if save_estimator:
        result_data = {
            'acc': cv['test_score'],
            'estimator': cv['estimator'],
            'X': X,
            'y': y,
            'y_logo': y_logo
        }
    else:
        result_data = {'acc': cv['test_score']}

    np.save(os.path.join(results_root, task_name, name), result_data)

    del X, y, y_logo, model, cv

    return 0
コード例 #9
0
    def fit(self, X, y, groups=None):
        """Fit the decoder (learner).

        Parameters
        ----------
        X: list of Niimg-like objects
            See http://nilearn.github.io/manipulating_images/input_output.html
            Data on which model is to be fitted. If this is a list,
            the affine is considered the same for all.

        y: numpy.ndarray of shape=(n_samples) or list of length n_samples
            The dependent variable (age, sex, IQ, yes/no, etc.).
            Target variable to predict. Must have exactly as many elements as
            3D images in niimg.

        groups: None
            Group labels for the samples used while splitting the dataset into
            train/test set. Default None.

            Note that this parameter must be specified in some scikit-learn
            cross-validation generators to calculate the number of splits, e.g.
            sklearn.model_selection.LeaveOneGroupOut or
            sklearn.model_selection.LeavePGroupsOut.

            For more details see
            https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data

        Attributes
        ----------
        `masker_`: instance of NiftiMasker or MultiNiftiMasker
            The NiftiMasker used to mask the data.

        `mask_img_`: Nifti1Image
            Mask computed by the masker object.

        `classes_`: numpy.ndarray
            Classes to predict. For classification only.

        `screening_percentile_`: float
            Screening percentile corrected according to volume of mask,
            relative to the volume of standard brain.

        `coef_`: numpy.ndarray, shape=(n_classes, n_features)
            Contains the mean of the models weight vector across
            fold for each class.

        `coef_img_`: dict of Nifti1Image
            Dictionary containing `coef_` with class names as keys,
            and `coef_` transformed in Nifti1Images as values. In the case of
            a regression, it contains a single Nifti1Image at the key 'beta'.

        `intercept_`: narray, shape (nclasses,)
            Intercept (a.k.a. bias) added to the decision function.

        `cv_`: list of pairs of lists
            List of the (n_folds,) folds. For the corresponding fold,
            each pair is composed of two lists of indices,
            one for the train samples and one for the test samples.

        `std_coef_`: numpy.ndarray, shape=(n_classes, n_features)
            Contains the standard deviation of the models weight vector across
            fold for each class. Note that folds are not independent, see
            https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators-for-grouped-data

        `std_coef_img_`: dict of Nifti1Image
            Dictionary containing `std_coef_` with class names as keys,
            and `coef_` transformed in Nifti1Image as values. In the case of
            a regression, it contains a single Nifti1Image at the key 'beta'.

        `cv_params_`: dict of lists
            Best point in the parameter grid for each tested fold
            in the inner cross validation loop.

        `cv_scores_`: dict, (classes, n_folds)
            Scores (misclassification) for each parameter, and on each fold
        """
        self.estimator = _check_estimator(self.estimator)
        self.memory_ = _check_memory(self.memory, self.verbose)

        X = self._apply_mask(X)
        X, y = check_X_y(X, y, dtype=np.float, multi_output=True)

        # Setup scorer
        scorer = check_scoring(self.estimator, self.scoring)

        # Setup cross-validation object. Default is StratifiedKFold when groups
        # is None. If groups is specified but self.cv is not set to custom CV
        # splitter, default is LeaveOneGroupOut. If self.cv is manually set to
        # a CV splitter object do check_cv regardless of groups parameter.
        cv = self.cv
        if (isinstance(cv, int) or cv is None) and groups is not None:
            warnings.warn('groups parameter is specified but '
                          'cv parameter is not set to custom CV splitter. '
                          'Using default object LeaveOneGroupOut().')
            cv_object = LeaveOneGroupOut()
        else:
            cv_object = check_cv(cv, y=y, classifier=self.is_classification)

        self.cv_ = list(cv_object.split(X, y, groups=groups))

        # Define the number problems to solve. In case of classification this
        # number corresponds to the number of binary problems to solve
        if self.is_classification:
            y = self._binarize_y(y)
        else:
            y = y[:, np.newaxis]
        if self.is_classification and self.n_classes_ > 2:
            n_problems = self.n_classes_
        else:
            n_problems = 1

        # Return a suitable screening percentile according to the mask image
        self.screening_percentile_ = _adjust_screening_percentile(
            self.screening_percentile, self.mask_img_, verbose=self.verbose)

        parallel = Parallel(n_jobs=self.n_jobs, verbose=2 * self.verbose)

        parallel_fit_outputs = parallel(
            delayed(self._cache(_parallel_fit))
            (self.estimator, X, y[:, c], train, test, self.param_grid,
             self.is_classification, scorer, self.mask_img_, c,
             self.screening_percentile_)
            for c, (train,
                    test) in itertools.product(range(n_problems), self.cv_))

        coefs, intercepts = self._fetch_parallel_fit_outputs(
            parallel_fit_outputs, y, n_problems)

        # Build the final model (the aggregated one)
        self.coef_ = np.vstack([
            np.mean(coefs[class_index], axis=0)
            for class_index in self.classes_
        ])
        self.std_coef_ = np.vstack([
            np.std(coefs[class_index], axis=0) for class_index in self.classes_
        ])
        self.intercept_ = np.hstack([
            np.mean(intercepts[class_index], axis=0)
            for class_index in self.classes_
        ])

        self.coef_img_, self.std_coef_img_ = self._output_image(
            self.classes_, self.coef_, self.std_coef_)

        if self.is_classification and (self.n_classes_ == 2):
            self.coef_ = self.coef_[0, :][np.newaxis, :]
            self.intercept_ = self.intercept_[0]
コード例 #10
0
X_vect = vectorizer.fit_transform(transcript_lem_list)

#TF IDF
tfidf = TfidfTransformer()

X_tfidf = tfidf.fit_transform(X_vect)

#Regressors
Ridge = Ridge(alpha=10)
RF = RandomForestRegressor(criterion='mse',
                           max_features='sqrt',
                           random_state=42)
SVR = LinearSVR(C=0.5)

#Leave-One-Interviewer-Out cross-validation
LOGO = LeaveOneGroupOut()
groups = df_labels.Group.values

X_cv = X_tfidf.toarray()

#np_gender = df_labels['Gender'].to_numpy().reshape(-1,1)
#X_cv_gender = np.concatenate((X_cv,np_gender),axis=1)

y_cv = df_labels['Label'].values

RMSE_list_Rid = []
RMSE_list_RF = []
RMSE_list_SVR = []

for train_index, test_index in LOGO.split(X_cv, y_cv, groups):
    X_train, X_test = X_cv[train_index], X_cv[test_index]
コード例 #11
0
def main():
    parser = argparse.ArgumentParser()
    # Names, paths, logs
    parser.add_argument(
        '--dataset_path',
        default=f'/home/ICT2000/jondras/datasets/mimicry/segmented_datasets',
        help=
        'path prefix to dataset directory (excludes the suffix with dataset version e.g. "_v0")'
    )
    parser.add_argument('--dataset_version',
                        default=f'v3',
                        help='version of the dataset (v0|v1|v2|v3)')
    parser.add_argument(
        '--logger_path',
        default=
        '/home/ICT2000/jondras/deep-virtual-rapport-agent/rapport_model/logs/1569294550_multimodal_multimodal-base-classifier_nod',
        help='path to logging directory containing the final model')
    # Data parameters
    parser.add_argument(
        '--sequence_length',
        default=32,
        type=int,
        help='maximum length of feature sequences (i.e. window size)')
    # Training and optimization
    parser.add_argument(
        '--loso_cross_validation',
        default=False,
        help=
        'load model from subject-independent (leave-one-subject-out (LOSO)) cross-validation'
    )
    parser.add_argument(
        '--fold_num',
        default=10,
        type=int,
        help=
        'number of folds, relevant only for subject-dependent cross-validation'
    )
    parser.add_argument('--gpu_id',
                        default=0,
                        type=int,
                        help='ID of a GPU to use (0|1|2|3)')
    opt = parser.parse_args()

    # Add derived/additional options
    opt.dataset_path = f'{opt.dataset_path}_{opt.dataset_version}'
    # Path to the dataset file with metadata and labels for each sequence
    opt.dataset_file_path = os.path.join(
        opt.dataset_path, f'metadata_labels_{opt.sequence_length}ws.csv')

    # Use the specified GPU
    # os.environ["CUDA_VISIBLE_DEVICES"] = str(opt.gpu_id)
    torch.cuda.set_device(int(opt.gpu_id))

    # Set up stdout logger (for stdout and stderr)
    os.makedirs(opt.logger_path, exist_ok=True)
    logging.basicConfig(filename=os.path.join(opt.logger_path, f'test.log'),
                        level=logging.INFO,
                        format='%(asctime)s %(levelname)s ==> %(message)s')
    sys.stdout = DualLogger('stdout')
    sys.stderr = DualLogger('stderr')

    # Print all args/options/settings
    print(
        f'{ConsoleColors.CC_GREY}\nTraining and validating models{ConsoleColors.CC_END}'
    )
    for arg in vars(opt):
        print(
            f'{arg}={ConsoleColors.CC_YELLOW}{str(getattr(opt, arg))}{ConsoleColors.CC_END}'
        )

    # Read metadata+labels file
    metadata_labels = pd.read_csv(opt.dataset_file_path)  #[:100]
    sequence_ids = metadata_labels['sequence_id'].tolist()

    ids = {}
    fold = 0

    # Load model from subject-independent (leave-one-subject-out (LOSO)) cross-validation
    if opt.loso_cross_validation:
        # Use listener subject id for subject-independent (leave-one-subject-out (LOSO)) cross-validation, since the
        # listener is the target subject
        subject_ids = metadata_labels['listener_sid'].tolist()
        opt.fold_num = len(np.unique(subject_ids))
        cross_validator = LeaveOneGroupOut()
        cross_validator_splits = cross_validator.split(sequence_ids,
                                                       groups=subject_ids)
    # Otherwise, load model from subject-dependent k-fold cross-validation
    else:
        cross_validator = KFold(n_splits=opt.fold_num, shuffle=True)
        cross_validator_splits = cross_validator.split(sequence_ids)

    metrics = dict()

    # Cross-validation loop: test on each fold
    for _, test_idx in cross_validator_splits:
        fold_start_time = time.time()
        print(
            f'\n{ConsoleColors.CC_BOLD}{ConsoleColors.CC_GREEN}fold:{ConsoleColors.CC_END} {fold + 1}/{opt.fold_num}'
        )

        ids['test'] = [sequence_ids[x] for x in test_idx]

        # Checkpoint
        checkpoint = torch.load(os.path.join(opt.logger_path,
                                             f'fold_{fold + 1:02d}',
                                             'model_best.pth.tar'),
                                map_location=device)
        print(
            f'\t{ConsoleColors.CC_BOLD}best_epoch:{ConsoleColors.CC_END} {checkpoint["epoch"]} \t '
            f'{ConsoleColors.CC_BOLD}best_monitored_metric:{ConsoleColors.CC_END} '
            f'{checkpoint["best_monitored_metric"]}')
        checkpoint_opt = checkpoint['opt']
        # Dataset partitions that will be generated
        checkpoint_opt.dataset_partitions = ['test']

        # Check whether checkpoint options agree with the current (specified) options
        assert checkpoint_opt.sequence_length == opt.sequence_length, f'The specified sequence length does not match the checkpoint one ({checkpoint_opt.sequence_length})!'
        assert checkpoint_opt.loso_cross_validation == opt.loso_cross_validation, f'The specified type of crossvalidation does not match the checkpoint one ({checkpoint_opt.loso_cross_validation})!'
        assert checkpoint_opt.fold_num == opt.fold_num, f'The specified number of folds does not match the checkpoint one ({checkpoint_opt.fold_num})!'

        # Data loaders
        if checkpoint_opt.modality == 'speech' or checkpoint_opt.modality == 'vision':
            if checkpoint_opt.model_type == 'unimodal-base-classifier' \
                    or checkpoint_opt.model_type == 'unimodal-tcn-classifier':
                loaders = get_unimodal_base_dataset_loaders(
                    metadata_labels=metadata_labels,
                    ids=ids,
                    opt=checkpoint_opt)
            else:
                print(
                    f'Data loader is not implemented for {checkpoint_opt.modality} -> {checkpoint_opt.model_type}'
                )

        elif checkpoint_opt.modality == 'multimodal':
            if checkpoint_opt.model_type == 'multimodal-base-classifier' \
                    or checkpoint_opt.model_type == 'multimodal-tcn-classifier':
                loaders = get_multimodal_base_dataset_loaders(
                    metadata_labels=metadata_labels,
                    ids=ids,
                    opt=checkpoint_opt)
            else:
                print(
                    f'Data loader is not implemented for {checkpoint_opt.modality} -> {checkpoint_opt.model_type}'
                )

        else:
            print(
                f'Data loader is not implemented for {checkpoint_opt.modality}'
            )

        # Model
        if checkpoint_opt.modality == 'speech' or checkpoint_opt.modality == 'vision':
            if checkpoint_opt.model_type == 'unimodal-base-classifier':
                model = UnimodalBaseClassifier(opt=checkpoint_opt).cuda()
            elif checkpoint_opt.model_type == 'unimodal-tcn-classifier':
                model = UnimodalTCNClassifier(opt=checkpoint_opt).cuda()
            else:
                print(
                    f'Model is not implemented for {checkpoint_opt.modality} -> {checkpoint_opt.model_type}'
                )

        elif checkpoint_opt.modality == 'multimodal':
            if checkpoint_opt.model_type == 'multimodal-base-classifier':
                model = MultimodalBaseClassifier(opt=checkpoint_opt).cuda()
            elif checkpoint_opt.model_type == 'multimodal-tcn-classifier':
                model = MultimodalTCNClassifier(opt=checkpoint_opt).cuda()
            else:
                print(
                    f'Model is not implemented for {checkpoint_opt.modality} -> {checkpoint_opt.model_type}'
                )

        else:
            print(f'Model is not implemented for {checkpoint_opt.modality}')
        model.load_state_dict(checkpoint['model'])
        model.eval()

        # Get test metrics for this fold
        fold_metrics = test_classifier(loaders['test'], model,
                                       checkpoint_opt.labels_names)
        for label_name in fold_metrics.keys():
            if label_name not in metrics:
                metrics[label_name] = defaultdict(list)
            for metric_name in fold_metrics[label_name].keys():
                metrics[label_name][metric_name].append(
                    fold_metrics[label_name][metric_name])

        fold += 1
        print(
            f'    fold time: {ConsoleColors.CC_YELLOW2}{time_diff(fold_start_time)}{ConsoleColors.CC_END}'
        )

    # Calculate metrics over all folds
    for label_name in metrics.keys():
        print()
        for metric_name in metrics[label_name].keys():
            metrics[label_name][metric_name] = {
                'mean': np.mean(metrics[label_name][metric_name]),
                'std': np.std(metrics[label_name][metric_name])
            }
            print(
                f'- {label_name}\t- {metric_name}:\t {metrics[label_name][metric_name]["mean"]:.4f} '
                f'+/- {metrics[label_name][metric_name]["std"]:.4f}')
コード例 #12
0
def decode(spike_times, spike_clusters, event_times, event_groups, pre_time=0, post_time=0.5,
           classifier='bayes', cross_validation='kfold', num_splits=5, prob_left=None,
           custom_validation=None, n_neurons='all', iterations=1):
    """
    Use decoding to classify groups of trials (e.g. stim left/right). Classification is done using
    the population vector of summed spike counts from the specified time window. Cross-validation
    is achieved using n-fold cross validation or leave-one-out cross validation. Decoders can
    decode any number of groups. When providing the classfier with an imbalanced dataset (not
    the same number of trials in each group) the chance level will not be 1/groups. In that case,
    to compare the classification performance against change one has to either determine chance
    level by decoding a shuffled dataset or use the 'auroc' metric as readout (this metric is
    robust against imbalanced datasets)

    Parameters
    ----------
    spike_times : 1D array
        spike times (in seconds)
    spike_clusters : 1D array
        cluster ids corresponding to each event in `spikes`
    event_times : 1D array
        times (in seconds) of the events from the two groups
    event_groups : 1D array
        group identities of the events, can be any number of groups, accepts integers and strings
    pre_time : float
        time (in seconds) preceding the event times
    post_time : float
        time (in seconds) following the event times
    classifier : string
        which decoder to use, options are:
            'bayes'         Naive Bayes
            'forest'        Random forest (with 100 trees)
            'regression'    Logistic regression
            'lda'           Linear Discriminant Analysis
    cross_validation : string
        which cross-validation method to use, options are:
            'none'              No cross-validation
            'kfold'             K-fold cross-validation
            'leave-one-out'     Leave out the trial that is being decoded
            'block'             Leave out the block the to-be-decoded trial is in
            'custom'            Any custom cross-validation provided by the user
    num_splits : integer
        ** only for 'kfold' cross-validation **
        Number of splits to use for k-fold cross validation, a value of 5 means that the decoder
        will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process
        is repeated five times so that all data has been used as both training and test set.
    prob_left : 1D array
        ** only for 'block' cross-validation **
        the probability of the stimulus appearing on the left for each trial in event_times
    custom_validation : generator
        ** only for 'custom' cross-validation **
        a generator object with the splits to be used for cross validation using this format:
            (
                (split1_train_idxs, split1_test_idxs),
                (split2_train_idxs, split2_test_idxs),
                (split3_train_idxs, split3_test_idxs),
             ...)

    Returns
    -------
    results : dict
        dictionary with decoding results

        accuracy : float
            accuracy of the classifier in percentage correct
        f1 : float
            F1 score of the classifier
        auroc : float
            the area under the ROC curve of the classification performance
        confusion_matrix : 2D array
            normalized confusion matrix
        predictions : 2D array with dimensions iterations x trials
            predicted group label for all trials in every iteration
        probabilities : 2D array with dimensions iterations x trials
            classification probability for all trials in every iteration
    """

    # Check input
    assert classifier in ['bayes', 'forest', 'regression', 'lda']
    assert cross_validation in ['none', 'kfold', 'leave-one-out', 'block', 'custom']
    assert event_times.shape[0] == event_groups.shape[0]
    if cross_validation == 'block':
        assert event_times.shape[0] == prob_left.shape[0]
    if cross_validation == 'custom':
        assert isinstance(custom_validation, types.GeneratorType)

    # Get matrix of all neuronal responses
    times = np.column_stack(((event_times - pre_time), (event_times + post_time)))
    pop_vector, cluster_ids = _get_spike_counts_in_bins(spike_times, spike_clusters, times)
    pop_vector = np.rot90(pop_vector)

    # Initialize classifier
    if classifier == 'forest':
        clf = RandomForestClassifier(n_estimators=100)
    elif classifier == 'bayes':
        clf = GaussianNB()
    elif classifier == 'regression':
        clf = LogisticRegression(solver='liblinear', multi_class='auto')
    elif classifier == 'lda':
        clf = LinearDiscriminantAnalysis()

    # Pre-allocate variables
    acc = np.zeros(iterations)
    f1 = np.zeros(iterations)
    auroc = np.zeros(iterations)
    conf_matrix_norm = np.zeros((np.shape(np.unique(event_groups))[0],
                                 np.shape(np.unique(event_groups))[0],
                                 iterations))
    pred = np.zeros([iterations, pop_vector.shape[0]])
    prob = np.zeros([iterations, pop_vector.shape[0]])

    for i in range(iterations):

        # Pre-allocate variables for this iteration
        y_pred = np.zeros(event_groups.shape)
        y_probs = np.zeros(event_groups.shape)

        # Get neurons to use for this iteration
        if n_neurons == 'all':
            sub_pop_vector = pop_vector
        else:
            use_neurons = np.random.choice(pop_vector.shape[1], n_neurons, replace=False)
            sub_pop_vector = pop_vector[:, use_neurons]

        if cross_validation == 'none':

            # Fit the model on all the data and predict
            clf.fit(sub_pop_vector, event_groups)
            y_pred = clf.predict(sub_pop_vector)

            #  Get the probability of the prediction for ROC analysis
            probs = clf.predict_proba(sub_pop_vector)
            y_probs = probs[:, 1]  # keep positive only

        else:
            # Perform cross-validation
            if cross_validation == 'leave-one-out':
                cv = LeaveOneOut().split(pop_vector)
            elif cross_validation == 'kfold':
                cv = KFold(n_splits=num_splits).split(pop_vector)
            elif cross_validation == 'block':
                block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)]
                blocks = np.repeat(np.arange(len(block_lengths)), block_lengths)
                cv = LeaveOneGroupOut().split(pop_vector, groups=blocks)
            elif cross_validation == 'custom':
                cv = custom_validation

            # Loop over the splits into train and test
            for train_index, test_index in cv:

                # Fit the model to the training data
                clf.fit(sub_pop_vector[train_index], event_groups[train_index])

                # Predict the test data
                y_pred[test_index] = clf.predict(sub_pop_vector[test_index])

                # Get the probability of the prediction for ROC analysis
                probs = clf.predict_proba(sub_pop_vector[test_index])
                y_probs[test_index] = probs[:, 1]  # keep positive only

        # Calculate performance metrics and confusion matrix
        acc[i] = accuracy_score(event_groups, y_pred)
        f1[i] = f1_score(event_groups, y_pred)
        auroc[i] = roc_auc_score(event_groups, y_probs)
        conf_matrix = confusion_matrix(event_groups, y_pred)
        conf_matrix_norm[:, :, i] = conf_matrix / conf_matrix.sum(axis=1)[:, np.newaxis]

        # Add prediction and probability to matrix
        pred[i, :] = y_pred
        prob[i, :] = y_probs

    # Make integers from arrays when there's only one iteration
    if iterations == 1:
        acc = acc[0]
        f1 = f1[0]
        auroc = auroc[0]

    # Add to results dictionary
    if cross_validation == 'kfold':
        results = dict({'accuracy': acc, 'f1': f1, 'auroc': auroc,
                        'predictions': pred, 'probabilities': prob,
                        'confusion_matrix': conf_matrix_norm,
                        'n_groups': np.shape(np.unique(event_groups))[0],
                        'classifier': classifier, 'cross_validation': '%d-fold' % num_splits,
                        'iterations': iterations})
    else:
        results = dict({'accuracy': acc, 'f1': f1, 'auroc': auroc,
                        'predictions': pred, 'probabilities': prob,
                        'confusion_matrix': conf_matrix_norm,
                        'n_groups': np.shape(np.unique(event_groups))[0],
                        'classifier': classifier, 'cross_validation': cross_validation,
                        'iterations': iterations})

    return results
コード例 #13
0
dataPath = 'AlanWalksWales/'
# Comment out one of the two data names to change portion of data to use
dataName = 'AWW_rest'
#dataName = 'AWW_walk'
nJobs = 12  # Number of cores to use

# Load feature matrices, labels, and groups (denoting which labeled time
# segment each row of the feature matrix comes from)
featuresAll = np.loadtxt(dataPath + dataName + '_all.csv', delimiter=',')
featuresAcc = np.loadtxt(dataPath + dataName + '_acc.csv', delimiter=',')
featuresEda = np.loadtxt(dataPath + dataName + '_eda.csv', delimiter=',')
labels = np.loadtxt(dataPath + dataName + '_label.csv')
groups = np.loadtxt(dataPath + dataName + '_groups.csv')

# Leave-one-group-out cross-validation
cv = LeaveOneGroupOut()

# Parameter tuning by grid search
solver = 'lbfgs'
activation = 'relu'
regParam = 10.0**np.arange(-3, 5)

# Comment out one of the choices below (either 1 or 2 hidden layers)

# 1 hidden layer
hiddenLayerSizes = 2**np.arange(0, 8)
"""
# 2 hidden layers
hidden1,hidden2 = np.meshgrid(2**np.arange(0,8),2**np.arange(0,8))
hiddenLayerSizes = np.reshape(np.stack([hidden1,hidden2]),
                                       (2,np.size(hidden1))).T.tolist()
コード例 #14
0
ファイル: model_fitting.py プロジェクト: awoo769/Level_8_Lab
def learn(X: (dict, pd.DataFrame),
          y: (dict, pd.Series),
          data_folder: str,
          groups: list = None,
          test_split: float = None,
          name: str = None):
    '''
	This function trains either a classification or regression random forest model. It is able to handle
	either a singular pandas DataFrame or a dictionary of pandas DataFrames. If the input is a singular
	pandas DataFrame, the rows will be split into a training and testing dataset using test_split (0 - 1).
	If the input is a dictionary of pandas DataFrames, a leave one out method will be used to verify the
	models accuracy.

	Inputs:

	X: a dictionary of pandas DataFrames or a singular pandas DataFrame

	y: a dictionary of pandas Series of a singular pandas Series

	data_folder: the location of where to save the output

	groups: a list of the trial names
			NOTE: this is only required if the X/y input is a dictionary

	test_split: the decimal percentage to split the training and testing datasets
				NOTE: this is only required if the X/y input is not a dictionary

	name: the name of the trial
		  NOTE: this is only required if the X/y input is not a dictionary

	Alex Woodall

	Auckland Bioengineering Institute

	08/04/2020

	'''

    if 'force' in data_folder or 'time' in data_folder:
        mode = 'regression'

    elif 'binary' in data_folder:
        mode = 'classification'

    if type(X) is pd.DataFrame:
        # Learning using one trial (or a combination into a DataFrame rather than a dictionary of DataFrames)

        if mode == 'classification':
            # Split into training and testing
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_split)

            # Create classifier and train
            cl = RandomForestClassifier(n_estimators=128, n_jobs=-1)
            cl.fit(X_train, y_train)

            # Predict on classifier and convert to a pandas series, save output
            y_predict = cl.predict(X_test)
            y_predict = pd.Series(y_predict, index=X_test.index)

            y_predict.to_csv("{}y_predict.csv".format(data_folder),
                             index=True,
                             header=True)
            y_test.to_csv("{}y_test.csv".format(data_folder),
                          index=True,
                          header=True)

            # Print score and confusion matrix
            score = roc_auc_score(y_test, y_predict)
            conf_mat = confusion_matrix(y_test, y_predict)

            print("Roc auc = {}\n".format(score))
            print(conf_mat)

        elif mode == 'regression':
            # Split into training and testing
            split_int = int(len(X) * (1 - test_split))

            X_train = X.head(split_int)
            y_train = y.head(split_int)
            X_test = X.tail(len(X) - split_int)
            y_test = y.tail(len(X) - split_int)

            # Create regressor and train
            rg = RandomForestRegressor(n_estimators=100, n_jobs=-1)
            rg.fit(X_train, y_train)

            # Predict
            y_predict = rg.predict(X_test)

            # Filter force array
            ''' Filter force plate data at 60 Hz '''
            analog_frequency = 1000
            cut_off = 60  # Derie (2017), Robberechts et al (2019)
            order = 2  # Weyand (2017), Robberechts et al (2019)
            b_f, a_f = signal.butter(N=order,
                                     Wn=cut_off / (analog_frequency / 2),
                                     btype='low')

            new_F = signal.filtfilt(b_f, a_f, y_predict)
            ''' Rezero filtered forces'''
            threshold = 50  # 20 N
            filter_plate = rezero_filter(original_fz=new_F,
                                         threshold=threshold)

            y_predict = filter_plate * new_F

            # Convert output into a pandas series and save
            y_predict = pd.Series(y_predict, index=X_test.index)
            y_predict.to_csv("{}y_predict.csv".format(data_folder),
                             index=True,
                             header=True)
            y_test.to_csv("{}y_test.csv".format(data_folder),
                          index=True,
                          header=True)

            # Calculate R2 score and print
            score = r2_score(y_test, y_predict)
            print("R2 = {}\n".format(score))

            # Plot result
            plt.plot(y_test.tail(1000), 'k', label='True data')
            plt.plot(y_predict.tail(1000), 'r', label='Estimate data')
            plt.legend()
            plt.ylabel('Force (N)')
            plt.xlabel('Time (ms)')
            plt.title('Estimated data for {}'.format(name))

            # Save figure
            score = round(score, 4)

            plt.savefig('{}{}_{}.png'.format(data_folder, name,
                                             '_'.join(str(score).split('.'))))
            plt.show()

    elif type(X) is dict:

        # Create leave one group out split
        group_num = np.arange(len(groups))
        logo = LeaveOneGroupOut()
        logo.get_n_splits(groups=group_num)

        if mode == 'classification':
            # Create results text file
            f = open("{}results.txt".format(data_folder), "w")
            f.write("Results for classification\n\n")
            f.close()

            roc = []
            # Train on n - 1 groups, test on 1. Repeat for all
            for train_index, test_index in logo.split(X=X, groups=group_num):
                cl = RandomForestClassifier(n_estimators=128, n_jobs=-1)

                # Training data
                print('Hold out trial: {}'.format(groups[test_index[0]]))

                for index in train_index:
                    try:
                        X_train = X_train.append(X[groups[index]],
                                                 ignore_index=True)
                        y_train = y_train.append(y[groups[index]],
                                                 ignore_index=True)

                    except NameError:
                        X_train = X[groups[index]]
                        y_train = y[groups[index]]

                cl.fit(X_train, y_train)

                # Testing data
                X_test = X[groups[test_index[0]]]
                y_test = y[groups[test_index[0]]]

                # Predict
                y_estimate_test = cl.predict(X_test)
                y_estimate_test = pd.Series(y_estimate_test,
                                            index=X_test.index)

                roc.append(roc_auc_score(y_test, y_estimate_test))

                conf = confusion_matrix(y_test, y_estimate_test)

                np.savetxt("{}y_estimate_conf_{}.txt".format(
                    data_folder, groups[test_index[0]]),
                           conf,
                           delimiter='\t',
                           fmt='%i')

                f = open("{}results.txt".format(data_folder), "a")
                f.write("Predicting on {}: {}\n".format(
                    groups[test_index[0]], round(roc[-1], 4)))
                f.close()

                # Save estimate
                y_estimate_test.to_csv("{}y_estimate_test_{}.csv".format(
                    data_folder, groups[test_index[0]]),
                                       index=True,
                                       header=True)

                # Remove datasets
                del X_train
                del X_test
                del y_train
                del y_test

                # Save model
                f = open(
                    "{}{}_cl.pkl".format(data_folder, groups[test_index[0]]),
                    "wb")
                pickle.dump(cl, f)
                f.close()

            f = open("{}results.txt".format(data_folder), "a")
            f.write("\nAverage roc auc score: {}".format(
                round(statistics.mean(roc), 4)))
            f.close()

        elif mode == 'regression':

            # Allow for different number of estimators depending on task
            if 'force' in data_folder:
                n_estimators = 10

            else:
                n_estimators = 10

            # Create results text file
            f = open("{}results.txt".format(data_folder), "w")
            f.write("Results for regression\n\n")
            f.close()

            r2 = []

            for train_index, test_index in logo.split(X=X, groups=group_num):
                rg = RandomForestRegressor(n_estimators=n_estimators,
                                           n_jobs=-1)

                # Training data
                print('Hold out trial: {}'.format(groups[test_index[0]]))

                for index in train_index:
                    try:
                        X_train = X_train.append(X[groups[index]],
                                                 ignore_index=True)
                        y_train = y_train.append(y[groups[index]],
                                                 ignore_index=True)

                    except NameError:
                        X_train = X[groups[index]]
                        y_train = y[groups[index]]

                rg.fit(X_train, y_train)

                # Testing data
                X_test = X[groups[test_index[0]]]
                y_test = y[groups[test_index[0]]]

                # Predict
                y_estimate_test = rg.predict(X_test)

                # Round estimate to a whole number
                y_estimate_test = np.around(y_estimate_test)

                # Any negative number = -1
                y_estimate_test[y_estimate_test < 0] = -1

                y_estimate_test = pd.Series(y_estimate_test,
                                            index=X_test.index)

                r2.append(r2_score(y_test, y_estimate_test))

                f = open("{}results.txt".format(data_folder), "a")
                f.write("Predicting on {}: {}\n".format(
                    groups[test_index[0]], round(r2[-1], 4)))
                f.close()

                # Save estimate
                y_estimate_test.to_csv("{}y_estimate_test_{}.csv".format(
                    data_folder, groups[test_index[0]]),
                                       index=True,
                                       header=True)

                # Remove datasets
                del X_train
                del X_test
                del y_train
                del y_test

                # Save model
                f = open(
                    "{}{}_rg.pkl".format(data_folder, groups[test_index[0]]),
                    "wb")
                pickle.dump(rg, f)
                f.close()

            f = open("{}results.txt".format(data_folder), "a")
            f.write("\nAverage R^2 score: {}".format(
                round(statistics.mean(r2), 4)))
            f.close()

    else:
        print("X should be of type dict or pd.DataFrame")

        return

    return
コード例 #15
0
data = pd.read_csv('./data-standard.csv')
features = [
    'RMSSD', 'SDNN', 'SDANN', 'SDANNi', 'SDSD', 'pNN50', 'AutoCorrelation'
]

data = shuffle(data)
X, Y = np.array(data[features]), np.array(data['sleep'])

grp = data['id'].values
model = RandomForestClassifier(n_estimators=148,
                               criterion='entropy',
                               max_depth=12,
                               min_samples_split=4,
                               min_samples_leaf=7)

logo = LeaveOneGroupOut()
cv = LeaveOneGroupOut().split(X, Y, grp)
scores = []

for train, test in logo.split(X, Y, grp):
    model = RandomForestClassifier(n_estimators=148,
                                   criterion='entropy',
                                   max_depth=12,
                                   min_samples_split=4,
                                   min_samples_leaf=7)

    x_train, x_test = X[train], X[test]
    y_train, y_test = Y[train], Y[test]
    model.fit(x_train, y_train.ravel())

    scores.append(metrics.accuracy_score(y_test, model.predict(x_test)))
コード例 #16
0
                p_pair_set += [set_dict[key]['pair']]

        instance_set_array = np.array(instance_set)

        # define pair sets
        final_neg_pairs_list = set()
        for pair in n_pair_set:
            final_neg_pairs_list.add(tuple(pair))
        for pair in p_pair_set:
            final_neg_pairs_list.add(tuple(pair))

        ################### create cross-validation groups (same order with instances)
        instance_groups = stratified_gene_pair_groups(set_dict, instance_keys)

        # create data split
        logo = LeaveOneGroupOut()
        data_split = []
        for train_index, test_index in logo.split(instance_set,
                                                  label_set,
                                                  groups=instance_groups):
            data_split += [[train_index, test_index]]

        print('##### Building Random Forest predictor for set %d with the LeaveOnePairOut cross-validation procedure (%d iterations)' % \
        (trial + 1, len(data_split)))

        ################################ create overall sets of iterations for statistics

        sgp_overall_test_set = []
        sgp_overall_test_label = []
        sgp_overall_pred_label = []
        sgp_overall_prob_label = []
コード例 #17
0
def test_decoder_binary_classification():
    X, y = make_classification(n_samples=200, n_features=125, scale=3.0,
                               n_informative=5, n_classes=2, random_state=42)
    X, mask = to_niimgs(X, [5, 5, 5])

    # check classification with masker object
    model = Decoder(mask=NiftiMasker())
    model.fit(X, y)
    y_pred = model.predict(X)
    assert accuracy_score(y, y_pred) > 0.95

    # decoder object use predict_proba for scoring with logistic model
    model = Decoder(estimator='logistic_l2', mask=mask)
    model.fit(X, y)
    y_pred = model.predict(X)
    assert accuracy_score(y, y_pred) > 0.95

    # decoder object use prior as strategy (default) for dummy classifier
    model = Decoder(estimator='dummy_classifier', mask=mask)
    model.fit(X, y)
    y_pred = model.predict(X)
    assert accuracy_score(y, y_pred) == 0.5

    # decoder object use other strategy for dummy classifier
    param = dict(strategy='stratified')
    dummy_classifier.set_params(**param)
    model = Decoder(estimator=dummy_classifier, mask=mask)
    model.fit(X, y)
    y_pred = model.predict(X)
    assert accuracy_score(y, y_pred) >= 0.5
    # Returns model coefficients for dummy estimators as None
    assert model.coef_ is None
    # Dummy output are nothing but the attributes of the dummy estimators
    assert model.dummy_output_ is not None
    assert model.cv_scores_ is not None
    # model attribute n_outputs_ depending on target y ndim
    assert model.n_outputs_ == 1

    # decoder object use other scoring metric for dummy classifier
    model = Decoder(estimator='dummy_classifier', mask=mask)
    model.fit(X, y)
    y_pred = model.predict(X)
    assert roc_auc_score(y, y_pred) == 0.5

    model = Decoder(estimator='dummy_classifier', mask=mask, scoring='roc_auc')
    model.fit(X, y)
    assert np.mean(model.cv_scores_[0]) >= 0.5

    # Raises a not implemented error with strategy constant
    param = dict(strategy='constant')
    dummy_classifier.set_params(**param)
    model = Decoder(estimator=dummy_classifier, mask=mask)
    pytest.raises(NotImplementedError, model.fit, X, y)

    # check different screening_percentile value
    for screening_percentile in [100, 20, None]:
        model = Decoder(mask=mask, screening_percentile=screening_percentile)
        model.fit(X, y)
        y_pred = model.predict(X)
        assert accuracy_score(y, y_pred) > 0.95

    for clustering_percentile in [100, 99]:
        model = FREMClassifier(estimator='logistic_l2', mask=mask,
                               clustering_percentile=clustering_percentile,
                               screening_percentile=90, cv=5)
        model.fit(X, y)
        y_pred = model.predict(X)
        assert accuracy_score(y, y_pred) > 0.9

    # check cross-validation scheme and fit attribute with groups enabled
    rand_local = np.random.RandomState(42)
    for cv in [KFold(n_splits=5), LeaveOneGroupOut()]:
        model = Decoder(estimator='svc', mask=mask,
                        standardize=True, cv=cv)
        if isinstance(cv, LeaveOneGroupOut):
            groups = rand_local.binomial(2, 0.3, size=len(y))
        else:
            groups = None
        model.fit(X, y, groups=groups)
        assert accuracy_score(y, y_pred) > 0.9
コード例 #18
0
ファイル: nested_auto1.py プロジェクト: vasudevareddy37/code
    small_dict = {}

    # creating path names for training(X) data
    for name_x in filename:
        
        file_x = os.path.join(inputFolder, name_x)
        x = np.load(file_x)
        
        # preprocessing the trinanig data
        x_scaled = x / 255
        
        # We  are using a Support Vector Classifier with "rbf" kernel
        clf = svm.SVC(kernel = 'rbf', gamma= 0.01, C = 100)
       
        # using leave one groupout method
        logo = LeaveOneGroupOut()
        cv =  logo.split(x_scaled, y, groups=group)
        
        scores = cross_val_score(clf, x_scaled, 
                                 y, cv = cv, scoring= 'recall_macro')
        
        loopMean = scores.mean() 
        loopStd = scores.std()
        
        small_name_x = name_x[:-4]
        
        small_dict[small_name_x] = (loopMean, loopStd)
        
        print(" finished finiding scores with %s data" %name_x)

    big_name = name[5:12]
コード例 #19
0
def lda_project(spike_times, spike_clusters, event_times, event_groups, pre_time=0, post_time=0.5,
                cross_validation='kfold', num_splits=5, prob_left=None, custom_validation=None):
    """
    Use linear discriminant analysis to project population vectors to the line that best separates
    the two groups. When cross-validation is used, the LDA projection is fitted on the training
    data after which the test data is projected to this projection.

    spike_times : 1D array
        spike times (in seconds)
    spike_clusters : 1D array
        cluster ids corresponding to each event in `spikes`
    event_times : 1D array
        times (in seconds) of the events from the two groups
    event_groups : 1D array
        group identities of the events, can be any number of groups, accepts integers and strings
    pre_time : float
        time (in seconds) preceding the event times
    post_time : float
        time (in seconds) following the event times
    cross_validation : string
        which cross-validation method to use, options are:
            'none'              No cross-validation
            'kfold'             K-fold cross-validation
            'leave-one-out'     Leave out the trial that is being decoded
            'block'             Leave out the block the to-be-decoded trial is in
            'custom'            Any custom cross-validation provided by the user
    num_splits : integer
        ** only for 'kfold' cross-validation **
        Number of splits to use for k-fold cross validation, a value of 5 means that the decoder
        will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process
        is repeated five times so that all data has been used as both training and test set.
    prob_left : 1D array
        ** only for 'block' cross-validation **
        the probability of the stimulus appearing on the left for each trial in event_times
    custom_validation : generator
        ** only for 'custom' cross-validation **
        a generator object with the splits to be used for cross validation using this format:
            (
                (split1_train_idxs, split1_test_idxs),
                (split2_train_idxs, split2_test_idxs),
                (split3_train_idxs, split3_test_idxs),
             ...)
    n_neurons : int
        Group size of number of neurons to be sub-selected

    Returns
    -------
    lda_projection : 1D array
        the position along the LDA projection axis for the population vector of each trial

    """

    # Check input
    assert cross_validation in ['none', 'kfold', 'leave-one-out', 'block', 'custom']
    assert event_times.shape[0] == event_groups.shape[0]
    if cross_validation == 'block':
        assert event_times.shape[0] == prob_left.shape[0]
    if cross_validation == 'custom':
        assert isinstance(custom_validation, types.GeneratorType)

    # Get matrix of all neuronal responses
    times = np.column_stack(((event_times - pre_time), (event_times + post_time)))
    pop_vector, cluster_ids = _get_spike_counts_in_bins(spike_times, spike_clusters, times)
    pop_vector = np.rot90(pop_vector)

    # Initialize
    lda = LinearDiscriminantAnalysis()
    lda_projection = np.zeros(event_groups.shape)

    if cross_validation == 'none':
        # Find the best LDA projection on all data and transform those data
        lda_projection = lda.fit_transform(pop_vector, event_groups)

    else:
        # Perform cross-validation
        if cross_validation == 'leave-one-out':
            cv = LeaveOneOut().split(pop_vector)
        elif cross_validation == 'kfold':
            cv = KFold(n_splits=num_splits).split(pop_vector)
        elif cross_validation == 'block':
            block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)]
            blocks = np.repeat(np.arange(len(block_lengths)), block_lengths)
            cv = LeaveOneGroupOut().split(pop_vector, groups=blocks)
        elif cross_validation == 'custom':
            cv = custom_validation

        # Loop over the splits into train and test
        for train_index, test_index in cv:

            # Find LDA projection on the training data
            lda.fit(pop_vector[train_index], [event_groups[j] for j in train_index])

            # Project the held-out test data to projection
            lda_projection[test_index] = np.rot90(lda.transform(pop_vector[test_index]))[0]

    return lda_projection
def cross_validate(data, lambda_config=(2e0, 2e20, 11), TA=0, name="data_0"):
    """

    Parameters
    ----------
    data : Pandas Dataframe
        load dataframe using from data_load, using getData function.
    lambda_config : tuple, optional
        Lambda range for Ridge regression. The default is (2e0, 2e20, 11). Range from 
        Cross et al 2016 publication.
    TA : int., optional
        Test subject nr. The default is 0.
    name : str., optional
        Name of txt file with model performance. The default is "data_0".

    Returns
    -------
    CSV-file with results
    Plots

    """

    # For reproducibility
    random_state = 9999

    # Train one model per TA
    data = data[data['TA'] == TA]

    # Define lambda values for training models
    lambdas = np.linspace(lambda_config[0], lambda_config[1], lambda_config[2])

    # Split into train/testing with leave-one-group-out
    logo = LeaveOneGroupOut()

    # Shift EEG 150 ms back
    EEG_shifted = shift(data[data.columns[:16]].T, lag=-150, freq=64)
    data = data.iloc[:len(EEG_shifted.T)]
    data[data.columns[:16]] = EEG_shifted.T

    # Assign X, y and group variable
    X = data[data.columns[:16]]
    y = data['target']

    groups = data["trial"]
    n_outer_groups = len(np.unique(groups))

    # Initiate test errors
    MSEs = []
    MSEdummies = []

    # For cross correlation
    scores = []
    opt_lambda_list = []

    # Parameters for MNE
    tmin = -.25
    tmax = .1
    sfreq = 64

    ## Leave-trial-out CV ##

    # Outer fold
    i = 0
    for out_train_idx, out_test_idx in logo.split(X, y, groups):
        print("Outer fold %i / %i" % (i + 1, n_outer_groups))

        X_train = X.iloc[out_train_idx]
        y_train = y.iloc[out_train_idx]

        X_test = X.iloc[out_test_idx]
        y_test = y.iloc[out_test_idx]

        # Define inner groups, these are n - 1 of n total groups
        inner_groups = data["trial"].iloc[out_train_idx]
        n_inner_groups = len(np.unique(inner_groups))

        # Initiate errors for inner fold validations
        vals = np.zeros((n_inner_groups, lambda_config[2]))

        # Inner fold
        j = 0
        for inn_train_idx, inn_test_idx in logo.split(X_train, y_train,
                                                      inner_groups):
            print("\t Inner fold %i / %i" % (j + 1, n_inner_groups))

            inn_X_train = X_train.iloc[inn_train_idx]
            inn_y_train = y_train.iloc[inn_train_idx]

            inn_X_test = X_train.iloc[inn_test_idx]
            inn_y_test = y_train.iloc[inn_test_idx]

            # Validate model with all parameters
            k = 0
            for l in lambdas:
                # Define model with l parameter
                model = ReceptiveField(tmin,
                                       tmax,
                                       sfreq,
                                       feature_names=None,
                                       estimator=l,
                                       scoring="corrcoef")

                # Fit model to inner fold training data
                model.fit(np.asarray(inn_X_train), np.asarray(inn_y_train))

                # Compute cross correlation for regressional value
                val = model.score(np.asarray(inn_X_test),
                                  np.asarray(inn_y_test))
                # Add score to matrix
                vals[j, k] = val

                k += 1
            j += 1

        # Get optimal parameter
        param_score = np.sum(vals, axis=0)
        lambda_opt = lambdas[np.argmax(param_score)]
        print("Optimal lambda = %f" % lambda_opt)

        # Store optimal lambda parameter
        opt_lambda_list.append(lambda_opt)

        # Train optimal model
        model_opt = ReceptiveField(tmin,
                                   tmax,
                                   sfreq,
                                   feature_names=None,
                                   estimator=lambda_opt,
                                   scoring="corrcoef")

        # Fit model to inner fold training data
        model_opt.fit(np.asarray(X_train), np.asarray(y_train))

        # Compute error of optimal model
        score = model_opt.score(np.asarray(X_test), np.asarray(y_test))
        print('Score:')
        print(score)

        # Fit dummy model
        dummy_regr = DummyRegressor(strategy="mean")
        dummy_regr.fit(np.asarray(X_train), np.asarray(y_train))

        # Add error to list
        scores.append(score)
        MSE = mean_squared_error(np.asarray(y_test),
                                 model_opt.predict(np.asarray(X_test)),
                                 squared=True)
        MSEs.append(MSE)

        MSEdummy = mean_squared_error(np.asarray(y_test),
                                      dummy_regr.predict(np.asarray(X_test)),
                                      squared=True)
        MSEdummies.append(MSEdummy)

        i += 1

    ## Training and testing for optimal model ##

    # Making dataframes for each SNR cond
    data_0 = data[data['SNR'] == 0]
    data_1 = data[data['SNR'] == 1]
    data_2 = data[data['SNR'] == 2]

    # Shuffle data
    data_0 = data_0.sample(frac=1, random_state=random_state)
    data_1 = data_1.sample(frac=1, random_state=random_state)
    data_2 = data_2.sample(frac=1, random_state=random_state)

    # Split data 80/20 for training/testing
    train_0, test_0 = train_test_split(data_0,
                                       test_size=0.2,
                                       random_state=random_state)
    train_1, test_1 = train_test_split(data_1,
                                       test_size=0.2,
                                       random_state=random_state)
    train_2, test_2 = train_test_split(data_2,
                                       test_size=0.2,
                                       random_state=random_state)

    # Combine training dataframes into one
    data = train_0.append(train_1, ignore_index=True)
    data = data.append(train_2, ignore_index=True)

    # Combine testing dataframes into one
    data_test = test_0.append(test_1, ignore_index=True)
    data_test = data_test.append(test_2, ignore_index=True)

    # Mean score across all folds
    mu_score = np.mean(scores)
    print("Mean score = %f" % mu_score)

    best_fold = np.argmax(scores) + 1
    print("Best fold = %i" % best_fold)

    # Optimal model
    model_optimal = ReceptiveField(
        tmin,
        tmax,
        sfreq,
        feature_names=None,
        estimator=opt_lambda_list[np.argmax(scores)],
        scoring="corrcoef")

    # Fit optimal model to training data
    model_optimal.fit(np.asarray(data[data.columns[:16]]),
                      np.asarray(data["target"]))

    # Dummy classifier
    dummy_regr = DummyRegressor(strategy="mean")
    dummy_regr.fit(np.asarray(data[data.columns[:16]]),
                   np.asarray(data["target"]))

    # Compute cross correlation scores on test data for all three SNR conds
    score_0 = signal.correlate(model_optimal.predict(
        np.asarray(test_0[test_0.columns[:16]])),
                               np.asarray(test_0['target']),
                               mode='same') / len(test_0['target'])
    score_1 = signal.correlate(model_optimal.predict(
        np.asarray(test_1[test_1.columns[:16]])),
                               np.asarray(test_1['target']),
                               mode='same') / len(test_1['target'])
    score_2 = signal.correlate(model_optimal.predict(
        np.asarray(test_2[test_2.columns[:16]])),
                               np.asarray(test_2['target']),
                               mode='same') / len(test_2['target'])

    # Compute cross correlation scores on test data for all three SNR conds with dummy regressor
    score_0_dummy = signal.correlate(dummy_regr.predict(
        np.asarray(test_0[test_0.columns[:16]])),
                                     np.asarray(test_0['target']),
                                     mode='same') / len(test_0['target'])
    score_1_dummy = signal.correlate(dummy_regr.predict(
        np.asarray(test_1[test_1.columns[:16]])),
                                     np.asarray(test_1['target']),
                                     mode='same') / len(test_1['target'])
    score_2_dummy = signal.correlate(dummy_regr.predict(
        np.asarray(test_2[test_2.columns[:16]])),
                                     np.asarray(test_2['target']),
                                     mode='same') / len(test_2['target'])

    # Cross correlate with random speech
    random_corr = signal.correlate(
        model_optimal.predict(np.asarray(test_1[test_1.columns[:16]])),
        np.random.uniform(low=min(test_1['target']),
                          high=max(test_1['target']),
                          size=(np.asarray(test_1).shape[0], )),
        mode='same') / len(
            np.random.uniform(low=min(test_1['target']),
                              high=max(test_1['target']),
                              size=(np.asarray(test_1).shape[0], )))

    ### SHOW RESULTS IN PLOTS ###

    ## Make line plots ##

    # Define x-axes
    x_axis_0 = np.linspace(0, len(test_0['target']), num=len(test_0['target']))
    x_axis_1 = np.linspace(0, len(test_1['target']), num=len(test_1['target']))
    x_axis_2 = np.linspace(0, len(test_2['target']), num=len(test_2['target']))
    x_all = np.linspace(0,
                        len(data_test['target']),
                        num=len(data_test['target']))

    ## All SNRs ##
    # For True
    plt.plot(x_all,
             np.asarray(data_test['target']),
             color='sandybrown',
             label='True')

    # For MNE Ridge regression
    plt.plot(x_all,
             model_optimal.predict(
                 np.asarray(data_test[data_test.columns[:16]])),
             color='deepskyblue',
             label='Predicted')

    # For baseline dummy
    plt.plot(x_all,
             dummy_regr.predict(np.asarray(data_test[data_test.columns[:16]])),
             color='rebeccapurple',
             dashes=[6, 2],
             label='Baseline (mean)')
    plt.grid()
    plt.title(f'TA: {TA} · Predicted and True Speech Envelopes · All SNRs')
    plt.xlabel('Samples')
    plt.ylabel('Speech Envelope')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()
    plt.savefig(f'Figure All - TA {TA}.png')

    ## -5 DB SNR ##
    # For True
    plt.plot(x_axis_0,
             np.asarray(test_0['target']),
             color='sandybrown',
             label='True')

    # For MNE Ridge regression
    plt.plot(x_axis_0,
             model_optimal.predict(np.asarray(test_0[test_0.columns[:16]])),
             color='deepskyblue',
             label='Predicted')

    # For baseline dummy
    plt.plot(x_axis_0,
             dummy_regr.predict(np.asarray(test_0[test_0.columns[:16]])),
             color='rebeccapurple',
             dashes=[6, 2],
             label='Baseline (mean)')
    plt.grid()
    plt.title(f'TA: {TA} · Predicted and True Speech Envelopes · -5 DB SNR')
    plt.xlabel('Samples')
    plt.ylabel('Speech Envelope')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()
    plt.savefig(f'Figure -5 DB - TA {TA}.png')

    ## 0 DB SNR ##
    # For True
    plt.plot(x_axis_1,
             np.asarray(test_1['target']),
             color='sandybrown',
             label='True')

    # For MNE Ridge regression
    plt.plot(x_axis_1,
             model_optimal.predict(np.asarray(test_1[test_1.columns[:16]])),
             color='deepskyblue',
             label='Predicted')

    # For baseline dummy
    plt.plot(x_axis_1,
             dummy_regr.predict(np.asarray(test_1[test_1.columns[:16]])),
             color='rebeccapurple',
             dashes=[6, 2],
             label='Baseline (mean)')
    plt.grid()
    plt.title(f'TA: {TA} · Predicted and True Speech Envelopes · 0 DB SNR')
    plt.xlabel('Samples')
    plt.ylabel('Speech Envelope')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()
    plt.savefig(f'Figure 0 DB - TA {TA}.png')

    ## +5 DB SNR ##
    # For True
    plt.plot(x_axis_2,
             np.asarray(test_2['target']),
             color='sandybrown',
             label='True')

    # For MNE Ridge regression
    plt.plot(x_axis_2,
             model_optimal.predict(np.asarray(test_2[test_2.columns[:16]])),
             color='deepskyblue',
             label='Predicted')

    # For baseline dummy
    plt.plot(x_axis_2,
             dummy_regr.predict(np.asarray(test_2[test_2.columns[:16]])),
             color='rebeccapurple',
             dashes=[6, 2],
             label='Baseline (mean)')
    plt.grid()
    plt.title(f'TA: {TA} · Predicted and True Speech Envelopes · +5 DB SNR')
    plt.xlabel('Samples')
    plt.ylabel('Speech Envelope')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()
    plt.savefig(f'Figure +5 DB - TA {TA}.png')

    ## Bar chart to compare MSE for L2 and baseline ##

    measure = [np.mean(MSEs), np.mean(MSEdummies)]
    variance = [np.var(MSEs), np.var(MSEdummies)]
    x_labels = ['L2 (MNE)', 'Baseline']

    x_pos = [i for i, _ in enumerate(x_labels)]

    plt.bar(x_pos, measure, color='sandybrown', yerr=variance)
    plt.grid()
    plt.xlabel("Model")
    plt.ylabel("MSE")
    plt.title("MSEs of L-2 and Baseline Model Compared")
    plt.xticks(x_pos, x_labels)
    plt.show()
    plt.savefig(f'MSEs compared - TA {TA}.png')

    ## Make boxplot of CrossCorrs ##

    ticks = ['-5 DB', '0 DB', '+5 DB', 'Random']

    # Function to set the colors of the boxplots
    def set_box_color(bp, color):
        plt.setp(bp['boxes'], color=color)
        plt.setp(bp['whiskers'], color=color)
        plt.setp(bp['caps'], color=color)
        plt.setp(bp['medians'], color=color)

    plt.figure()

    bpl = plt.boxplot(
        [score_0, score_1, score_2],
        positions=np.array(range(len([score_0, score_1, score_2]))) * 2.0 -
        0.4,
        sym='',
        widths=0.6)
    bpr = plt.boxplot(
        [score_0_dummy, score_1_dummy, score_2_dummy],
        positions=np.array(
            range(len([score_0_dummy, score_1_dummy, score_2_dummy]))) * 2.0 +
        0.4,
        sym='',
        widths=0.6)
    bpu = plt.boxplot(random_corr, positions=[6], sym='', widths=0.6)
    set_box_color(bpl, 'deepskyblue')
    set_box_color(bpr, 'rebeccapurple')
    set_box_color(bpu, 'sandybrown')

    # Draw temporary purple and blue lines and use them to create a legend
    plt.plot([], c='deepskyblue', label='L2 MNE')
    plt.plot([], c='rebeccapurple', label='Baseline')
    plt.plot([], c='sandybrown', label='Random')
    plt.title(
        "Cross-correlation Between L-2-Predicted and True Envelopes in All SNR Levels & Random"
    )
    plt.legend()
    plt.ylabel("Cross-correlation")
    plt.grid()

    plt.xticks(range(0, len(ticks) * 2, 2), ticks)
    plt.xlim(-2, len(ticks) * 2)
    plt.tight_layout()
    plt.show()
    plt.savefig(f'Boxplot over CrossCorr - TA {TA}.png')

    # Make data matrix
    data_matrix = np.array([
        'TA:', TA, 'Optimal Lambda Value:', opt_lambda_list[np.argmax(scores)],
        'Best Score (Pearson´s R):', scores[np.argmax(scores)],
        'Mean Score (Pearson´s R):',
        np.mean(scores), 'Best MSE:',
        min(MSEs), 'Mean MSE:',
        np.mean(MSEs), 'Best MSE Dummy:',
        min(MSEdummies), 'Mean MSE Dummy',
        np.mean(MSEdummies), 'CrossCorr for -5DB SNR:',
        np.mean(score_0), 'CrossCorr for 0DB SNR:',
        np.mean(score_1), 'CrossCorr for +5DB SNR:',
        np.mean(score_2), 'CrossCorr for random:',
        np.mean(random_corr), 'Dummy CrossCorr for -5DB SNR:',
        np.mean(score_0_dummy), 'Dummy CrossCorr for 0DB SNR:',
        np.mean(score_1_dummy), 'Dummy CrossCorr for +5DB SNR:',
        np.mean(score_2_dummy)
    ]).T

    # Save as CSV in working directory
    np.savetxt(name, data_matrix, delimiter=",", fmt='%s')
コード例 #21
0
ファイル: ML_functions.py プロジェクト: azodichr/ML-Pipeline
	def Run_Regression_Model(df, reg, cv_num, ALG, df_unknowns, test_df,
		cv_sets, j, save):
		from sklearn.model_selection import cross_val_predict
		from sklearn.metrics import make_scorer
		from sklearn.metrics import mean_squared_error, r2_score
		from sklearn.metrics import explained_variance_score
		# Data from balanced dataframe
		y = df['Y']
		X = df.drop(['Y'], axis=1)

		# Obtain the predictions using 10 fold cross validation
		# (uses KFold cv by default):
		if isinstance(cv_sets, pd.DataFrame):
			from sklearn.model_selection import LeaveOneGroupOut
			cv_split = LeaveOneGroupOut()
			cv_folds = cv_split.split(X, y, cv_sets.iloc[:, j])
			cv_pred = cross_val_predict(estimator=reg, X=X, y=y, cv=cv_folds)
		else:
			cv_pred = cross_val_predict(estimator=reg, X=X, y=y, cv=cv_num)

		cv_pred_df = pd.DataFrame(data=cv_pred, index=df.index,
			columns=['pred'])

		# Get performance statistics from cross-validation
		y = y.astype(float)
		mse = mean_squared_error(y, cv_pred)
		evs = explained_variance_score(y, cv_pred)
		r2 = r2_score(y, cv_pred)
		cor = np.corrcoef(np.array(y), cv_pred)
		result = [mse, evs, r2, cor[0, 1]]

		reg.fit(X, y)

		# Save the model for future persistence
		print(f'\nSaving model as {save+".joblib"}\n')
		dump(reg, save+'.joblib')

		# Apply fit model to unknowns
		if isinstance(df_unknowns, pd.DataFrame):
			unk_pred = reg.predict(df_unknowns.drop(['Y'], axis=1))
			unk_pred_df = pd.DataFrame(data=unk_pred, index=df_unknowns.index,
				columns=['pred'])
			cv_pred_df = cv_pred_df.append(unk_pred_df)

		if not isinstance(test_df, str):
			test_y = test_df['Y']
			test_pred = reg.predict(test_df.drop(['Y'], axis=1))
			test_pred_df = pd.DataFrame(data=test_pred, index=test_df.index,
				columns=['pred'])
			cv_pred_df = cv_pred_df.append(test_pred_df)

			# Get performance stats
			mse_test = mean_squared_error(test_y, test_pred)
			evs_test = explained_variance_score(test_y, test_pred)
			r2_test = r2_score(test_y, test_pred)
			cor_test = np.corrcoef(np.array(test_y), test_pred)
			result_test = [mse_test, evs_test, r2_test, cor_test[0, 1]]

		# Try to extract importance scores
		try:
			importances = reg.feature_importances_
		except:
			try:
				importances = reg.coef_
			except:
				importances = "na"
				print("Cannot get importance scores")

		if not isinstance(test_df, str):
			return result, cv_pred_df, importances, result_test, reg
		else:
			return result, cv_pred_df, importances, reg
コード例 #22
0
ファイル: classifer.py プロジェクト: Jiessie-jq/EE559
#temp = np.equal(guess, df['Class'])
#guess_acc = temp.sum()/13500*100

datadf = df[Config['selected_features']]

## oigin class in order, shuffle it
if Config['debug']:
    data = datadf.sample(10)
else:
    data = datadf.sample(frac=1)
#
feature1 = data[Config['selected_features']]
label = data['Class']
groups = data['user']
feature = feature1.drop(columns=['Class', 'user'])
gss = LeaveOneGroupOut()
x = gss.split(feature, label, groups=groups)

clf = Config['model']
score = cross_val_score(clf, X=feature, y=label, cv=x)
print('the average of c.v. score is:')
print(score.mean())
print()
print('the standard deviation of c.v. score is:')
print(np.std(score))

feature1 = data[Config['selected_features']]
label = data['Class']
groups = data['user']
feature = feature1.drop(columns=['Class', 'user'])
gss = LeaveOneGroupOut()
コード例 #23
0
    LogisticRegression(C=args.c,
                       class_weight='balanced',
                       penalty='l2',
                       solver='liblinear',
                       verbose=0,
                       max_iter=1000))

params = model['logisticregression'].get_params()
print(
    f"Fitting {params['penalty']} penalized logistic regression with C={params['C']}."
)

scores_l2 = cross_validate(model,
                           X,
                           y,
                           cv=LeaveOneGroupOut().split(X, y, seqid),
                           scoring=pr_auc,
                           n_jobs=args.njobs,
                           return_estimator=True,
                           pre_dispatch='n_jobs',
                           verbose=1)

mean_score = scores_l2['test_score'].mean()
std_score = scores_l2['test_score'].std()

print(f'Mean score: {mean_score:.2f} (std {std_score:.2f})')

model = make_pipeline(
    StandardScaler(),
    LogisticRegression(C=args.c,
                       class_weight='balanced',
コード例 #24
0
# initialize performance tables
acc_scores = ["participant_id", "modality"]
for c in clf_tokens:
    for s in fs_perc:
        acc_scores.extend([
            "ACC_%s_perc%s" % (c, s),
            "Perm_%s_perc%s" % (c, s),
            "P_%s_perc%s" % (c, s)
        ])
facc = os.path.join(
    vdir, "group_%s_MVPA-classifier-selection-PermACC_unimodal.csv" % task)
with open(facc, 'w') as fid:
    fid.write(','.join(acc_scores) + '\n')
fid.close()
# do MVPA for each subject
CV = LeaveOneGroupOut()  # leave-one-run-out cross-validation
for i in range(0, n):
    subj = subjects.index[i]
    print(
        "Perform ROI-based MVPA using classifiers: %s with LOROCV for subject: %s ......\n"
        % (clf_tokens, subj))
    # setup individual  path
    sdir = os.path.join(vdir, subj)  # subject working folder
    bdir = os.path.join(sdir, 'betas_afni')  # beta estimates
    pdir = os.path.join(sdir, 'tvrMVPC')  # Trial-wise Volume ROI-based MVPC
    if not os.path.exists(pdir): os.makedirs(pdir)
    fbet = "%s/%s_LSS_nilearn.nii.gz" % (bdir, subj)
    for imod in mods:
        # read labels and betas according to the modality
        labs_mod = labs_trl.isin(
            ['WV', 'PV']) if imod == 'visual' else labs_trl.isin(['WA', 'PA'])
コード例 #25
0
        assert tokenize(cv) == tokenize(cv)
        tokens.append(cv)
    assert len(set(tokens)) == len(tokens)

    cv = cvs[0]
    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(True):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol

    with assert_dask_compute(False):
        assert compute_n_splits(cv, np_X, da_y, da_groups) == sol


@pytest.mark.parametrize('cvs', [(LeaveOneGroupOut(), ),
                                 (LeavePGroupsOut(2), LeavePGroupsOut(3))])
def test_leave_group_out(cvs):
    tokens = []
    for cv in cvs:
        assert tokenize(cv) == tokenize(cv)
        tokens.append(cv)
    assert len(set(tokens)) == len(tokens)

    cv = cvs[0]
    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(True):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
コード例 #26
0
ファイル: classify.py プロジェクト: shuaaa/audio-sentiment
def main(args, pipe=False):
    '''
    Checks passed arguments and performs requested actions.
    '''
    if not pipe:
        parser = argparse.ArgumentParser(
            description='Classify call segments as positive or negative.')
        parser.add_argument('-f',
                            '--features',
                            dest='feat_loc',
                            required=True,
                            help='Path to CSV feature file.')
        parser.add_argument(
            '-o',
            '--out',
            dest='out_loc',
            required=True,
            help='Path to where classification summary should be saved.')
        parser.add_argument('--hmm',
                            dest='hmm_flag',
                            action='store_true',
                            help='Classify with a Hidden Markov Model.')
        parser.add_argument('--rf',
                            dest='rf_flag',
                            action='store_true',
                            help='Classify with a random forest.')
        parser.add_argument('--n_components',
                            dest='n_components',
                            help='Number of components for the HMM.')
        parser.add_argument('--n_mix',
                            dest='n_mix',
                            help='Number of Gaussian mixtures for the HMM.')
        parser.add_argument(
            '--n_estimators',
            dest='n_estimators',
            help='Number of tree estimators for the random forest.')
        args = parser.parse_args()

    if args.hmm_flag or args.rf_flag:
        # store scores from all runs to calc stats
        hmm_chunk_scores = []
        hmm_overall_scores = []
        rf_chunk_scores = []
        rf_overall_scores = []

        # split data for leave-one-group(call)-out validation
        data, labels, ids = sep_data_labels(args.feat_loc)
        logo = LeaveOneGroupOut()
        curr_split = 1
        num_splits = logo.get_n_splits(data, labels, ids)

        # loop through all cross validation folds
        for train_index, test_index in logo.split(data, labels, ids):
            print('Split ' + str(curr_split) + ' out of ' + str(num_splits))
            data_train, data_test = data[train_index], data[test_index]
            labels_train, labels_test = labels[train_index], labels[test_index]
            # classify with the selected models
            if args.hmm_flag:
                if args.n_components:
                    n_components = int(args.n_components)
                else:
                    n_components = 2
                if args.n_mix:
                    n_mix = int(args.n_mix)
                else:
                    n_mix = 2
                hmm_model = HmmMorency(n_components=n_components, n_mix=n_mix)
                chunk_scores, call_score = train_and_test(
                    hmm_model, data_train, data_test, labels_train,
                    labels_test)
                hmm_chunk_scores.append(chunk_scores)
                hmm_overall_scores.append(call_score)

            if args.rf_flag:
                if args.n_estimators:
                    n_estimators = int(args.n_estimators)
                else:
                    n_estimators = 100
                rf_model = RandomForestClassifier(n_estimators=n_estimators,
                                                  n_jobs=-1,
                                                  random_state=10)
                chunk_scores, call_score = train_and_test(
                    rf_model, data_train, data_test, labels_train, labels_test)
                rf_chunk_scores.append(chunk_scores)
                rf_overall_scores.append(call_score)

            curr_split += 1

        # evaluate the scores for all models
        out_file = os.path.join(args.out_loc, 'results.txt')
        if args.hmm_flag:
            score_stats(
                'hmm, mix: ' + str(n_mix) + ' states: ' + str(n_components),
                hmm_chunk_scores, hmm_overall_scores, out_file)
        if args.rf_flag:
            score_stats('random forest, estimators: ' + str(n_estimators),
                        rf_chunk_scores, rf_overall_scores, out_file)

    else:
        sys.exit(
            'Must choose at least one classification method. (--hmm, --rf)')
コード例 #27
0
# Load feature matrices, labels, and groups (denoting which labeled time
# segment each row of the feature matrix comes from)
featuresAll = np.loadtxt(dataPath + dataName + '_all.csv', delimiter=',')
featuresAcc = np.loadtxt(dataPath + dataName + '_acc.csv', delimiter=',')
featuresEda = np.loadtxt(dataPath + dataName + '_eda.csv', delimiter=',')
labels = np.loadtxt(dataPath + dataName + '_label.csv')
groups = np.loadtxt(dataPath + dataName + '_groups.csv')
# Indicates the subjects that have no MAs, in order to exclude them during grid search
includeRowsTrain = np.logical_and(
    np.logical_and(np.where(groups != 5, True, False),
                   np.where(groups != 17, True, False)),
    np.where(groups != 18, True, False))

# Leave-one-group-out cross-validation
cv = LeaveOneGroupOut()

## Svm
# Parameter tuning by grid search
regParamC = 10.**np.arange(-3, 5)
regParamG = 10.**np.arange(-10, 1)
parameters = {'gamma': regParamG, 'C': regParamC}

svmgsAll = GridSearchCV(svm.SVC(),
                        parameters,
                        'roc_auc',
                        n_jobs=nJobs,
                        cv=cv,
                        refit=False,
                        verbose=1)
svmgsAll.fit(featuresAll[includeRowsTrain, :], labels[includeRowsTrain],
コード例 #28
0
def whole_brain_analysis(model, fmri_runs, design_matrices, subject):
    #   - fmri_runs: list of fMRI data runs (1 for each run)
    #   - matrices: list of design matrices (1 for each run)
    distribution_array = None
    nb_runs = len(fmri_runs)
    nb_voxels = fmri_runs[0].shape[1]
    n_sample = params.n_sample
    scores_cv = np.zeros((nb_runs, nb_voxels))
    distribution_array = np.zeros((nb_runs, n_sample, nb_voxels))

    logo = LeaveOneGroupOut()  # leave on run out !
    columns_index = np.arange(design_matrices[0].shape[1])
    shuffling = []
    cv = 0
    for _ in range(n_sample):
        np.random.shuffle(columns_index)
        shuffling.append(columns_index)
    for train, test in tqdm(logo.split(fmri_runs, groups=range(1,
                                                               nb_runs + 1))):
        fmri_data_train = np.vstack([
            fmri_runs[i] for i in train
        ])  # fmri_runs liste 2D colonne = voxels et chaque row = un t_i
        predictors_train = np.vstack([design_matrices[i] for i in train])

        if type(model) == sklearn.linear_model.RidgeCV:
            nb_samples = np.cumsum(
                [0] + [[fmri_runs[i] for i in train][i].shape[0]
                       for i in range(len([fmri_runs[i] for i in train]))
                       ])  # list of cumulative lenght
            indexes = {
                'run{}'.format(run + 1): [nb_samples[i], nb_samples[i + 1]]
                for i, run in enumerate(train)
            }
            model.cv = Splitter(
                indexes_dict=indexes, n_splits=nb_runs - 1
            )  # adequate splitter for cross-validate alpha taking into account groups
        print('Fitting model...')
        start = time()
        model_fitted = model.fit(predictors_train, fmri_data_train)
        # pickle.dump(model_fitted, open(join(paths.path2derivatives, 'fMRI/glm-indiv/english', str(model).split('(')[0] + '{}.sav'.format(test[0])), 'wb'))
        with open(os.path.join(paths.path2derivatives, 'fMRI', 'time2fit.txt'),
                  'w') as f:
            f.write(str(model_fitted))
            f.write('Model fitted in {}.'.format(time() - start))
        print('Model fitted in {}.'.format(time() - start))

        # return the R2_score for each voxel (=list)
        #r2 = get_r2_score(model_fitted, fmri_runs[test[0]], design_matrices[test[0]])
        r2, distribution = sample_r2(model_fitted,
                                     design_matrices[test[0]],
                                     fmri_runs[test[0]],
                                     shuffling=shuffling,
                                     n_sample=n_sample,
                                     alpha_percentile=params.alpha_percentile)

        # log the results
        # log(subject, voxel='whole brain', alpha=None, r2=r2)

        scores_cv[cv, :] = r2  # r2 is a 1d array: 1 value for each voxel
        distribution_array[
            cv, :, :] = distribution  # distribution is a 2d array: n_sample values for each voxel
        cv += 1
    # result = pd.DataFrame(scores, columns=['voxel #{}'.format(i) for i in range(scores.shape[1])])
    # result.to_csv(join(paths.path2derivatives, 'fMRI/glm-indiv/english', str(model).split('(')[0] + '.csv'))
    return scores_cv, distribution_array  # 2D arrays : (nb_runs_test, nb_voxels)
コード例 #29
0
                           param_grid=param_dist,
                           cv=cv,
                           n_jobs=-1,
                           verbose=True,
                           scoring='neg_root_mean_squared_error')
grid_search.fit(X_train_test, y_train_test, groups=groups_train_test)

best_grid = grid_search.best_estimator_
print("Best Grid")
print(best_grid)
#choosing the reg model from grid search
reg_model = grid_search.best_estimator_

###################### Regression ##########################
############# perform leave-one-out-cross-validation on the training-testing set #############
logo = LeaveOneGroupOut()

#initialize vector to keep score for each fold
y_train_test_predictions = np.zeros(y_train_test.shape)

for train, test in logo.split(X_train_test,
                              y_train_test,
                              groups=groups_train_test):

    #perform train test split for the current fold
    X_train, X_test, y_train, y_test = X_train_test[train], X_train_test[
        test], y_train_test[train], y_train_test[test]

    model = reg_model  # regression model from the grid search portion

    model_trained = model.fit(X_train, y_train)
コード例 #30
0
    def findParametersAndEvaluate(self,
                                  data,
                                  strategy,
                                  label_name,
                                  group=None,
                                  dataset=None,
                                  cv=5):

        self.strategy = strategy
        self.results = {}

        print('-------------------------------')
        print(' STEP : Finding Parameters & Evaluate Models')
        print('-------------------------------')

        self.label_name_check(label_name)
        #print(self.labelset.columns)

        # store performance data for each strategy

        if (strategy == 'train_test_split' or strategy == 'all'):

            self.train_test = dict()
            for model in self.models.keys():
                self.train_test[model] = None

            print('===> Evaluation strategy: Train and Test Split   ')
            X_train, X_test, y_train, y_test = train_test_split(
                data,
                self.label_set[label_name],
                train_size=.7,
                random_state=self.seed)

            print('===> Parameters find-> Start')
            for model in self.models.keys():
                if model == 'vot':
                    continue
                if not self.configured:
                    gd = GridSearchCV(self.models[model],
                                      self.params[model],
                                      cv=cv,
                                      scoring='neg_root_mean_squared_error')
                    gd.fit(X_train, y_train)
                    print('       Parameters for ', model, ': ',
                          gd.best_params_)
                    self.models[model] = gd.best_estimator_

            print('===> Parameters find-> End')

            test_performances = dict()
            print('===> Test data performance[RMSE] ')
            for model in self.models.keys():
                self.models[model].fit(X_train, y_train)
                test_performances[model] = mean_squared_error(
                    y_test, self.models[model].predict(X_test), squared=False)
                #print('       Model[',model,']:',test_performances[model])
                self.train_test[model] = test_performances[model]
            print(self.train_test)

            self.results['train_test'] = self.train_test

        if (strategy == 'cross_val' or strategy == 'all'):

            self.cross_val = dict()
            cross_val = dict()
            for model in self.models.keys():
                self.cross_val[model] = None

            print('==============================================')
            print('Evaluation strategy: Cross Validation')
            print('==============================================')
            for model in self.models.keys():

                if model != 'vot' and not self.configured:
                    print('    ==> Finding params for ', model)
                    gd = GridSearchCV(self.models[model],
                                      self.params[model],
                                      cv=10,
                                      scoring='neg_root_mean_squared_error')
                    gd.fit(data, self.label_set[label_name])
                    print('        Parameters: ', gd.best_params_)
                    self.models[model] = gd.best_estimator_

                cross_val[model] = cross_val_score(
                    self.models[model],
                    data,
                    self.label_set[label_name],
                    scoring='neg_root_mean_squared_error',
                    cv=cv)
                #print('  Score[',model,']:',cross_val_scores[model])

                cross_val_mean = -1 * statistics.mean(cross_val[model])
                cross_val_var = statistics.variance(cross_val[model])

                self.cross_val[model] = [cross_val_mean, cross_val_var]

            self.results['cross_val'] = self.cross_val

        if (strategy == 'leave_one_group_out' or strategy == 'all'):

            self.leave_group = dict()
            for model in self.models.keys():
                self.leave_group[model] = None

            print('==============================================')
            print('Evaluation strategy: Leave one group out')
            print('==============================================')

            logo = LeaveOneGroupOut()
            n_splits = logo.get_n_splits(groups=group)

            error = dict()

            for model in self.models.keys():
                error[model] = [None] * n_splits

            k = 0
            for train_index, test_index in logo.split(
                    data, self.label_set[label_name], group):
                #print(test_index)

                X_train, y_train = data.iloc[train_index], self.label_set[
                    label_name][train_index]
                X_test, y_test = data.iloc[test_index], self.label_set[
                    label_name][test_index]

                for model in self.models.keys():

                    if model != 'vot' and not self.configured:

                        print('    ==> Finding params for ', model)
                        gd = GridSearchCV(
                            self.models[model],
                            self.params[model],
                            cv=10,
                            scoring='neg_root_mean_squared_error')
                        gd.fit(X_train, y_train)
                        print('        Parameters: ', gd.best_params_)
                        estimator = gd.best_estimator_

                        self.models[model] = estimator

                    self.models[model].fit(X_train, y_train)
                    error[model][k] = mean_squared_error(
                        y_test,
                        self.models[model].predict(X_test),
                        squared=False)

                    #print('    Model[',model,']:',error[model])

                k = k + 1

            for model in self.models.keys():
                err_mean = statistics.mean(error[model])
                err_var = statistics.variance(error[model])
                self.leave_group[model] = [err_mean, err_var]

            self.results['leave_group'] = self.leave_group

        if (strategy == 'leave_one_dataset_out' or strategy == 'all'):
            self.leave_dataset = dict()
            for model in self.models.keys():
                self.leave_dataset[model] = None

            print('==============================================')
            print('Evaluation strategy: Leave one dataset out')
            print('==============================================')

            logo = LeaveOneGroupOut()
            n_splits = logo.get_n_splits(groups=dataset)

            error = dict()

            for model in self.models.keys():
                error[model] = [None] * n_splits

            k = 0

            for train_index, test_index in logo.split(
                    data, self.label_set[label_name], dataset):

                X_train, y_train = data.iloc[train_index], self.label_set[
                    label_name][train_index]
                X_test, y_test = data.iloc[test_index], self.label_set[
                    label_name][test_index]

                for model in self.models.keys():

                    if model != 'vot' and not self.configured:

                        print('    ==> Finding params for ', model)
                        gd = GridSearchCV(
                            self.models[model],
                            self.params[model],
                            cv=10,
                            scoring='neg_root_mean_squared_error')
                        gd.fit(X_train, y_train)
                        #print('        Parameters: ',gd.best_params_)
                        estimator = gd.best_estimator_

                        self.models[model] = estimator

                    self.models[model].fit(X_train, y_train)

                    error[model][k] = mean_squared_error(
                        y_test,
                        self.models[model].predict(X_test),
                        squared=False)

                    #print('    Model[',model,']:',error[model])
                k = k + 1
            for model in self.models.keys():
                err_mean = statistics.mean(error[model])
                err_var = statistics.variance(error[model])

                self.leave_dataset[model] = [err_mean, err_var]

            self.results['leave_dataset'] = self.leave_dataset

        if (strategy == 'sorted_stratified' or strategy == 'all'):

            self.stratified = dict()
            for model in self.models.keys():
                self.stratified[model] = None

            # idea from https://scottclowe.com/2016-03-19-stratified-regression-partitions/
            print('==============================================')
            print('Evaluation strategy: Sorted Stratification')
            print('==============================================')

            label_df = pd.DataFrame(self.label_set)

            indices = label_df.sort_values(by=[label_name]).index.tolist()
            splits = dict()

            error = dict()
            for model in self.models.keys():
                error[model] = [None] * cv

            for i in range(cv):
                splits[i] = list()

            for i in range(len(indices)):
                if i % cv == 0:
                    pick = random.sample(range(cv), cv)
                cur_pick = pick.pop()
                splits[cur_pick].append(indices[i])

            for i in range(cv):
                test_index = splits[i]
                train_index = []
                for j in range(cv):
                    if j != i:
                        train_index = train_index + splits[j]

                ##########################################

                # Code to training model on sorted stratified set
                X_train, y_train = data.iloc[train_index], self.label_set[
                    label_name][train_index]
                X_test, y_test = data.iloc[test_index], self.label_set[
                    label_name][test_index]

                for model in self.models.keys():
                    if model != 'vot' and not self.configured:

                        print('    ==> Finding params for ', model)
                        gd = GridSearchCV(
                            self.models[model],
                            self.params[model],
                            cv=10,
                            scoring='neg_root_mean_squared_error')
                        gd.fit(X_train, y_train)
                        print('        Parameters: ', gd.best_params_)
                        estimator = gd.best_estimator_
                        self.models[model] = estimator

                    self.models[model].fit(X_train, y_train)

                    error[model][i] = mean_squared_error(
                        y_test,
                        self.models[model].predict(X_test),
                        squared=False)

                    #print('    Model[',model,']:',error[model])

            for model in self.models.keys():
                err_mean = statistics.mean(error[model])
                err_var = statistics.variance(error[model])
                self.stratified[model] = [err_mean, err_var]
                ##########################################

            self.results['stratified'] = self.stratified

        else:
            print('Unsupported evaluation strategy')
            return None

        return self.results

        # Preparing dataframe with results for report generation
        """        
コード例 #31
0

df['class_label'] = df.apply(lambda row: give_class(row), axis=1)

data_train_x = df[df.Year != 2018][df.Year != 2016][train_columns]
data_train_y = df[df.Year != 2018][df.Year != 2016].class_label
data_dev_x = df[df.Year == 2016][train_columns]
data_dev_y = df[df.Year == 2016].class_label
data_all_x = df[df.Year != 2018][train_columns]
data_all_y = df[df.Year != 2018].class_label

# Temporal_cv------------------------------------

from sklearn.model_selection import LeaveOneGroupOut
groups = data_all_x.Year.values.tolist()
temporal_cv = LeaveOneGroupOut()

#logistic regression------------------------------------

from sklearn.linear_model import LogisticRegression

clever_print('logistic regression with no penelization')
basic_multi_logi = LogisticRegression(max_iter=10000000,
                                      penalty='l2',
                                      C=10000000000).fit(
                                          data_train_x, data_train_y)
print('accuracy on training')
print(accuracy_score(data_train_y, basic_multi_logi.predict(data_train_x)))
print('accuracy on dev')
print(accuracy_score(data_dev_y, basic_multi_logi.predict(data_dev_x)))
コード例 #32
0
for i,n in enumerate(sorted(names)):
    roi_name=fold+'mni4060/asymroi_'+smt+'_'+n+'.npz'            
    roi=np.load(roi_name)['roi']
    roi=roi[:,motor_label-1]
    roi_imp=roi[mask_imp]
    roi_imag=roi[mask_imag]
    roi_imp_all=np.vstack((roi_imp_all,roi_imp))
    roi_imag_all=np.vstack((roi_imag_all,roi_imag))
    y_imp_all=np.append(y_imp_all,y_imp)
    y_imag_all=np.append(y_imag_all,y_imag)
    groups=np.append(groups,np.ones(len(y_imp))*i)
result_cv_tr_imp=[] 
result_cv_tr_imag=[]   
pipeline = Pipeline([('scale', scaler),('svm', svm)])    
from sklearn.model_selection import LeaveOneGroupOut
logo = LeaveOneGroupOut()
for train_index, test_index in logo.split(roi_imp_all, y_imp_all, groups):
    X_train, X_test = roi_imp_all[train_index], roi_imag_all[test_index]
    y_train, y_test = y_imp_all[train_index], y_imp_all[test_index]
    pipeline.fit(X_train,y_train)
    prediction = pipeline.predict(X_test)  
    result_cv_tr_imp.append(accuracy_score(prediction,y_test))
    
    X_train, X_test = roi_imag_all[train_index], roi_imp_all[test_index]
    y_train, y_test = y_imp_all[train_index], y_imp_all[test_index]
    pipeline.fit(X_train,y_train)
    prediction = pipeline.predict(X_test)  
    result_cv_tr_imag.append(accuracy_score(prediction,y_test))

from scipy.stats import ttest_1samp
tt,p=ttest_1samp(np.array(result_cv_tr_imag),0.5)
コード例 #33
0
# osx_data_path = '/Users/elijahc/data/uminn/preprocessed'
# all_data = load_preprocessed(data_path=osx_data_path,file_names=file_names,simple=True)
# merged_data = load_preprocessed(data_path=osx_data_path,file_names=file_names,merge_keys=['pows','stages'],simple=True)

# linux
all_data = load_preprocessed(file_names=file_names, simple=True)
merged_data = load_preprocessed(file_names=file_names,
                                merge_keys=['pows', 'stages'],
                                simple=True)

scaler = StandardScaler()
scaler.fit(merged_data['pows'])

groups = [[i] * len(d['pows'])
          for i, d in zip(np.arange(len(all_data)), all_data)]
logo = LeaveOneGroupOut(n_splits=10, groups=groups)
import ipdb

ipdb.set_trace()

model_params = dict(
    layer_spec=[64],
    # reg_weight=0.01,
    # num_labels=3,
    activ='relu',
)
fit_params = dict(
    # validation_split=0.1,
    epochs=500,
    verbose=0,
)
コード例 #34
0
                         (DataImp_dropNA['Well Name'] == 'SHRIMPLIN')].index
DataImp_dropF9 = DataImp_dropNA.drop(dropidx)
wells_noPE = DataImp_dropF9['Well Name'].values
DataImp = DataImp_dropF9.drop(['Formation', 'Well Name', 'Depth'],
                              axis=1).copy()

Ximp = DataImp.loc[:, DataImp.columns != 'PE'].values
Yimp = DataImp.loc[:, 'PE'].values

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(Ximp)
Ximp_scaled = scaler.transform(Ximp)

from sklearn.metrics import mean_squared_error
logo = LeaveOneGroupOut()
R2list = []
mselist = []

start = time.time()

for train, test in logo.split(Ximp_scaled, Yimp, groups=wells_noPE):
    well_name = wells_noPE[test[0]]

    # Imputation using linear regression
    linear_model = LinearRegression()
    linear_model.fit(Ximp_scaled[train], Yimp[train])

    R2 = linear_model.score(Ximp_scaled[test], Yimp[test])  # R2
    print("Well name_test : ", well_name)
    print("R2: %.4f" % R2)
コード例 #35
0
def RF_classifier(X_data,Y_data,options=None):
    from sklearn.ensemble import RandomForestClassifier

    ####################
    # Parse user options
    ####################
    params = {}
    gridsearch   = False
    GS_settings  = None
    randomsearch = False
    RS_settings  = None
    accuracy = False
    cv_type = 'logo'
    scoring = 'f1'

    if (options is not None):

        if (("RF_parameters" in options)==True):
            params = options['RF_parameters']

        if (("grid_search" in options)==True):
            from sklearn.model_selection import GridSearchCV
            gridsearch = True
            GS_params   = options['grid_search']['parameter_grid']
            if (("settings" in options['grid_search'])==True): GS_settings = options['grid_search']['settings'] 

        if (("random_search" in options)==True):
            from sklearn.model_selection import RandomizedSearchCV
            from cfd2ml.utilities import convert_param_dist
            randomsearch = True
            RS_params, RS_Nmax   = convert_param_dist(options['random_search']['parameter_grid'])
            print('RS_Nmax = ', RS_Nmax)
            if (("settings" in options['random_search'])==True): RS_settings = options['random_search']['settings'] 

        if(randomsearch==True and gridsearch==True): quit('********** Stopping! grid_search and random_search both set *********')

        if (("accuracy" in options)==True):
            accuracy = options['accuracy']
            if (accuracy==True):
                from sklearn.model_selection import cross_validate
                from sklearn.metrics import precision_recall_curve, auc, f1_score, accuracy_score, balanced_accuracy_score, confusion_matrix
                from cfd2ml.utilities import print_cm

        if (("scoring" in options)==True):
            scoring = options['scoring']

        if (("cv_type" in options)==True):
            cv_type = options['cv_type']

    ##############
    # Prepare data
    ##############
    if(cv_type=='logo'): groups = X_data['group']
    X_data = X_data.drop(columns='group')

    # Find feature and target headers
    X_headers = X_data.columns
    Y_header  = Y_data.name

    nX = X_headers.size
    print('\nFeatures:')
    for i in range(0,nX):
        print('%d/%d: %s' %(i+1,nX,X_headers[i]) )
    print('\nTarget: ', Y_header)
  
    ########################
    # Prepare other settings
    ########################
    # Setting cross-validation type (either leave-one-group-out or 5-fold)
    if(cv_type=='logo'):
        from sklearn.model_selection import LeaveOneGroupOut
        logo = LeaveOneGroupOut()
        ngroup = logo.get_n_splits(groups=groups)
        print('\nUsing Leave-One-Group-Out cross validation on ', ngroup, ' groups')
    elif(cv_type=='kfold'):
        from sklearn.model_selection import StratifiedKFold
        print('\nUsing 10-fold cross validation')
        k_fold = StratifiedKFold(n_splits=10, random_state=42,shuffle=True)
        cv = k_fold.split(X_data,Y_data)

    #########################
    # Training the classifier
    #########################
    # TODO TODO TODO - improve accuracy by using balanced or weighted random forest
    # (see https://statistics.berkeley.edu/sites/default/files/tech-reports/666.pdf)
    if(gridsearch==True):
        # Finding optimal hyperparameters with GridSearchCV
        print('\n Performing GridSearchCV to find optimal hyperparameters for random forest classifier')
        clf = RandomForestClassifier(**params,random_state=42)
        if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
        GS_clf = GridSearchCV(estimator=clf,param_grid=GS_params, cv=cv, scoring=scoring, iid=False, verbose=2, **GS_settings)
        GS_clf.fit(X_data,Y_data)

        # Write out results to file
        scores_df = pd.DataFrame(GS_clf.cv_results_)#.sort_values(by='rank_test_score')
        scores_df.to_csv('GridSearch_results.csv')

        # Pich out best results
        best_params = GS_clf.best_params_
        best_score  = GS_clf.best_score_
        clf = GS_clf.best_estimator_  # (this clf has been fit to all of the X_data,Y_data)

        print('\nBest hyperparameters found:', best_params)
        print('\nScore with these hyperparameters:', best_score)

    elif(randomsearch==True):
        # Finding optimal hyperparameters with RandomSearchCV
        print('\n Performing RandomizedSearchCV to find optimal hyperparameters for random forest classifier')
        clf = RandomForestClassifier(**params,random_state=42)
        if (cv_type=='logo'): cv = logo.split(X_data,Y_data,groups)
        RS_clf = RandomizedSearchCV(estimator=clf,param_distributions=RS_params, cv=cv, scoring=scoring,iid=False, verbose=2, error_score=np.nan, **RS_settings)
        RS_clf.fit(X_data,Y_data)
        
        # Write out results to file
        scores_df = pd.DataFrame(RS_clf.cv_results_)#.sort_values(by='rank_test_score')
        scores_df.to_csv('RandomSearch_results.csv')

        # Pick out best results
        best_params = RS_clf.best_params_
        best_score  = RS_clf.best_score_
        clf = RS_clf.best_estimator_  # (this clf has been fit to all of the X_data,Y_data)

        print('\nBest hyperparameters found:', best_params)
        print('\nScore with these hyperparameters:', best_score)


    else:
        # Train RF classifier with hyperparameters given by user
        print('\nTraining random forest classifer with given hyperparameters')
        clf = RandomForestClassifier(**params)
        clf.fit(X_data,Y_data)

    # Cross validation accuracy metrics
    if(accuracy==True):
        print('\nPerforming cross validation to determine train and test accuracy/error, and precision-recall curves')

        #TODO - capability to decide on probablity threshold, and predict with chosen threshold

        # Get generator object depending on cv strategy
        if (cv_type=='logo'): 
            cv = logo.split(X_data,Y_data,groups)
        elif(cv_type=='kfold'):
            cv = k_fold.split(X_data,Y_data)  # Need to regen "Generator" object

        fig1, ax1 = plt.subplots()

        # Init lists
        y_real   = []
        y_proba  = []
        train_f1 = []
        test_f1  = []
        train_A  = []
        test_A   = []
        train_BA = []
        test_BA  = []

        # Loop through CV folds
        i = 0
        for train_index, test_index in cv:
            X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
            Y_train, Y_test = Y_data.iloc[train_index], Y_data.iloc[test_index]

            # Train classifier
            clf_cv = clf
            clf_cv.fit(X_train, Y_train)

            # Predict Y
            Y_pred_train = clf_cv.predict(X_train)
            Y_pred_test  = clf_cv.predict(X_test )

            # F1 scores
            f1score = f1_score(Y_test , Y_pred_test)
            train_f1.append(f1_score(Y_train, Y_pred_train) )
            test_f1.append(f1score)
            # Accuracy scores
            Ascore = accuracy_score(Y_test , Y_pred_test)
            train_A.append(accuracy_score(Y_train, Y_pred_train) )
            test_A.append(Ascore)
            # Balanced accuracy scores
            BAscore = balanced_accuracy_score(Y_test , Y_pred_test)
            train_BA.append(balanced_accuracy_score(Y_train, Y_pred_train) )
            test_BA.append(BAscore)

            # Print validation scores (training scores are stored to print mean later, but not printed for each fold)
            if(cv_type=='logo'):
                print('\nTest group = ', groups.iloc[test_index[0]])
            elif(cv_type=='kfold'):
                print('\nFold = ', i)
            print('-------------------')
            print('F1 score = %.2f %%' %(f1score*100) )
            print('Total error = %.2f %%' %((1.0-Ascore)*100) )
            print('Per-class error = %.2f %%' %((1.0-BAscore)*100) )

            # Print confusion matrix for this fold
            print('Confusion matrix:')
            confuse_mat = confusion_matrix(Y_test, Y_pred_test)
            print_cm(confuse_mat, ['Off','On'])
            
            # Prediction probability based on X_test (used for precision-recall curves)
            pred_proba = clf_cv.predict_proba(X_test)
            precision, recall, _ = precision_recall_curve(Y_test, pred_proba[:,1])
            lab = 'Fold %d AUC=%.4f' % (i+1, auc(recall, precision))
            ax1.step(recall, precision, label=lab)
            y_real.append(Y_test)
            y_proba.append(pred_proba[:,1])

            i += 1

        # Calculate errors from accuracies
        train_TE = 1.0 -  np.array(train_A)
        test_TE  = 1.0 -  np.array(test_A)
        train_CAE = 1.0 - np.array(train_BA)
        test_CAE  = 1.0 - np.array(test_BA)

        # Print performance scores
        print('\nMean training scores:')
        print('F1 score = %.2f %%' %(np.mean(train_f1)*100) )
        print('Total error = %.2f %%' %(np.mean(train_TE)*100) )
        print('Per-class error = %.2f %%' %(np.mean(train_CAE)*100) )
    
        print('\nMean validation scores:')
        print('F1 score = %.2f %%' %(np.mean(test_f1)*100) )
        print('Total error = %.2f %%' %(np.mean(test_TE)*100) )
        print('Per-class error = %.2f %%' %(np.mean(test_CAE)*100) )

        
        # Average precision-recall over folds, and plot curves
        y_real = np.concatenate(y_real)
        y_proba = np.concatenate(y_proba)
        precision, recall, _ = precision_recall_curve(y_real, y_proba)
        lab = 'Overall AUC=%.4f' % (auc(recall, precision))
        ax1.step(recall, precision, label=lab, lw=2, color='black')
        ax1.set_xlabel('Recall')
        ax1.set_ylabel('Precision')
        ax1.legend(loc='lower left', fontsize='small')
        

        plt.show()

    return clf