Python LeavePGroupsOut.LeavePGroupsOutの例、sklearn.model_selection.LeavePGroupsOut.LeavePGroupsOut Pythonの例

コード例 #1

0

ファイルを表示

ファイル: cross_validation.py プロジェクト: urubens/CellCounting

def cv_strategy(parameters):
    if parameters.cv_mode == 'GKF':
        return GroupKFold(n_splits=parameters.cv_param)
    elif parameters.cv_mode == 'LPGO':
        return LeavePGroupsOut(n_groups=parameters.cv_param)
    else:
        raise ValueError("Unknown CV mode")

コード例 #2

0

ファイルを表示

ファイル: ml.py プロジェクト: hyruuk/NeuroPy-MLToolbox

 def _init_atributes(self, y, groups):
     """Initialization."""
     if len(y) != len(groups):
         raise Exception("Error: y and groups need to have the same length")
     if y is None:
         raise Exception("Error: y cannot be None")
     if groups is None:
         raise Exception("Error: this function requires a groups parameter")
     if self.labels_list is None:
         self.labels_list = list(set(y))
     if self.n_labs is None:
         self.n_labs = len(self.labels_list)
     assert (
         self.n_groups % self.n_labs == 0
     ), "Error: The number of groups to leave out must be a multiple of the number of classes"
     if self.n_each is None:
         self.n_each = int(self.n_groups / self.n_labs)
     if self.lpgos is None:
         lpgos, indexes = [], []
         for label in self.labels_list:
             index = np.where(y == label)[0]
             indexes.append(index)
             lpgos.append(LeavePGroupsOut(self.n_each))
         self.lpgos = lpgos
         self.indexes = np.array(indexes)

コード例 #3

0

ファイルを表示

ファイル: test_search.py プロジェクト: zihua/scikit-learn

def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [
        LeaveOneGroupOut(),
        LeavePGroupsOut(2),
        GroupKFold(),
        GroupShuffleSplit()
    ]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The groups parameter should not be None", gs.fit,
                             X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

コード例 #4

0

ファイルを表示

ファイル: test_validation.py プロジェクト: victzh/scikit-learn

def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [
        LeaveOneGroupOut(),
        LeavePGroupsOut(2),
        GroupKFold(),
        GroupShuffleSplit()
    ]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The groups parameter should not be None",
                             cross_val_score,
                             estimator=clf,
                             X=X,
                             y=y,
                             cv=cv)
        assert_raise_message(ValueError,
                             "The groups parameter should not be None",
                             cross_val_predict,
                             estimator=clf,
                             X=X,
                             y=y,
                             cv=cv)

コード例 #5

0

ファイルを表示

ファイル: test_successive_halving.py プロジェクト: asaadeldin11/scikit-learn

def test_groups_support(Est):
    # Check if ValueError (when groups is None) propagates to
    # HalvingGridSearchCV and HalvingRandomSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=50, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 50)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2),
                 GroupKFold(n_splits=3), GroupShuffleSplit(random_state=0)]
    error_msg = "The 'groups' parameter should not be None."
    for cv in group_cvs:
        gs = Est(clf, grid, cv=cv)
        with pytest.raises(ValueError, match=error_msg):
            gs.fit(X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)]
    for cv in non_group_cvs:
        gs = Est(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

コード例 #6

0

ファイルを表示

def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to dcv.GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {"C": [1]}

    group_cvs = [
        LeaveOneGroupOut(),
        LeavePGroupsOut(2),
        GroupKFold(n_splits=3),
        GroupShuffleSplit(n_splits=3),
    ]
    for cv in group_cvs:
        gs = dcv.GridSearchCV(clf, grid, cv=cv)

        with pytest.raises(ValueError) as exc:
            assert gs.fit(X, y)
        assert "parameter should not be None" in str(exc.value)

        gs.fit(X, y, groups=groups)

    non_group_cvs = [
        StratifiedKFold(n_splits=3),
        StratifiedShuffleSplit(n_splits=3)
    ]
    for cv in non_group_cvs:
        gs = dcv.GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)

コード例 #7

0

ファイルを表示

ファイル: model_direct.py プロジェクト: wangjiweisean/CVR_Pre

def cross_validation(X, y, pre_x, groups, model='LGB', test_days=1):
    groups = np.floor((groups + 1) / 2)

    logo = LeavePGroupsOut(n_groups=test_days)
    i = 0
    pre_sum = np.zeros(pre_x.shape[0])
    pre_ = []
    print np.isnan(groups).astype(int).sum()
    print np.unique(groups)
    ll_ = []
    for train, test in logo.split(X, y, groups=groups):
        i = i + 1
        print 'times:', i
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        print X_train.shape, X_test.shape, y_train.shape, y_test.shape
        if model == 'LGB':
            pre, ll = LGB(X_train, X_test, y_train, y_test, pre_x)
        else:
            pre, ll = LR(X_train, X_test, y_train, y_test, pre_x)
        ll_ += [ll]
        pre_ += [pre]
    weight = []
    weight_sum = 0
    for l in ll_:
        weight_sum += 1.0 / l
        weight += [1.0 / l]
    for i in range(len(pre_)):
        pre_sum += pre_[i] * weight[i] / weight_sum

    print 'weight', weight
    print 'loss', ll_

    return pre_sum

コード例 #8

0

ファイルを表示

def fold_maker(X, fold_choice='default', n_fold=4, n_groups=2):
    if fold_choice == 'default':
        folds = KFold(n_splits=n_fold, shuffle=False)
        fold_iter = folds.split(X)
        fold_iter = shuffle_group(fold_iter)
    elif fold_choice == 'earthquake':
        earthquake_id = data_loader.load_earthquake_id()
        group_kfold = LeaveOneGroupOut()
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        # fold_iter = shuffle_group(fold_iter)
        # fold_iter = min_valid_filter(fold_iter)
    elif fold_choice == f'eqCombo':
        earthquake_id = eqComboMaker(n_fold)
        group_kfold = LeaveOneGroupOut()
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        fold_iter = shuffle_group(fold_iter)
    elif fold_choice == 'k-earthquake':
        earthquake_id = data_loader.load_earthquake_id()
        group_kfold = LeavePGroupsOut(n_groups=n_groups)
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        fold_iter = min_valid_filter(fold_iter)
    elif fold_choice == 'customize':
        fold = CVPipe()
        fold_iter = fold.fold_iter(num_fold=n_fold, mini_quake_prob=0.3)
    else:
        raise AttributeError(f"Not support CV {fold_choice} yet...")

    return (list(fold_iter), fold_choice)

コード例 #9

0

ファイルを表示

ファイル: test_split.py プロジェクト: enzlaur/ALGOCOMP2

def test_cross_validator_with_default_params():
    n_samples = 4
    n_unique_groups = 4
    n_splits = 2
    p = 2
    n_shuffle_splits = 10  # (the default value)

    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    X_1d = np.array([1, 2, 3, 4])
    y = np.array([1, 1, 2, 2])
    groups = np.array([1, 2, 3, 4])
    loo = LeaveOneOut()
    lpo = LeavePOut(p)
    kf = KFold(n_splits)
    skf = StratifiedKFold(n_splits)
    lolo = LeaveOneGroupOut()
    lopo = LeavePGroupsOut(p)
    ss = ShuffleSplit(random_state=0)
    ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2

    loo_repr = "LeaveOneOut()"
    lpo_repr = "LeavePOut(p=2)"
    kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)"
    skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
    lolo_repr = "LeaveOneGroupOut()"
    lopo_repr = "LeavePGroupsOut(n_groups=2)"
    ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, "
               "train_size=None)")
    ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"

    n_splits_expected = [
        n_samples,
        comb(n_samples, p), n_splits, n_splits, n_unique_groups,
        comb(n_unique_groups, p), n_shuffle_splits, 2
    ]

    for i, (cv, cv_repr) in enumerate(
            zip([loo, lpo, kf, skf, lolo, lopo, ss, ps], [
                loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr,
                ss_repr, ps_repr
            ])):
        # Test if get_n_splits works correctly
        assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups))

        # Test if the cross-validator works as expected even if
        # the data is 1d
        np.testing.assert_equal(list(cv.split(X, y, groups)),
                                list(cv.split(X_1d, y, groups)))
        # Test that train, test indices returned are integers
        for train, test in cv.split(X, y, groups):
            assert_equal(np.asarray(train).dtype.kind, 'i')
            assert_equal(np.asarray(train).dtype.kind, 'i')

        # Test if the repr works without any errors
        assert_equal(cv_repr, repr(cv))

コード例 #10

0

ファイルを表示

def test_leave_group_out_changing_groups():
    # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if
    # the groups variable is changed before calling split
    groups = np.array([0, 1, 2, 1, 1, 2, 0, 0])
    X = np.ones(len(groups))
    groups_changing = np.array(groups, copy=True)
    lolo = LeaveOneGroupOut().split(X, groups=groups)
    lolo_changing = LeaveOneGroupOut().split(X, groups=groups)
    lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
    lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
    groups_changing[:] = 0
    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
            assert_array_equal(train, train_chan)
            assert_array_equal(test, test_chan)

    # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3
    assert_equal(3, LeavePGroupsOut(n_groups=2).get_n_splits(X, y, groups))
    # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups)
    assert_equal(3, LeaveOneGroupOut().get_n_splits(X, y, groups))

コード例 #11

0

ファイルを表示

ファイル: test_split.py プロジェクト: enzlaur/ALGOCOMP2

def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
    X = y = groups = np.ones(0)
    assert_raise_message(ValueError, "Found array with 0 sample(s)", next,
                         LeaveOneGroupOut().split(X, y, groups))
    X = y = groups = np.ones(1)
    msg = ("The groups parameter contains fewer than 2 unique groups ([ 1.]). "
           "LeaveOneGroupOut expects at least 2.")
    assert_raise_message(ValueError, msg, next,
                         LeaveOneGroupOut().split(X, y, groups))
    X = y = groups = np.ones(1)
    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
           "(3) numbers of unique groups ([ 1.]). LeavePGroupsOut expects "
           "that at least n_groups + 1 (4) unique groups be present")
    assert_raise_message(ValueError, msg, next,
                         LeavePGroupsOut(n_groups=3).split(X, y, groups))
    X = y = groups = np.arange(3)
    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
           "(3) numbers of unique groups ([0 1 2]). LeavePGroupsOut expects "
           "that at least n_groups + 1 (4) unique groups be present")
    assert_raise_message(ValueError, msg, next,
                         LeavePGroupsOut(n_groups=3).split(X, y, groups))

コード例 #12

0

ファイルを表示

ファイル: Classification_bak.py プロジェクト: ssz66666/ADS_SH2_Project

def tmpFUN(dataset, group_label = "groups", n_groups = 2, y_label = "groups", rf_n_estimators = 2000, n_jobs = -1):
        lpgo = LeavePGroupsOut(n_groups = n_groups)

        for train_index, validate_index in lpgo.split(X = dataset, y = dataset.loc[:,y_label], groups = dataset.loc[:,group_label]):
                trainset = dataset.iloc[train_index,:]
                validateset  = dataset.iloc[validate_index,:]
                X_train = trainset.drop(y_label, axis = 1)
                y_train = trainset.loc[:,y_label]
                
                RF_mod = RandomForestClassifier(n_estimators = rf_n_estimators, n_jobs = n_jobs, class_weight = "balanced")
                RF_mod.fit(X_train, y_train)
                RF_pred = RF_mod.predict(X_test)

コード例 #13

0

ファイルを表示

ファイル: splitter.py プロジェクト: mcd01/arvalus-experiments

    def split(self):
        complete: bool = self.use_test and self.use_validation
        logging.info(f"Conducting a {'3/1/1' if complete else '4/1'} Split.")

        split_args: dict = {
            "make_normal_splitter":
            lambda: StratifiedShuffleSplit(n_splits=self.n_splits,
                                           test_size=0.4 if complete else 0.2),
            "make_normal_sub_splitter":
            lambda: (StratifiedShuffleSplit(n_splits=1, test_size=0.5)
                     if complete else None),
            "make_anomaly_splitter":
            lambda: LeavePGroupsOut(n_groups=2 if complete else 1),
            "make_anomaly_sub_splitter":
            lambda: (LeavePGroupsOut(n_groups=1) if complete else None),
            "use_test":
            self.use_test,
            "use_validation":
            self.use_validation
        }

        return self.__split__(split_args)

コード例 #14

0

ファイルを表示

ファイル: utils.py プロジェクト: sidiatig/ot_ISPA

def create_cv(x, y, subjects, P):
    """

    :param x:
    :param y:
    :param N:
    :return:
    """
    cv = []
    lpgo = LeavePGroupsOut(n_groups=P)
    for train_index, test_index in lpgo.split(x, y, subjects):
        cv.append((train_index, test_index))
    return cv

コード例 #15

0

ファイルを表示

def _cv_build(cv_scheme):
    LOG.debug('Building CV scheme: %s', str(cv_scheme))
    if cv_scheme is None:
        return None

    if cv_scheme is not None and cv_scheme.get('type', '') == 'kfold':
        nsplits = cv_scheme.get('n_splits', 6)
        return StratifiedKFold(n_splits=nsplits, shuffle=True)

    if cv_scheme is not None and cv_scheme.get('type', '') == 'loso':
        return LeavePGroupsOut(n_groups=1)

    raise RuntimeError('Unknown CV scheme (%s)' % str(cv_scheme))

コード例 #16

0

ファイルを表示

ファイル: test_split.py プロジェクト: enzlaur/ALGOCOMP2

def test_leave_one_p_group_out():
    logo = LeaveOneGroupOut()
    lpgo_1 = LeavePGroupsOut(n_groups=1)
    lpgo_2 = LeavePGroupsOut(n_groups=2)

    # Make sure the repr works
    assert_equal(repr(logo), 'LeaveOneGroupOut()')
    assert_equal(repr(lpgo_1), 'LeavePGroupsOut(n_groups=1)')
    assert_equal(repr(lpgo_2), 'LeavePGroupsOut(n_groups=2)')
    assert_equal(repr(LeavePGroupsOut(n_groups=3)),
                 'LeavePGroupsOut(n_groups=3)')

    for j, (cv, p_groups_out) in enumerate(
        ((logo, 1), (lpgo_1, 1), (lpgo_2, 2))):
        for i, groups_i in enumerate(test_groups):
            n_groups = len(np.unique(groups_i))
            n_splits = (n_groups if p_groups_out == 1 else n_groups *
                        (n_groups - 1) / 2)
            X = y = np.ones(len(groups_i))

            # Test that the length is correct
            assert_equal(cv.get_n_splits(X, y, groups=groups_i), n_splits)

            groups_arr = np.asarray(groups_i)

            # Split using the original list / array / list of string groups_i
            for train, test in cv.split(X, y, groups=groups_i):
                # First test: no train group is in the test set and vice versa
                assert_array_equal(
                    np.intersect1d(groups_arr[train],
                                   groups_arr[test]).tolist(), [])

                # Second test: train and test add up to all the data
                assert_equal(len(train) + len(test), len(groups_i))

                # Third test:
                # The number of groups in test must be equal to p_groups_out
                assert_true(np.unique(groups_arr[test]).shape[0], p_groups_out)

コード例 #17

0

ファイルを表示

    def make_leave_out(X, y=None, p=5, strategy=None, group=None):
        ### strategy = None / 'group'

        # group strategy
        if strategy == 'group':
            spliter = LeaveOneGroupOut() if p == 1 else LeavePGroupsOut(p)
            if group is None:
                raise Exception('Please provide group parameter.')
            else:
                idx_generator = spliter.split(X, y=y, groups=group)
        # not specific strategy
        else:
            spliter = LeaveOneOut() if p == 1 else LeavePOut(p)
            idx_generator = spliter.split(X, y=y, groups=group)
        return idx_generator

コード例 #18

0

ファイルを表示

 def grid_search_hyperparams(self,
                             model,
                             data,
                             feature_names,
                             hyperparam_grid,
                             n_leave_out=1):
     cv_splits = LeavePGroupsOut(n_groups=n_leave_out).split(
         data[feature_names],
         np.ravel(data[self.target]),
         groups=data['group_id'])
     gs_models = GridSearchCV(model,
                              hyperparam_grid,
                              cv=cv_splits,
                              scoring=self.metric,
                              n_jobs=-1)
     gs_models.fit(data[feature_names], np.ravel(data[self.target]))
     return gs_models.best_params_

コード例 #19

0

ファイルを表示

ファイル: cross_validation.py プロジェクト: BCEM-UniAndes/Eukarya_classifier

def leave_P_out_iter(data,
                     labels,
                     s_labels,
                     train_grp_animals,
                     num_groups=2,
                     clf=None,
                     **kwargs):
    """Function to separate folds as groups with separated instances of each organism
	num_groups: number of groups in the train split. Can be 2 or 3 only"""
    lpgo = LeavePGroupsOut(n_groups=num_groups)
    groups_list = lb.grouping_crossval(s_labels, ani_gps=train_grp_animals)
    if clf == None:
        interations = get_iterations(data, lpgo, labels, groups_list)
        return interations
    else:
        clf_function = get_train_function(clf)
        train_cv = get_train_cv_results(data, labels, lpgo, clf_function,
                                        groups_list, **kwargs)
        return train_cv

コード例 #20

0

ファイルを表示

ファイル: _lpso.py プロジェクト: dmalt/brainpipe

 def __init__(self, y, nsuj, pout=1, clf='lda', **clfArg):
     self._y = y
     self._ry = np.ravel(np.concatenate(y))
     self._nsuj = nsuj
     self._pout = pout
     # Manage cross-validation:
     self._cv = LeavePGroupsOut(pout)
     self._cv.shStr = 'Leave '+str(pout)+' subjects out'
     self._cv.lgStr = self._cv.shStr
     self._cv.rep = 1
     self._cv.y = y[0]
     # Manage classifier :
     if isinstance(clf, (int, str)):
         clf = defClf(self._ry, clf=clf, **clfArg)
     self._clf = clf
     # Manage info:
     self._updatestring()
     # Stat tools:
     self.stat = clfstat()

コード例 #21

0

ファイルを表示

ファイル: evaluation.py プロジェクト: Rosemary94/tcrregex

    def _cv_split_hold_out_by_subject_using_sklearn(self, tcrrep=None):
        """
        returns a generator with train and test set indices based on hold on
        subject out cross-validation. This is based on the LeavePGroupsOut


        Parameters
        ----------
        tcrrep : TCRrep class instance
            TCRrep class instance, with TCRrep.clone_df.subject and TCRrep.clone_df.epitope fields

        Returns
        -------
        partitions : generator object BaseCrossValidator.split from sklearn

        """
        if tcrrep is None:
            tcrrep = self.tcrrep
        # unique epitope mapped to unique numbers
        encoder_epitope = preprocessing.LabelEncoder()
        encoder_epitope.fit(list(tcrrep.clone_df.epitope.unique()))

        # `y` target vector
        y = encoder_epitope.transform(tcrrep.clone_df.epitope)

        # `X` distance matrix (metric = 'precomputed')
        X = tcrrep.paired_tcrregex

        # Cross Validation Split
        # unique subjects mapped to unique numbers
        encoder_subjects = preprocessing.LabelEncoder()
        encoder_subjects = encoder_subjects.fit(
            list(tcrrep.clone_df.subject.unique()))

        # define groups based on subject
        groups = list(encoder_subjects.transform(tcrrep.clone_df.subject))

        # Leave P Groups Out
        lpgo = LeavePGroupsOut(n_groups=1)
        lpgo.get_n_splits(X, y, groups)
        partitions = lpgo.split(X, y, groups)
        return partitions

コード例 #22

0

ファイルを表示

ファイル: ModelCollection.py プロジェクト: Kotzly/TCC_EMG

    def create_splits(self, splits=None):

        if self.mode == 'bootstrap':
            unique = np.unique(self.si)

            rs = ShuffleSplit(n_splits=self.n_splits, test_size=1-self.train_size)
            splits = []
            for train, test in rs.split(unique):
                train = unique[train]
                test = unique[test]
                train_ = np.nonzero([x in train for x in self.si])
                test_ = np.nonzero([x in test for x in self.si])
                splits.append((train_, test_))
            self.splits = splits
            self.splitter = None
        elif self.mode == 'groupkfold':
            self.splitter = GroupKFold(n_splits=self.n_splits)
        elif self.mode == 'loso':
            self.splitter = LeaveOneGroupOut()
        elif self.mode == 'lpso':
            self.splitter = LeavePGroupsOut(n_groups=len(np.unique(self.si))//self.n_splits)

コード例 #23

0

ファイルを表示

ファイル: data_management.py プロジェクト: ruivarandas/DSLproject

    def split_groups(filenames, labels, groups, size):
        lpgo = LeavePGroupsOut(n_groups=size)
        flag = False
        for i, (train,
                test) in enumerate(lpgo.split(filenames, labels,
                                              groups=groups)):
            if random() > 0.95:
                flag = True
                train_filenames, train_labels, train_groups = np.array(
                    filenames)[train], np.array(labels)[train], np.array(
                        groups)[train]
                test_filenames, test_labels, test_groups = np.array(filenames)[
                    test], np.array(labels)[test], np.array(groups)[test]
                break
        if not flag:
            train_filenames, train_labels, train_groups = np.array(filenames)[
                train], np.array(labels)[train], np.array(groups)[train]
            test_filenames, test_labels, test_groups = np.array(filenames)[
                test], np.array(labels)[test], np.array(groups)[test]

        return train_filenames, test_filenames, train_groups, train_labels

コード例 #24

0

ファイルを表示

ファイル: CrossValidation.py プロジェクト: Palenoff/MachineLearningScoresSystem

def GetCVObject(type, **kwargs):
    if (type == 'KFold'):
        return KFold(**kwargs)
    elif (type == 'StratifiedKFold'):
        return StratifiedKFold(**kwargs)
    elif (type == 'GroupKFold'):
        return GroupKFold(**kwargs)
    elif (type == 'ShuffleSplit'):
        return ShuffleSplit(**kwargs)
    elif (type == 'StratifiedShuffleSplit'):
        return StratifiedShuffleSplit(**kwargs)
    elif (type == 'GroupShuffleSplit'):
        return GroupShuffleSplit(**kwargs)
    elif (type == 'LeaveOneOut'):
        return LeaveOneOut()
    elif (type == 'LeavePOut'):
        return LeavePOut(**kwargs)
    elif (type == 'LeaveOneGroupOut'):
        return LeaveOneGroupOut()
    elif (type == 'LeavePGroupsOut'):
        return LeavePGroupsOut(**kwargs)

コード例 #25

0

ファイルを表示

ファイル: models.py プロジェクト: GarrettCGraham/dynomics_public

def construct_exp_splits(feature_frame, leave_n_out=1):
    """ Constructs a list of (train,test) splits for a feature_frame
        representing a set of experiments. These splits used integer
        based (as opposed to label based) indexing of feature_frame.
        Input:
            feature_frame : DataFrame
                A pandas dataframe returned by extract_features_targets
                representing multiple experiments
            leave_n_out : int
                The number of experiments to leave out in each cross validation
                fold
        Returns: [(Array, Array)]
            A list of (train index, test index) splits
    """
    groups = feature_frame.index.get_level_values(0)
    logo = LeavePGroupsOut(n_groups=leave_n_out)
    df_mat = feature_frame.values
    cv_splits = [
        (train_index, test_index)
        for train_index, test_index in logo.split(df_mat, groups=groups)
    ]
    return cv_splits

コード例 #26

0

ファイルを表示

    def test_grid_search_groups(self):
        # Check if ValueError (when groups is None) propagates to
        # dcv.GridSearchCV
        # And also check if groups is correctly passed to the cv object
        rng = np.random.RandomState(0)

        X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
        groups = rng.randint(0, 3, 15)

        clf = LinearSVC(random_state=0)
        grid = {"C": [1]}

        group_cvs = [
            LeaveOneGroupOut(),
            LeavePGroupsOut(2),
            GroupKFold(n_splits=3),
            GroupShuffleSplit(n_splits=3),
        ]
        for cv in group_cvs:
            gs = TuneGridSearchCV(clf, grid, cv=cv)
            try:
                with self.assertLogs("ray.tune") as cm:
                    gs.fit(X, y)
                self.assertTrue(
                    ("parameter should not be None.") in str(cm.output))
            except ValueError as exc:
                self.assertTrue("parameter should not be None" in str(exc))

            gs.fit(X, y, groups=groups)

        non_group_cvs = [
            StratifiedKFold(n_splits=3),
            StratifiedShuffleSplit(n_splits=3)
        ]
        for cv in non_group_cvs:
            gs = TuneGridSearchCV(clf, grid, cv=cv)
            # Should not raise an error
            gs.fit(X, y)

コード例 #27

0

ファイルを表示

    def fit(self):
        from sklearn.ensemble import RandomForestClassifier as RFC
        if self._pickled:
            LOG.info('Classifier was loaded from file, cancelling fitting.')
            return

        LOG.info('Start fitting ...')
        estimator = RFC()
        grid = RobustGridSearchCV(estimator,
                                  self.param['rfc'],
                                  error_score=0.5,
                                  refit=True,
                                  scoring=check_scoring(estimator,
                                                        scoring='roc_auc'),
                                  n_jobs=self.n_jobs,
                                  cv=LeavePGroupsOut(n_groups=1),
                                  verbose=0)

        X, y, groups = self._generate_sample()
        self._estimator = grid.fit(X, y, groups=groups)

        LOG.info('Model selection - best parameters (roc_auc=%f) %s',
                 grid.best_score_, grid.best_params_)

コード例 #28

0

ファイルを表示

def get_model():
    sc_params = dict()
    en_params = {"fit_intercept": True,
                 "normalize": False,
                 "precompute": False,
                 "random_state": 5}
    param_grid = {"estimator__alpha": np.power(10., np.arange(-5,6))}
    pca_params = {"n_components": 0.95, "svd_solver": "full"}
    split_params = {"n_groups": 2}

    model = Pipeline([('scaler', StandardScaler(**sc_params)),
                      ('reducer', PCA(**pca_params)),
                      ('estimator', Lasso(**en_params))])

    score = make_scorer(nmse_loss, greater_is_better=False)

    selector = GridSearchCV(estimator=model,
                            param_grid=param_grid,
                            scoring=score,
                            cv=LeavePGroupsOut(**split_params),
                            n_jobs=4,
                            refit=True)

    return model, selector

コード例 #29

0

ファイルを表示

ファイル: sk-交叉验证.py プロジェクト: yuan413/sklearn

# ==================================组 k-fold交叉验证、留一组交叉验证、留 P 组交叉验证、Group Shuffle Split==========================================
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]

# k折分组
gkf = GroupKFold(n_splits=3)  # 训练集和测试集属于不同的组
for train, test in gkf.split(X, y, groups=groups):
    print("组 k-fold分割：%s %s" % (train, test))

# 留一分组
logo = LeaveOneGroupOut()
for train, test in logo.split(X, y, groups=groups):
    print("留一组分割：%s %s" % (train, test))

# 留p分组
lpgo = LeavePGroupsOut(n_groups=2)
for train, test in lpgo.split(X, y, groups=groups):
    print("留 P 组分割：%s %s" % (train, test))

# 随机分组
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("随机分割：%s %s" % (train, test))


# ==================================时间序列分割==========================================
tscv = TimeSeriesSplit(n_splits=3)
TimeSeriesSplit(max_train_size=None, n_splits=3)
for train, test in tscv.split(iris.data):
    print("时间序列分割：%s %s" % (train, test))

コード例 #30

0

ファイルを表示

ファイル: learning_pipelines.py プロジェクト: yinxiaojing1/pre_epi_seizures

def supervised_pipeline(label_struct, baseline_label_struct,
                        pipe, scaler, param_grid,
                        patient_list,
                        feature_slot,
                        hyper_param,
                        plot_eda_all_new,
                        learn_flag,
                        compute_all_new,
                        n_jobs
                       ):
    # State the parameters of the pipeline

    disk = '/mnt/Seagate/pre_epi_seizures/'
    baseline_files = 'h5_files/processing_datasets/baseline_datasets_new'
    seizure_files = 'h5_files/processing_datasets/seizure_datasets_new'

    lead_list = ['ECG-']

    interim_processing = [scaler]
    hist_bins = None
    dist = None
    flag_hist = True
    flag_andrews = True
    flag_series = True
    flag_box = True
    flag_pair = True
    assign_baseline = 'assign_equal_baseline_seizure'
    

    general_dir = disk + 'EDanalysis_new/'
    
    # choose data grouping 
    group_keys= ['patient_nr',
                 'seizure_nr',
                 'types_of_seizure',
                 'location']
    
    group_id = 'seizure_nr'
    
    # Get initial directory, for Exploratory Data Analysis    
    eda_dir =  prepare_disk_space_hyper_param_results(directory=general_dir ,
                                                               patient_list = patient_list,
                                                               lead_list = lead_list,
                                                               scaler = scaler,
                                                               interim_processing = interim_processing,
                                                               assign_baseline = assign_baseline,
                                                               label_struct = label_struct,
                                                               baseline_label_struct = baseline_label_struct,
                                                               feature_slot=feature_slot, 
                                                               group_id=group_id,
                                                               hyper_param=hyper_param)
    
    # information for sklearn labeling
    label = 'label'

    # define cross-validation strategy 
    cv_out = LeavePGroupsOut(n_groups=1)
    cv_in = LeavePGroupsOut(n_groups=1)

    # choose scoring
    scoring = ['f1_micro']

    # choose hyperparameter search function
    search_function = GridSearchCV
    
    # get steps of the pipeline
    pipe_steps = [step[0] for step in pipe.steps]
    
    # Get directory (should be nested) to save classification objects
    classification_dir = prepare_disk_space_hyper_param_results(directory=eda_dir,
                                                               pipe = str(pipe_steps),
                                                               param_grid = param_grid,
                                                               cv_out = cv_out,
                                                               cv_in = cv_in,
                                                               scoring = scoring,
                                                               search_function = search_function,
                                                               label=label)


    if plot_eda_all_new:
        plot_eda(directory=classification_dir,
                 data_groups_list=data_groups_list)

        

    import classification.cross_validation as cv

    # ***********************************Learning****************************
    # Learn from data_struct using nested cross_validation
    # learninig is an optimization and respective test results
    # for each partition of the dataset according to cv_out
    
    # Load the data, according to specification (loading made by convert pandas)
    data_struct = interim_process(disk, seizure_files, baseline_files,
                    feature_slot, hyper_param,
                    patient_list, lead_list,
                    label_struct, baseline_label_struct,
                    interim_processing)
    data = data_struct[0]
    features = data_struct[1]
    meta_features = data_struct[2]
    


    if learn_flag:
         # prepare data for classification - watch out for memory concerns
        X = data[features]
        y = data[label]
        groups = data[group_id]
        learning_results = cv.nested_cross_validation(classification_dir,
                                               X,y, groups,
                                               pipe,
                                               param_grid, scoring,
                                               compute_all_new, cv_out, cv_in,
                                               search_function,
                                               n_jobs=n_jobs)
        #************************************************************************
        
        print 'These are the learning results'
        print learning_results
            # get data groups
        data_groups = data.groupby(group_keys)
        groups = data_groups.groups.keys()

        for learning_result, group in zip(learning_results, groups):
                learning_result['group'] = group
                learning_result['group_keys'] = group_keys

        cv_object = learning_results

        report = cv.generate_classification_report(cv_object)
    
        report.to_hdf(classification_dir + 'classification_report.h5', '/report' )
        
        print report
    print 'Done!'