Beispiel #1
0
def cross_validation(X, y, pre_x, groups, model='LGB', test_days=1):
    groups = np.floor((groups + 1) / 2)

    logo = LeavePGroupsOut(n_groups=test_days)
    i = 0
    pre_sum = np.zeros(pre_x.shape[0])
    pre_ = []
    print np.isnan(groups).astype(int).sum()
    print np.unique(groups)
    ll_ = []
    for train, test in logo.split(X, y, groups=groups):
        i = i + 1
        print 'times:', i
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        print X_train.shape, X_test.shape, y_train.shape, y_test.shape
        if model == 'LGB':
            pre, ll = LGB(X_train, X_test, y_train, y_test, pre_x)
        else:
            pre, ll = LR(X_train, X_test, y_train, y_test, pre_x)
        ll_ += [ll]
        pre_ += [pre]
    weight = []
    weight_sum = 0
    for l in ll_:
        weight_sum += 1.0 / l
        weight += [1.0 / l]
    for i in range(len(pre_)):
        pre_sum += pre_[i] * weight[i] / weight_sum

    print 'weight', weight
    print 'loss', ll_

    return pre_sum
Beispiel #2
0
class DKULeavePGroupsOut(object):
    def __init__(self, column_name, p):
        self.column_name = column_name
        self.splitter = LeavePGroupsOut(p)
        pass

    def set_column_labels(self, column_labels):
        self.column_labels = column_labels

    def get_n_splits(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        ret = self.splitter.get_n_splits(X, y, groups_array)
        print("Will use %s splits" % ret)
        return ret

    def split(self, X, y, groups=None):
        try:
            column_idx = self.column_labels.index(self.column_name)
        except ValueError as e:
            raise Exception("Column %s not found among %s" %
                            (self.column_name, self.column_labels))

        groups_array = X[:, column_idx]

        return self.splitter.split(X, y, groups_array)
def tmpFUN(dataset, group_label = "groups", n_groups = 2, y_label = "groups", rf_n_estimators = 2000, n_jobs = -1):
        lpgo = LeavePGroupsOut(n_groups = n_groups)

        for train_index, validate_index in lpgo.split(X = dataset, y = dataset.loc[:,y_label], groups = dataset.loc[:,group_label]):
                trainset = dataset.iloc[train_index,:]
                validateset  = dataset.iloc[validate_index,:]
                X_train = trainset.drop(y_label, axis = 1)
                y_train = trainset.loc[:,y_label]
                
                RF_mod = RandomForestClassifier(n_estimators = rf_n_estimators, n_jobs = n_jobs, class_weight = "balanced")
                RF_mod.fit(X_train, y_train)
                RF_pred = RF_mod.predict(X_test)
Beispiel #4
0
def create_cv(x, y, subjects, P):
    """

    :param x:
    :param y:
    :param N:
    :return:
    """
    cv = []
    lpgo = LeavePGroupsOut(n_groups=P)
    for train_index, test_index in lpgo.split(x, y, subjects):
        cv.append((train_index, test_index))
    return cv
Beispiel #5
0
class LeavePSubjectsOut():
    def __init__(self, subjects_indexes):
        self.subjects_indexes = subjects_indexes
        self.splitter = LeavePGroupsOut(np.unique(subjects_indexes))

    def split(self, X=None, y=None, groups=None):
        if groups == None:
            groups = self.subjects_indexes
        return self.splitter.split(X, y, groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        if groups == None:
            groups = self.subjects_indexes
        return self.splitter.get_n_splits(X, y, groups)
Beispiel #6
0
def fold_maker(X, fold_choice='default', n_fold=4, n_groups=2):
    if fold_choice == 'default':
        folds = KFold(n_splits=n_fold, shuffle=False)
        fold_iter = folds.split(X)
        fold_iter = shuffle_group(fold_iter)
    elif fold_choice == 'earthquake':
        earthquake_id = data_loader.load_earthquake_id()
        group_kfold = LeaveOneGroupOut()
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        # fold_iter = shuffle_group(fold_iter)
        # fold_iter = min_valid_filter(fold_iter)
    elif fold_choice == f'eqCombo':
        earthquake_id = eqComboMaker(n_fold)
        group_kfold = LeaveOneGroupOut()
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        fold_iter = shuffle_group(fold_iter)
    elif fold_choice == 'k-earthquake':
        earthquake_id = data_loader.load_earthquake_id()
        group_kfold = LeavePGroupsOut(n_groups=n_groups)
        fold_iter = group_kfold.split(X, groups=earthquake_id)
        fold_iter = min_valid_filter(fold_iter)
    elif fold_choice == 'customize':
        fold = CVPipe()
        fold_iter = fold.fold_iter(num_fold=n_fold, mini_quake_prob=0.3)
    else:
        raise AttributeError(f"Not support CV {fold_choice} yet...")

    return (list(fold_iter), fold_choice)
def cv_strategy(parameters):
    if parameters.cv_mode == 'GKF':
        return GroupKFold(n_splits=parameters.cv_param)
    elif parameters.cv_mode == 'LPGO':
        return LeavePGroupsOut(n_groups=parameters.cv_param)
    else:
        raise ValueError("Unknown CV mode")
Beispiel #8
0
def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [
        LeaveOneGroupOut(),
        LeavePGroupsOut(2),
        GroupKFold(),
        GroupShuffleSplit()
    ]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The groups parameter should not be None",
                             cross_val_score,
                             estimator=clf,
                             X=X,
                             y=y,
                             cv=cv)
        assert_raise_message(ValueError,
                             "The groups parameter should not be None",
                             cross_val_predict,
                             estimator=clf,
                             X=X,
                             y=y,
                             cv=cv)
Beispiel #9
0
 def _init_atributes(self, y, groups):
     """Initialization."""
     if len(y) != len(groups):
         raise Exception("Error: y and groups need to have the same length")
     if y is None:
         raise Exception("Error: y cannot be None")
     if groups is None:
         raise Exception("Error: this function requires a groups parameter")
     if self.labels_list is None:
         self.labels_list = list(set(y))
     if self.n_labs is None:
         self.n_labs = len(self.labels_list)
     assert (
         self.n_groups % self.n_labs == 0
     ), "Error: The number of groups to leave out must be a multiple of the number of classes"
     if self.n_each is None:
         self.n_each = int(self.n_groups / self.n_labs)
     if self.lpgos is None:
         lpgos, indexes = [], []
         for label in self.labels_list:
             index = np.where(y == label)[0]
             indexes.append(index)
             lpgos.append(LeavePGroupsOut(self.n_each))
         self.lpgos = lpgos
         self.indexes = np.array(indexes)
Beispiel #10
0
def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [
        LeaveOneGroupOut(),
        LeavePGroupsOut(2),
        GroupKFold(),
        GroupShuffleSplit()
    ]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The groups parameter should not be None", gs.fit,
                             X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)
def test_groups_support(Est):
    # Check if ValueError (when groups is None) propagates to
    # HalvingGridSearchCV and HalvingRandomSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=50, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 50)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2),
                 GroupKFold(n_splits=3), GroupShuffleSplit(random_state=0)]
    error_msg = "The 'groups' parameter should not be None."
    for cv in group_cvs:
        gs = Est(clf, grid, cv=cv)
        with pytest.raises(ValueError, match=error_msg):
            gs.fit(X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)]
    for cv in non_group_cvs:
        gs = Est(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)
Beispiel #12
0
def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to dcv.GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {"C": [1]}

    group_cvs = [
        LeaveOneGroupOut(),
        LeavePGroupsOut(2),
        GroupKFold(n_splits=3),
        GroupShuffleSplit(n_splits=3),
    ]
    for cv in group_cvs:
        gs = dcv.GridSearchCV(clf, grid, cv=cv)

        with pytest.raises(ValueError) as exc:
            assert gs.fit(X, y)
        assert "parameter should not be None" in str(exc.value)

        gs.fit(X, y, groups=groups)

    non_group_cvs = [
        StratifiedKFold(n_splits=3),
        StratifiedShuffleSplit(n_splits=3)
    ]
    for cv in non_group_cvs:
        gs = dcv.GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y)
Beispiel #13
0
class Splits():
    def __init__(self, sub_indexes, train_size=0.33, n_splits=10, mode='loso'):
        # bootstrap ou loso
        self.si = sub_indexes
        self.train_size = train_size
        self.n_splits = n_splits
        self.mode = mode
        self.create_splits()

    def create_splits(self, splits=None):

        if self.mode == 'bootstrap':
            unique = np.unique(self.si)

            rs = ShuffleSplit(n_splits=self.n_splits, test_size=1-self.train_size)
            splits = []
            for train, test in rs.split(unique):
                train = unique[train]
                test = unique[test]
                train_ = np.nonzero([x in train for x in self.si])
                test_ = np.nonzero([x in test for x in self.si])
                splits.append((train_, test_))
            self.splits = splits
            self.splitter = None
        elif self.mode == 'groupkfold':
            self.splitter = GroupKFold(n_splits=self.n_splits)
        elif self.mode == 'loso':
            self.splitter = LeaveOneGroupOut()
        elif self.mode == 'lpso':
            self.splitter = LeavePGroupsOut(n_groups=len(np.unique(self.si))//self.n_splits)

    def get_n_splits(self, X=None, y=None, groups=None):
        if self.splitter:
            return self.splitter.get_n_splits(X, y, groups)
        return self.n_splits

    def split(self, X=None, y=None, groups=None):
        if self.splitter:
            for i, j in self.splitter.split(X, y, groups):
                yield i, j
        else:
            for tt in self.splits:
                yield tt
Beispiel #14
0
def test_cross_validator_with_default_params():
    n_samples = 4
    n_unique_groups = 4
    n_splits = 2
    p = 2
    n_shuffle_splits = 10  # (the default value)

    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    X_1d = np.array([1, 2, 3, 4])
    y = np.array([1, 1, 2, 2])
    groups = np.array([1, 2, 3, 4])
    loo = LeaveOneOut()
    lpo = LeavePOut(p)
    kf = KFold(n_splits)
    skf = StratifiedKFold(n_splits)
    lolo = LeaveOneGroupOut()
    lopo = LeavePGroupsOut(p)
    ss = ShuffleSplit(random_state=0)
    ps = PredefinedSplit([1, 1, 2, 2])  # n_splits = np of unique folds = 2

    loo_repr = "LeaveOneOut()"
    lpo_repr = "LeavePOut(p=2)"
    kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)"
    skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)"
    lolo_repr = "LeaveOneGroupOut()"
    lopo_repr = "LeavePGroupsOut(n_groups=2)"
    ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, "
               "train_size=None)")
    ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))"

    n_splits_expected = [
        n_samples,
        comb(n_samples, p), n_splits, n_splits, n_unique_groups,
        comb(n_unique_groups, p), n_shuffle_splits, 2
    ]

    for i, (cv, cv_repr) in enumerate(
            zip([loo, lpo, kf, skf, lolo, lopo, ss, ps], [
                loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr,
                ss_repr, ps_repr
            ])):
        # Test if get_n_splits works correctly
        assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups))

        # Test if the cross-validator works as expected even if
        # the data is 1d
        np.testing.assert_equal(list(cv.split(X, y, groups)),
                                list(cv.split(X_1d, y, groups)))
        # Test that train, test indices returned are integers
        for train, test in cv.split(X, y, groups):
            assert_equal(np.asarray(train).dtype.kind, 'i')
            assert_equal(np.asarray(train).dtype.kind, 'i')

        # Test if the repr works without any errors
        assert_equal(cv_repr, repr(cv))
Beispiel #15
0
    def _cv_split_hold_out_by_subject_using_sklearn(self, tcrrep=None):
        """
        returns a generator with train and test set indices based on hold on
        subject out cross-validation. This is based on the LeavePGroupsOut


        Parameters
        ----------
        tcrrep : TCRrep class instance
            TCRrep class instance, with TCRrep.clone_df.subject and TCRrep.clone_df.epitope fields

        Returns
        -------
        partitions : generator object BaseCrossValidator.split from sklearn

        """
        if tcrrep is None:
            tcrrep = self.tcrrep
        # unique epitope mapped to unique numbers
        encoder_epitope = preprocessing.LabelEncoder()
        encoder_epitope.fit(list(tcrrep.clone_df.epitope.unique()))

        # `y` target vector
        y = encoder_epitope.transform(tcrrep.clone_df.epitope)

        # `X` distance matrix (metric = 'precomputed')
        X = tcrrep.paired_tcrregex

        # Cross Validation Split
        # unique subjects mapped to unique numbers
        encoder_subjects = preprocessing.LabelEncoder()
        encoder_subjects = encoder_subjects.fit(
            list(tcrrep.clone_df.subject.unique()))

        # define groups based on subject
        groups = list(encoder_subjects.transform(tcrrep.clone_df.subject))

        # Leave P Groups Out
        lpgo = LeavePGroupsOut(n_groups=1)
        lpgo.get_n_splits(X, y, groups)
        partitions = lpgo.split(X, y, groups)
        return partitions
Beispiel #16
0
def test_leave_group_out_changing_groups():
    # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if
    # the groups variable is changed before calling split
    groups = np.array([0, 1, 2, 1, 1, 2, 0, 0])
    X = np.ones(len(groups))
    groups_changing = np.array(groups, copy=True)
    lolo = LeaveOneGroupOut().split(X, groups=groups)
    lolo_changing = LeaveOneGroupOut().split(X, groups=groups)
    lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
    lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups)
    groups_changing[:] = 0
    for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]:
        for (train, test), (train_chan, test_chan) in zip(llo, llo_changing):
            assert_array_equal(train, train_chan)
            assert_array_equal(test, test_chan)

    # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3
    assert_equal(3, LeavePGroupsOut(n_groups=2).get_n_splits(X, y, groups))
    # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups)
    assert_equal(3, LeaveOneGroupOut().get_n_splits(X, y, groups))
Beispiel #17
0
    def create_splits(self, splits=None):

        if self.mode == 'bootstrap':
            unique = np.unique(self.si)

            rs = ShuffleSplit(n_splits=self.n_splits, test_size=1-self.train_size)
            splits = []
            for train, test in rs.split(unique):
                train = unique[train]
                test = unique[test]
                train_ = np.nonzero([x in train for x in self.si])
                test_ = np.nonzero([x in test for x in self.si])
                splits.append((train_, test_))
            self.splits = splits
            self.splitter = None
        elif self.mode == 'groupkfold':
            self.splitter = GroupKFold(n_splits=self.n_splits)
        elif self.mode == 'loso':
            self.splitter = LeaveOneGroupOut()
        elif self.mode == 'lpso':
            self.splitter = LeavePGroupsOut(n_groups=len(np.unique(self.si))//self.n_splits)
    def split_groups(filenames, labels, groups, size):
        lpgo = LeavePGroupsOut(n_groups=size)
        flag = False
        for i, (train,
                test) in enumerate(lpgo.split(filenames, labels,
                                              groups=groups)):
            if random() > 0.95:
                flag = True
                train_filenames, train_labels, train_groups = np.array(
                    filenames)[train], np.array(labels)[train], np.array(
                        groups)[train]
                test_filenames, test_labels, test_groups = np.array(filenames)[
                    test], np.array(labels)[test], np.array(groups)[test]
                break
        if not flag:
            train_filenames, train_labels, train_groups = np.array(filenames)[
                train], np.array(labels)[train], np.array(groups)[train]
            test_filenames, test_labels, test_groups = np.array(filenames)[
                test], np.array(labels)[test], np.array(groups)[test]

        return train_filenames, test_filenames, train_groups, train_labels
Beispiel #19
0
def test_leave_one_p_group_out_error_on_fewer_number_of_groups():
    X = y = groups = np.ones(0)
    assert_raise_message(ValueError, "Found array with 0 sample(s)", next,
                         LeaveOneGroupOut().split(X, y, groups))
    X = y = groups = np.ones(1)
    msg = ("The groups parameter contains fewer than 2 unique groups ([ 1.]). "
           "LeaveOneGroupOut expects at least 2.")
    assert_raise_message(ValueError, msg, next,
                         LeaveOneGroupOut().split(X, y, groups))
    X = y = groups = np.ones(1)
    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
           "(3) numbers of unique groups ([ 1.]). LeavePGroupsOut expects "
           "that at least n_groups + 1 (4) unique groups be present")
    assert_raise_message(ValueError, msg, next,
                         LeavePGroupsOut(n_groups=3).split(X, y, groups))
    X = y = groups = np.arange(3)
    msg = ("The groups parameter contains fewer than (or equal to) n_groups "
           "(3) numbers of unique groups ([0 1 2]). LeavePGroupsOut expects "
           "that at least n_groups + 1 (4) unique groups be present")
    assert_raise_message(ValueError, msg, next,
                         LeavePGroupsOut(n_groups=3).split(X, y, groups))
Beispiel #20
0
def _cv_build(cv_scheme):
    LOG.debug('Building CV scheme: %s', str(cv_scheme))
    if cv_scheme is None:
        return None

    if cv_scheme is not None and cv_scheme.get('type', '') == 'kfold':
        nsplits = cv_scheme.get('n_splits', 6)
        return StratifiedKFold(n_splits=nsplits, shuffle=True)

    if cv_scheme is not None and cv_scheme.get('type', '') == 'loso':
        return LeavePGroupsOut(n_groups=1)

    raise RuntimeError('Unknown CV scheme (%s)' % str(cv_scheme))
Beispiel #21
0
    def split(self):
        complete: bool = self.use_test and self.use_validation
        logging.info(f"Conducting a {'3/1/1' if complete else '4/1'} Split.")

        split_args: dict = {
            "make_normal_splitter":
            lambda: StratifiedShuffleSplit(n_splits=self.n_splits,
                                           test_size=0.4 if complete else 0.2),
            "make_normal_sub_splitter":
            lambda: (StratifiedShuffleSplit(n_splits=1, test_size=0.5)
                     if complete else None),
            "make_anomaly_splitter":
            lambda: LeavePGroupsOut(n_groups=2 if complete else 1),
            "make_anomaly_sub_splitter":
            lambda: (LeavePGroupsOut(n_groups=1) if complete else None),
            "use_test":
            self.use_test,
            "use_validation":
            self.use_validation
        }

        return self.__split__(split_args)
def construct_exp_splits(feature_frame, leave_n_out=1):
    """ Constructs a list of (train,test) splits for a feature_frame
        representing a set of experiments. These splits used integer
        based (as opposed to label based) indexing of feature_frame.
        Input:
            feature_frame : DataFrame
                A pandas dataframe returned by extract_features_targets
                representing multiple experiments
            leave_n_out : int
                The number of experiments to leave out in each cross validation
                fold
        Returns: [(Array, Array)]
            A list of (train index, test index) splits
    """
    groups = feature_frame.index.get_level_values(0)
    logo = LeavePGroupsOut(n_groups=leave_n_out)
    df_mat = feature_frame.values
    cv_splits = [
        (train_index, test_index)
        for train_index, test_index in logo.split(df_mat, groups=groups)
    ]
    return cv_splits
Beispiel #23
0
def test_leave_one_p_group_out():
    logo = LeaveOneGroupOut()
    lpgo_1 = LeavePGroupsOut(n_groups=1)
    lpgo_2 = LeavePGroupsOut(n_groups=2)

    # Make sure the repr works
    assert_equal(repr(logo), 'LeaveOneGroupOut()')
    assert_equal(repr(lpgo_1), 'LeavePGroupsOut(n_groups=1)')
    assert_equal(repr(lpgo_2), 'LeavePGroupsOut(n_groups=2)')
    assert_equal(repr(LeavePGroupsOut(n_groups=3)),
                 'LeavePGroupsOut(n_groups=3)')

    for j, (cv, p_groups_out) in enumerate(
        ((logo, 1), (lpgo_1, 1), (lpgo_2, 2))):
        for i, groups_i in enumerate(test_groups):
            n_groups = len(np.unique(groups_i))
            n_splits = (n_groups if p_groups_out == 1 else n_groups *
                        (n_groups - 1) / 2)
            X = y = np.ones(len(groups_i))

            # Test that the length is correct
            assert_equal(cv.get_n_splits(X, y, groups=groups_i), n_splits)

            groups_arr = np.asarray(groups_i)

            # Split using the original list / array / list of string groups_i
            for train, test in cv.split(X, y, groups=groups_i):
                # First test: no train group is in the test set and vice versa
                assert_array_equal(
                    np.intersect1d(groups_arr[train],
                                   groups_arr[test]).tolist(), [])

                # Second test: train and test add up to all the data
                assert_equal(len(train) + len(test), len(groups_i))

                # Third test:
                # The number of groups in test must be equal to p_groups_out
                assert_true(np.unique(groups_arr[test]).shape[0], p_groups_out)
Beispiel #24
0
    def make_leave_out(X, y=None, p=5, strategy=None, group=None):
        ### strategy = None / 'group'

        # group strategy
        if strategy == 'group':
            spliter = LeaveOneGroupOut() if p == 1 else LeavePGroupsOut(p)
            if group is None:
                raise Exception('Please provide group parameter.')
            else:
                idx_generator = spliter.split(X, y=y, groups=group)
        # not specific strategy
        else:
            spliter = LeaveOneOut() if p == 1 else LeavePOut(p)
            idx_generator = spliter.split(X, y=y, groups=group)
        return idx_generator
Beispiel #25
0
 def grid_search_hyperparams(self,
                             model,
                             data,
                             feature_names,
                             hyperparam_grid,
                             n_leave_out=1):
     cv_splits = LeavePGroupsOut(n_groups=n_leave_out).split(
         data[feature_names],
         np.ravel(data[self.target]),
         groups=data['group_id'])
     gs_models = GridSearchCV(model,
                              hyperparam_grid,
                              cv=cv_splits,
                              scoring=self.metric,
                              n_jobs=-1)
     gs_models.fit(data[feature_names], np.ravel(data[self.target]))
     return gs_models.best_params_
def leave_P_out_iter(data,
                     labels,
                     s_labels,
                     train_grp_animals,
                     num_groups=2,
                     clf=None,
                     **kwargs):
    """Function to separate folds as groups with separated instances of each organism
	num_groups: number of groups in the train split. Can be 2 or 3 only"""
    lpgo = LeavePGroupsOut(n_groups=num_groups)
    groups_list = lb.grouping_crossval(s_labels, ani_gps=train_grp_animals)
    if clf == None:
        interations = get_iterations(data, lpgo, labels, groups_list)
        return interations
    else:
        clf_function = get_train_function(clf)
        train_cv = get_train_cv_results(data, labels, lpgo, clf_function,
                                        groups_list, **kwargs)
        return train_cv
Beispiel #27
0
 def __init__(self, y, nsuj, pout=1, clf='lda', **clfArg):
     self._y = y
     self._ry = np.ravel(np.concatenate(y))
     self._nsuj = nsuj
     self._pout = pout
     # Manage cross-validation:
     self._cv = LeavePGroupsOut(pout)
     self._cv.shStr = 'Leave '+str(pout)+' subjects out'
     self._cv.lgStr = self._cv.shStr
     self._cv.rep = 1
     self._cv.y = y[0]
     # Manage classifier :
     if isinstance(clf, (int, str)):
         clf = defClf(self._ry, clf=clf, **clfArg)
     self._clf = clf
     # Manage info:
     self._updatestring()
     # Stat tools:
     self.stat = clfstat()
def GetCVObject(type, **kwargs):
    if (type == 'KFold'):
        return KFold(**kwargs)
    elif (type == 'StratifiedKFold'):
        return StratifiedKFold(**kwargs)
    elif (type == 'GroupKFold'):
        return GroupKFold(**kwargs)
    elif (type == 'ShuffleSplit'):
        return ShuffleSplit(**kwargs)
    elif (type == 'StratifiedShuffleSplit'):
        return StratifiedShuffleSplit(**kwargs)
    elif (type == 'GroupShuffleSplit'):
        return GroupShuffleSplit(**kwargs)
    elif (type == 'LeaveOneOut'):
        return LeaveOneOut()
    elif (type == 'LeavePOut'):
        return LeavePOut(**kwargs)
    elif (type == 'LeaveOneGroupOut'):
        return LeaveOneGroupOut()
    elif (type == 'LeavePGroupsOut'):
        return LeavePGroupsOut(**kwargs)
Beispiel #29
0
    def test_grid_search_groups(self):
        # Check if ValueError (when groups is None) propagates to
        # dcv.GridSearchCV
        # And also check if groups is correctly passed to the cv object
        rng = np.random.RandomState(0)

        X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
        groups = rng.randint(0, 3, 15)

        clf = LinearSVC(random_state=0)
        grid = {"C": [1]}

        group_cvs = [
            LeaveOneGroupOut(),
            LeavePGroupsOut(2),
            GroupKFold(n_splits=3),
            GroupShuffleSplit(n_splits=3),
        ]
        for cv in group_cvs:
            gs = TuneGridSearchCV(clf, grid, cv=cv)
            try:
                with self.assertLogs("ray.tune") as cm:
                    gs.fit(X, y)
                self.assertTrue(
                    ("parameter should not be None.") in str(cm.output))
            except ValueError as exc:
                self.assertTrue("parameter should not be None" in str(exc))

            gs.fit(X, y, groups=groups)

        non_group_cvs = [
            StratifiedKFold(n_splits=3),
            StratifiedShuffleSplit(n_splits=3)
        ]
        for cv in non_group_cvs:
            gs = TuneGridSearchCV(clf, grid, cv=cv)
            # Should not raise an error
            gs.fit(X, y)
Beispiel #30
0
    def fit(self):
        from sklearn.ensemble import RandomForestClassifier as RFC
        if self._pickled:
            LOG.info('Classifier was loaded from file, cancelling fitting.')
            return

        LOG.info('Start fitting ...')
        estimator = RFC()
        grid = RobustGridSearchCV(estimator,
                                  self.param['rfc'],
                                  error_score=0.5,
                                  refit=True,
                                  scoring=check_scoring(estimator,
                                                        scoring='roc_auc'),
                                  n_jobs=self.n_jobs,
                                  cv=LeavePGroupsOut(n_groups=1),
                                  verbose=0)

        X, y, groups = self._generate_sample()
        self._estimator = grid.fit(X, y, groups=groups)

        LOG.info('Model selection - best parameters (roc_auc=%f) %s',
                 grid.best_score_, grid.best_params_)