def cv_strategy(parameters): if parameters.cv_mode == 'GKF': return GroupKFold(n_splits=parameters.cv_param) elif parameters.cv_mode == 'LPGO': return LeavePGroupsOut(n_groups=parameters.cv_param) else: raise ValueError("Unknown CV mode")
def _init_atributes(self, y, groups): """Initialization.""" if len(y) != len(groups): raise Exception("Error: y and groups need to have the same length") if y is None: raise Exception("Error: y cannot be None") if groups is None: raise Exception("Error: this function requires a groups parameter") if self.labels_list is None: self.labels_list = list(set(y)) if self.n_labs is None: self.n_labs = len(self.labels_list) assert ( self.n_groups % self.n_labs == 0 ), "Error: The number of groups to leave out must be a multiple of the number of classes" if self.n_each is None: self.n_each = int(self.n_groups / self.n_labs) if self.lpgos is None: lpgos, indexes = [], [] for label in self.labels_list: index = np.where(y == label)[0] indexes.append(index) lpgos.append(LeavePGroupsOut(self.n_each)) self.lpgos = lpgos self.indexes = np.array(indexes)
def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {'C': [1]} group_cvs = [ LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit() ] for cv in group_cvs: gs = GridSearchCV(clf, grid, cv=cv) assert_raise_message(ValueError, "The groups parameter should not be None", gs.fit, X, y) gs.fit(X, y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()] for cv in non_group_cvs: gs = GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
def test_cross_val_score_predict_groups(): # Check if ValueError (when groups is None) propagates to cross_val_score # and cross_val_predict # And also check if groups is correctly passed to the cv object X, y = make_classification(n_samples=20, n_classes=2, random_state=0) clf = SVC(kernel="linear") group_cvs = [ LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(), GroupShuffleSplit() ] for cv in group_cvs: assert_raise_message(ValueError, "The groups parameter should not be None", cross_val_score, estimator=clf, X=X, y=y, cv=cv) assert_raise_message(ValueError, "The groups parameter should not be None", cross_val_predict, estimator=clf, X=X, y=y, cv=cv)
def test_groups_support(Est): # Check if ValueError (when groups is None) propagates to # HalvingGridSearchCV and HalvingRandomSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=50, n_classes=2, random_state=0) groups = rng.randint(0, 3, 50) clf = LinearSVC(random_state=0) grid = {'C': [1]} group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(n_splits=3), GroupShuffleSplit(random_state=0)] error_msg = "The 'groups' parameter should not be None." for cv in group_cvs: gs = Est(clf, grid, cv=cv) with pytest.raises(ValueError, match=error_msg): gs.fit(X, y) gs.fit(X, y, groups=groups) non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit(random_state=0)] for cv in non_group_cvs: gs = Est(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
def test_grid_search_groups(): # Check if ValueError (when groups is None) propagates to dcv.GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {"C": [1]} group_cvs = [ LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(n_splits=3), GroupShuffleSplit(n_splits=3), ] for cv in group_cvs: gs = dcv.GridSearchCV(clf, grid, cv=cv) with pytest.raises(ValueError) as exc: assert gs.fit(X, y) assert "parameter should not be None" in str(exc.value) gs.fit(X, y, groups=groups) non_group_cvs = [ StratifiedKFold(n_splits=3), StratifiedShuffleSplit(n_splits=3) ] for cv in non_group_cvs: gs = dcv.GridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
def cross_validation(X, y, pre_x, groups, model='LGB', test_days=1): groups = np.floor((groups + 1) / 2) logo = LeavePGroupsOut(n_groups=test_days) i = 0 pre_sum = np.zeros(pre_x.shape[0]) pre_ = [] print np.isnan(groups).astype(int).sum() print np.unique(groups) ll_ = [] for train, test in logo.split(X, y, groups=groups): i = i + 1 print 'times:', i X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] print X_train.shape, X_test.shape, y_train.shape, y_test.shape if model == 'LGB': pre, ll = LGB(X_train, X_test, y_train, y_test, pre_x) else: pre, ll = LR(X_train, X_test, y_train, y_test, pre_x) ll_ += [ll] pre_ += [pre] weight = [] weight_sum = 0 for l in ll_: weight_sum += 1.0 / l weight += [1.0 / l] for i in range(len(pre_)): pre_sum += pre_[i] * weight[i] / weight_sum print 'weight', weight print 'loss', ll_ return pre_sum
def fold_maker(X, fold_choice='default', n_fold=4, n_groups=2): if fold_choice == 'default': folds = KFold(n_splits=n_fold, shuffle=False) fold_iter = folds.split(X) fold_iter = shuffle_group(fold_iter) elif fold_choice == 'earthquake': earthquake_id = data_loader.load_earthquake_id() group_kfold = LeaveOneGroupOut() fold_iter = group_kfold.split(X, groups=earthquake_id) # fold_iter = shuffle_group(fold_iter) # fold_iter = min_valid_filter(fold_iter) elif fold_choice == f'eqCombo': earthquake_id = eqComboMaker(n_fold) group_kfold = LeaveOneGroupOut() fold_iter = group_kfold.split(X, groups=earthquake_id) fold_iter = shuffle_group(fold_iter) elif fold_choice == 'k-earthquake': earthquake_id = data_loader.load_earthquake_id() group_kfold = LeavePGroupsOut(n_groups=n_groups) fold_iter = group_kfold.split(X, groups=earthquake_id) fold_iter = min_valid_filter(fold_iter) elif fold_choice == 'customize': fold = CVPipe() fold_iter = fold.fold_iter(num_fold=n_fold, mini_quake_prob=0.3) else: raise AttributeError(f"Not support CV {fold_choice} yet...") return (list(fold_iter), fold_choice)
def test_cross_validator_with_default_params(): n_samples = 4 n_unique_groups = 4 n_splits = 2 p = 2 n_shuffle_splits = 10 # (the default value) X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) X_1d = np.array([1, 2, 3, 4]) y = np.array([1, 1, 2, 2]) groups = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) kf = KFold(n_splits) skf = StratifiedKFold(n_splits) lolo = LeaveOneGroupOut() lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, " "train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" n_splits_expected = [ n_samples, comb(n_samples, p), n_splits, n_splits, n_unique_groups, comb(n_unique_groups, p), n_shuffle_splits, 2 ] for i, (cv, cv_repr) in enumerate( zip([loo, lpo, kf, skf, lolo, lopo, ss, ps], [ loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, ss_repr, ps_repr ])): # Test if get_n_splits works correctly assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups)) # Test if the cross-validator works as expected even if # the data is 1d np.testing.assert_equal(list(cv.split(X, y, groups)), list(cv.split(X_1d, y, groups))) # Test that train, test indices returned are integers for train, test in cv.split(X, y, groups): assert_equal(np.asarray(train).dtype.kind, 'i') assert_equal(np.asarray(train).dtype.kind, 'i') # Test if the repr works without any errors assert_equal(cv_repr, repr(cv))
def test_leave_group_out_changing_groups(): # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if # the groups variable is changed before calling split groups = np.array([0, 1, 2, 1, 1, 2, 0, 0]) X = np.ones(len(groups)) groups_changing = np.array(groups, copy=True) lolo = LeaveOneGroupOut().split(X, groups=groups) lolo_changing = LeaveOneGroupOut().split(X, groups=groups) lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups) lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups) groups_changing[:] = 0 for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): assert_array_equal(train, train_chan) assert_array_equal(test, test_chan) # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3 assert_equal(3, LeavePGroupsOut(n_groups=2).get_n_splits(X, y, groups)) # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups) assert_equal(3, LeaveOneGroupOut().get_n_splits(X, y, groups))
def test_leave_one_p_group_out_error_on_fewer_number_of_groups(): X = y = groups = np.ones(0) assert_raise_message(ValueError, "Found array with 0 sample(s)", next, LeaveOneGroupOut().split(X, y, groups)) X = y = groups = np.ones(1) msg = ("The groups parameter contains fewer than 2 unique groups ([ 1.]). " "LeaveOneGroupOut expects at least 2.") assert_raise_message(ValueError, msg, next, LeaveOneGroupOut().split(X, y, groups)) X = y = groups = np.ones(1) msg = ("The groups parameter contains fewer than (or equal to) n_groups " "(3) numbers of unique groups ([ 1.]). LeavePGroupsOut expects " "that at least n_groups + 1 (4) unique groups be present") assert_raise_message(ValueError, msg, next, LeavePGroupsOut(n_groups=3).split(X, y, groups)) X = y = groups = np.arange(3) msg = ("The groups parameter contains fewer than (or equal to) n_groups " "(3) numbers of unique groups ([0 1 2]). LeavePGroupsOut expects " "that at least n_groups + 1 (4) unique groups be present") assert_raise_message(ValueError, msg, next, LeavePGroupsOut(n_groups=3).split(X, y, groups))
def tmpFUN(dataset, group_label = "groups", n_groups = 2, y_label = "groups", rf_n_estimators = 2000, n_jobs = -1): lpgo = LeavePGroupsOut(n_groups = n_groups) for train_index, validate_index in lpgo.split(X = dataset, y = dataset.loc[:,y_label], groups = dataset.loc[:,group_label]): trainset = dataset.iloc[train_index,:] validateset = dataset.iloc[validate_index,:] X_train = trainset.drop(y_label, axis = 1) y_train = trainset.loc[:,y_label] RF_mod = RandomForestClassifier(n_estimators = rf_n_estimators, n_jobs = n_jobs, class_weight = "balanced") RF_mod.fit(X_train, y_train) RF_pred = RF_mod.predict(X_test)
def split(self): complete: bool = self.use_test and self.use_validation logging.info(f"Conducting a {'3/1/1' if complete else '4/1'} Split.") split_args: dict = { "make_normal_splitter": lambda: StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.4 if complete else 0.2), "make_normal_sub_splitter": lambda: (StratifiedShuffleSplit(n_splits=1, test_size=0.5) if complete else None), "make_anomaly_splitter": lambda: LeavePGroupsOut(n_groups=2 if complete else 1), "make_anomaly_sub_splitter": lambda: (LeavePGroupsOut(n_groups=1) if complete else None), "use_test": self.use_test, "use_validation": self.use_validation } return self.__split__(split_args)
def create_cv(x, y, subjects, P): """ :param x: :param y: :param N: :return: """ cv = [] lpgo = LeavePGroupsOut(n_groups=P) for train_index, test_index in lpgo.split(x, y, subjects): cv.append((train_index, test_index)) return cv
def _cv_build(cv_scheme): LOG.debug('Building CV scheme: %s', str(cv_scheme)) if cv_scheme is None: return None if cv_scheme is not None and cv_scheme.get('type', '') == 'kfold': nsplits = cv_scheme.get('n_splits', 6) return StratifiedKFold(n_splits=nsplits, shuffle=True) if cv_scheme is not None and cv_scheme.get('type', '') == 'loso': return LeavePGroupsOut(n_groups=1) raise RuntimeError('Unknown CV scheme (%s)' % str(cv_scheme))
def test_leave_one_p_group_out(): logo = LeaveOneGroupOut() lpgo_1 = LeavePGroupsOut(n_groups=1) lpgo_2 = LeavePGroupsOut(n_groups=2) # Make sure the repr works assert_equal(repr(logo), 'LeaveOneGroupOut()') assert_equal(repr(lpgo_1), 'LeavePGroupsOut(n_groups=1)') assert_equal(repr(lpgo_2), 'LeavePGroupsOut(n_groups=2)') assert_equal(repr(LeavePGroupsOut(n_groups=3)), 'LeavePGroupsOut(n_groups=3)') for j, (cv, p_groups_out) in enumerate( ((logo, 1), (lpgo_1, 1), (lpgo_2, 2))): for i, groups_i in enumerate(test_groups): n_groups = len(np.unique(groups_i)) n_splits = (n_groups if p_groups_out == 1 else n_groups * (n_groups - 1) / 2) X = y = np.ones(len(groups_i)) # Test that the length is correct assert_equal(cv.get_n_splits(X, y, groups=groups_i), n_splits) groups_arr = np.asarray(groups_i) # Split using the original list / array / list of string groups_i for train, test in cv.split(X, y, groups=groups_i): # First test: no train group is in the test set and vice versa assert_array_equal( np.intersect1d(groups_arr[train], groups_arr[test]).tolist(), []) # Second test: train and test add up to all the data assert_equal(len(train) + len(test), len(groups_i)) # Third test: # The number of groups in test must be equal to p_groups_out assert_true(np.unique(groups_arr[test]).shape[0], p_groups_out)
def make_leave_out(X, y=None, p=5, strategy=None, group=None): ### strategy = None / 'group' # group strategy if strategy == 'group': spliter = LeaveOneGroupOut() if p == 1 else LeavePGroupsOut(p) if group is None: raise Exception('Please provide group parameter.') else: idx_generator = spliter.split(X, y=y, groups=group) # not specific strategy else: spliter = LeaveOneOut() if p == 1 else LeavePOut(p) idx_generator = spliter.split(X, y=y, groups=group) return idx_generator
def grid_search_hyperparams(self, model, data, feature_names, hyperparam_grid, n_leave_out=1): cv_splits = LeavePGroupsOut(n_groups=n_leave_out).split( data[feature_names], np.ravel(data[self.target]), groups=data['group_id']) gs_models = GridSearchCV(model, hyperparam_grid, cv=cv_splits, scoring=self.metric, n_jobs=-1) gs_models.fit(data[feature_names], np.ravel(data[self.target])) return gs_models.best_params_
def leave_P_out_iter(data, labels, s_labels, train_grp_animals, num_groups=2, clf=None, **kwargs): """Function to separate folds as groups with separated instances of each organism num_groups: number of groups in the train split. Can be 2 or 3 only""" lpgo = LeavePGroupsOut(n_groups=num_groups) groups_list = lb.grouping_crossval(s_labels, ani_gps=train_grp_animals) if clf == None: interations = get_iterations(data, lpgo, labels, groups_list) return interations else: clf_function = get_train_function(clf) train_cv = get_train_cv_results(data, labels, lpgo, clf_function, groups_list, **kwargs) return train_cv
def __init__(self, y, nsuj, pout=1, clf='lda', **clfArg): self._y = y self._ry = np.ravel(np.concatenate(y)) self._nsuj = nsuj self._pout = pout # Manage cross-validation: self._cv = LeavePGroupsOut(pout) self._cv.shStr = 'Leave '+str(pout)+' subjects out' self._cv.lgStr = self._cv.shStr self._cv.rep = 1 self._cv.y = y[0] # Manage classifier : if isinstance(clf, (int, str)): clf = defClf(self._ry, clf=clf, **clfArg) self._clf = clf # Manage info: self._updatestring() # Stat tools: self.stat = clfstat()
def _cv_split_hold_out_by_subject_using_sklearn(self, tcrrep=None): """ returns a generator with train and test set indices based on hold on subject out cross-validation. This is based on the LeavePGroupsOut Parameters ---------- tcrrep : TCRrep class instance TCRrep class instance, with TCRrep.clone_df.subject and TCRrep.clone_df.epitope fields Returns ------- partitions : generator object BaseCrossValidator.split from sklearn """ if tcrrep is None: tcrrep = self.tcrrep # unique epitope mapped to unique numbers encoder_epitope = preprocessing.LabelEncoder() encoder_epitope.fit(list(tcrrep.clone_df.epitope.unique())) # `y` target vector y = encoder_epitope.transform(tcrrep.clone_df.epitope) # `X` distance matrix (metric = 'precomputed') X = tcrrep.paired_tcrregex # Cross Validation Split # unique subjects mapped to unique numbers encoder_subjects = preprocessing.LabelEncoder() encoder_subjects = encoder_subjects.fit( list(tcrrep.clone_df.subject.unique())) # define groups based on subject groups = list(encoder_subjects.transform(tcrrep.clone_df.subject)) # Leave P Groups Out lpgo = LeavePGroupsOut(n_groups=1) lpgo.get_n_splits(X, y, groups) partitions = lpgo.split(X, y, groups) return partitions
def create_splits(self, splits=None): if self.mode == 'bootstrap': unique = np.unique(self.si) rs = ShuffleSplit(n_splits=self.n_splits, test_size=1-self.train_size) splits = [] for train, test in rs.split(unique): train = unique[train] test = unique[test] train_ = np.nonzero([x in train for x in self.si]) test_ = np.nonzero([x in test for x in self.si]) splits.append((train_, test_)) self.splits = splits self.splitter = None elif self.mode == 'groupkfold': self.splitter = GroupKFold(n_splits=self.n_splits) elif self.mode == 'loso': self.splitter = LeaveOneGroupOut() elif self.mode == 'lpso': self.splitter = LeavePGroupsOut(n_groups=len(np.unique(self.si))//self.n_splits)
def split_groups(filenames, labels, groups, size): lpgo = LeavePGroupsOut(n_groups=size) flag = False for i, (train, test) in enumerate(lpgo.split(filenames, labels, groups=groups)): if random() > 0.95: flag = True train_filenames, train_labels, train_groups = np.array( filenames)[train], np.array(labels)[train], np.array( groups)[train] test_filenames, test_labels, test_groups = np.array(filenames)[ test], np.array(labels)[test], np.array(groups)[test] break if not flag: train_filenames, train_labels, train_groups = np.array(filenames)[ train], np.array(labels)[train], np.array(groups)[train] test_filenames, test_labels, test_groups = np.array(filenames)[ test], np.array(labels)[test], np.array(groups)[test] return train_filenames, test_filenames, train_groups, train_labels
def GetCVObject(type, **kwargs): if (type == 'KFold'): return KFold(**kwargs) elif (type == 'StratifiedKFold'): return StratifiedKFold(**kwargs) elif (type == 'GroupKFold'): return GroupKFold(**kwargs) elif (type == 'ShuffleSplit'): return ShuffleSplit(**kwargs) elif (type == 'StratifiedShuffleSplit'): return StratifiedShuffleSplit(**kwargs) elif (type == 'GroupShuffleSplit'): return GroupShuffleSplit(**kwargs) elif (type == 'LeaveOneOut'): return LeaveOneOut() elif (type == 'LeavePOut'): return LeavePOut(**kwargs) elif (type == 'LeaveOneGroupOut'): return LeaveOneGroupOut() elif (type == 'LeavePGroupsOut'): return LeavePGroupsOut(**kwargs)
def construct_exp_splits(feature_frame, leave_n_out=1): """ Constructs a list of (train,test) splits for a feature_frame representing a set of experiments. These splits used integer based (as opposed to label based) indexing of feature_frame. Input: feature_frame : DataFrame A pandas dataframe returned by extract_features_targets representing multiple experiments leave_n_out : int The number of experiments to leave out in each cross validation fold Returns: [(Array, Array)] A list of (train index, test index) splits """ groups = feature_frame.index.get_level_values(0) logo = LeavePGroupsOut(n_groups=leave_n_out) df_mat = feature_frame.values cv_splits = [ (train_index, test_index) for train_index, test_index in logo.split(df_mat, groups=groups) ] return cv_splits
def test_grid_search_groups(self): # Check if ValueError (when groups is None) propagates to # dcv.GridSearchCV # And also check if groups is correctly passed to the cv object rng = np.random.RandomState(0) X, y = make_classification(n_samples=15, n_classes=2, random_state=0) groups = rng.randint(0, 3, 15) clf = LinearSVC(random_state=0) grid = {"C": [1]} group_cvs = [ LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(n_splits=3), GroupShuffleSplit(n_splits=3), ] for cv in group_cvs: gs = TuneGridSearchCV(clf, grid, cv=cv) try: with self.assertLogs("ray.tune") as cm: gs.fit(X, y) self.assertTrue( ("parameter should not be None.") in str(cm.output)) except ValueError as exc: self.assertTrue("parameter should not be None" in str(exc)) gs.fit(X, y, groups=groups) non_group_cvs = [ StratifiedKFold(n_splits=3), StratifiedShuffleSplit(n_splits=3) ] for cv in non_group_cvs: gs = TuneGridSearchCV(clf, grid, cv=cv) # Should not raise an error gs.fit(X, y)
def fit(self): from sklearn.ensemble import RandomForestClassifier as RFC if self._pickled: LOG.info('Classifier was loaded from file, cancelling fitting.') return LOG.info('Start fitting ...') estimator = RFC() grid = RobustGridSearchCV(estimator, self.param['rfc'], error_score=0.5, refit=True, scoring=check_scoring(estimator, scoring='roc_auc'), n_jobs=self.n_jobs, cv=LeavePGroupsOut(n_groups=1), verbose=0) X, y, groups = self._generate_sample() self._estimator = grid.fit(X, y, groups=groups) LOG.info('Model selection - best parameters (roc_auc=%f) %s', grid.best_score_, grid.best_params_)
def get_model(): sc_params = dict() en_params = {"fit_intercept": True, "normalize": False, "precompute": False, "random_state": 5} param_grid = {"estimator__alpha": np.power(10., np.arange(-5,6))} pca_params = {"n_components": 0.95, "svd_solver": "full"} split_params = {"n_groups": 2} model = Pipeline([('scaler', StandardScaler(**sc_params)), ('reducer', PCA(**pca_params)), ('estimator', Lasso(**en_params))]) score = make_scorer(nmse_loss, greater_is_better=False) selector = GridSearchCV(estimator=model, param_grid=param_grid, scoring=score, cv=LeavePGroupsOut(**split_params), n_jobs=4, refit=True) return model, selector
# ==================================组 k-fold交叉验证、留一组交叉验证、留 P 组交叉验证、Group Shuffle Split========================================== X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] # k折分组 gkf = GroupKFold(n_splits=3) # 训练集和测试集属于不同的组 for train, test in gkf.split(X, y, groups=groups): print("组 k-fold分割:%s %s" % (train, test)) # 留一分组 logo = LeaveOneGroupOut() for train, test in logo.split(X, y, groups=groups): print("留一组分割:%s %s" % (train, test)) # 留p分组 lpgo = LeavePGroupsOut(n_groups=2) for train, test in lpgo.split(X, y, groups=groups): print("留 P 组分割:%s %s" % (train, test)) # 随机分组 gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0) for train, test in gss.split(X, y, groups=groups): print("随机分割:%s %s" % (train, test)) # ==================================时间序列分割========================================== tscv = TimeSeriesSplit(n_splits=3) TimeSeriesSplit(max_train_size=None, n_splits=3) for train, test in tscv.split(iris.data): print("时间序列分割:%s %s" % (train, test))
def supervised_pipeline(label_struct, baseline_label_struct, pipe, scaler, param_grid, patient_list, feature_slot, hyper_param, plot_eda_all_new, learn_flag, compute_all_new, n_jobs ): # State the parameters of the pipeline disk = '/mnt/Seagate/pre_epi_seizures/' baseline_files = 'h5_files/processing_datasets/baseline_datasets_new' seizure_files = 'h5_files/processing_datasets/seizure_datasets_new' lead_list = ['ECG-'] interim_processing = [scaler] hist_bins = None dist = None flag_hist = True flag_andrews = True flag_series = True flag_box = True flag_pair = True assign_baseline = 'assign_equal_baseline_seizure' general_dir = disk + 'EDanalysis_new/' # choose data grouping group_keys= ['patient_nr', 'seizure_nr', 'types_of_seizure', 'location'] group_id = 'seizure_nr' # Get initial directory, for Exploratory Data Analysis eda_dir = prepare_disk_space_hyper_param_results(directory=general_dir , patient_list = patient_list, lead_list = lead_list, scaler = scaler, interim_processing = interim_processing, assign_baseline = assign_baseline, label_struct = label_struct, baseline_label_struct = baseline_label_struct, feature_slot=feature_slot, group_id=group_id, hyper_param=hyper_param) # information for sklearn labeling label = 'label' # define cross-validation strategy cv_out = LeavePGroupsOut(n_groups=1) cv_in = LeavePGroupsOut(n_groups=1) # choose scoring scoring = ['f1_micro'] # choose hyperparameter search function search_function = GridSearchCV # get steps of the pipeline pipe_steps = [step[0] for step in pipe.steps] # Get directory (should be nested) to save classification objects classification_dir = prepare_disk_space_hyper_param_results(directory=eda_dir, pipe = str(pipe_steps), param_grid = param_grid, cv_out = cv_out, cv_in = cv_in, scoring = scoring, search_function = search_function, label=label) if plot_eda_all_new: plot_eda(directory=classification_dir, data_groups_list=data_groups_list) import classification.cross_validation as cv # ***********************************Learning**************************** # Learn from data_struct using nested cross_validation # learninig is an optimization and respective test results # for each partition of the dataset according to cv_out # Load the data, according to specification (loading made by convert pandas) data_struct = interim_process(disk, seizure_files, baseline_files, feature_slot, hyper_param, patient_list, lead_list, label_struct, baseline_label_struct, interim_processing) data = data_struct[0] features = data_struct[1] meta_features = data_struct[2] if learn_flag: # prepare data for classification - watch out for memory concerns X = data[features] y = data[label] groups = data[group_id] learning_results = cv.nested_cross_validation(classification_dir, X,y, groups, pipe, param_grid, scoring, compute_all_new, cv_out, cv_in, search_function, n_jobs=n_jobs) #************************************************************************ print 'These are the learning results' print learning_results # get data groups data_groups = data.groupby(group_keys) groups = data_groups.groups.keys() for learning_result, group in zip(learning_results, groups): learning_result['group'] = group learning_result['group_keys'] = group_keys cv_object = learning_results report = cv.generate_classification_report(cv_object) report.to_hdf(classification_dir + 'classification_report.h5', '/report' ) print report print 'Done!'