def test_stratified_kfold_ratios(): y = pd.DataFrame(np.random.randn(1000)) * 20 + 50 n_folds = 5 cv = KFoldStratified(n_splits=n_folds) for train, test in cv.split(np.zeros(len(y)), y): assert (y.iloc[train].mean()[0] >= 47) & (y.iloc[train].mean()[0] <= 53)
def set_cv(Y=None, cv_dict=None): """ Helper function to create a sci-kit learn compatible cv object using common parameters for prediction analyses. Args: Y: (pd.DataFrame) Pandas Dataframe of Y labels cv_dict: (dict) Type of cross_validation to use. A dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout} Returns: cv: a scikit-learn model-selection generator """ if isinstance(cv_dict, dict): if cv_dict['type'] == 'kfolds': if 'subject_id' in cv_dict: # Hold out subjects within each fold from sklearn.model_selection import GroupKFold gkf = GroupKFold(n_splits=cv_dict['n_folds']) cv = gkf.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id']) elif 'stratified' in cv_dict: # Stratified K-Folds Continuous from nltools.cross_validation import KFoldStratified kfs = KFoldStratified(n_splits=cv_dict['n_folds']) cv = kfs.split(X=np.zeros(len(Y)), y=Y) else: # Normal K-Folds from sklearn.model_selection import KFold kf = KFold(n_splits=cv_dict['n_folds']) cv = kf.split(X=np.zeros(len(Y)), y=Y) elif cv_dict['type'] == 'loso': # Leave One Subject Out from sklearn.model_selection import LeaveOneGroupOut loso = LeaveOneGroupOut() cv = loso.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict['subject_id']) else: raise ValueError("""Make sure you specify a dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout}, where n = number of folds, and subject = vector of subject ids that corresponds to self.Y""") else: raise ValueError("Make sure 'cv_dict' is a dictionary.") return cv
def set_cv(Y=None, cv_dict=None, return_generator=True): """Helper function to create a sci-kit learn compatible cv object using common parameters for prediction analyses. Args: Y: (pd.DataFrame) Pandas Dataframe of Y labels cv_dict: (dict) Type of cross_validation to use. A dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout} return_generator (bool): return a cv generator instead of an instance; default True Returns: cv: a scikit-learn model-selection generator """ if isinstance(cv_dict, dict): if cv_dict["type"] == "kfolds": if "subject_id" in cv_dict: # Hold out subjects within each fold from sklearn.model_selection import GroupKFold cv_inst = GroupKFold(n_splits=cv_dict["n_folds"]) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict["subject_id"]) elif "stratified" in cv_dict: # Stratified K-Folds Continuous from nltools.cross_validation import KFoldStratified cv_inst = KFoldStratified(n_splits=cv_dict["n_folds"]) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y) else: # Normal K-Folds from sklearn.model_selection import KFold cv_inst = KFold(n_splits=cv_dict["n_folds"]) cv = cv_inst.split(X=np.zeros(len(Y)), y=Y) elif cv_dict["type"] == "loso": # Leave One Subject Out from sklearn.model_selection import LeaveOneGroupOut cv_inst = LeaveOneGroupOut() cv = cv_inst.split(X=np.zeros(len(Y)), y=Y, groups=cv_dict["subject_id"]) else: raise ValueError("""Make sure you specify a dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout}, where n = number of folds, and subject = vector of subject ids that corresponds to self.Y""") else: raise ValueError("Make sure 'cv_dict' is a dictionary.") if return_generator: return cv else: return cv_inst
def test_kfoldstratified(): y = pd.DataFrame(np.random.randn(50)) * 20 + 50 n_folds = 5 cv = KFoldStratified(n_splits=n_folds) check_cv_coverage(cv, X=np.zeros(len(y)), y=y, groups=None, expected_n_splits=n_folds) y = pd.DataFrame(np.random.randn(51)) * 20 + 50 n_folds = 5 cv = KFoldStratified(n_splits=n_folds) check_cv_coverage(cv, X=np.zeros(len(y)), y=y, groups=None, expected_n_splits=n_folds)
def set_cv(cv_dict): """ Helper function to create a sci-kit learn compatible cv object using common parameters for prediction analyses. Args: cv_dict: Type of cross_validation to use. A dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout} Returns: cv: a scikit-learn cross-validation instance """ if type(cv_dict) is dict: if cv_dict['type'] == 'kfolds': if 'subject_id' in cv_dict: # Hold out subjects within each fold from nltools.cross_validation import KFoldSubject cv = KFoldSubject(len(cv_dict['subject_id']), cv_dict['subject_id'], n_folds=cv_dict['n_folds']) elif 'stratified' in cv_dict: # Stratified K-Folds from nltools.cross_validation import KFoldStratified if isinstance(cv_dict['stratified'], pd.DataFrame): # need to pass numpy array not pandas cv_dict['stratified'] = np.array( cv_dict['stratified']).flatten() cv = KFoldStratified(cv_dict['stratified'], n_folds=cv_dict['n_folds']) else: # Normal K-Folds from sklearn.cross_validation import KFold cv = KFold(n=cv_dict['n'], n_folds=cv_dict['n_folds']) elif cv_dict['type'] == 'loso': # Leave One Subject Out from nltools.cross_validation import LeaveOneSubjectOut cv = LeaveOneSubjectOut(len(cv_dict['subject_id']), labels=cv_dict['subject_id']) else: raise ValueError("""Make sure you specify a dictionary of {'type': 'kfolds', 'n_folds': n}, {'type': 'kfolds', 'n_folds': n, 'stratified': Y}, {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or {'type': 'loso', 'subject_id': holdout}, where n = number of folds, and subject = vector of subject ids that corresponds to self.Y""" ) else: raise ValueError("Make sure 'cv_dict' is a dictionary.") return cv