Exemple #1
0
def test_stratified_kfold_ratios():
    y = pd.DataFrame(np.random.randn(1000)) * 20 + 50
    n_folds = 5
    cv = KFoldStratified(n_splits=n_folds)
    for train, test in cv.split(np.zeros(len(y)), y):
        assert (y.iloc[train].mean()[0] >= 47) & (y.iloc[train].mean()[0] <=
                                                  53)
def set_cv(Y=None, cv_dict=None):
    """ Helper function to create a sci-kit learn compatible cv object using
    common parameters for prediction analyses.

    Args:
        Y:  (pd.DataFrame) Pandas Dataframe of Y labels
        cv_dict: (dict) Type of cross_validation to use. A dictionary of
            {'type': 'kfolds', 'n_folds': n},
            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
            {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or
            {'type': 'loso', 'subject_id': holdout}
    Returns:
        cv: a scikit-learn model-selection generator

     """

    if isinstance(cv_dict, dict):
        if cv_dict['type'] == 'kfolds':
            if 'subject_id' in cv_dict:  # Hold out subjects within each fold
                from sklearn.model_selection import GroupKFold
                gkf = GroupKFold(n_splits=cv_dict['n_folds'])
                cv = gkf.split(X=np.zeros(len(Y)),
                               y=Y,
                               groups=cv_dict['subject_id'])
            elif 'stratified' in cv_dict:  # Stratified K-Folds Continuous
                from nltools.cross_validation import KFoldStratified
                kfs = KFoldStratified(n_splits=cv_dict['n_folds'])
                cv = kfs.split(X=np.zeros(len(Y)), y=Y)
            else:  # Normal K-Folds
                from sklearn.model_selection import KFold
                kf = KFold(n_splits=cv_dict['n_folds'])
                cv = kf.split(X=np.zeros(len(Y)), y=Y)
        elif cv_dict['type'] == 'loso':  # Leave One Subject Out
            from sklearn.model_selection import LeaveOneGroupOut
            loso = LeaveOneGroupOut()
            cv = loso.split(X=np.zeros(len(Y)),
                            y=Y,
                            groups=cv_dict['subject_id'])
        else:
            raise ValueError("""Make sure you specify a dictionary of
                            {'type': 'kfolds', 'n_folds': n},
                            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
                            {'type': 'kfolds', 'n_folds': n,
                            'subject_id': holdout}, or {'type': 'loso',
                            'subject_id': holdout}, where n = number of folds,
                            and subject = vector of subject ids that
                            corresponds to self.Y""")
    else:
        raise ValueError("Make sure 'cv_dict' is a dictionary.")
    return cv
Exemple #3
0
def set_cv(Y=None, cv_dict=None, return_generator=True):
    """Helper function to create a sci-kit learn compatible cv object using
    common parameters for prediction analyses.

    Args:
        Y:  (pd.DataFrame) Pandas Dataframe of Y labels
        cv_dict: (dict) Type of cross_validation to use. A dictionary of
            {'type': 'kfolds', 'n_folds': n},
            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
            {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or
            {'type': 'loso', 'subject_id': holdout}
        return_generator (bool): return a cv generator instead of an instance; default True
    Returns:
        cv: a scikit-learn model-selection generator

    """

    if isinstance(cv_dict, dict):
        if cv_dict["type"] == "kfolds":
            if "subject_id" in cv_dict:  # Hold out subjects within each fold
                from sklearn.model_selection import GroupKFold

                cv_inst = GroupKFold(n_splits=cv_dict["n_folds"])
                cv = cv_inst.split(X=np.zeros(len(Y)),
                                   y=Y,
                                   groups=cv_dict["subject_id"])
            elif "stratified" in cv_dict:  # Stratified K-Folds Continuous
                from nltools.cross_validation import KFoldStratified

                cv_inst = KFoldStratified(n_splits=cv_dict["n_folds"])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
            else:  # Normal K-Folds
                from sklearn.model_selection import KFold

                cv_inst = KFold(n_splits=cv_dict["n_folds"])
                cv = cv_inst.split(X=np.zeros(len(Y)), y=Y)
        elif cv_dict["type"] == "loso":  # Leave One Subject Out
            from sklearn.model_selection import LeaveOneGroupOut

            cv_inst = LeaveOneGroupOut()
            cv = cv_inst.split(X=np.zeros(len(Y)),
                               y=Y,
                               groups=cv_dict["subject_id"])
        else:
            raise ValueError("""Make sure you specify a dictionary of
                            {'type': 'kfolds', 'n_folds': n},
                            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
                            {'type': 'kfolds', 'n_folds': n,
                            'subject_id': holdout}, or {'type': 'loso',
                            'subject_id': holdout}, where n = number of folds,
                            and subject = vector of subject ids that
                            corresponds to self.Y""")
    else:
        raise ValueError("Make sure 'cv_dict' is a dictionary.")
    if return_generator:
        return cv
    else:
        return cv_inst
Exemple #4
0
def test_kfoldstratified():
    y = pd.DataFrame(np.random.randn(50)) * 20 + 50
    n_folds = 5
    cv = KFoldStratified(n_splits=n_folds)
    check_cv_coverage(cv,
                      X=np.zeros(len(y)),
                      y=y,
                      groups=None,
                      expected_n_splits=n_folds)

    y = pd.DataFrame(np.random.randn(51)) * 20 + 50
    n_folds = 5
    cv = KFoldStratified(n_splits=n_folds)
    check_cv_coverage(cv,
                      X=np.zeros(len(y)),
                      y=y,
                      groups=None,
                      expected_n_splits=n_folds)
Exemple #5
0
def set_cv(cv_dict):
    """ Helper function to create a sci-kit learn compatible cv object using common parameters for prediction analyses.

    Args:
        cv_dict: Type of cross_validation to use. A dictionary of
            {'type': 'kfolds', 'n_folds': n},
            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
            {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or
            {'type': 'loso', 'subject_id': holdout}
    Returns:
        cv: a scikit-learn cross-validation instance

     """

    if type(cv_dict) is dict:
        if cv_dict['type'] == 'kfolds':
            if 'subject_id' in cv_dict:
                # Hold out subjects within each fold
                from nltools.cross_validation import KFoldSubject
                cv = KFoldSubject(len(cv_dict['subject_id']),
                                  cv_dict['subject_id'],
                                  n_folds=cv_dict['n_folds'])
            elif 'stratified' in cv_dict:
                # Stratified K-Folds
                from nltools.cross_validation import KFoldStratified
                if isinstance(cv_dict['stratified'], pd.DataFrame):
                    # need to pass numpy array not pandas
                    cv_dict['stratified'] = np.array(
                        cv_dict['stratified']).flatten()
                cv = KFoldStratified(cv_dict['stratified'],
                                     n_folds=cv_dict['n_folds'])
            else:
                # Normal K-Folds
                from sklearn.cross_validation import KFold
                cv = KFold(n=cv_dict['n'], n_folds=cv_dict['n_folds'])
        elif cv_dict['type'] == 'loso':
            # Leave One Subject Out
            from nltools.cross_validation import LeaveOneSubjectOut
            cv = LeaveOneSubjectOut(len(cv_dict['subject_id']),
                                    labels=cv_dict['subject_id'])
        else:
            raise ValueError("""Make sure you specify a dictionary of
            {'type': 'kfolds', 'n_folds': n},
            {'type': 'kfolds', 'n_folds': n, 'stratified': Y},
            {'type': 'kfolds', 'n_folds': n, 'subject_id': holdout}, or
            {'type': 'loso', 'subject_id': holdout},
            where n = number of folds, and subject = vector of subject ids that corresponds to self.Y"""
                             )
    else:
        raise ValueError("Make sure 'cv_dict' is a dictionary.")
    return cv