def checkme(keep_discrete_columns):
        """Utility"""
        _, _, df = mock_problem()
        prob = Problem(df, ['gene1', 'gender'], 'disease', 'yes')
        vectorized_prob = prob.vectorize(
            keep_discrete_columns=keep_discrete_columns)
        print(vectorized_prob.dataframe)
        nose.tools.eq_(vectorized_prob.outcome_column, prob.outcome_column)
        nose.tools.eq_(vectorized_prob.positive_outcome, prob.positive_outcome)
        np.testing.assert_array_equal(vectorized_prob.y, prob.y)

        if keep_discrete_columns:
            expected_columns = [
                'gene1', 'gene2', 'disease', 'gender', 'gender=male',
                'gender=female'
            ]
            nose.tools.assert_list_equal(
                list(vectorized_prob.dataframe['gender']),
                list(prob.dataframe['gender']))
        else:
            expected_columns = [
                'gene1', 'gene2', 'disease', 'gender=male', 'gender=female'
            ]

        nose.tools.assert_list_equal(
            sorted(list(vectorized_prob.dataframe.columns)),
            sorted(expected_columns))
        nose.tools.assert_list_equal(vectorized_prob.features,
                                     ['gender=female', 'gender=male', 'gene1'])
        np.testing.assert_almost_equal(vectorized_prob.X,
                                       np.asarray([[0, 1, 0.0], [0, 1, 0.2],
                                                   [1, 0, 0.4], [1, 0, 0.6],
                                                   [1, 0, 0.8]]),
                                       decimal=10)
def test_problem_creation():
    """Validates that Problem instances behave as expected"""
    feat_df, _, combined_df = mock_problem()
    prob = Problem(combined_df, feat_df.columns, 'disease', 'yes')
    nose.tools.eq_(prob.n_features, 2)
    nose.tools.eq_(prob.n_samples, 5)
    nose.tools.assert_list_equal(prob.sample_ids,
                                 ['S-0', 'S-1', 'S-2', 'S-3', 'S-4'])
    np.testing.assert_array_equal(prob.y, [1, 0, 0, 1, 1])
    np.testing.assert_array_equal(prob.dataframe.values, combined_df.values)
    nose.tools.eq_(prob.X.shape[0], prob.n_samples)
    nose.tools.eq_(prob.X.shape[1], prob.n_features)
    np.testing.assert_array_equal(prob.X, feat_df.values)

    # Try subsetting features and a different outcome variable
    sub_prob = Problem(combined_df, ['gene2'], 'gender', 'male')
    nose.tools.eq_(sub_prob.n_features, 1)
    nose.tools.eq_(sub_prob.n_samples, 5)
    nose.tools.assert_list_equal(sub_prob.sample_ids,
                                 ['S-0', 'S-1', 'S-2', 'S-3', 'S-4'])
    np.testing.assert_array_equal(sub_prob.y, [1, 1, 0, 0, 0])
    np.testing.assert_array_equal(sub_prob.dataframe.values,
                                  combined_df.values)
    nose.tools.eq_(sub_prob.X.shape[0], sub_prob.n_samples)
    nose.tools.eq_(sub_prob.X.shape[1], sub_prob.n_features)
    np.testing.assert_array_equal(sub_prob.X.ravel(), feat_df.values[:, 1])
    def checkme(fraction):
        """Tests learning curve CV downsampling
        :param fraction: Float/double in [0,1] (inclusive on both ends) - sampling rate for CV generation
        """
        problem_size = 1000
        prob = Problem(mock_frame('A', problem_size), ['f1', 'f2'], 'y', 1)
        cv_gen = CVSplitGenerator(prob,
                                  10,
                                  2,
                                  random_state=np.random.RandomState(0xC0FFEE))
        sanity_check_cv_generator(prob, cv_gen)
        learning_curve = LearningCurveCVGenerator(
            fraction, cv_gen, random_state=cv_gen.random_state)

        train_occurrences = Counter()
        for train, test in learning_curve:
            nose.tools.eq_(len(train), int(
                problem_size * 0.9 *
                fraction))  # subsampled 900/100 split (10 folds)
            nose.tools.eq_(len(test), int(
                problem_size *
                0.1))  # verify that test set is 1/10 of the problem size

            train_occurrences.update(train.sample_ids)

        # This is a fairly weak test, but in general it's difficult to predict how many unique train samples we'll see,
        # especially when both the subsampling fraction and the total number of splits are small. It does protect
        # us against truly terrible bugs though, e.g. if we accidentally return the same training set over and over.
        nose.tools.assert_greater_equal(len(set(train_occurrences.keys())),
                                        problem_size * fraction)
def test_no_column_overwrite():
    """Validates that we don't overwrite input values if the input contains NaNs in discrete columns"""
    df = pd.DataFrame({
        'A': ['a', 'aa', float('nan')],
        'B': ['b', 'bb', 'bbb'],
        'y': [0, 1, 1]
    })
    prob = Problem(df, ['A', 'B'], 'y', 1)
    vec = ProblemVectorizer()

    vec_prob = vec.fit_apply(prob, keep_discrete_columns=True)
    vec_df = vec_prob.dataframe

    nose.tools.assert_list_equal(sorted(vec_prob.features),
                                 ['A=a', 'A=aa', 'B=b', 'B=bb', 'B=bbb'])

    nose.tools.assert_list_equal(list(vec_df['A=a']), [1, 0, 0])
    nose.tools.assert_list_equal(list(vec_df['A=aa']), [0, 1, 0])

    nose.tools.assert_list_equal(list(vec_df['B=b']), [1, 0, 0])
    nose.tools.assert_list_equal(list(vec_df['B=bb']), [0, 1, 0])
    nose.tools.assert_list_equal(list(vec_df['B=bbb']), [0, 0, 1])

    # Original input columns shouldn't have changed.
    #
    # In the initial implementation, this test failed for column 'A'. This happened
    # because scikit's vectorizer creates an all-zero column with the exact same name if the input is
    # discrete and contains NaNs, which causes the original values to be overwritten.
    nose.tools.assert_list_equal(list(vec_df['A']), list(df['A']))
    nose.tools.assert_list_equal(list(vec_df['B']), list(df['B']))

    nose.tools.assert_list_equal(
        sorted(vec_df.columns),
        sorted(['A', 'A=a', 'A=aa', 'B', 'B=b', 'B=bb', 'B=bbb', 'y']))
def test_problem_slicing():
    """Validates that we can slice problems along the sample axis"""
    _, _, df = mock_problem()
    prob = Problem(df, ['gene1', 'gene2'], 'disease', 'yes')

    male_prob = prob[prob.dataframe['gender'] == 'male']
    assert_metadata_eq(prob, male_prob)
    nose.tools.eq_(male_prob.n_samples, 2)
    nose.tools.eq_(male_prob.n_features, 2)
    np.testing.assert_array_equal(male_prob.y, [1, 0])
    np.testing.assert_array_equal(male_prob.X, prob.X[:2])

    custom_prob = prob.iloc([0, 2, 3])
    assert_metadata_eq(prob, custom_prob)
    nose.tools.eq_(custom_prob.n_samples, 3)
    nose.tools.eq_(custom_prob.n_features, 2)
    np.testing.assert_array_equal(custom_prob.y, [1, 0, 1])
    np.testing.assert_array_equal(custom_prob.X, prob.X[[0, 2, 3]])
def test_multiclass(working_dir):
    """ Tests machine learning classification workfloor with multiclass for iris dataset
        see http://scikit-learn.org/stable/modules/multiclass.html """

    out_dir = os.path.join(working_dir, 'learn_output')
    model_path = os.path.join(out_dir, 'model.txt')

    iris = datasets.load_iris()

    df = iris_to_df(iris)

    features = [feat for feat in df.columns if feat not in ['Target']]

    prob = Problem(df, features, "Target", positive_outcome=None)
    rnd = np.random.RandomState(2016)
    approach = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3),
                                 RandomForestClassifier(random_state=rnd))

    learn_params = LearningParameters(metrics={
        'auc':
        roc_auc_score,
        'accuracy':
        accuracy_from_confusion_matrix
    },
                                      treat_as_binary=False)
    cvg = CVSplitGenerator(prob,
                           n_folds=10,
                           n_repartitions=10,
                           random_state=rnd)

    cv = CrossValidatedAnalysis(prob,
                                approach,
                                cv_generator=cvg,
                                runner=SerialRunner(),
                                params=learn_params)

    results = cv.run()
    renderer = ReportRenderer(out_dir)
    ClassificationReport(renderer, False, prob.label_list).generate(results)
    nose.tools.ok_(
        os.path.exists(os.path.join(out_dir, 'sample_confusion_matrix.txt')))
    average_accuracy = compute_average_accuracy(results)
    nose.tools.assert_almost_equal(0.95, average_accuracy, delta=0.01)

    classifier = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3),
                                   RandomForestClassifier(random_state=2016),
                                   name='test multiclass model').fit(prob)
    model = ClassificationModel(classifier, prob)
    model.write(model_path)

    read_model = ClassificationModel.read(model_path)

    auc_average = read_model.training_auc
    nose.tools.assert_almost_equal(1.0, auc_average, delta=1e-6)
def mock_problem():
    """ creates mock problem """
    X = np.random.normal(size=(100, 2))
    y = np.asarray([1] * 50 + [0] * 50)
    df = pd.DataFrame({
        'featA': X[:, 0],
        'featB': X[:, 1],
        'featC': ['foo', 'bar'] * 50,
        'y': y
    })
    prob = Problem(df, ['featA', 'featB', 'featC'], 'y', 1)
    return prob
def test_pipeline():
    """Validates that pipelines work as expected"""
    prob = Problem(pd.DataFrame({'feat0': [0] * 100, 'y': [0, 1] * 50}), ['feat0'], 'y', 1)
    pipe = Pipeline([('step{}'.format(idx), CountingTransform()) for idx in range(50)])
    pipe.fit(prob)

    transformed_prob = pipe.apply(prob)
    nose.tools.eq_(transformed_prob.X.shape[0], 100)  # same number of samples
    nose.tools.eq_(transformed_prob.X.shape[1], 51)  # started with 1 feature, and added one extra for each transform

    for idx in range(transformed_prob.X.shape[1]):
        np.testing.assert_array_equal(transformed_prob.X[:, idx], [idx] * prob.X.shape[0])
예제 #9
0
    def checkme(working_dir, n_samples, n_features, k, make_classifier,
                test_vectorize):
        """Utility"""
        assert n_samples % 4 == 0
        model_path = os.path.join(working_dir, 'model.txt')
        prob = mock_problem(n_samples=n_samples, n_features=n_features)
        if test_vectorize:
            df = prob.dataframe
            df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2)
            df['discrete_2'] = ['foo', 'bar', 'baz',
                                float('nan')] * int(n_samples / 4)
            df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int(
                n_samples / 4)
            prob = Problem(
                df, prob.features +
                ['discrete_1', 'discrete_2', 'continuous_with_missing'],
                prob.outcome_column, prob.positive_outcome)
            preprocess = ProblemVectorizer()
        else:
            preprocess = None

        approach = SelectAndClassify(SelectKBest(k=k),
                                     make_classifier(),
                                     preprocess=preprocess).fit(prob)
        model = ClassificationModel(approach, prob)

        model.write(model_path)
        reconstituted_model = ClassificationModel.read(model_path)

        model.validate()
        reconstituted_model.validate()

        np.testing.assert_array_equal(model.approach.apply(prob),
                                      reconstituted_model.approach.apply(prob))

        if preprocess is not None:
            approach_pipeline = ApproachPipeline([('preprocess', preprocess)])
            approach_with_pipeline = SelectAndClassify(
                SelectKBest(k=k),
                make_classifier(),
                preprocess=approach_pipeline).fit(prob)
            # test approach serialization with Pipeline from learners.py
            model_with_pipeline = ClassificationModel(approach_with_pipeline,
                                                      prob)
            model_path2 = os.path.join(working_dir, 'model2.txt')
            model_with_pipeline.write(model_path2)
            reconstituted_model2 = ClassificationModel.read(model_path2)
            reconstituted_model2.validate()
            np.testing.assert_array_almost_equal(
                model.approach.apply(prob),
                reconstituted_model2.approach.apply(prob), 14)
예제 #10
0
    def make_problem(self):
        """Creates a Problem instance using the current options"""
        df = pd.read_csv(self.input_file,
                         sep=self.separator,
                         index_col=0 if self.id_col is None else self.id_col)

        # pylint wrongly thinks that df is a tuple (it's a DataFrame), hence the disable below
        # pylint: disable=no-member
        all_features = [col for col in df.columns if col != self.target_label]
        return Problem(PandasDataSource(df, path=self.input_file),
                       features=self.features
                       if self.features is not None else all_features,
                       outcome_column=self.target_label,
                       positive_outcome=self.positive_value)
def test_multiclass_label_subset():
    """ Tests y_score for multiclass problem with training set
    having subset of possible classes """
    data = []
    data2 = []
    class_values = ['A', 'B', 'C', 'D']
    for index_class in range(4):
        data, data2 = mock_coords_data(data, index_class,
                                       class_values[index_class], data2, True)

    df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data)
    prob = Problem(df, ['coord0', 'coord1'], 'class', None)
    df2 = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data2)
    prob2 = Problem(df2, ['coord0', 'coord1'], 'class', None, prob.label_list)

    classifier = SelectAndClassify(SelectKBest(k='all'),
                                   LogisticRegression(),
                                   name='test multiclass model').fit(prob2)

    y_pred = classifier.predict(prob2)
    y_score = classifier.prediction_probabilities(prob2)
    # check that "C" class has probabilities 0
    for i_row in range(y_pred.shape[0]):
        nose.tools.assert_almost_equal(0.0, y_score[i_row, 2], delta=1e-6)
def test_multiclass_auc():
    """ Tests auc value for multiclass problem"""
    data = []
    class_values = ['A', 'B', 'C', 'D']
    for index_class in range(4):
        data, _ = mock_coords_data(data, index_class,
                                   class_values[index_class], None, True)

    df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data)
    prob = Problem(df, ['coord0', 'coord1'], 'class', None)
    classifier = SelectAndClassify(SelectKBest(k='all'),
                                   LogisticRegression(),
                                   name='test multiclass model').fit(prob)
    model = ClassificationModel(classifier, prob)
    auc_average = model.training_auc
    nose.tools.assert_almost_equal(0.853333333, auc_average, delta=1e-6)

    prob_binary = Problem(df, ['coord0', 'coord1'], 'class', 'A')
    classifier_binary = SelectAndClassify(SelectKBest(k='all'),
                                          LogisticRegression(),
                                          name='binary model').fit(prob_binary)
    model_binary = ClassificationModel(classifier_binary, prob_binary)
    auc_binary = model_binary.training_auc
    nose.tools.assert_almost_equal(auc_binary, auc_average, delta=1e-6)
 def checkme(permissive_or_not, fail_or_pass, expected_numeric,
             expected_discrete, df_columns):
     """Utility"""
     assert permissive_or_not in {'permissive', 'strict'}
     assert fail_or_pass in {'fail', 'pass'}
     df = pd.DataFrame({col: list(range(10)) for col in df_columns})
     df['y'] = [0, 1] * 5
     prob = Problem(df, df_columns, 'y', 1)
     vec = ProblemVectorizer(expected_numeric=expected_numeric,
                             expected_discrete=expected_discrete,
                             permissive=(permissive_or_not == 'permissive'))
     if fail_or_pass == 'pass':
         vec.fit_apply(prob)
     else:
         nose.tools.assert_raises(ValueError, lambda: vec.fit_apply(prob))
def test_problem_concatenation():
    """Validates that we can concatenate Problem instances"""
    _, _, df = mock_problem()
    df = df.sort_values(
        'gender'
    )  # need to sort so that we can reverse slicing by simple concatenation

    prob = Problem(df, ['gene1', 'gene2'], 'disease', 'yes')
    sub_prob_male = prob[prob.dataframe['gender'] == 'male']
    sub_prob_female = prob[prob.dataframe['gender'] == 'female']
    reconstituted_prob = sub_prob_female + sub_prob_male  # here's where the sort matters

    np.testing.assert_array_equal(reconstituted_prob.dataframe.values,
                                  prob.dataframe.values)
    np.testing.assert_array_equal(reconstituted_prob.outcome_column,
                                  prob.outcome_column)
    np.testing.assert_array_equal(reconstituted_prob.positive_outcome,
                                  prob.positive_outcome)
    nose.tools.assert_list_equal(reconstituted_prob.features, prob.features)
    nose.tools.assert_list_equal(reconstituted_prob.sample_ids,
                                 prob.sample_ids)

    # Incompatible outcome columns
    nose.tools.assert_raises(
        ValueError, lambda: sub_prob_male + Problem(
            sub_prob_female.dataframe, ['gene1', 'gene2'], 'gender', 'male'))

    # Incompatible positive outcome
    nose.tools.assert_raises(
        ValueError, lambda: sub_prob_male + Problem(
            sub_prob_female.dataframe, ['gene1', 'gene2'], 'disease', 'no'))

    # Incompatible features
    nose.tools.assert_raises(
        ValueError, lambda: sub_prob_male + Problem(sub_prob_female.dataframe,
                                                    ['f1'], 'disease', 'yes'))
    def checkme(n_pos, n_neg, n_folds, fail_or_pass):
        """Utility"""
        assert fail_or_pass in {'fail', 'pass'}
        y = np.asarray([1] * n_pos + [0] * n_neg)
        X = np.zeros((y.shape[0], 2))
        df = pd.DataFrame(data=X, columns=['f1', 'f2'])
        df['y'] = y
        prob = Problem(df, ['f1', 'f2'], 'y', 1)

        if fail_or_pass == 'pass':
            runner = lambda thunk: thunk()
        else:
            runner = lambda thunk: nose.tools.assert_raises(ValueError, thunk)

        cv = CVSplitGenerator(prob,
                              n_folds,
                              2,
                              random_state=np.random.RandomState(0xC0FFEE))
        runner(lambda: next(cv.__iter__()))
def mock_badvector_problem():
    """Mocks noisy DataFrames for testing vectorization"""
    feat_df, _, combined_df = mock_problem()

    # Feature columns are numeric, but we want to assign bad non-numeric values to test our pre-processing.
    # To be able to do this, we need to set the datatype to np.object.
    for feat_name in feat_df.columns:
        feat_df[feat_name] = pd.Series(feat_df[feat_name],
                                       dtype=np.object,
                                       copy=True)

    feat_df.loc['S-1']['gene1'] = 'invalid'
    feat_df.loc['S-2']['gene1'] = 'nul'
    feat_df.loc['S-0']['gene2'] = 'a'
    feat_df.loc['S-1']['gene2'] = 'b'
    feat_df.loc['S-3']['gene2'] = 0.5
    feat_df.loc['S-2']['gene2'] = 'c'
    feat_df.loc['S-4']['gene2'] = 'd'
    feat_df['disease'] = combined_df['disease']
    return Problem(feat_df, ['gene1', 'gene2'], 'disease', 'yes')
예제 #17
0
def mock_problem(n_samples, n_features, n_informative, theta):
    """Mocks up a problem for testing"""
    rand = np.random.RandomState(0xC0FFEE)
    X = rand.normal(size=(n_samples, n_features))
    y = rand.choice([0, 1], size=n_samples)

    informative_idx = rand.choice(list(range(n_informative)),
                                  size=n_informative,
                                  replace=False)
    for idx in informative_idx:
        X[y == 1, idx] += theta

    features = [
        'true-{}'.format(idx)
        if idx in informative_idx else 'null-{}'.format(idx)
        for idx in range(n_features)
    ]

    df = pd.DataFrame(data=X, columns=features)
    df['y'] = y
    return Problem(df, features, 'y', 1)
예제 #18
0
    def _check_prediction_input(self, df):
        """Validates that a DataFrame has all the required columns for prediction, and returns a Problem instance
        that the underlying learning approach can be invoked on"""
        missing_features = sorted(
            set(self.training_problem.features) - set(df.columns))
        if len(missing_features) > 0:
            raise ValueError("Input is missing features (count={}): {}".format(
                len(missing_features), ', '.join(missing_features)))

        # TODO FIXME: LearningApproaches require a Problem instance when calling apply(). This is not ideal
        # because Problems assume an outcome column, which might not be known when applying to new data.
        # Here we just mock a null outcome column, but we should consider changing the interface so that
        # apply() accepts a data frame directly.
        classification_columns = self.training_problem.features + [
            self.training_problem.outcome_column
        ]
        classification_df = pd.DataFrame(df, columns=classification_columns)
        return Problem(classification_df, self.training_problem.features,
                       self.training_problem.outcome_column,
                       self.training_problem.positive_outcome,
                       self.training_problem.label_list)
def test_binary_report_with_score_vector():
    " Test that in binary case score as vector contains same data as with positive outcome only"
    data = []
    class_values = ['A', 'B']
    for index_class in range(4):
        data = mock_coords_data(data,
                                index_class,
                                class_values[index_class % 2],
                                data2=None,
                                append_missed=False)[0]

    df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data)
    prob = Problem(df, ['coord0', 'coord1'], 'class', 'B')

    classifier = SelectAndClassify(
        SelectKBest(k='all'),
        LogisticRegression(),
        name='test binary with score vector').fit(prob)
    y_score_positive = classifier.apply(prob)
    y_score_all = classifier.apply(prob, False)
    nose.tools.ok_(np.allclose(y_score_positive, y_score_all[:, 1]))
def test_grouping_cross_validation():
    """Validates that the grouping CV generator works as expected"""
    df = mock_frame('A', 100)
    df['group'] = ['group{}'.format(idx)
                   for idx in range(20)] * 5  # 20 groups repeated 5 times
    prob = Problem(df, ['f1', 'f2'], 'y', 1)

    cv = GroupingCVSplitGenerator(prob,
                                  group_by='group',
                                  n_folds=10,
                                  n_repartitions=10)
    sanity_check_cv_generator(prob, cv)

    for train, test in cv:
        nose.tools.eq_(train.n_samples,
                       18 * 5)  # 18/20 groups, 5 samples per group
        nose.tools.eq_(test.n_samples,
                       2 * 5)  # 2/20 groups, 5 samples per group

        nose.tools.eq_(
            set(train.dataframe['group']) & set(test.dataframe['group']),
            set())  # no groups overlap
def mock_problem(n_samples=1000, n_features=100, theta=0.5):
    """\
    Creates a mock problem with class-differentiated features.

    :param n_samples: number of samples
    :param n_features: number of features
    :param theta: measure of class separation in sigma units
    :return: a Problem instance

    """
    if n_samples % 2 != 0:
        raise ValueError('Number of samples have to be a multiple of 2')

    rand = np.random.RandomState(0x12345)
    X = rand.normal(size=(n_samples, n_features))
    y = np.zeros(X.shape[0])
    y[:int(X.shape[0] / 2)] = 1
    X[y == 1] += theta
    df = pd.DataFrame(columns=['feat{}'.format(idx) for idx in range(X.shape[1])],
                      data=X)
    df['y'] = y
    df['train_or_test'] = ['train', 'test'] * int(n_samples / 2)
    return Problem(df, [col for col in df if 'feat' in col], 'y', 1)
def test_y_for_multiclass_slicing():
    """ Testing y method for multiclass"""
    df = pd.DataFrame(columns=['gene', 'number'],
                      data=[['gene1', 'one'], ['gene2', 'two'],
                            ['gene3', 'three'], ['gene4', 'four'],
                            ['gene5', 'five']])
    prob = Problem(df, ['gene'], 'number', None)
    y = prob.y
    nose.tools.assert_list_equal(list(y), [2, 4, 3, 1, 0])

    subset_prob = prob[prob.dataframe['gene'] != 'gene3']
    y_subset = subset_prob.y
    nose.tools.assert_list_equal(list(y_subset), [2, 4, 1, 0])

    subset_df = df[df['gene'] != 'gene3']
    prob_subset_df = Problem(subset_df, ['gene'], 'number', None)
    y_subset_df = prob_subset_df.y
    nose.tools.assert_list_equal(list(y_subset_df), [2, 3, 1, 0])

    prob_subset_df_with_list = Problem(subset_df, ['gene'], 'number', None,
                                       prob.label_list)
    y_subset_df_with_list = prob_subset_df_with_list.y
    nose.tools.assert_list_equal(list(y_subset_df_with_list), list(y_subset))

    custom_prob = prob.iloc([0, 2, 3])
    y_custom = custom_prob.y
    nose.tools.assert_list_equal(list(y_custom), [2, 3, 1])

    custom_df = df.iloc[[0, 2, 3]]
    prob_custom_df_with_list = Problem(custom_df, ['gene'], 'number', None)
    y_custom_df_with_list = prob_custom_df_with_list.y
    nose.tools.assert_list_equal(list(y_custom_df_with_list), [1, 2, 0], None)

    prob_custom_df = Problem(custom_df, ['gene'], 'number', None,
                             prob.label_list)
    y_custom_df = prob_custom_df.y
    nose.tools.assert_list_equal(list(y_custom_df), list(y_custom))
 def apply(self, problem):
     """Adds a new feature equal to self.count + 1 to the Problem"""
     df = pd.DataFrame(problem.dataframe)
     df[self.feature_name] = self.count + 1
     return Problem(df, problem.features + [self.feature_name], problem.outcome_column, problem.positive_outcome)
    def checkme(cv_df, train_df, test_df, ignore_df):
        """Test utility: validates CV split properties for the given CV/train-only/test-only/ignored data frames"""
        for setname, df in [('cv', cv_df), ('train', train_df),
                            ('test', test_df), ('ignore', ignore_df)]:
            df['set'] = setname

        prob = Problem(
            pd.concat([
                pd.DataFrame(df)
                for df in (cv_df, train_df, test_df, ignore_df)
            ]), ['f1', 'f2'], 'y', 1)
        cv_gen = CVSplitGenerator(prob,
                                  10,
                                  2,
                                  random_state=np.random.RandomState(0xC0FFEE),
                                  train_filter=lambda meta: meta[
                                      'set'] == 'cv' or meta['set'] == 'train',
                                  test_filter=lambda meta: meta['set'] == 'cv'
                                  or meta['set'] == 'test')
        cv_gen = list(cv_gen)  # so that we can check the length
        nose.tools.eq_(len(cv_gen), 20 if len(cv_df) > 0 else 1)

        for cv_train, cv_test in cv_gen:
            np.testing.assert_allclose(len(cv_train),
                                       0.9 * len(cv_df) + len(train_df),
                                       atol=1.0)  # 90% CV + train-only
            np.testing.assert_allclose(len(cv_test),
                                       0.1 * len(cv_df) + len(test_df),
                                       atol=1.0)  # 10% CV + test-only

            # Sanity check: no train/test overlap
            nose.tools.eq_(
                set(cv_train.sample_ids) & set(cv_test.sample_ids), set())

            # Train samples: must be from either CV or train-only set
            # Test samples: must be from either CV or test-only set
            nose.tools.ok_(
                all([
                    sample in cv_df.index or sample in train_df.index
                    for sample in cv_train.sample_ids
                ]))
            nose.tools.ok_(
                all([
                    sample in cv_df.index or sample in test_df.index
                    for sample in cv_test.sample_ids
                ]))

            # All train-only and all test-only samples should be present
            nose.tools.ok_(
                all([
                    sample in cv_train.sample_ids for sample in train_df.index
                ]))
            nose.tools.ok_(
                all([sample in cv_test.sample_ids
                     for sample in test_df.index]))

            # Samples in ignore_df should never be emitted
            nose.tools.ok_(not any([
                sample in ignore_df.index
                for sample in cv_train.sample_ids + cv_test.sample_ids
            ]))
 def assert_fails(*args, **kwargs):
     """Utility: calls the Problem ctor with the given arguments and expects it to raise an error"""
     nose.tools.assert_raises(ValueError, lambda: Problem(*args, **kwargs))