def checkme(keep_discrete_columns): """Utility""" _, _, df = mock_problem() prob = Problem(df, ['gene1', 'gender'], 'disease', 'yes') vectorized_prob = prob.vectorize( keep_discrete_columns=keep_discrete_columns) print(vectorized_prob.dataframe) nose.tools.eq_(vectorized_prob.outcome_column, prob.outcome_column) nose.tools.eq_(vectorized_prob.positive_outcome, prob.positive_outcome) np.testing.assert_array_equal(vectorized_prob.y, prob.y) if keep_discrete_columns: expected_columns = [ 'gene1', 'gene2', 'disease', 'gender', 'gender=male', 'gender=female' ] nose.tools.assert_list_equal( list(vectorized_prob.dataframe['gender']), list(prob.dataframe['gender'])) else: expected_columns = [ 'gene1', 'gene2', 'disease', 'gender=male', 'gender=female' ] nose.tools.assert_list_equal( sorted(list(vectorized_prob.dataframe.columns)), sorted(expected_columns)) nose.tools.assert_list_equal(vectorized_prob.features, ['gender=female', 'gender=male', 'gene1']) np.testing.assert_almost_equal(vectorized_prob.X, np.asarray([[0, 1, 0.0], [0, 1, 0.2], [1, 0, 0.4], [1, 0, 0.6], [1, 0, 0.8]]), decimal=10)
def test_problem_creation(): """Validates that Problem instances behave as expected""" feat_df, _, combined_df = mock_problem() prob = Problem(combined_df, feat_df.columns, 'disease', 'yes') nose.tools.eq_(prob.n_features, 2) nose.tools.eq_(prob.n_samples, 5) nose.tools.assert_list_equal(prob.sample_ids, ['S-0', 'S-1', 'S-2', 'S-3', 'S-4']) np.testing.assert_array_equal(prob.y, [1, 0, 0, 1, 1]) np.testing.assert_array_equal(prob.dataframe.values, combined_df.values) nose.tools.eq_(prob.X.shape[0], prob.n_samples) nose.tools.eq_(prob.X.shape[1], prob.n_features) np.testing.assert_array_equal(prob.X, feat_df.values) # Try subsetting features and a different outcome variable sub_prob = Problem(combined_df, ['gene2'], 'gender', 'male') nose.tools.eq_(sub_prob.n_features, 1) nose.tools.eq_(sub_prob.n_samples, 5) nose.tools.assert_list_equal(sub_prob.sample_ids, ['S-0', 'S-1', 'S-2', 'S-3', 'S-4']) np.testing.assert_array_equal(sub_prob.y, [1, 1, 0, 0, 0]) np.testing.assert_array_equal(sub_prob.dataframe.values, combined_df.values) nose.tools.eq_(sub_prob.X.shape[0], sub_prob.n_samples) nose.tools.eq_(sub_prob.X.shape[1], sub_prob.n_features) np.testing.assert_array_equal(sub_prob.X.ravel(), feat_df.values[:, 1])
def checkme(fraction): """Tests learning curve CV downsampling :param fraction: Float/double in [0,1] (inclusive on both ends) - sampling rate for CV generation """ problem_size = 1000 prob = Problem(mock_frame('A', problem_size), ['f1', 'f2'], 'y', 1) cv_gen = CVSplitGenerator(prob, 10, 2, random_state=np.random.RandomState(0xC0FFEE)) sanity_check_cv_generator(prob, cv_gen) learning_curve = LearningCurveCVGenerator( fraction, cv_gen, random_state=cv_gen.random_state) train_occurrences = Counter() for train, test in learning_curve: nose.tools.eq_(len(train), int( problem_size * 0.9 * fraction)) # subsampled 900/100 split (10 folds) nose.tools.eq_(len(test), int( problem_size * 0.1)) # verify that test set is 1/10 of the problem size train_occurrences.update(train.sample_ids) # This is a fairly weak test, but in general it's difficult to predict how many unique train samples we'll see, # especially when both the subsampling fraction and the total number of splits are small. It does protect # us against truly terrible bugs though, e.g. if we accidentally return the same training set over and over. nose.tools.assert_greater_equal(len(set(train_occurrences.keys())), problem_size * fraction)
def test_no_column_overwrite(): """Validates that we don't overwrite input values if the input contains NaNs in discrete columns""" df = pd.DataFrame({ 'A': ['a', 'aa', float('nan')], 'B': ['b', 'bb', 'bbb'], 'y': [0, 1, 1] }) prob = Problem(df, ['A', 'B'], 'y', 1) vec = ProblemVectorizer() vec_prob = vec.fit_apply(prob, keep_discrete_columns=True) vec_df = vec_prob.dataframe nose.tools.assert_list_equal(sorted(vec_prob.features), ['A=a', 'A=aa', 'B=b', 'B=bb', 'B=bbb']) nose.tools.assert_list_equal(list(vec_df['A=a']), [1, 0, 0]) nose.tools.assert_list_equal(list(vec_df['A=aa']), [0, 1, 0]) nose.tools.assert_list_equal(list(vec_df['B=b']), [1, 0, 0]) nose.tools.assert_list_equal(list(vec_df['B=bb']), [0, 1, 0]) nose.tools.assert_list_equal(list(vec_df['B=bbb']), [0, 0, 1]) # Original input columns shouldn't have changed. # # In the initial implementation, this test failed for column 'A'. This happened # because scikit's vectorizer creates an all-zero column with the exact same name if the input is # discrete and contains NaNs, which causes the original values to be overwritten. nose.tools.assert_list_equal(list(vec_df['A']), list(df['A'])) nose.tools.assert_list_equal(list(vec_df['B']), list(df['B'])) nose.tools.assert_list_equal( sorted(vec_df.columns), sorted(['A', 'A=a', 'A=aa', 'B', 'B=b', 'B=bb', 'B=bbb', 'y']))
def test_problem_slicing(): """Validates that we can slice problems along the sample axis""" _, _, df = mock_problem() prob = Problem(df, ['gene1', 'gene2'], 'disease', 'yes') male_prob = prob[prob.dataframe['gender'] == 'male'] assert_metadata_eq(prob, male_prob) nose.tools.eq_(male_prob.n_samples, 2) nose.tools.eq_(male_prob.n_features, 2) np.testing.assert_array_equal(male_prob.y, [1, 0]) np.testing.assert_array_equal(male_prob.X, prob.X[:2]) custom_prob = prob.iloc([0, 2, 3]) assert_metadata_eq(prob, custom_prob) nose.tools.eq_(custom_prob.n_samples, 3) nose.tools.eq_(custom_prob.n_features, 2) np.testing.assert_array_equal(custom_prob.y, [1, 0, 1]) np.testing.assert_array_equal(custom_prob.X, prob.X[[0, 2, 3]])
def test_multiclass(working_dir): """ Tests machine learning classification workfloor with multiclass for iris dataset see http://scikit-learn.org/stable/modules/multiclass.html """ out_dir = os.path.join(working_dir, 'learn_output') model_path = os.path.join(out_dir, 'model.txt') iris = datasets.load_iris() df = iris_to_df(iris) features = [feat for feat in df.columns if feat not in ['Target']] prob = Problem(df, features, "Target", positive_outcome=None) rnd = np.random.RandomState(2016) approach = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3), RandomForestClassifier(random_state=rnd)) learn_params = LearningParameters(metrics={ 'auc': roc_auc_score, 'accuracy': accuracy_from_confusion_matrix }, treat_as_binary=False) cvg = CVSplitGenerator(prob, n_folds=10, n_repartitions=10, random_state=rnd) cv = CrossValidatedAnalysis(prob, approach, cv_generator=cvg, runner=SerialRunner(), params=learn_params) results = cv.run() renderer = ReportRenderer(out_dir) ClassificationReport(renderer, False, prob.label_list).generate(results) nose.tools.ok_( os.path.exists(os.path.join(out_dir, 'sample_confusion_matrix.txt'))) average_accuracy = compute_average_accuracy(results) nose.tools.assert_almost_equal(0.95, average_accuracy, delta=0.01) classifier = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3), RandomForestClassifier(random_state=2016), name='test multiclass model').fit(prob) model = ClassificationModel(classifier, prob) model.write(model_path) read_model = ClassificationModel.read(model_path) auc_average = read_model.training_auc nose.tools.assert_almost_equal(1.0, auc_average, delta=1e-6)
def mock_problem(): """ creates mock problem """ X = np.random.normal(size=(100, 2)) y = np.asarray([1] * 50 + [0] * 50) df = pd.DataFrame({ 'featA': X[:, 0], 'featB': X[:, 1], 'featC': ['foo', 'bar'] * 50, 'y': y }) prob = Problem(df, ['featA', 'featB', 'featC'], 'y', 1) return prob
def test_pipeline(): """Validates that pipelines work as expected""" prob = Problem(pd.DataFrame({'feat0': [0] * 100, 'y': [0, 1] * 50}), ['feat0'], 'y', 1) pipe = Pipeline([('step{}'.format(idx), CountingTransform()) for idx in range(50)]) pipe.fit(prob) transformed_prob = pipe.apply(prob) nose.tools.eq_(transformed_prob.X.shape[0], 100) # same number of samples nose.tools.eq_(transformed_prob.X.shape[1], 51) # started with 1 feature, and added one extra for each transform for idx in range(transformed_prob.X.shape[1]): np.testing.assert_array_equal(transformed_prob.X[:, idx], [idx] * prob.X.shape[0])
def checkme(working_dir, n_samples, n_features, k, make_classifier, test_vectorize): """Utility""" assert n_samples % 4 == 0 model_path = os.path.join(working_dir, 'model.txt') prob = mock_problem(n_samples=n_samples, n_features=n_features) if test_vectorize: df = prob.dataframe df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2) df['discrete_2'] = ['foo', 'bar', 'baz', float('nan')] * int(n_samples / 4) df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int( n_samples / 4) prob = Problem( df, prob.features + ['discrete_1', 'discrete_2', 'continuous_with_missing'], prob.outcome_column, prob.positive_outcome) preprocess = ProblemVectorizer() else: preprocess = None approach = SelectAndClassify(SelectKBest(k=k), make_classifier(), preprocess=preprocess).fit(prob) model = ClassificationModel(approach, prob) model.write(model_path) reconstituted_model = ClassificationModel.read(model_path) model.validate() reconstituted_model.validate() np.testing.assert_array_equal(model.approach.apply(prob), reconstituted_model.approach.apply(prob)) if preprocess is not None: approach_pipeline = ApproachPipeline([('preprocess', preprocess)]) approach_with_pipeline = SelectAndClassify( SelectKBest(k=k), make_classifier(), preprocess=approach_pipeline).fit(prob) # test approach serialization with Pipeline from learners.py model_with_pipeline = ClassificationModel(approach_with_pipeline, prob) model_path2 = os.path.join(working_dir, 'model2.txt') model_with_pipeline.write(model_path2) reconstituted_model2 = ClassificationModel.read(model_path2) reconstituted_model2.validate() np.testing.assert_array_almost_equal( model.approach.apply(prob), reconstituted_model2.approach.apply(prob), 14)
def make_problem(self): """Creates a Problem instance using the current options""" df = pd.read_csv(self.input_file, sep=self.separator, index_col=0 if self.id_col is None else self.id_col) # pylint wrongly thinks that df is a tuple (it's a DataFrame), hence the disable below # pylint: disable=no-member all_features = [col for col in df.columns if col != self.target_label] return Problem(PandasDataSource(df, path=self.input_file), features=self.features if self.features is not None else all_features, outcome_column=self.target_label, positive_outcome=self.positive_value)
def test_multiclass_label_subset(): """ Tests y_score for multiclass problem with training set having subset of possible classes """ data = [] data2 = [] class_values = ['A', 'B', 'C', 'D'] for index_class in range(4): data, data2 = mock_coords_data(data, index_class, class_values[index_class], data2, True) df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data) prob = Problem(df, ['coord0', 'coord1'], 'class', None) df2 = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data2) prob2 = Problem(df2, ['coord0', 'coord1'], 'class', None, prob.label_list) classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), name='test multiclass model').fit(prob2) y_pred = classifier.predict(prob2) y_score = classifier.prediction_probabilities(prob2) # check that "C" class has probabilities 0 for i_row in range(y_pred.shape[0]): nose.tools.assert_almost_equal(0.0, y_score[i_row, 2], delta=1e-6)
def test_multiclass_auc(): """ Tests auc value for multiclass problem""" data = [] class_values = ['A', 'B', 'C', 'D'] for index_class in range(4): data, _ = mock_coords_data(data, index_class, class_values[index_class], None, True) df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data) prob = Problem(df, ['coord0', 'coord1'], 'class', None) classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), name='test multiclass model').fit(prob) model = ClassificationModel(classifier, prob) auc_average = model.training_auc nose.tools.assert_almost_equal(0.853333333, auc_average, delta=1e-6) prob_binary = Problem(df, ['coord0', 'coord1'], 'class', 'A') classifier_binary = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), name='binary model').fit(prob_binary) model_binary = ClassificationModel(classifier_binary, prob_binary) auc_binary = model_binary.training_auc nose.tools.assert_almost_equal(auc_binary, auc_average, delta=1e-6)
def checkme(permissive_or_not, fail_or_pass, expected_numeric, expected_discrete, df_columns): """Utility""" assert permissive_or_not in {'permissive', 'strict'} assert fail_or_pass in {'fail', 'pass'} df = pd.DataFrame({col: list(range(10)) for col in df_columns}) df['y'] = [0, 1] * 5 prob = Problem(df, df_columns, 'y', 1) vec = ProblemVectorizer(expected_numeric=expected_numeric, expected_discrete=expected_discrete, permissive=(permissive_or_not == 'permissive')) if fail_or_pass == 'pass': vec.fit_apply(prob) else: nose.tools.assert_raises(ValueError, lambda: vec.fit_apply(prob))
def test_problem_concatenation(): """Validates that we can concatenate Problem instances""" _, _, df = mock_problem() df = df.sort_values( 'gender' ) # need to sort so that we can reverse slicing by simple concatenation prob = Problem(df, ['gene1', 'gene2'], 'disease', 'yes') sub_prob_male = prob[prob.dataframe['gender'] == 'male'] sub_prob_female = prob[prob.dataframe['gender'] == 'female'] reconstituted_prob = sub_prob_female + sub_prob_male # here's where the sort matters np.testing.assert_array_equal(reconstituted_prob.dataframe.values, prob.dataframe.values) np.testing.assert_array_equal(reconstituted_prob.outcome_column, prob.outcome_column) np.testing.assert_array_equal(reconstituted_prob.positive_outcome, prob.positive_outcome) nose.tools.assert_list_equal(reconstituted_prob.features, prob.features) nose.tools.assert_list_equal(reconstituted_prob.sample_ids, prob.sample_ids) # Incompatible outcome columns nose.tools.assert_raises( ValueError, lambda: sub_prob_male + Problem( sub_prob_female.dataframe, ['gene1', 'gene2'], 'gender', 'male')) # Incompatible positive outcome nose.tools.assert_raises( ValueError, lambda: sub_prob_male + Problem( sub_prob_female.dataframe, ['gene1', 'gene2'], 'disease', 'no')) # Incompatible features nose.tools.assert_raises( ValueError, lambda: sub_prob_male + Problem(sub_prob_female.dataframe, ['f1'], 'disease', 'yes'))
def checkme(n_pos, n_neg, n_folds, fail_or_pass): """Utility""" assert fail_or_pass in {'fail', 'pass'} y = np.asarray([1] * n_pos + [0] * n_neg) X = np.zeros((y.shape[0], 2)) df = pd.DataFrame(data=X, columns=['f1', 'f2']) df['y'] = y prob = Problem(df, ['f1', 'f2'], 'y', 1) if fail_or_pass == 'pass': runner = lambda thunk: thunk() else: runner = lambda thunk: nose.tools.assert_raises(ValueError, thunk) cv = CVSplitGenerator(prob, n_folds, 2, random_state=np.random.RandomState(0xC0FFEE)) runner(lambda: next(cv.__iter__()))
def mock_badvector_problem(): """Mocks noisy DataFrames for testing vectorization""" feat_df, _, combined_df = mock_problem() # Feature columns are numeric, but we want to assign bad non-numeric values to test our pre-processing. # To be able to do this, we need to set the datatype to np.object. for feat_name in feat_df.columns: feat_df[feat_name] = pd.Series(feat_df[feat_name], dtype=np.object, copy=True) feat_df.loc['S-1']['gene1'] = 'invalid' feat_df.loc['S-2']['gene1'] = 'nul' feat_df.loc['S-0']['gene2'] = 'a' feat_df.loc['S-1']['gene2'] = 'b' feat_df.loc['S-3']['gene2'] = 0.5 feat_df.loc['S-2']['gene2'] = 'c' feat_df.loc['S-4']['gene2'] = 'd' feat_df['disease'] = combined_df['disease'] return Problem(feat_df, ['gene1', 'gene2'], 'disease', 'yes')
def mock_problem(n_samples, n_features, n_informative, theta): """Mocks up a problem for testing""" rand = np.random.RandomState(0xC0FFEE) X = rand.normal(size=(n_samples, n_features)) y = rand.choice([0, 1], size=n_samples) informative_idx = rand.choice(list(range(n_informative)), size=n_informative, replace=False) for idx in informative_idx: X[y == 1, idx] += theta features = [ 'true-{}'.format(idx) if idx in informative_idx else 'null-{}'.format(idx) for idx in range(n_features) ] df = pd.DataFrame(data=X, columns=features) df['y'] = y return Problem(df, features, 'y', 1)
def _check_prediction_input(self, df): """Validates that a DataFrame has all the required columns for prediction, and returns a Problem instance that the underlying learning approach can be invoked on""" missing_features = sorted( set(self.training_problem.features) - set(df.columns)) if len(missing_features) > 0: raise ValueError("Input is missing features (count={}): {}".format( len(missing_features), ', '.join(missing_features))) # TODO FIXME: LearningApproaches require a Problem instance when calling apply(). This is not ideal # because Problems assume an outcome column, which might not be known when applying to new data. # Here we just mock a null outcome column, but we should consider changing the interface so that # apply() accepts a data frame directly. classification_columns = self.training_problem.features + [ self.training_problem.outcome_column ] classification_df = pd.DataFrame(df, columns=classification_columns) return Problem(classification_df, self.training_problem.features, self.training_problem.outcome_column, self.training_problem.positive_outcome, self.training_problem.label_list)
def test_binary_report_with_score_vector(): " Test that in binary case score as vector contains same data as with positive outcome only" data = [] class_values = ['A', 'B'] for index_class in range(4): data = mock_coords_data(data, index_class, class_values[index_class % 2], data2=None, append_missed=False)[0] df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data) prob = Problem(df, ['coord0', 'coord1'], 'class', 'B') classifier = SelectAndClassify( SelectKBest(k='all'), LogisticRegression(), name='test binary with score vector').fit(prob) y_score_positive = classifier.apply(prob) y_score_all = classifier.apply(prob, False) nose.tools.ok_(np.allclose(y_score_positive, y_score_all[:, 1]))
def test_grouping_cross_validation(): """Validates that the grouping CV generator works as expected""" df = mock_frame('A', 100) df['group'] = ['group{}'.format(idx) for idx in range(20)] * 5 # 20 groups repeated 5 times prob = Problem(df, ['f1', 'f2'], 'y', 1) cv = GroupingCVSplitGenerator(prob, group_by='group', n_folds=10, n_repartitions=10) sanity_check_cv_generator(prob, cv) for train, test in cv: nose.tools.eq_(train.n_samples, 18 * 5) # 18/20 groups, 5 samples per group nose.tools.eq_(test.n_samples, 2 * 5) # 2/20 groups, 5 samples per group nose.tools.eq_( set(train.dataframe['group']) & set(test.dataframe['group']), set()) # no groups overlap
def mock_problem(n_samples=1000, n_features=100, theta=0.5): """\ Creates a mock problem with class-differentiated features. :param n_samples: number of samples :param n_features: number of features :param theta: measure of class separation in sigma units :return: a Problem instance """ if n_samples % 2 != 0: raise ValueError('Number of samples have to be a multiple of 2') rand = np.random.RandomState(0x12345) X = rand.normal(size=(n_samples, n_features)) y = np.zeros(X.shape[0]) y[:int(X.shape[0] / 2)] = 1 X[y == 1] += theta df = pd.DataFrame(columns=['feat{}'.format(idx) for idx in range(X.shape[1])], data=X) df['y'] = y df['train_or_test'] = ['train', 'test'] * int(n_samples / 2) return Problem(df, [col for col in df if 'feat' in col], 'y', 1)
def test_y_for_multiclass_slicing(): """ Testing y method for multiclass""" df = pd.DataFrame(columns=['gene', 'number'], data=[['gene1', 'one'], ['gene2', 'two'], ['gene3', 'three'], ['gene4', 'four'], ['gene5', 'five']]) prob = Problem(df, ['gene'], 'number', None) y = prob.y nose.tools.assert_list_equal(list(y), [2, 4, 3, 1, 0]) subset_prob = prob[prob.dataframe['gene'] != 'gene3'] y_subset = subset_prob.y nose.tools.assert_list_equal(list(y_subset), [2, 4, 1, 0]) subset_df = df[df['gene'] != 'gene3'] prob_subset_df = Problem(subset_df, ['gene'], 'number', None) y_subset_df = prob_subset_df.y nose.tools.assert_list_equal(list(y_subset_df), [2, 3, 1, 0]) prob_subset_df_with_list = Problem(subset_df, ['gene'], 'number', None, prob.label_list) y_subset_df_with_list = prob_subset_df_with_list.y nose.tools.assert_list_equal(list(y_subset_df_with_list), list(y_subset)) custom_prob = prob.iloc([0, 2, 3]) y_custom = custom_prob.y nose.tools.assert_list_equal(list(y_custom), [2, 3, 1]) custom_df = df.iloc[[0, 2, 3]] prob_custom_df_with_list = Problem(custom_df, ['gene'], 'number', None) y_custom_df_with_list = prob_custom_df_with_list.y nose.tools.assert_list_equal(list(y_custom_df_with_list), [1, 2, 0], None) prob_custom_df = Problem(custom_df, ['gene'], 'number', None, prob.label_list) y_custom_df = prob_custom_df.y nose.tools.assert_list_equal(list(y_custom_df), list(y_custom))
def apply(self, problem): """Adds a new feature equal to self.count + 1 to the Problem""" df = pd.DataFrame(problem.dataframe) df[self.feature_name] = self.count + 1 return Problem(df, problem.features + [self.feature_name], problem.outcome_column, problem.positive_outcome)
def checkme(cv_df, train_df, test_df, ignore_df): """Test utility: validates CV split properties for the given CV/train-only/test-only/ignored data frames""" for setname, df in [('cv', cv_df), ('train', train_df), ('test', test_df), ('ignore', ignore_df)]: df['set'] = setname prob = Problem( pd.concat([ pd.DataFrame(df) for df in (cv_df, train_df, test_df, ignore_df) ]), ['f1', 'f2'], 'y', 1) cv_gen = CVSplitGenerator(prob, 10, 2, random_state=np.random.RandomState(0xC0FFEE), train_filter=lambda meta: meta[ 'set'] == 'cv' or meta['set'] == 'train', test_filter=lambda meta: meta['set'] == 'cv' or meta['set'] == 'test') cv_gen = list(cv_gen) # so that we can check the length nose.tools.eq_(len(cv_gen), 20 if len(cv_df) > 0 else 1) for cv_train, cv_test in cv_gen: np.testing.assert_allclose(len(cv_train), 0.9 * len(cv_df) + len(train_df), atol=1.0) # 90% CV + train-only np.testing.assert_allclose(len(cv_test), 0.1 * len(cv_df) + len(test_df), atol=1.0) # 10% CV + test-only # Sanity check: no train/test overlap nose.tools.eq_( set(cv_train.sample_ids) & set(cv_test.sample_ids), set()) # Train samples: must be from either CV or train-only set # Test samples: must be from either CV or test-only set nose.tools.ok_( all([ sample in cv_df.index or sample in train_df.index for sample in cv_train.sample_ids ])) nose.tools.ok_( all([ sample in cv_df.index or sample in test_df.index for sample in cv_test.sample_ids ])) # All train-only and all test-only samples should be present nose.tools.ok_( all([ sample in cv_train.sample_ids for sample in train_df.index ])) nose.tools.ok_( all([sample in cv_test.sample_ids for sample in test_df.index])) # Samples in ignore_df should never be emitted nose.tools.ok_(not any([ sample in ignore_df.index for sample in cv_train.sample_ids + cv_test.sample_ids ]))
def assert_fails(*args, **kwargs): """Utility: calls the Problem ctor with the given arguments and expects it to raise an error""" nose.tools.assert_raises(ValueError, lambda: Problem(*args, **kwargs))