def checkme(selector_grid, classifier_grid, expected_grid, expected_size): """Utility""" learner = SelectAndClassify(SelectKBest(), LogisticRegression(), selector_grid=selector_grid, classifier_grid=classifier_grid) actual_grid, actual_grid_size = learner._get_grid() # pylint: disable=protected-access nose.tools.eq_(actual_grid, expected_grid) nose.tools.eq_(actual_grid_size, expected_size)
def test_predict_tool(working_dir): """Tests that the predict.py command line tool works as expected""" out_dir = os.path.join(working_dir, 'learn_output') model_path = os.path.join(out_dir, 'model.txt') predictions_path = os.path.join(out_dir, 'predictions.txt') # Mock up some input data prob_path, prob = mock_input(working_dir) os.mkdir(out_dir) # Train a model and save it to a file classifier = SelectAndClassify(SelectKBest(k=5), GaussianNB(), name='test model').fit(prob) model = ClassificationModel(classifier, prob) model.write(model_path) # Run the predict tool with the model using the training data loaded from a file, and validate that # the returned predictions match predict.main( [model_path, prob_path, predictions_path, '--index_col', 'sample_id']) expected_predictions = pd.DataFrame({ 'sample_id': prob.sample_ids, 'score': classifier.apply(prob) }) actual_predictions = pd.read_csv(predictions_path, sep='\t') np.testing.assert_allclose(actual_predictions['score'].values, expected_predictions['score'].values)
def test_map_label_to_class_index(): """ test utility SelectAndClassify.map_label_to_class_index """ nose.tools.eq_(SelectAndClassify.map_label_to_class_index(['A', 'B', 'C', 'D'], [0, 1, 3]), {'A': 0, 'B': 1, 'D': 2}) nose.tools.eq_(SelectAndClassify.map_label_to_class_index(['A', 'B', 'C', 'D'], [0, 3]), {'A': 0, 'D': 1}) nose.tools.eq_(SelectAndClassify.map_label_to_class_index(['A', 'B', 'C', 'D'], [2]), {'C': 0}) nose.tools.eq_(SelectAndClassify.map_label_to_class_index(['A', 'B', 'C', 'D'], [0, 1, 2]), {'A': 0, 'B': 1, 'C': 2})
def test_multiclass(working_dir): """ Tests machine learning classification workfloor with multiclass for iris dataset see http://scikit-learn.org/stable/modules/multiclass.html """ out_dir = os.path.join(working_dir, 'learn_output') model_path = os.path.join(out_dir, 'model.txt') iris = datasets.load_iris() df = iris_to_df(iris) features = [feat for feat in df.columns if feat not in ['Target']] prob = Problem(df, features, "Target", positive_outcome=None) rnd = np.random.RandomState(2016) approach = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3), RandomForestClassifier(random_state=rnd)) learn_params = LearningParameters(metrics={ 'auc': roc_auc_score, 'accuracy': accuracy_from_confusion_matrix }, treat_as_binary=False) cvg = CVSplitGenerator(prob, n_folds=10, n_repartitions=10, random_state=rnd) cv = CrossValidatedAnalysis(prob, approach, cv_generator=cvg, runner=SerialRunner(), params=learn_params) results = cv.run() renderer = ReportRenderer(out_dir) ClassificationReport(renderer, False, prob.label_list).generate(results) nose.tools.ok_( os.path.exists(os.path.join(out_dir, 'sample_confusion_matrix.txt'))) average_accuracy = compute_average_accuracy(results) nose.tools.assert_almost_equal(0.95, average_accuracy, delta=0.01) classifier = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3), RandomForestClassifier(random_state=2016), name='test multiclass model').fit(prob) model = ClassificationModel(classifier, prob) model.write(model_path) read_model = ClassificationModel.read(model_path) auc_average = read_model.training_auc nose.tools.assert_almost_equal(1.0, auc_average, delta=1e-6)
def checkme(selector_grid, classifier_grid, optimal_params): """Utility: runs grid search and verifies that we selected the right parameters""" prob = mock_problem() learner = SelectAndClassify(SelectKBest(), LogisticRegression(), selector_grid=selector_grid, classifier_grid=classifier_grid, grid_search_scorer=make_test_grid_scorer(optimal_params), grid_search_cv_folds=2, grid_search_cv_repartitions=1, randomized_grid_size_cutoff=None) model_params = learner.fit(prob).model.get_params() params_to_check = sorted(optimal_params.keys()) nose.tools.assert_list_equal([(k, model_params[k]) for k in params_to_check], [(k, optimal_params[k]) for k in params_to_check])
def checkme(working_dir, n_samples, n_features, k, make_classifier, test_vectorize): """Utility""" assert n_samples % 4 == 0 model_path = os.path.join(working_dir, 'model.txt') prob = mock_problem(n_samples=n_samples, n_features=n_features) if test_vectorize: df = prob.dataframe df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2) df['discrete_2'] = ['foo', 'bar', 'baz', float('nan')] * int(n_samples / 4) df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int( n_samples / 4) prob = Problem( df, prob.features + ['discrete_1', 'discrete_2', 'continuous_with_missing'], prob.outcome_column, prob.positive_outcome) preprocess = ProblemVectorizer() else: preprocess = None approach = SelectAndClassify(SelectKBest(k=k), make_classifier(), preprocess=preprocess).fit(prob) model = ClassificationModel(approach, prob) model.write(model_path) reconstituted_model = ClassificationModel.read(model_path) model.validate() reconstituted_model.validate() np.testing.assert_array_equal(model.approach.apply(prob), reconstituted_model.approach.apply(prob)) if preprocess is not None: approach_pipeline = ApproachPipeline([('preprocess', preprocess)]) approach_with_pipeline = SelectAndClassify( SelectKBest(k=k), make_classifier(), preprocess=approach_pipeline).fit(prob) # test approach serialization with Pipeline from learners.py model_with_pipeline = ClassificationModel(approach_with_pipeline, prob) model_path2 = os.path.join(working_dir, 'model2.txt') model_with_pipeline.write(model_path2) reconstituted_model2 = ClassificationModel.read(model_path2) reconstituted_model2.validate() np.testing.assert_array_almost_equal( model.approach.apply(prob), reconstituted_model2.approach.apply(prob), 14)
def checkme(n_samples, n_features, n_features_to_select, theta): """Utility""" prob = mock_problem(n_samples, n_features, theta) train = prob[prob.dataframe['train_or_test'] == 'train'] test = prob[prob.dataframe['train_or_test'] == 'test'] approach = LoggingApproach(SelectAndClassify(SelectKBest(k=n_features_to_select), LogisticRegression())) results = train_and_evaluate_model(approach, train, test, LearningParameters()) model = results['approach']['model'] # Simple sanity checks nose.tools.assert_is_not_none(model) nose.tools.eq_(results['train']['n_samples'], len(train)) nose.tools.eq_(results['test']['n_samples'], len(test)) nose.tools.eq_(len(results['approach']['selected_features']), n_features_to_select) nose.tools.assert_greater(results['train']['metrics']['auc'], 0.75) nose.tools.assert_greater(results['test']['metrics']['auc'], 0.75) nose.tools.assert_greater(results['train']['metrics']['auc'], results['test']['metrics']['auc']) for name, sub_prob in [('train', train), ('test', test)]: nose.tools.assert_list_equal(results[name]['outcome'], list(sub_prob.dataframe[sub_prob.outcome_column])) nose.tools.eq_(results[name]['positive_outcome'], 1) # Make sure we trained on the right samples nose.tools.eq_(len(model.fit_problems), 1) # fitting the same model multiple times might cause problems np.testing.assert_array_equal(train.X, model.fit_problems[-1].X) np.testing.assert_array_equal(train.y, model.fit_problems[-1].y) # Make sure we applied the model to both training and test for X in [train.X, test.X]: nose.tools.ok_(any([np.array_equal(X, applied_prob.X) for applied_prob in model.apply_problems]))
def checkme(optimal_params): """Utility: runs grid search and verifies that we selected (approximately) the right parameters""" np.random.seed(0xC0FFEE) prob = mock_problem(n_samples=100) learner = SelectAndClassify(SelectKBest(), LogisticRegression(), selector_grid={'k': [10, 20]}, classifier_grid={'C': np.linspace(0.5, 1.0, 1000)}, grid_search_scorer=make_test_grid_scorer(optimal_params), grid_search_cv_folds=2, grid_search_cv_repartitions=1, randomized_grid_size_cutoff=100) model_params = learner.fit(prob).model.get_params() for param_name in sorted(optimal_params.keys()): # Might not be exactly optimal, but should be close tolerance = 0.05 * abs(optimal_params[param_name]) nose.tools.assert_almost_equal(model_params[param_name], optimal_params[param_name], delta=tolerance) error_log.append(make_test_grid_scorer(optimal_params)(learner.model, prob.X, prob.y))
def test_comparative_report(): """Verifies that we correctly generate a classification report""" features = ['f1', 'f2', 'f3', 'f4', 'f5'] cv_results_1 = mock_cv_results_list( features, 'my test approach 1', [['f1', 'f2'], ['f2', 'f5']], [[0.1, 0.2, 0.8, 1.0] * 25, [0.0, 0.1, 0.7, 0.9] * 25], [[0.1, 0.2, 0.8, 1.0] * 25, [0.0, 0.1, 0.7, 0.9] * 25], [0.9, 1.0], [0.66, 0.76], [0.05, 0.01]) cv_results_2 = mock_cv_results_list( features, 'my test approach 1', [['f2', 'f3'], ['f3', 'f4']], [[0.1, 0.2, 0.8, 1.0] * 25, [0.0, 0.1, 0.7, 0.9] * 25], [[0.1, 0.2, 0.8, 1.0] * 25, [0.0, 0.1, 0.7, 0.9] * 25], [0.95, 0.97], [0.85, 0.78], [0.05, 0.01]) approach1 = SelectAndClassify(SelectKBest(k=2), LogisticRegression(), name="logit") approach2 = SelectAndClassify(SelectKBest(k=3), GaussianNB(), name="nb") cv_results = {approach1: cv_results_1, approach2: cv_results_2} renderer = LoggingRenderer() report = ComparativeClassificationReport(renderer) report.generate(cv_results) for expected_plot in ['score_plots']: nose.tools.assert_true(expected_plot in renderer.plots) for expected_table in ['mean_scores', 'mean_metrics']: nose.tools.assert_true(expected_table in renderer.tables) # Should have the exact same number of entries for each approach groups = dict(list( renderer.tables[expected_table].groupby('approach'))) nose.tools.eq_(set(groups.keys()), {'logit', 'nb'}) assert_lengths_equal(*list(groups.values())) metrics_df = renderer.tables["mean_metrics"] nose.tools.assert_equal( dict(list(zip(metrics_df["approach"], metrics_df["train_auc"]))), { 'logit': 0.95, 'nb': 0.96 }) nose.tools.assert_equal( dict(list(zip(metrics_df["approach"], metrics_df["test_auc"]))), { 'logit': 0.71, 'nb': 0.815 })
def test_preprocessing(): """Tests feature preprocessing""" base_prob = mock_problem() base_prob.features.append('discrete_feat') # Derive a problem with a single discrete feature perfectly correlated with the label df = pd.DataFrame(base_prob.dataframe, copy=True) df['discrete_feat'] = 'negative' df['discrete_feat'].values[base_prob.y == 1] = 'positive' # Verify that a manual upfront vectorize is equivalent to passing a vectorizer as the preprocess step # to SelectAndClassify prob = base_prob.set_data(df) vectorized_prob = ProblemVectorizer().fit_apply(prob) baseline_classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), preprocess=None) preprocess_classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), preprocess=ProblemVectorizer()) # First make sure that the baseline classifier cannot be fit on the unvectorized data nose.tools.assert_raises(ValueError, lambda: baseline_classifier.fit_apply(prob)) baseline_scores = baseline_classifier.fit_apply(vectorized_prob) preprocess_scores = preprocess_classifier.fit_apply(prob) np.testing.assert_allclose(baseline_scores, preprocess_scores)
def make_learning_approaches(self): """Creates LearningApproaches from the learner options""" for k in self.n_features: for cls in self.classifiers: yield SelectAndClassify(SelectKBest(k=k), cls, name='SelectKBest(k={}) -> {}'.format( k, cls.__class__.__name__), preprocess=ProblemVectorizer())
def mock_model(): """Creates a simple mock model for testing""" prob = mock_problem() logit = SelectAndClassify(selector=None, classifier=LogisticRegression(), preprocess=ProblemVectorizer(), name="test model").fit(prob) return ClassificationModel(logit, prob)
def test_overlapping_train_and_test(): """Validates that we fail if samples overlap between training and test""" prob = mock_problem() train = prob[prob.dataframe['train_or_test'] == 'train'] test = prob[prob.dataframe['train_or_test'] == 'test'] approach = SelectAndClassify(SelectKBest(k='all'), LogisticRegression()) params = LearningParameters() train_and_evaluate_model(approach, train, test, params) # no overlap -- should just work nose.tools.assert_raises(ValueError, lambda: train_and_evaluate_model(approach, train, train, params)) # oops nose.tools.assert_raises(ValueError, lambda: train_and_evaluate_model(approach, test, test, params)) # oops
def test_binary_report_with_score_vector(): " Test that in binary case score as vector contains same data as with positive outcome only" data = [] class_values = ['A', 'B'] for index_class in range(4): data = mock_coords_data(data, index_class, class_values[index_class % 2], data2=None, append_missed=False)[0] df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data) prob = Problem(df, ['coord0', 'coord1'], 'class', 'B') classifier = SelectAndClassify( SelectKBest(k='all'), LogisticRegression(), name='test binary with score vector').fit(prob) y_score_positive = classifier.apply(prob) y_score_all = classifier.apply(prob, False) nose.tools.ok_(np.allclose(y_score_positive, y_score_all[:, 1]))
def test_null_feature_selector(): """Validates that SelectAndClassify works with a null feature selector""" def make_fixed_rs(): """Utility: makes a fixed random state for use in this test""" return np.random.RandomState(0xC0FFEE) prob = mock_problem() # selector=None and SelectKBest(k='all') should produce identical predictions no_select_approach = SelectAndClassify(None, LogisticRegression(random_state=make_fixed_rs()), classifier_grid={'C': [0.5, 1.0]}, random_state=make_fixed_rs()).fit(prob) select_all_approach = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(random_state=make_fixed_rs()), classifier_grid={'C': [0.5, 1.0]}, random_state=make_fixed_rs()).fit(prob) # There should be no selection step in the underlying model nose.tools.eq_(len(no_select_approach.model.steps), len(select_all_approach.model.steps) - 1) # We should still be logging the right features nose.tools.assert_list_equal(no_select_approach.selected_features, prob.features) # Scores should be identical as k='all' np.testing.assert_allclose(no_select_approach.apply(prob), select_all_approach.apply(prob))
def test_multiclass_auc(): """ Tests auc value for multiclass problem""" data = [] class_values = ['A', 'B', 'C', 'D'] for index_class in range(4): data, _ = mock_coords_data(data, index_class, class_values[index_class], None, True) df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data) prob = Problem(df, ['coord0', 'coord1'], 'class', None) classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), name='test multiclass model').fit(prob) model = ClassificationModel(classifier, prob) auc_average = model.training_auc nose.tools.assert_almost_equal(0.853333333, auc_average, delta=1e-6) prob_binary = Problem(df, ['coord0', 'coord1'], 'class', 'A') classifier_binary = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), name='binary model').fit(prob_binary) model_binary = ClassificationModel(classifier_binary, prob_binary) auc_binary = model_binary.training_auc nose.tools.assert_almost_equal(auc_binary, auc_average, delta=1e-6)
def test_multiclass_label_subset(): """ Tests y_score for multiclass problem with training set having subset of possible classes """ data = [] data2 = [] class_values = ['A', 'B', 'C', 'D'] for index_class in range(4): data, data2 = mock_coords_data(data, index_class, class_values[index_class], data2, True) df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data) prob = Problem(df, ['coord0', 'coord1'], 'class', None) df2 = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data2) prob2 = Problem(df2, ['coord0', 'coord1'], 'class', None, prob.label_list) classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), name='test multiclass model').fit(prob2) y_pred = classifier.predict(prob2) y_score = classifier.prediction_probabilities(prob2) # check that "C" class has probabilities 0 for i_row in range(y_pred.shape[0]): nose.tools.assert_almost_equal(0.0, y_score[i_row, 2], delta=1e-6)
def cv_analysis_run_one(task_runner=None): """\ Does one CV analysis run, then validates and returns the results :param task_runner: Task runner to use for the CV analysis mockup :return: CV analysis results """ prob = mock_problem(n_samples=1000, n_features=100) approach = SelectAndClassify(SelectKBest(k=17), LogisticRegression(random_state=np.random.RandomState(0xC0FFEE))) cv_generator = CVSplitGenerator(prob, n_folds=10, n_repartitions=2, random_state=np.random.RandomState(0xC0FFEE)) analysis = CrossValidatedAnalysis(prob, approach, cv_generator=cv_generator, runner=task_runner) results = analysis.run() nose.tools.eq_(len(results), cv_generator.n_total_splits) # One per CV split for field_name in ['metrics', 'n_samples']: nose.tools.ok_(all([field_name in r['train'] for r in results])) nose.tools.ok_(all([field_name in r['test'] for r in results])) return results
def learning_curves_run_one(fractions=None): """Runs a single LearningCurves analysis and validates the output""" prob = mock_problem(n_samples=1000, n_features=100) approach = SelectAndClassify(SelectKBest(k=17), LogisticRegression(random_state=np.random.RandomState(0xC0FFEE))) cv_generator = CVSplitGenerator(prob, n_folds=10, n_repartitions=2, random_state=np.random.RandomState(0xC0FFEE)) analysis = LearningCurveAnalysis(prob, approach, cv_generator=cv_generator, fractions=fractions, runner=SerialRunner()) results = analysis.run() nose.tools.eq_(len(results), len(fractions)) # One per fraction for fraction in sorted(results): nose.tools.eq_(len(results[fraction]), cv_generator.n_total_splits) for field_name in ['metrics', 'n_samples']: nose.tools.ok_(all([field_name in r['train'] for r in results[fraction]])) nose.tools.ok_(all([field_name in r['test'] for r in results[fraction]])) seen_test_samples = Counter() for split_results in results[fraction]: # No train/test overlap nose.tools.eq_(set(split_results['train']['sample']) & set(split_results['test']['sample']), set()) # Size of the test set should be 10% of problem size nose.tools.eq_(len(split_results['test']['sample']), 0.1 * prob.n_samples) # Size of the training set should be 90% of problem size * fraction nose.tools.assert_almost_equal(len(split_results['train']['sample']), 0.9 * fraction * prob.n_samples, delta=1) # Record test samples seen_test_samples.update(split_results['test']['sample']) # Must have seen all test samples, all of them the same number of times nose.tools.eq_(set(seen_test_samples.keys()), set(prob.sample_ids)) nose.tools.eq_(set(seen_test_samples.values()), set([cv_generator.n_repartitions])) # Test sets should be identical across fractions. Otherwise difference between fractions will be a product # of both the training set size and different CV splits, but we only really care about the former. test_sets_by_fraction = {fraction: tuple([tuple(sorted(set(split_results['test']['sample']))) for split_results in results[fraction]]) for fraction in sorted(results.keys())} nose.tools.eq_(len(set(test_sets_by_fraction.values())), 1) return results
def test_feature_engineering(): """Tests feature engineering""" prob = mock_problem() classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), feature_engineering=PCA(n_components=2)) model = classifier.fit(prob).model steps = dict(model.steps) nose.tools.ok_('PCA' in str(classifier)) nose.tools.ok_('feature_engineering' in steps) nose.tools.assert_is_not_none(steps['feature_engineering'].components_) # Check that classifier.apply() works nose.tools.eq_(len(classifier.apply(prob)), prob.n_samples) # Test that SelectAndClassify still works without feature engineering classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression()) model = classifier.fit(prob).model steps = dict(model.steps) nose.tools.ok_('PCA' not in str(classifier)) nose.tools.ok_('feature_engineering' not in steps)
def test_model_validation(working_dir): """Validates that we fail if a model has been corrupted or otherwise produces bad output""" model_path = os.path.join(working_dir, 'model.txt') prob = mock_problem() approach = SelectAndClassify(SelectKBest(k=7), LogisticRegression()).fit(prob) model = ClassificationModel(approach, prob) model.write(model_path) # Change an expected score for a sample -- this should cause model loading to fail because actual # classifier output will no longer match the expected output with open(model_path, 'r') as f: model_string = '\n'.join(f.readlines()) nose.tools.ok_(str(model.expected_scores[17]) in model_string) bad_model_string = model_string.replace( str(model.expected_scores[17]), str(model.expected_scores[17] + 0.5)) with open(model_path, 'w') as f: f.write(bad_model_string) nose.tools.assert_raises(ValueError, lambda: ClassificationModel.read(model_path))