def checkme(selector_grid, classifier_grid, expected_grid, expected_size):
     """Utility"""
     learner = SelectAndClassify(SelectKBest(), LogisticRegression(), selector_grid=selector_grid,
                                 classifier_grid=classifier_grid)
     actual_grid, actual_grid_size = learner._get_grid()  # pylint: disable=protected-access
     nose.tools.eq_(actual_grid, expected_grid)
     nose.tools.eq_(actual_grid_size, expected_size)
Example #2
0
def test_predict_tool(working_dir):
    """Tests that the predict.py command line tool works as expected"""
    out_dir = os.path.join(working_dir, 'learn_output')
    model_path = os.path.join(out_dir, 'model.txt')
    predictions_path = os.path.join(out_dir, 'predictions.txt')

    # Mock up some input data
    prob_path, prob = mock_input(working_dir)
    os.mkdir(out_dir)

    # Train a model and save it to a file
    classifier = SelectAndClassify(SelectKBest(k=5),
                                   GaussianNB(),
                                   name='test model').fit(prob)
    model = ClassificationModel(classifier, prob)
    model.write(model_path)

    # Run the predict tool with the model using the training data loaded from a file, and validate that
    # the returned predictions match
    predict.main(
        [model_path, prob_path, predictions_path, '--index_col', 'sample_id'])

    expected_predictions = pd.DataFrame({
        'sample_id': prob.sample_ids,
        'score': classifier.apply(prob)
    })
    actual_predictions = pd.read_csv(predictions_path, sep='\t')

    np.testing.assert_allclose(actual_predictions['score'].values,
                               expected_predictions['score'].values)
def test_map_label_to_class_index():
    """ test utility SelectAndClassify.map_label_to_class_index """
    nose.tools.eq_(SelectAndClassify.map_label_to_class_index(['A', 'B', 'C', 'D'], [0, 1, 3]),
                   {'A': 0, 'B': 1, 'D': 2})
    nose.tools.eq_(SelectAndClassify.map_label_to_class_index(['A', 'B', 'C', 'D'], [0, 3]),
                   {'A': 0, 'D': 1})
    nose.tools.eq_(SelectAndClassify.map_label_to_class_index(['A', 'B', 'C', 'D'], [2]),
                   {'C': 0})
    nose.tools.eq_(SelectAndClassify.map_label_to_class_index(['A', 'B', 'C', 'D'], [0, 1, 2]),
                   {'A': 0, 'B': 1, 'C': 2})
def test_multiclass(working_dir):
    """ Tests machine learning classification workfloor with multiclass for iris dataset
        see http://scikit-learn.org/stable/modules/multiclass.html """

    out_dir = os.path.join(working_dir, 'learn_output')
    model_path = os.path.join(out_dir, 'model.txt')

    iris = datasets.load_iris()

    df = iris_to_df(iris)

    features = [feat for feat in df.columns if feat not in ['Target']]

    prob = Problem(df, features, "Target", positive_outcome=None)
    rnd = np.random.RandomState(2016)
    approach = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3),
                                 RandomForestClassifier(random_state=rnd))

    learn_params = LearningParameters(metrics={
        'auc':
        roc_auc_score,
        'accuracy':
        accuracy_from_confusion_matrix
    },
                                      treat_as_binary=False)
    cvg = CVSplitGenerator(prob,
                           n_folds=10,
                           n_repartitions=10,
                           random_state=rnd)

    cv = CrossValidatedAnalysis(prob,
                                approach,
                                cv_generator=cvg,
                                runner=SerialRunner(),
                                params=learn_params)

    results = cv.run()
    renderer = ReportRenderer(out_dir)
    ClassificationReport(renderer, False, prob.label_list).generate(results)
    nose.tools.ok_(
        os.path.exists(os.path.join(out_dir, 'sample_confusion_matrix.txt')))
    average_accuracy = compute_average_accuracy(results)
    nose.tools.assert_almost_equal(0.95, average_accuracy, delta=0.01)

    classifier = SelectAndClassify(SelectKBest(score_func=f_pearson, k=3),
                                   RandomForestClassifier(random_state=2016),
                                   name='test multiclass model').fit(prob)
    model = ClassificationModel(classifier, prob)
    model.write(model_path)

    read_model = ClassificationModel.read(model_path)

    auc_average = read_model.training_auc
    nose.tools.assert_almost_equal(1.0, auc_average, delta=1e-6)
 def checkme(selector_grid, classifier_grid, optimal_params):
     """Utility: runs grid search and verifies that we selected the right parameters"""
     prob = mock_problem()
     learner = SelectAndClassify(SelectKBest(), LogisticRegression(), selector_grid=selector_grid,
                                 classifier_grid=classifier_grid,
                                 grid_search_scorer=make_test_grid_scorer(optimal_params),
                                 grid_search_cv_folds=2, grid_search_cv_repartitions=1,
                                 randomized_grid_size_cutoff=None)
     model_params = learner.fit(prob).model.get_params()
     params_to_check = sorted(optimal_params.keys())
     nose.tools.assert_list_equal([(k, model_params[k]) for k in params_to_check],
                                  [(k, optimal_params[k]) for k in params_to_check])
Example #6
0
    def checkme(working_dir, n_samples, n_features, k, make_classifier,
                test_vectorize):
        """Utility"""
        assert n_samples % 4 == 0
        model_path = os.path.join(working_dir, 'model.txt')
        prob = mock_problem(n_samples=n_samples, n_features=n_features)
        if test_vectorize:
            df = prob.dataframe
            df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2)
            df['discrete_2'] = ['foo', 'bar', 'baz',
                                float('nan')] * int(n_samples / 4)
            df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int(
                n_samples / 4)
            prob = Problem(
                df, prob.features +
                ['discrete_1', 'discrete_2', 'continuous_with_missing'],
                prob.outcome_column, prob.positive_outcome)
            preprocess = ProblemVectorizer()
        else:
            preprocess = None

        approach = SelectAndClassify(SelectKBest(k=k),
                                     make_classifier(),
                                     preprocess=preprocess).fit(prob)
        model = ClassificationModel(approach, prob)

        model.write(model_path)
        reconstituted_model = ClassificationModel.read(model_path)

        model.validate()
        reconstituted_model.validate()

        np.testing.assert_array_equal(model.approach.apply(prob),
                                      reconstituted_model.approach.apply(prob))

        if preprocess is not None:
            approach_pipeline = ApproachPipeline([('preprocess', preprocess)])
            approach_with_pipeline = SelectAndClassify(
                SelectKBest(k=k),
                make_classifier(),
                preprocess=approach_pipeline).fit(prob)
            # test approach serialization with Pipeline from learners.py
            model_with_pipeline = ClassificationModel(approach_with_pipeline,
                                                      prob)
            model_path2 = os.path.join(working_dir, 'model2.txt')
            model_with_pipeline.write(model_path2)
            reconstituted_model2 = ClassificationModel.read(model_path2)
            reconstituted_model2.validate()
            np.testing.assert_array_almost_equal(
                model.approach.apply(prob),
                reconstituted_model2.approach.apply(prob), 14)
    def checkme(n_samples, n_features, n_features_to_select, theta):
        """Utility"""
        prob = mock_problem(n_samples, n_features, theta)
        train = prob[prob.dataframe['train_or_test'] == 'train']
        test = prob[prob.dataframe['train_or_test'] == 'test']

        approach = LoggingApproach(SelectAndClassify(SelectKBest(k=n_features_to_select), LogisticRegression()))
        results = train_and_evaluate_model(approach, train, test, LearningParameters())
        model = results['approach']['model']

        # Simple sanity checks
        nose.tools.assert_is_not_none(model)
        nose.tools.eq_(results['train']['n_samples'], len(train))
        nose.tools.eq_(results['test']['n_samples'], len(test))
        nose.tools.eq_(len(results['approach']['selected_features']), n_features_to_select)
        nose.tools.assert_greater(results['train']['metrics']['auc'], 0.75)
        nose.tools.assert_greater(results['test']['metrics']['auc'], 0.75)
        nose.tools.assert_greater(results['train']['metrics']['auc'], results['test']['metrics']['auc'])

        for name, sub_prob in [('train', train), ('test', test)]:
            nose.tools.assert_list_equal(results[name]['outcome'], list(sub_prob.dataframe[sub_prob.outcome_column]))
            nose.tools.eq_(results[name]['positive_outcome'], 1)

        # Make sure we trained on the right samples
        nose.tools.eq_(len(model.fit_problems), 1)  # fitting the same model multiple times might cause problems
        np.testing.assert_array_equal(train.X, model.fit_problems[-1].X)
        np.testing.assert_array_equal(train.y, model.fit_problems[-1].y)

        # Make sure we applied the model to both training and test
        for X in [train.X, test.X]:
            nose.tools.ok_(any([np.array_equal(X, applied_prob.X) for applied_prob in model.apply_problems]))
    def checkme(optimal_params):
        """Utility: runs grid search and verifies that we selected (approximately) the right parameters"""
        np.random.seed(0xC0FFEE)
        prob = mock_problem(n_samples=100)
        learner = SelectAndClassify(SelectKBest(), LogisticRegression(), selector_grid={'k': [10, 20]},
                                    classifier_grid={'C': np.linspace(0.5, 1.0, 1000)},
                                    grid_search_scorer=make_test_grid_scorer(optimal_params),
                                    grid_search_cv_folds=2, grid_search_cv_repartitions=1,
                                    randomized_grid_size_cutoff=100)
        model_params = learner.fit(prob).model.get_params()
        for param_name in sorted(optimal_params.keys()):
            # Might not be exactly optimal, but should be close
            tolerance = 0.05 * abs(optimal_params[param_name])
            nose.tools.assert_almost_equal(model_params[param_name], optimal_params[param_name],
                                           delta=tolerance)

        error_log.append(make_test_grid_scorer(optimal_params)(learner.model, prob.X, prob.y))
def test_comparative_report():
    """Verifies that we correctly generate a classification report"""
    features = ['f1', 'f2', 'f3', 'f4', 'f5']
    cv_results_1 = mock_cv_results_list(
        features, 'my test approach 1', [['f1', 'f2'], ['f2', 'f5']],
        [[0.1, 0.2, 0.8, 1.0] * 25, [0.0, 0.1, 0.7, 0.9] * 25],
        [[0.1, 0.2, 0.8, 1.0] * 25, [0.0, 0.1, 0.7, 0.9] * 25], [0.9, 1.0],
        [0.66, 0.76], [0.05, 0.01])
    cv_results_2 = mock_cv_results_list(
        features, 'my test approach 1', [['f2', 'f3'], ['f3', 'f4']],
        [[0.1, 0.2, 0.8, 1.0] * 25, [0.0, 0.1, 0.7, 0.9] * 25],
        [[0.1, 0.2, 0.8, 1.0] * 25, [0.0, 0.1, 0.7, 0.9] * 25], [0.95, 0.97],
        [0.85, 0.78], [0.05, 0.01])
    approach1 = SelectAndClassify(SelectKBest(k=2),
                                  LogisticRegression(),
                                  name="logit")
    approach2 = SelectAndClassify(SelectKBest(k=3), GaussianNB(), name="nb")
    cv_results = {approach1: cv_results_1, approach2: cv_results_2}
    renderer = LoggingRenderer()
    report = ComparativeClassificationReport(renderer)
    report.generate(cv_results)

    for expected_plot in ['score_plots']:
        nose.tools.assert_true(expected_plot in renderer.plots)

    for expected_table in ['mean_scores', 'mean_metrics']:
        nose.tools.assert_true(expected_table in renderer.tables)

        # Should have the exact same number of entries for each approach
        groups = dict(list(
            renderer.tables[expected_table].groupby('approach')))
        nose.tools.eq_(set(groups.keys()), {'logit', 'nb'})
        assert_lengths_equal(*list(groups.values()))

    metrics_df = renderer.tables["mean_metrics"]
    nose.tools.assert_equal(
        dict(list(zip(metrics_df["approach"], metrics_df["train_auc"]))), {
            'logit': 0.95,
            'nb': 0.96
        })
    nose.tools.assert_equal(
        dict(list(zip(metrics_df["approach"], metrics_df["test_auc"]))), {
            'logit': 0.71,
            'nb': 0.815
        })
def test_preprocessing():
    """Tests feature preprocessing"""
    base_prob = mock_problem()
    base_prob.features.append('discrete_feat')

    # Derive a problem with a single discrete feature perfectly correlated with the label
    df = pd.DataFrame(base_prob.dataframe, copy=True)
    df['discrete_feat'] = 'negative'
    df['discrete_feat'].values[base_prob.y == 1] = 'positive'

    # Verify that a manual upfront vectorize is equivalent to passing a vectorizer as the preprocess step
    # to SelectAndClassify
    prob = base_prob.set_data(df)
    vectorized_prob = ProblemVectorizer().fit_apply(prob)

    baseline_classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), preprocess=None)
    preprocess_classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(),
                                              preprocess=ProblemVectorizer())

    # First make sure that the baseline classifier cannot be fit on the unvectorized data
    nose.tools.assert_raises(ValueError, lambda: baseline_classifier.fit_apply(prob))

    baseline_scores = baseline_classifier.fit_apply(vectorized_prob)
    preprocess_scores = preprocess_classifier.fit_apply(prob)

    np.testing.assert_allclose(baseline_scores, preprocess_scores)
Example #11
0
 def make_learning_approaches(self):
     """Creates LearningApproaches from the learner options"""
     for k in self.n_features:
         for cls in self.classifiers:
             yield SelectAndClassify(SelectKBest(k=k),
                                     cls,
                                     name='SelectKBest(k={}) -> {}'.format(
                                         k, cls.__class__.__name__),
                                     preprocess=ProblemVectorizer())
def mock_model():
    """Creates a simple mock model for testing"""
    prob = mock_problem()
    logit = SelectAndClassify(selector=None,
                              classifier=LogisticRegression(),
                              preprocess=ProblemVectorizer(),
                              name="test model").fit(prob)

    return ClassificationModel(logit, prob)
def test_overlapping_train_and_test():
    """Validates that we fail if samples overlap between training and test"""
    prob = mock_problem()
    train = prob[prob.dataframe['train_or_test'] == 'train']
    test = prob[prob.dataframe['train_or_test'] == 'test']

    approach = SelectAndClassify(SelectKBest(k='all'), LogisticRegression())
    params = LearningParameters()
    train_and_evaluate_model(approach, train, test, params)  # no overlap -- should just work
    nose.tools.assert_raises(ValueError, lambda: train_and_evaluate_model(approach, train, train, params))  # oops
    nose.tools.assert_raises(ValueError, lambda: train_and_evaluate_model(approach, test, test, params))  # oops
def test_binary_report_with_score_vector():
    " Test that in binary case score as vector contains same data as with positive outcome only"
    data = []
    class_values = ['A', 'B']
    for index_class in range(4):
        data = mock_coords_data(data,
                                index_class,
                                class_values[index_class % 2],
                                data2=None,
                                append_missed=False)[0]

    df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data)
    prob = Problem(df, ['coord0', 'coord1'], 'class', 'B')

    classifier = SelectAndClassify(
        SelectKBest(k='all'),
        LogisticRegression(),
        name='test binary with score vector').fit(prob)
    y_score_positive = classifier.apply(prob)
    y_score_all = classifier.apply(prob, False)
    nose.tools.ok_(np.allclose(y_score_positive, y_score_all[:, 1]))
def test_null_feature_selector():
    """Validates that SelectAndClassify works with a null feature selector"""
    def make_fixed_rs():
        """Utility: makes a fixed random state for use in this test"""
        return np.random.RandomState(0xC0FFEE)

    prob = mock_problem()

    # selector=None and SelectKBest(k='all') should produce identical predictions
    no_select_approach = SelectAndClassify(None, LogisticRegression(random_state=make_fixed_rs()),
                                           classifier_grid={'C': [0.5, 1.0]},
                                           random_state=make_fixed_rs()).fit(prob)
    select_all_approach = SelectAndClassify(SelectKBest(k='all'),
                                            LogisticRegression(random_state=make_fixed_rs()),
                                            classifier_grid={'C': [0.5, 1.0]},
                                            random_state=make_fixed_rs()).fit(prob)

    # There should be no selection step in the underlying model
    nose.tools.eq_(len(no_select_approach.model.steps), len(select_all_approach.model.steps) - 1)

    # We should still be logging the right features
    nose.tools.assert_list_equal(no_select_approach.selected_features, prob.features)

    # Scores should be identical as k='all'
    np.testing.assert_allclose(no_select_approach.apply(prob), select_all_approach.apply(prob))
def test_multiclass_auc():
    """ Tests auc value for multiclass problem"""
    data = []
    class_values = ['A', 'B', 'C', 'D']
    for index_class in range(4):
        data, _ = mock_coords_data(data, index_class,
                                   class_values[index_class], None, True)

    df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data)
    prob = Problem(df, ['coord0', 'coord1'], 'class', None)
    classifier = SelectAndClassify(SelectKBest(k='all'),
                                   LogisticRegression(),
                                   name='test multiclass model').fit(prob)
    model = ClassificationModel(classifier, prob)
    auc_average = model.training_auc
    nose.tools.assert_almost_equal(0.853333333, auc_average, delta=1e-6)

    prob_binary = Problem(df, ['coord0', 'coord1'], 'class', 'A')
    classifier_binary = SelectAndClassify(SelectKBest(k='all'),
                                          LogisticRegression(),
                                          name='binary model').fit(prob_binary)
    model_binary = ClassificationModel(classifier_binary, prob_binary)
    auc_binary = model_binary.training_auc
    nose.tools.assert_almost_equal(auc_binary, auc_average, delta=1e-6)
def test_multiclass_label_subset():
    """ Tests y_score for multiclass problem with training set
    having subset of possible classes """
    data = []
    data2 = []
    class_values = ['A', 'B', 'C', 'D']
    for index_class in range(4):
        data, data2 = mock_coords_data(data, index_class,
                                       class_values[index_class], data2, True)

    df = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data)
    prob = Problem(df, ['coord0', 'coord1'], 'class', None)
    df2 = pd.DataFrame(columns=['coord0', 'coord1', 'class'], data=data2)
    prob2 = Problem(df2, ['coord0', 'coord1'], 'class', None, prob.label_list)

    classifier = SelectAndClassify(SelectKBest(k='all'),
                                   LogisticRegression(),
                                   name='test multiclass model').fit(prob2)

    y_pred = classifier.predict(prob2)
    y_score = classifier.prediction_probabilities(prob2)
    # check that "C" class has probabilities 0
    for i_row in range(y_pred.shape[0]):
        nose.tools.assert_almost_equal(0.0, y_score[i_row, 2], delta=1e-6)
def cv_analysis_run_one(task_runner=None):
    """\
    Does one CV analysis run, then validates and returns the results
    :param task_runner: Task runner to use for the CV analysis mockup
    :return: CV analysis results
    """
    prob = mock_problem(n_samples=1000, n_features=100)
    approach = SelectAndClassify(SelectKBest(k=17), LogisticRegression(random_state=np.random.RandomState(0xC0FFEE)))
    cv_generator = CVSplitGenerator(prob, n_folds=10, n_repartitions=2, random_state=np.random.RandomState(0xC0FFEE))
    analysis = CrossValidatedAnalysis(prob, approach, cv_generator=cv_generator, runner=task_runner)

    results = analysis.run()
    nose.tools.eq_(len(results), cv_generator.n_total_splits)  # One per CV split
    for field_name in ['metrics', 'n_samples']:
        nose.tools.ok_(all([field_name in r['train'] for r in results]))
        nose.tools.ok_(all([field_name in r['test'] for r in results]))
    return results
def learning_curves_run_one(fractions=None):
    """Runs a single LearningCurves analysis and validates the output"""
    prob = mock_problem(n_samples=1000, n_features=100)
    approach = SelectAndClassify(SelectKBest(k=17), LogisticRegression(random_state=np.random.RandomState(0xC0FFEE)))
    cv_generator = CVSplitGenerator(prob, n_folds=10, n_repartitions=2, random_state=np.random.RandomState(0xC0FFEE))
    analysis = LearningCurveAnalysis(prob, approach, cv_generator=cv_generator, fractions=fractions,
                                     runner=SerialRunner())

    results = analysis.run()
    nose.tools.eq_(len(results), len(fractions))  # One per fraction
    for fraction in sorted(results):
        nose.tools.eq_(len(results[fraction]), cv_generator.n_total_splits)
        for field_name in ['metrics', 'n_samples']:
            nose.tools.ok_(all([field_name in r['train'] for r in results[fraction]]))
            nose.tools.ok_(all([field_name in r['test'] for r in results[fraction]]))

        seen_test_samples = Counter()
        for split_results in results[fraction]:
            # No train/test overlap
            nose.tools.eq_(set(split_results['train']['sample']) & set(split_results['test']['sample']), set())

            # Size of the test set should be 10% of problem size
            nose.tools.eq_(len(split_results['test']['sample']), 0.1 * prob.n_samples)

            # Size of the training set should be 90% of problem size * fraction
            nose.tools.assert_almost_equal(len(split_results['train']['sample']), 0.9 * fraction * prob.n_samples,
                                           delta=1)

            # Record test samples
            seen_test_samples.update(split_results['test']['sample'])

        # Must have seen all test samples, all of them the same number of times
        nose.tools.eq_(set(seen_test_samples.keys()), set(prob.sample_ids))
        nose.tools.eq_(set(seen_test_samples.values()), set([cv_generator.n_repartitions]))

        # Test sets should be identical across fractions. Otherwise difference between fractions will be a product
        # of both the training set size and different CV splits, but we only really care about the former.
        test_sets_by_fraction = {fraction: tuple([tuple(sorted(set(split_results['test']['sample'])))
                                                  for split_results in results[fraction]])
                                 for fraction in sorted(results.keys())}

        nose.tools.eq_(len(set(test_sets_by_fraction.values())), 1)
    return results
def test_feature_engineering():
    """Tests feature engineering"""
    prob = mock_problem()

    classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression(), feature_engineering=PCA(n_components=2))
    model = classifier.fit(prob).model
    steps = dict(model.steps)

    nose.tools.ok_('PCA' in str(classifier))
    nose.tools.ok_('feature_engineering' in steps)
    nose.tools.assert_is_not_none(steps['feature_engineering'].components_)

    # Check that classifier.apply() works
    nose.tools.eq_(len(classifier.apply(prob)), prob.n_samples)

    # Test that SelectAndClassify still works without feature engineering
    classifier = SelectAndClassify(SelectKBest(k='all'), LogisticRegression())
    model = classifier.fit(prob).model
    steps = dict(model.steps)
    nose.tools.ok_('PCA' not in str(classifier))
    nose.tools.ok_('feature_engineering' not in steps)
Example #21
0
def test_model_validation(working_dir):
    """Validates that we fail if a model has been corrupted or otherwise produces bad output"""
    model_path = os.path.join(working_dir, 'model.txt')
    prob = mock_problem()
    approach = SelectAndClassify(SelectKBest(k=7),
                                 LogisticRegression()).fit(prob)
    model = ClassificationModel(approach, prob)
    model.write(model_path)

    # Change an expected score for a sample -- this should cause model loading to fail because actual
    # classifier output will no longer match the expected output
    with open(model_path, 'r') as f:
        model_string = '\n'.join(f.readlines())
        nose.tools.ok_(str(model.expected_scores[17]) in model_string)
        bad_model_string = model_string.replace(
            str(model.expected_scores[17]),
            str(model.expected_scores[17] + 0.5))

    with open(model_path, 'w') as f:
        f.write(bad_model_string)

    nose.tools.assert_raises(ValueError,
                             lambda: ClassificationModel.read(model_path))