def checkme(make_classifier): """Utility""" prob = mock_problem() logr = make_classifier().fit(prob.X, prob.y) decoded_logr = serializer_roundtrip(Serializer(), logr) np.testing.assert_array_equal(logr.predict_proba(prob.X), decoded_logr.predict_proba(prob.X))
def checkme(n_features, k, make_classifier): """Utility""" prob = mock_problem(n_features=n_features) pipe = Pipeline([('select', SelectKBest(k=k)), ('classify', make_classifier())]).fit(prob.X, prob.y) decoded_piple = serializer_roundtrip(Serializer(), pipe) np.testing.assert_array_equal(pipe.predict_proba(prob.X), decoded_piple.predict_proba(prob.X))
def checkme(params): """ Test helper function. :param params: StandardScaler constructor params as a dictionary :return: None """ prob = mock_problem(n_samples=100, n_features=4) scaler = StandardScaler(**params).fit(prob.X) reconstituted_scaler = serializer_roundtrip(Serializer(), scaler) np.testing.assert_allclose(reconstituted_scaler.transform(prob.X), scaler.transform(prob.X))
def checkme(params): """ Test helper function. :param params: PCA constructor params as a dictionary :return: None """ prob = mock_problem(n_samples=100, n_features=4) pca = PCA(**params).fit(prob.X) reconstituted_pca = serializer_roundtrip(Serializer(), pca) np.testing.assert_allclose(reconstituted_pca.transform(prob.X), pca.transform(prob.X))
def mock_input(working_dir): """\ Mocks an input problem in the given directory, returning a pair of (1) path to the problem file and (2) the Problem instance """ prob_path = os.path.join(working_dir, 'problem.txt') prob = mock_problem(n_samples=100, n_features=10) prob.dataframe.to_csv(prob_path, sep='\t', index=True, index_label='sample_id') return prob_path, prob
def checkme(working_dir, n_samples, n_features, k, make_classifier, test_vectorize): """Utility""" assert n_samples % 4 == 0 model_path = os.path.join(working_dir, 'model.txt') prob = mock_problem(n_samples=n_samples, n_features=n_features) if test_vectorize: df = prob.dataframe df['discrete_1'] = ['foo', 'bar'] * int(n_samples / 2) df['discrete_2'] = ['foo', 'bar', 'baz', float('nan')] * int(n_samples / 4) df['continuous_with_missing'] = [0, 1, 2, float('nan')] * int( n_samples / 4) prob = Problem( df, prob.features + ['discrete_1', 'discrete_2', 'continuous_with_missing'], prob.outcome_column, prob.positive_outcome) preprocess = ProblemVectorizer() else: preprocess = None approach = SelectAndClassify(SelectKBest(k=k), make_classifier(), preprocess=preprocess).fit(prob) model = ClassificationModel(approach, prob) model.write(model_path) reconstituted_model = ClassificationModel.read(model_path) model.validate() reconstituted_model.validate() np.testing.assert_array_equal(model.approach.apply(prob), reconstituted_model.approach.apply(prob)) if preprocess is not None: approach_pipeline = ApproachPipeline([('preprocess', preprocess)]) approach_with_pipeline = SelectAndClassify( SelectKBest(k=k), make_classifier(), preprocess=approach_pipeline).fit(prob) # test approach serialization with Pipeline from learners.py model_with_pipeline = ClassificationModel(approach_with_pipeline, prob) model_path2 = os.path.join(working_dir, 'model2.txt') model_with_pipeline.write(model_path2) reconstituted_model2 = ClassificationModel.read(model_path2) reconstituted_model2.validate() np.testing.assert_array_almost_equal( model.approach.apply(prob), reconstituted_model2.approach.apply(prob), 14)
def checkme(params): """ Test helper function. :param params: DecisionTreeClassifier constructor params as a dictioanry :return: None """ prob = mock_problem(n_samples=100, n_features=18) tree = DecisionTreeClassifier(**params).fit(prob.X, prob.y) reconstituted_dtc = serializer_roundtrip( Serializer(debug_deserialize=False), tree) np.testing.assert_array_equal(reconstituted_dtc.predict_proba(prob.X), tree.predict_proba(prob.X))
def checkme(params): """ Test helper function. :param params: PCA constructor params as a dictioanry :return: None """ prob = mock_problem(n_samples=100, n_features=4) rfc = RandomForestClassifier(**params) rfc.fit(prob.X, prob.y) reconstituted_rfc = serializer_roundtrip(Serializer(), rfc) np.testing.assert_array_equal(reconstituted_rfc.predict(prob.X), rfc.predict(prob.X)) np.testing.assert_array_equal(reconstituted_rfc.predict_proba(prob.X), rfc.predict_proba(prob.X))
def checkme(params, probability): """ Test helper function. :param params: SVC constructor params as a dictioanry :param probability: whether to test the output of predict_proba :return: None """ prob = mock_problem(n_samples=100, n_features=3) svc = SVC(**params).fit(prob.X, prob.y) reconstituted_svc = serializer_roundtrip(Serializer(), svc) np.testing.assert_array_equal( reconstituted_svc.decision_function(prob.X), svc.decision_function(prob.X)) if probability: np.testing.assert_array_equal( reconstituted_svc.predict_proba(prob.X), svc.predict_proba(prob.X))
def test_model_validation(working_dir): """Validates that we fail if a model has been corrupted or otherwise produces bad output""" model_path = os.path.join(working_dir, 'model.txt') prob = mock_problem() approach = SelectAndClassify(SelectKBest(k=7), LogisticRegression()).fit(prob) model = ClassificationModel(approach, prob) model.write(model_path) # Change an expected score for a sample -- this should cause model loading to fail because actual # classifier output will no longer match the expected output with open(model_path, 'r') as f: model_string = '\n'.join(f.readlines()) nose.tools.ok_(str(model.expected_scores[17]) in model_string) bad_model_string = model_string.replace( str(model.expected_scores[17]), str(model.expected_scores[17] + 0.5)) with open(model_path, 'w') as f: f.write(bad_model_string) nose.tools.assert_raises(ValueError, lambda: ClassificationModel.read(model_path))