def test_all_same_labels(db_engine_with_results_schema): num_entities = 5 trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) for label_value in [0, 1]: labels = [label_value] * num_entities # We should be able to calculate accuracy even if all of the labels # are the same, but ROC_AUC requires some positive and some # negative labels, so we should get one NULL value # for this config training_metric_groups = [{"metrics": ["accuracy", "roc_auc"]}] # Acquire fake data and objects to be used in the tests model_evaluator = ModelEvaluator( {}, training_metric_groups, db_engine_with_results_schema, ) fake_matrix_store = MockMatrixStore( matrix_type="train", matrix_uuid=str(labels), label_count=num_entities, db_engine=db_engine_with_results_schema, init_labels=pd.DataFrame( { "label_value": labels, "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } ) .set_index(["entity_id", "as_of_date"]) .label_value, init_as_of_dates=[TRAIN_END_TIME], ) model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_matrix_store, model_id ) for metric, best, worst, stochastic in db_engine_with_results_schema.execute( f"""select metric, best_value, worst_value, stochastic_value from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_matrix_store.as_of_dates[0]), ): if metric == "accuracy": assert best is not None assert worst is not None assert stochastic is not None else: assert best is None assert worst is None assert stochastic is None
def test_evaluation_with_protected_df(db_engine_with_results_schema): # Test that if a protected_df is passed (along with bias config, the only real needed one # being threshold info), an Aequitas report is written to the database. model_evaluator = ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": { "top_n": [3] }, }, ], training_metric_groups=[], bias_config={'thresholds': { 'top_n': [2] }}, db_engine=db_engine_with_results_schema, ) testing_labels = np.array([1, 0]) testing_prediction_probas = np.array([0.56, 0.55]) fake_test_matrix_store = MockMatrixStore("test", "1234", 5, db_engine_with_results_schema, testing_labels) trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) protected_df = pd.DataFrame({ "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(), "protectedattribute1": "value1" }) model_evaluator.evaluate(testing_prediction_probas, fake_test_matrix_store, model_id, protected_df) for record in db_engine_with_results_schema.execute( """select * from test_results.aequitas where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): assert record['model_id'] == model_id assert record['parameter'] == '2_abs' assert record['attribute_name'] == 'protectedattribute1' assert record['attribute_value'] == 'value1'
def test_model_scoring_inspections(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) metric_groups = [ { 'metrics': ['precision@', 'recall@', 'fpr@'], 'thresholds': { 'percentiles': [50.0], 'top_n': [3] } }, { # ensure we test a non-thresholded metric as well 'metrics': ['accuracy'], } ] model_evaluator = ModelEvaluator(metric_groups, db_engine) _, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine) labels = numpy.array([True, False, numpy.nan, True, False]) prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3]) evaluation_start = datetime.datetime(2016, 4, 1) evaluation_end = datetime.datetime(2016, 7, 1) example_as_of_date_frequency = '1d' model_evaluator.evaluate(prediction_probas, labels, model_id, evaluation_start, evaluation_end, example_as_of_date_frequency) for record in db_engine.execute( '''select * from results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, evaluation_start)): assert record['num_labeled_examples'] == 4 assert record['num_positive_labels'] == 2 if record['parameter'] == '': assert record['num_labeled_above_threshold'] == 4 elif 'pct' in record['parameter']: assert record['num_labeled_above_threshold'] == 1 else: assert record['num_labeled_above_threshold'] == 2
def test_evaluation_with_sort_ties(db_engine_with_results_schema): model_evaluator = ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": { "top_n": [3] }, }, ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ) testing_labels = np.array([1, 0, 1, 0, 0]) testing_prediction_probas = np.array([0.56, 0.55, 0.5, 0.5, 0.3]) fake_test_matrix_store = MockMatrixStore("test", "1234", 5, db_engine_with_results_schema, testing_labels) trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) model_evaluator.evaluate(testing_prediction_probas, fake_test_matrix_store, model_id) for record in db_engine_with_results_schema.execute( """select * from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 5 assert record["num_positive_labels"] == 2 assert_almost_equal(float(record["worst_value"]), 0.33333, 5) assert_almost_equal(float(record["best_value"]), 0.66666, 5) assert record["num_sort_trials"] == SORT_TRIALS assert record["stochastic_value"] > record["worst_value"] assert record["stochastic_value"] < record["best_value"] assert record["standard_deviation"]
class ModelTester(object): def __init__( self, db_engine, model_storage_engine, matrix_storage_engine, replace, evaluator_config, individual_importance_config, ): self.matrix_storage_engine = matrix_storage_engine self.predictor = Predictor( db_engine=db_engine, model_storage_engine=model_storage_engine, replace=replace, ) self.individual_importance_calculator = IndividualImportanceCalculator( db_engine=db_engine, n_ranks=individual_importance_config.get("n_ranks", 5), methods=individual_importance_config.get("methods", ["uniform"]), replace=replace, ) self.evaluator = ModelEvaluator( db_engine=db_engine, sort_seed=evaluator_config.get("sort_seed", None), testing_metric_groups=evaluator_config.get("testing_metric_groups", []), training_metric_groups=evaluator_config.get("training_metric_groups", []), ) def generate_model_test_tasks(self, split, train_store, model_ids): test_tasks = [] for test_matrix_def, test_uuid in zip( split["test_matrices"], split["test_uuids"] ): test_store = self.matrix_storage_engine.get_store(test_uuid) if test_store.empty: logging.warning( """Test matrix for uuid %s was empty, no point in generating predictions. Not creating test task. """, test_uuid, ) continue test_tasks.append( { "test_store": test_store, "train_store": train_store, "model_ids": [model_id for model_id in model_ids if model_id], } ) return test_tasks def process_model_test_task(self, test_store, train_store, model_ids): as_of_times = test_store.metadata["as_of_times"] logging.info( "Testing and scoring all model ids with test matrix %s. " "as_of_times min: %s max: %s num: %s", test_store.uuid, min(as_of_times), max(as_of_times), len(as_of_times), ) for model_id in model_ids: logging.info("Testing model id %s", model_id) self.individual_importance_calculator.calculate_and_save_all_methods_and_dates( model_id, test_store ) # Generate predictions for the testing data then training data for store in (test_store, train_store): predictions_proba = self.predictor.predict( model_id, store, misc_db_parameters=dict(), train_matrix_columns=train_store.columns(), ) self.evaluator.evaluate( predictions_proba=predictions_proba, matrix_store=store, model_id=model_id, )
def test_evaluating_early_warning(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) testing_metric_groups = [ { "metrics": [ "precision@", "recall@", "true positives@", "true negatives@", "false positives@", "false negatives@", ], "thresholds": {"percentiles": [5.0, 10.0], "top_n": [5, 10]}, }, { "metrics": [ "f1", "mediocre", "accuracy", "roc_auc", "average precision score", ] }, {"metrics": ["fbeta@"], "parameters": [{"beta": 0.75}, {"beta": 1.25}]}, ] training_metric_groups = [{"metrics": ["accuracy", "roc_auc"]}] custom_metrics = {"mediocre": always_half} model_evaluator = ModelEvaluator( testing_metric_groups, training_metric_groups, db_engine, custom_metrics=custom_metrics, ) labels = fake_labels(5) fake_train_matrix_store = MockMatrixStore("train", "efgh", 5, db_engine, labels) fake_test_matrix_store = MockMatrixStore("test", "1234", 5, db_engine, labels) trained_model, model_id = fake_trained_model(db_engine) # Evaluate the testing metrics and test for all of them. model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_test_matrix_store, model_id ) records = [ row[0] for row in db_engine.execute( """select distinct(metric || parameter) from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ) ] assert records == [ "accuracy", "average precision score", "f1", "false [email protected]_pct", "false negatives@10_abs", "false [email protected]_pct", "false negatives@5_abs", "false [email protected]_pct", "false positives@10_abs", "false [email protected]_pct", "false positives@5_abs", "[email protected]_beta", "[email protected]_beta", "mediocre", "[email protected]_pct", "precision@10_abs", "[email protected]_pct", "precision@5_abs", "[email protected]_pct", "recall@10_abs", "[email protected]_pct", "recall@5_abs", "roc_auc", "true [email protected]_pct", "true negatives@10_abs", "true [email protected]_pct", "true negatives@5_abs", "true [email protected]_pct", "true positives@10_abs", "true [email protected]_pct", "true positives@5_abs", ] # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id ) records = [ row[0] for row in db_engine.execute( """select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"]
def test_model_scoring_inspections(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) testing_metric_groups = [ { "metrics": ["precision@", "recall@", "fpr@"], "thresholds": {"percentiles": [50.0], "top_n": [3]}, }, { # ensure we test a non-thresholded metric as well "metrics": ["accuracy"] }, ] training_metric_groups = [ {"metrics": ["accuracy"], "thresholds": {"percentiles": [50.0]}} ] model_evaluator = ModelEvaluator( testing_metric_groups, training_metric_groups, db_engine ) testing_labels = numpy.array([True, False, numpy.nan, True, False]) testing_prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3]) training_labels = numpy.array( [False, False, True, True, True, False, True, True] ) training_prediction_probas = numpy.array( [0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6] ) fake_train_matrix_store = MockMatrixStore( "train", "efgh", 5, db_engine, training_labels ) fake_test_matrix_store = MockMatrixStore( "test", "1234", 5, db_engine, testing_labels ) trained_model, model_id = fake_trained_model(db_engine) # Evaluate testing matrix and test the results model_evaluator.evaluate( testing_prediction_probas, fake_test_matrix_store, model_id ) for record in db_engine.execute( """select * from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 4 assert record["num_positive_labels"] == 2 if record["parameter"] == "": assert record["num_labeled_above_threshold"] == 4 elif "pct" in record["parameter"]: assert record["num_labeled_above_threshold"] == 1 else: assert record["num_labeled_above_threshold"] == 2 # Evaluate the training matrix and test the results model_evaluator.evaluate( training_prediction_probas, fake_train_matrix_store, model_id ) for record in db_engine.execute( """select * from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 8 assert record["num_positive_labels"] == 5 assert record["value"] == 0.625
def test_evaluation_sorting_with_protected_df(db_engine_with_results_schema): # Test that if a protected_df is passed (along with bias config, the only real needed one # being threshold info), an Aequitas report is written to the database. model_evaluator = ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [3]}, }, ], training_metric_groups=[], bias_config={"thresholds": {"top_n": [2]}}, db_engine=db_engine_with_results_schema, ) testing_labels = np.array([1, 1, 1, 0, 1]) testing_prediction_probas = np.array([0.56, 0.55, 0.92, 0.85, 0.24]) fake_test_matrix_store = MockMatrixStore( "test", "1234", 5, db_engine_with_results_schema, metadata_overrides={"as_of_times": [TRAIN_END_TIME]}, matrix=pd.DataFrame.from_dict( { "entity_id": [1, 2, 3, 4, 5], "as_of_date": [pd.Timestamp(2016, 1, 1)] * 5, "feature_one": [3, 4, 3, 4, 3], "feature_two": [5, 6, 5, 6, 5], "label": testing_labels, } ).set_index(MatrixStore.indices), init_labels=pd.DataFrame( { "label_value": testing_labels, "entity_id": [1, 2, 3, 4, 5], "as_of_date": [pd.Timestamp(2016, 1, 1)] * 5, } ) .set_index(["entity_id", "as_of_date"]) .label_value, init_as_of_dates=[TRAIN_END_TIME], ) trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) protected_df = pd.DataFrame( { # "entity_id": fake_test_matrix_store.design_matrix.index.levels[0].tolist(), # "as_of_date": fake_test_matrix_store.design_matrix.index.levels[1].tolist(), "protectedattribute1": ["low", "low", "low", "high", "high"] }, index=fake_test_matrix_store.design_matrix.index, ) # should be low has 3 records, all 1's; high has 2 records, one 1 expected = { "low": {"group_size": 3, "group_label_neg": 0, "group_label_pos": 3}, "high": {"group_size": 2, "group_label_neg": 1, "group_label_pos": 1}, } model_evaluator.evaluate( testing_prediction_probas, fake_test_matrix_store, model_id, protected_df ) for record in db_engine_with_results_schema.execute( """select * from test_results.aequitas where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): assert record["model_id"] == model_id assert record["parameter"] == "2_abs" assert record["attribute_name"] == "protectedattribute1" for col, value in expected[record["attribute_value"]].items(): assert record[col] == value
def test_model_scoring_inspections(db_engine_with_results_schema): testing_metric_groups = [ { "metrics": ["precision@", "recall@", "fpr@"], "thresholds": {"percentiles": [50.0], "top_n": [3]}, }, { # ensure we test a non-thresholded metric as well "metrics": ["accuracy"] }, ] training_metric_groups = [ {"metrics": ["accuracy"], "thresholds": {"percentiles": [50.0]}} ] model_evaluator = ModelEvaluator( testing_metric_groups, training_metric_groups, db_engine_with_results_schema, ) testing_labels = np.array([1, 0, np.nan, 1, 0]) testing_prediction_probas = np.array([0.56, 0.4, 0.55, 0.5, 0.3]) training_labels = np.array([0, 0, 1, 1, 1, 0, 1, 1]) training_prediction_probas = np.array([0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6]) fake_train_matrix_store = MockMatrixStore( "train", "efgh", 5, db_engine_with_results_schema, training_labels ) fake_test_matrix_store = MockMatrixStore( "test", "1234", 5, db_engine_with_results_schema, testing_labels ) trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) # Evaluate testing matrix and test the results model_evaluator.evaluate( testing_prediction_probas, fake_test_matrix_store, model_id ) for record in db_engine_with_results_schema.execute( """select * from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 4 assert record["num_positive_labels"] == 2 if record["parameter"] == "": assert record["num_labeled_above_threshold"] == 4 elif "pct" in record["parameter"]: assert record["num_labeled_above_threshold"] == 1 else: assert record["num_labeled_above_threshold"] == 2 # Evaluate the training matrix and test the results model_evaluator.evaluate( training_prediction_probas, fake_train_matrix_store, model_id ) for record in db_engine_with_results_schema.execute( """select * from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 8 assert record["num_positive_labels"] == 5 assert record["worst_value"] == 0.625 assert record["best_value"] == 0.625 assert record["stochastic_value"] == 0.625 # best/worst are same, should shortcut trials assert record["num_sort_trials"] == 0 assert record["standard_deviation"] == 0
def test_evaluating_early_warning(db_engine_with_results_schema): num_entities = 10 labels = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # Set up testing configuration parameters testing_metric_groups = [ { "metrics": [ "precision@", "recall@", "true positives@", "true negatives@", "false positives@", "false negatives@", ], "thresholds": {"percentiles": [5.0, 10.0], "top_n": [5, 10]}, }, { "metrics": [ "f1", "mediocre", "accuracy", "roc_auc", "average precision score", ] }, {"metrics": ["fbeta@"], "parameters": [{"beta": 0.75}, {"beta": 1.25}]}, ] training_metric_groups = [{"metrics": ["accuracy", "roc_auc"]}] custom_metrics = {"mediocre": always_half} # Acquire fake data and objects to be used in the tests model_evaluator = ModelEvaluator( testing_metric_groups, training_metric_groups, db_engine_with_results_schema, custom_metrics=custom_metrics, ) fake_test_matrix_store = MockMatrixStore( matrix_type="test", matrix_uuid="efgh", label_count=num_entities, db_engine=db_engine_with_results_schema, init_labels=pd.DataFrame( { "label_value": labels, "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } ) .set_index(["entity_id", "as_of_date"]) .label_value, init_as_of_dates=[TRAIN_END_TIME], ) fake_train_matrix_store = MockMatrixStore( matrix_type="train", matrix_uuid="1234", label_count=num_entities, db_engine=db_engine_with_results_schema, init_labels=pd.DataFrame( { "label_value": labels, "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } ) .set_index(["entity_id", "as_of_date"]) .label_value, init_as_of_dates=[TRAIN_END_TIME], ) trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) # ensure that the matrix uuid is present matrix_uuids = [ row[0] for row in db_engine_with_results_schema.execute( "select matrix_uuid from test_results.evaluations" ) ] assert all(matrix_uuid == "efgh" for matrix_uuid in matrix_uuids) # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id ) records = [ row[0] for row in db_engine_with_results_schema.execute( """select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"] # Run tests for overall and subset evaluations for subset in SUBSETS: if subset is None: where_hash = "" else: populate_subset_data( db_engine_with_results_schema, subset, list(range(num_entities)) ) SubsetFactory(subset_hash=filename_friendly_hash(subset)) session.commit() where_hash = f"and subset_hash = '{filename_friendly_hash(subset)}'" # Evaluate the testing metrics and test for all of them. model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_test_matrix_store, model_id, subset=subset, ) records = [ row[0] for row in db_engine_with_results_schema.execute( f"""\ select distinct(metric || parameter) from test_results.evaluations where model_id = %s and evaluation_start_time = %s {where_hash} order by 1 """, (model_id, fake_test_matrix_store.as_of_dates[0]), ) ] assert records == [ "accuracy", "average precision score", "f1", "false [email protected]_pct", "false negatives@10_abs", "false [email protected]_pct", "false negatives@5_abs", "false [email protected]_pct", "false positives@10_abs", "false [email protected]_pct", "false positives@5_abs", "[email protected]_beta", "[email protected]_beta", "mediocre", "[email protected]_pct", "precision@10_abs", "[email protected]_pct", "precision@5_abs", "[email protected]_pct", "recall@10_abs", "[email protected]_pct", "recall@5_abs", "roc_auc", "true [email protected]_pct", "true negatives@10_abs", "true [email protected]_pct", "true negatives@5_abs", "true [email protected]_pct", "true positives@10_abs", "true [email protected]_pct", "true positives@5_abs", ] # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id, subset=subset, ) records = [ row[0] for row in db_engine_with_results_schema.execute( f"""select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s {where_hash} order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"] # ensure that the matrix uuid is present matrix_uuids = [ row[0] for row in db_engine_with_results_schema.execute( "select matrix_uuid from train_results.evaluations" ) ] assert all(matrix_uuid == "1234" for matrix_uuid in matrix_uuids)
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train')) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [0] }).set_index('entity_id'), matrix_metadata_creator(end_time=as_of_date, indices=['entity_id'])) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'feature_names': ['ft1', 'ft2'], 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'train' } # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() train_store = InMemoryMatrixStore(train_matrix, sample_metadata()) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }), { 'label_name': 'label', 'label_timespan': '1y', 'end_time': as_of_date, 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'test', 'as_of_date_frequency': '1month' }) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(project_path) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(project_path, model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.test_predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.test_evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_evaluating_early_warning(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) testing_metric_groups = [{ 'metrics': ['precision@', 'recall@', 'true positives@', 'true negatives@', 'false positives@', 'false negatives@'], 'thresholds': { 'percentiles': [5.0, 10.0], 'top_n': [5, 10] } }, { 'metrics': ['f1', 'mediocre', 'accuracy', 'roc_auc', 'average precision score'], }, { 'metrics': ['fbeta@'], 'parameters': [{'beta': 0.75}, {'beta': 1.25}] }] training_metric_groups = [{'metrics': ['accuracy', 'roc_auc']}] custom_metrics = {'mediocre': always_half} model_evaluator = ModelEvaluator(testing_metric_groups, training_metric_groups, db_engine, custom_metrics=custom_metrics ) labels = fake_labels(5) fake_train_matrix_store = MockMatrixStore('train', 'efgh', 5, db_engine, labels) fake_test_matrix_store = MockMatrixStore('test', '1234', 5, db_engine, labels) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine ) # Evaluate the testing metrics and test for all of them. model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_test_matrix_store, model_id, ) records = [ row[0] for row in db_engine.execute( '''select distinct(metric || parameter) from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_test_matrix_store.as_of_dates[0]) ) ] assert records == [ 'accuracy', 'average precision score', 'f1', 'false [email protected]_pct', 'false negatives@10_abs', 'false [email protected]_pct', 'false negatives@5_abs', 'false [email protected]_pct', 'false positives@10_abs', 'false [email protected]_pct', 'false positives@5_abs', '[email protected]_beta', '[email protected]_beta', 'mediocre', '[email protected]_pct', 'precision@10_abs', '[email protected]_pct', 'precision@5_abs', '[email protected]_pct', 'recall@10_abs', '[email protected]_pct', 'recall@5_abs', 'roc_auc', 'true [email protected]_pct', 'true negatives@10_abs', 'true [email protected]_pct', 'true negatives@5_abs', 'true [email protected]_pct', 'true positives@10_abs', 'true [email protected]_pct', 'true positives@5_abs' ] # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id, ) records = [ row[0] for row in db_engine.execute( '''select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_train_matrix_store.as_of_dates[0]) ) ] assert records == ['accuracy', 'roc_auc']
def test_model_scoring_inspections(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) testing_metric_groups = [{ 'metrics': ['precision@', 'recall@', 'fpr@'], 'thresholds': {'percentiles': [50.0], 'top_n': [3]} }, { # ensure we test a non-thresholded metric as well 'metrics': ['accuracy'], }] training_metric_groups = [{'metrics': ['accuracy'], 'thresholds': {'percentiles': [50.0]}}] model_evaluator = ModelEvaluator(testing_metric_groups, training_metric_groups, db_engine) testing_labels = numpy.array([True, False, numpy.nan, True, False]) testing_prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3]) training_labels = numpy.array([False, False, True, True, True, False, True, True]) training_prediction_probas = numpy.array([0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6]) fake_train_matrix_store = MockMatrixStore('train', 'efgh', 5, db_engine, training_labels) fake_test_matrix_store = MockMatrixStore('test', '1234', 5, db_engine, testing_labels) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine ) # Evaluate testing matrix and test the results model_evaluator.evaluate( testing_prediction_probas, fake_test_matrix_store, model_id, ) for record in db_engine.execute( '''select * from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_test_matrix_store.as_of_dates[0]) ): assert record['num_labeled_examples'] == 4 assert record['num_positive_labels'] == 2 if record['parameter'] == '': assert record['num_labeled_above_threshold'] == 4 elif 'pct' in record['parameter']: assert record['num_labeled_above_threshold'] == 1 else: assert record['num_labeled_above_threshold'] == 2 # Evaluate the training matrix and test the results model_evaluator.evaluate( training_prediction_probas, fake_train_matrix_store, model_id, ) for record in db_engine.execute( '''select * from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_train_matrix_store.as_of_dates[0]) ): assert record['num_labeled_examples'] == 8 assert record['num_positive_labels'] == 5 assert record['value'] == 0.625
class ModelTester(object): def __init__(self, db_engine, project_path, model_storage_engine, replace, evaluator_config, individual_importance_config): self.predictor = Predictor(db_engine=db_engine, model_storage_engine=model_storage_engine, project_path=project_path, replace=replace) self.individual_importance_calculator = IndividualImportanceCalculator( db_engine=db_engine, n_ranks=individual_importance_config.get('n_ranks', 5), methods=individual_importance_config.get('methods', ['uniform']), replace=replace) self.evaluator = ModelEvaluator( db_engine=db_engine, sort_seed=evaluator_config.get('sort_seed', None), metric_groups=evaluator_config['metric_groups'], training_metric_groups=evaluator_config['training_metric_groups']) def generate_model_test_tasks(self, split, train_store, model_ids, matrix_store_creator): test_tasks = [] for test_matrix_def, test_uuid in zip(split['test_matrices'], split['test_uuids']): test_store = matrix_store_creator(test_uuid) if test_store.empty: logging.warning( '''Test matrix for uuid %s was empty, no point in generating predictions. Not creating test task. ''', test_uuid) continue test_tasks.append({ 'test_store': test_store, 'train_store': train_store, 'model_ids': [model_id for model_id in model_ids if model_id] }) return test_tasks def process_model_test_task(self, test_store, train_store, model_ids): as_of_times = test_store.metadata['as_of_times'] logging.info( 'Testing and scoring all model ids with test matrix %s. as_of_times min: %s max: %s num: %s', test_store.uuid, min(as_of_times), max(as_of_times), len(as_of_times)) for model_id in model_ids: logging.info('Testing model id %s', model_id) self.individual_importance_calculator\ .calculate_and_save_all_methods_and_dates( model_id, test_store ) # Generate predictions for the testing data then training data for store in (test_store, train_store): predictions_proba = self.predictor.predict( model_id, store, misc_db_parameters=dict(), train_matrix_columns=train_store.columns()) self.evaluator.evaluate( predictions_proba=predictions_proba, matrix_store=store, model_id=model_id, )
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type="train"), ) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ "entity_id": [3], "feature_one": [8], "feature_two": [5], "label": [0], }).set_index("entity_id"), matrix_metadata_creator(end_time=as_of_date, indices=["entity_id"]), ) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ "metrics": ["precision@"], "thresholds": { "top_n": [5] } }], [{}], db_engine) # run the pipeline grid_config = { "sklearn.linear_model.LogisticRegression": { "C": [0.00001, 0.0001], "penalty": ["l1", "l2"], "random_state": [2193], } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) model_evaluator.evaluate(predictions_proba, test_store, model_id) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( """select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2""") ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(""" select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1""") ] assert records == [ (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), ]