def test_model_scoring_inspections(db_engine_with_results_schema): testing_metric_groups = [ { "metrics": ["precision@", "recall@", "fpr@"], "thresholds": { "percentiles": [50.0], "top_n": [3] }, }, { # ensure we test a non-thresholded metric as well "metrics": ["accuracy"] }, ] training_metric_groups = [{ "metrics": ["accuracy"], "thresholds": { "percentiles": [50.0] } }] model_evaluator = ModelEvaluator( testing_metric_groups, training_metric_groups, db_engine_with_results_schema, ) testing_labels = np.array([1, 0, np.nan, 1, 0]) testing_prediction_probas = np.array([0.56, 0.4, 0.55, 0.5, 0.3]) training_labels = np.array([0, 0, 1, 1, 1, 0, 1, 1]) training_prediction_probas = np.array( [0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6]) fake_train_matrix_store = MockMatrixStore("train", "efgh", 5, db_engine_with_results_schema, training_labels) fake_test_matrix_store = MockMatrixStore("test", "1234", 5, db_engine_with_results_schema, testing_labels) trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) # Evaluate testing matrix and test the results model_evaluator.evaluate(testing_prediction_probas, fake_test_matrix_store, model_id) for record in db_engine_with_results_schema.execute( """select * from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 4 assert record["num_positive_labels"] == 2 if record["parameter"] == "": assert record["num_labeled_above_threshold"] == 4 elif "pct" in record["parameter"]: assert record["num_labeled_above_threshold"] == 1 else: assert record["num_labeled_above_threshold"] == 2 # Evaluate the training matrix and test the results model_evaluator.evaluate(training_prediction_probas, fake_train_matrix_store, model_id) for record in db_engine_with_results_schema.execute( """select * from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 8 assert record["num_positive_labels"] == 5 assert record["worst_value"] == 0.625 assert record["best_value"] == 0.625 assert record["stochastic_value"] == 0.625 # best/worst are same, should shortcut trials assert record["num_sort_trials"] == 0 assert record["standard_deviation"] == 0
def test_evaluating_early_warning(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) metric_groups = [{ 'metrics': [ 'precision@', 'recall@', 'true positives@', 'true negatives@', 'false positives@', 'false negatives@' ], 'thresholds': { 'percentiles': [5.0, 10.0], 'top_n': [5, 10] } }, { 'metrics': [ 'f1', 'mediocre', 'accuracy', 'roc_auc', 'average precision score' ], }, { 'metrics': ['fbeta@'], 'parameters': [{ 'beta': 0.75 }, { 'beta': 1.25 }] }] custom_metrics = {'mediocre': always_half} model_evaluator = ModelEvaluator(metric_groups, db_engine, custom_metrics=custom_metrics) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine) labels = fake_labels(5) as_of_date = datetime.date(2016, 5, 5) model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], labels, model_id, as_of_date, as_of_date, '1y') # assert # that all of the records are there records = [ row[0] for row in db_engine.execute( '''select distinct(metric || parameter) from results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, as_of_date)) ] assert records == [ 'accuracy', 'average precision score', 'f1', 'false [email protected]_pct', 'false negatives@10_abs', 'false [email protected]_pct', 'false negatives@5_abs', 'false [email protected]_pct', 'false positives@10_abs', 'false [email protected]_pct', 'false positives@5_abs', '[email protected]_beta', '[email protected]_beta', 'mediocre', '[email protected]_pct', 'precision@10_abs', '[email protected]_pct', 'precision@5_abs', '[email protected]_pct', 'recall@10_abs', '[email protected]_pct', 'recall@5_abs', 'roc_auc', 'true [email protected]_pct', 'true negatives@10_abs', 'true [email protected]_pct', 'true negatives@5_abs', 'true [email protected]_pct', 'true positives@10_abs', 'true [email protected]_pct', 'true positives@5_abs' ]
def test_evaluating_early_warning(db_engine_with_results_schema): num_entities = 10 labels = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # Set up testing configuration parameters testing_metric_groups = [ { "metrics": [ "precision@", "recall@", "true positives@", "true negatives@", "false positives@", "false negatives@", ], "thresholds": { "percentiles": [5.0, 10.0], "top_n": [5, 10] }, }, { "metrics": [ "f1", "mediocre", "accuracy", "roc_auc", "average precision score", ] }, { "metrics": ["fbeta@"], "parameters": [{ "beta": 0.75 }, { "beta": 1.25 }] }, ] training_metric_groups = [{"metrics": ["accuracy", "roc_auc"]}] custom_metrics = {"mediocre": always_half} # Acquire fake data and objects to be used in the tests model_evaluator = ModelEvaluator( testing_metric_groups, training_metric_groups, db_engine_with_results_schema, custom_metrics=custom_metrics, ) fake_test_matrix_store = MockMatrixStore( matrix_type="test", matrix_uuid="efgh", label_count=num_entities, db_engine=db_engine_with_results_schema, init_labels=pd.DataFrame({ "label_value": labels, "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, }).set_index(["entity_id", "as_of_date"]).label_value, init_as_of_dates=[TRAIN_END_TIME], ) fake_train_matrix_store = MockMatrixStore( matrix_type="train", matrix_uuid="1234", label_count=num_entities, db_engine=db_engine_with_results_schema, init_labels=pd.DataFrame({ "label_value": labels, "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, }).set_index(["entity_id", "as_of_date"]).label_value, init_as_of_dates=[TRAIN_END_TIME], ) trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) # ensure that the matrix uuid is present matrix_uuids = [ row[0] for row in db_engine_with_results_schema.execute( "select matrix_uuid from test_results.evaluations") ] assert all(matrix_uuid == "efgh" for matrix_uuid in matrix_uuids) # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id) records = [ row[0] for row in db_engine_with_results_schema.execute( """select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"] # Run tests for overall and subset evaluations for subset in SUBSETS: if subset is None: where_hash = "" else: populate_subset_data(db_engine_with_results_schema, subset, list(range(num_entities))) SubsetFactory(subset_hash=filename_friendly_hash(subset)) session.commit() where_hash = f"and subset_hash = '{filename_friendly_hash(subset)}'" # Evaluate the testing metrics and test for all of them. model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_test_matrix_store, model_id, subset=subset, ) records = [ row[0] for row in db_engine_with_results_schema.execute( f"""\ select distinct(metric || parameter) from test_results.evaluations where model_id = %s and evaluation_start_time = %s {where_hash} order by 1 """, (model_id, fake_test_matrix_store.as_of_dates[0]), ) ] assert records == [ "accuracy", "average precision score", "f1", "false [email protected]_pct", "false negatives@10_abs", "false [email protected]_pct", "false negatives@5_abs", "false [email protected]_pct", "false positives@10_abs", "false [email protected]_pct", "false positives@5_abs", "[email protected]_beta", "[email protected]_beta", "mediocre", "[email protected]_pct", "precision@10_abs", "[email protected]_pct", "precision@5_abs", "[email protected]_pct", "recall@10_abs", "[email protected]_pct", "recall@5_abs", "roc_auc", "true [email protected]_pct", "true negatives@10_abs", "true [email protected]_pct", "true negatives@5_abs", "true [email protected]_pct", "true positives@10_abs", "true [email protected]_pct", "true positives@5_abs", ] # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id, subset=subset, ) records = [ row[0] for row in db_engine_with_results_schema.execute( f"""select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s {where_hash} order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"] # ensure that the matrix uuid is present matrix_uuids = [ row[0] for row in db_engine_with_results_schema.execute( "select matrix_uuid from train_results.evaluations") ] assert all(matrix_uuid == "1234" for matrix_uuid in matrix_uuids)
def test_predictor_retrieve(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234') predictor = Predictor(project_path, model_storage_engine, db_engine, replace=False) dayone = datetime.date(2011, 1, 1).strftime(predictor.expected_matrix_ts_format) daytwo = datetime.date(2011, 1, 2).strftime(predictor.expected_matrix_ts_format) # create prediction set matrix_data = { 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] } matrix = pandas.DataFrame.from_dict(matrix_data)\ .set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id', 'as_of_date'], 'matrix_type': 'test' } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) # When run again, the predictions retrieved from the database # should match. # # Some trickiness here. Let's explain: # # If we are not careful, retrieving predictions from the database and # presenting them as a numpy array can result in a bad ordering, # since the given matrix may not be 'ordered' by some criteria # that can be easily represented by an ORDER BY clause. # # It will sometimes work, because without ORDER BY you will get # it back in the table's physical order, which unless something has # happened to the table will be the order you inserted it, # which could very well be the order in the matrix. # So it's not a bug that would necessarily immediately show itself, # but when it does go wrong your scores will be garbage. # # So we simulate a table order mutation that can happen over time: # Remove the first row and put it at the end. # If the Predictor doesn't explicitly reorder the results, this will fail # Only running on TestPrediction because TrainPrediction behaves the exact same way reorder_session = sessionmaker(bind=db_engine)() obj = reorder_session.query(TestPrediction).first() reorder_session.delete(obj) reorder_session.commit() make_transient(obj) reorder_session = sessionmaker(bind=db_engine)() reorder_session.add(obj) reorder_session.commit() predictor.load_model = Mock() new_predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) assert_array_equal(new_predict_proba, predict_proba) assert not predictor.load_model.called
def test_model_scoring_inspections(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) testing_metric_groups = [{ 'metrics': ['precision@', 'recall@', 'fpr@'], 'thresholds': {'percentiles': [50.0], 'top_n': [3]} }, { # ensure we test a non-thresholded metric as well 'metrics': ['accuracy'], }] training_metric_groups = [{'metrics': ['accuracy'], 'thresholds': {'percentiles': [50.0]}}] model_evaluator = ModelEvaluator(testing_metric_groups, training_metric_groups, db_engine) testing_labels = numpy.array([True, False, numpy.nan, True, False]) testing_prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3]) training_labels = numpy.array([False, False, True, True, True, False, True, True]) training_prediction_probas = numpy.array([0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6]) fake_train_matrix_store = MockMatrixStore('train', 'efgh', 5, db_engine, training_labels) fake_test_matrix_store = MockMatrixStore('test', '1234', 5, db_engine, testing_labels) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine ) # Evaluate testing matrix and test the results model_evaluator.evaluate( testing_prediction_probas, fake_test_matrix_store, model_id, ) for record in db_engine.execute( '''select * from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_test_matrix_store.as_of_dates[0]) ): assert record['num_labeled_examples'] == 4 assert record['num_positive_labels'] == 2 if record['parameter'] == '': assert record['num_labeled_above_threshold'] == 4 elif 'pct' in record['parameter']: assert record['num_labeled_above_threshold'] == 1 else: assert record['num_labeled_above_threshold'] == 2 # Evaluate the training matrix and test the results model_evaluator.evaluate( training_prediction_probas, fake_train_matrix_store, model_id, ) for record in db_engine.execute( '''select * from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_train_matrix_store.as_of_dates[0]) ): assert record['num_labeled_examples'] == 8 assert record['num_positive_labels'] == 5 assert record['value'] == 0.625
def test_predictor(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234') predictor = Predictor(project_path, model_storage_engine, db_engine) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id'], } train_matrix_columns = ['feature_one', 'feature_two'] # Runs the same test for training and testing predictions for mat_type in ("train", "test"): # Create the matrix to be tested and store in db metadata['matrix_type'] = mat_type matrix_store = InMemoryMatrixStore(matrix, metadata) # Note, the first time 'matrix' is used, the label column is popped. # It must be added back in to 'matrix' to create another matrix_store. matrix['label'] = [7, 8] predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)'''.format( mat_type, mat_type)) ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date new_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') new_metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE + datetime.timedelta(days=1), 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id'], } # Runs the same test for training and testing predictions for mat_type in ("train", "test"): # Create the matrix to be tested and store in db new_metadata['matrix_type'] = mat_type new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata) # Adding 'label' column back into new_matrix new_matrix['label'] = [7, 8] predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) predictor.predict(model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) records = [ row for row in db_engine.execute( '''select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)'''.format( mat_type, mat_type)) ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) == None
def test_predictor_composite_index(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234') predictor = Predictor(project_path, model_storage_engine, db_engine) dayone = datetime.datetime(2011, 1, 1) daytwo = datetime.datetime(2011, 1, 2) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] }).set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id', 'as_of_date'], } # Runs the same test for training and testing predictions for mat_type in ("train", "test"): # Create the matrix to be tested and store in db metadata['matrix_type'] = mat_type matrix_store = InMemoryMatrixStore(matrix, metadata) # Adding 'label' column back into matrix matrix['label'] = [7, 8, 8, 7] predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 4 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)'''.format( mat_type, mat_type)) ] assert len(records) == 4
def test_predictor(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_window': '3month', 'metta-uuid': '1234', } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date new_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') new_metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE + datetime.timedelta(days=1), 'label_window': '3month', 'metta-uuid': '1234', } new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata) predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict()) predictor.predict(model_id, matrix_store, misc_db_parameters=dict()) records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) == None
def test_evaluating_early_warning(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) testing_metric_groups = [ { "metrics": [ "precision@", "recall@", "true positives@", "true negatives@", "false positives@", "false negatives@", ], "thresholds": { "percentiles": [5.0, 10.0], "top_n": [5, 10] }, }, { "metrics": [ "f1", "mediocre", "accuracy", "roc_auc", "average precision score", ] }, { "metrics": ["fbeta@"], "parameters": [{ "beta": 0.75 }, { "beta": 1.25 }] }, ] training_metric_groups = [{"metrics": ["accuracy", "roc_auc"]}] custom_metrics = {"mediocre": always_half} model_evaluator = ModelEvaluator( testing_metric_groups, training_metric_groups, db_engine, custom_metrics=custom_metrics, ) labels = fake_labels(5) fake_train_matrix_store = MockMatrixStore("train", "efgh", 5, db_engine, labels) fake_test_matrix_store = MockMatrixStore("test", "1234", 5, db_engine, labels) trained_model, model_id = fake_trained_model(db_engine) # Evaluate the testing metrics and test for all of them. model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_test_matrix_store, model_id) records = [ row[0] for row in db_engine.execute( """select distinct(metric || parameter) from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ) ] assert records == [ "accuracy", "average precision score", "f1", "false [email protected]_pct", "false negatives@10_abs", "false [email protected]_pct", "false negatives@5_abs", "false [email protected]_pct", "false positives@10_abs", "false [email protected]_pct", "false positives@5_abs", "[email protected]_beta", "[email protected]_beta", "mediocre", "[email protected]_pct", "precision@10_abs", "[email protected]_pct", "precision@5_abs", "[email protected]_pct", "recall@10_abs", "[email protected]_pct", "recall@5_abs", "roc_auc", "true [email protected]_pct", "true negatives@10_abs", "true [email protected]_pct", "true negatives@5_abs", "true [email protected]_pct", "true positives@10_abs", "true [email protected]_pct", "true positives@5_abs", ] # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id) records = [ row[0] for row in db_engine.execute( """select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"]
def test_model_scoring_inspections(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) testing_metric_groups = [ { "metrics": ["precision@", "recall@", "fpr@"], "thresholds": { "percentiles": [50.0], "top_n": [3] }, }, { # ensure we test a non-thresholded metric as well "metrics": ["accuracy"] }, ] training_metric_groups = [{ "metrics": ["accuracy"], "thresholds": { "percentiles": [50.0] } }] model_evaluator = ModelEvaluator(testing_metric_groups, training_metric_groups, db_engine) testing_labels = numpy.array([True, False, numpy.nan, True, False]) testing_prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3]) training_labels = numpy.array( [False, False, True, True, True, False, True, True]) training_prediction_probas = numpy.array( [0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6]) fake_train_matrix_store = MockMatrixStore("train", "efgh", 5, db_engine, training_labels) fake_test_matrix_store = MockMatrixStore("test", "1234", 5, db_engine, testing_labels) trained_model, model_id = fake_trained_model(db_engine) # Evaluate testing matrix and test the results model_evaluator.evaluate(testing_prediction_probas, fake_test_matrix_store, model_id) for record in db_engine.execute( """select * from test_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_test_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 4 assert record["num_positive_labels"] == 2 if record["parameter"] == "": assert record["num_labeled_above_threshold"] == 4 elif "pct" in record["parameter"]: assert record["num_labeled_above_threshold"] == 1 else: assert record["num_labeled_above_threshold"] == 2 # Evaluate the training matrix and test the results model_evaluator.evaluate(training_prediction_probas, fake_train_matrix_store, model_id) for record in db_engine.execute( """select * from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ): assert record["num_labeled_examples"] == 8 assert record["num_positive_labels"] == 5 assert record["value"] == 0.625