class Matrices(TestCase): config = { 'temporal_config': sample_config()['temporal_config'], 'feature_aggregations': sample_config()['feature_aggregations'], 'cohort_config': sample_config()['cohort_config'], 'label_config': sample_config()['label_config'], 'config_version': sample_config()['config_version'] } def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() matrices_path = experiment.matrices_directory matrices_and_metadata = [f for f in os.listdir(matrices_path) if isfile(join(matrices_path, f))] matrices = experiment.matrix_build_tasks assert len(matrices) > 0 for matrix in matrices: assert '{}.csv'.format(matrix) in matrices_and_metadata assert '{}.yaml'.format(matrix) in matrices_and_metadata def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: experiment.validate(strict=False) def test_validate_strict(self): with prepare_experiment(self.config) as experiment: with self.assertRaises(ValueError): experiment.validate()
class PostimputationFeatures(TestCase): config = { "temporal_config": sample_config()["temporal_config"], "feature_aggregations": sample_config()["feature_aggregations"], "cohort_config": sample_config()["cohort_config"], "config_version": sample_config()["config_version"], } def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() generated_tables = [ table for table in schema_tables(experiment.features_schema_name, experiment.db_engine).keys() if "_aggregation_imputed" in table ] assert len(generated_tables) == len( sample_config()["feature_aggregations"]) for table in generated_tables: table_should_have_data(table, experiment.db_engine) def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: experiment.validate(strict=False) def test_validate_strict(self): with prepare_experiment(self.config) as experiment: with self.assertRaises(ValueError): experiment.validate()
def test_fill_model_grid_presets(): # case 1: has grid, no preset config = sample_config() fill_grid = fill_model_grid_presets(config) assert fill_grid == config['grid_config'] # case 2: has preset, no grid config = sample_config() config.pop('grid_config') config['model_grid_preset'] = 'quickstart' fill_grid = fill_model_grid_presets(config) assert len(fill_grid) == 3 # case 3: neither config = sample_config() config.pop('grid_config') fill_grid = fill_model_grid_presets(config) assert fill_grid is None # case 4: both config = sample_config() config['model_grid_preset'] = 'quickstart' with pytest.raises(KeyError): fill_grid = fill_model_grid_presets(config)
def test_restart_experiment(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) experiment.run() evaluations = num_linked_evaluations(db_engine) assert evaluations > 0 experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, replace=False, ) experiment.make_entity_date_table = mock.Mock() experiment.run() assert not experiment.make_entity_date_table.called
class PreimputationFeatures(TestCase): config = { 'temporal_config': sample_config()['temporal_config'], 'feature_aggregations': sample_config()['feature_aggregations'], 'config_version': sample_config()['config_version'] } def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() generated_tables = [ table for table in schema_tables( experiment.features_schema_name, experiment.db_engine ).keys() if '_aggregation' in table ] assert len(generated_tables) == len(sample_config()['feature_aggregations']) for table in generated_tables: table_should_have_data(table, experiment.db_engine) def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: experiment.validate(strict=False) def test_validate_strict(self): with prepare_experiment(self.config) as experiment: with self.assertRaises(ValueError): experiment.validate()
def test_filepaths_and_queries_give_same_hashes(experiment_class): with testing.postgresql.Postgresql() as postgresql, TemporaryDirectory( ) as temp_dir, mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) query_config = sample_config(query_source="query") file_config = sample_config(query_source="filepath") experiment_with_queries = experiment_class( config=query_config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) experiment_with_filepaths = experiment_class( config=file_config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) assert (experiment_with_queries.experiment_hash == experiment_with_filepaths.experiment_hash) assert (experiment_with_queries.cohort_table_name == experiment_with_filepaths.cohort_table_name) assert (experiment_with_queries.labels_table_name == experiment_with_filepaths.labels_table_name)
def test_fill_model_grid_presets(): # case 1: has grid, no preset config = sample_config() fill_grid = fill_model_grid_presets(config) assert fill_grid == config['grid_config'] # case 2: has preset, no grid config = sample_config() config.pop('grid_config') config['model_grid_preset'] = 'quickstart' fill_grid = fill_model_grid_presets(config) assert len(fill_grid) == 3 # case 3: neither config = sample_config() config.pop('grid_config') fill_grid = fill_model_grid_presets(config) assert fill_grid is None # case 4: both config = sample_config() config['model_grid_preset'] = 'quickstart' fill_grid = fill_model_grid_presets(config) assert len(fill_grid) == 3 assert len(fill_grid.get('sklearn.tree.DecisionTreeClassifier', {}).get('max_depth', [])) == 3 assert len(fill_grid.get('sklearn.tree.DecisionTreeClassifier', {}).get('criterion', [])) == 1
class Matrices(TestCase): config = { "temporal_config": sample_config()["temporal_config"], "feature_aggregations": sample_config()["feature_aggregations"], "cohort_config": sample_config()["cohort_config"], "label_config": sample_config()["label_config"], "config_version": sample_config()["config_version"], } def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() matrices_path = join(experiment.project_path, "matrices") matrices_and_metadata = [ f for f in os.listdir(matrices_path) if isfile(join(matrices_path, f)) ] matrices = experiment.matrix_build_tasks assert len(matrices) > 0 for matrix in matrices: assert "{}.csv".format(matrix) in matrices_and_metadata assert "{}.yaml".format(matrix) in matrices_and_metadata def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: experiment.validate(strict=False) def test_validate_strict(self): with prepare_experiment(self.config) as experiment: with self.assertRaises(ValueError): experiment.validate()
def test_experiment_validator(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) with mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: ExperimentValidator(db_engine).run(sample_config("query")) ExperimentValidator(db_engine).run(sample_config("filepath"))
def test_experiment_tracker(test_engine, project_path): experiment = MultiCoreExperiment( config=sample_config(), db_engine=test_engine, project_path=project_path, n_processes=4, ) experiment_run = Session(bind=test_engine).query(ExperimentRun).get(experiment.run_id) assert experiment_run.current_status == ExperimentRunStatus.started assert experiment_run.experiment_hash == experiment.experiment_hash assert experiment_run.experiment_class_path == 'triage.experiments.multicore.MultiCoreExperiment' assert experiment_run.platform assert experiment_run.os_user assert experiment_run.installed_libraries assert experiment_run.matrices_skipped == 0 assert experiment_run.matrices_errored == 0 assert experiment_run.matrices_made == 0 assert experiment_run.models_skipped == 0 assert experiment_run.models_errored == 0 assert experiment_run.models_made == 0 experiment.run() experiment_run = Session(bind=test_engine).query(ExperimentRun).get(experiment.run_id) assert experiment_run.start_method == "run" assert experiment_run.matrices_made == len(experiment.matrix_build_tasks) assert experiment_run.matrices_skipped == 0 assert experiment_run.matrices_errored == 0 assert experiment_run.models_skipped == 0 assert experiment_run.models_errored == 0 assert experiment_run.models_made == len(list(task['train_kwargs']['model_hash'] for batch in experiment._all_train_test_batches() for task in batch.tasks)) assert isinstance(experiment_run.matrix_building_started, datetime.datetime) assert isinstance(experiment_run.model_building_started, datetime.datetime) assert isinstance(experiment_run.last_updated_time, datetime.datetime) assert not experiment_run.stacktrace assert experiment_run.current_status == ExperimentRunStatus.completed
def test_build_error_cleanup_timeout(_clean_up_mock, experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, cleanup_timeout=0.02, # Set short timeout skip_validation= True, # avoid catching the missing data at validation stage ) with mock.patch.object(experiment, "generate_matrices") as build_mock: build_mock.side_effect = RuntimeError("boom!") with pytest.raises(TimeoutError) as exc_info: experiment() # Last exception is TimeoutError, but earlier error is preserved in # __context__, and will be noted as well in any standard traceback: assert exc_info.value.__context__ is build_mock.side_effect
def test_build_error_cleanup_timeout(_clean_up_mock, experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), cleanup=True, cleanup_timeout=0.02, # Set short timeout ) with mock.patch.object(experiment, 'generate_matrices') as build_mock: build_mock.side_effect = RuntimeError('boom!') with pytest.raises(TimeoutError) as exc_info: experiment() # Last exception is TimeoutError, but earlier error is preserved in # __context__, and will be noted as well in any standard traceback: assert exc_info.value.__context__ is build_mock.side_effect
def test_fill_timechop_config_missing(): remove_keys = [ 'model_update_frequency', 'training_as_of_date_frequencies', 'test_as_of_date_frequencies', 'max_training_histories', 'test_durations', 'feature_start_time', 'feature_end_time', 'label_start_time', 'label_end_time', 'training_label_timespans', 'test_label_timespans' ] # ensure redundant keys properly raise errors config = sample_config() config['temporal_config']['label_timespans'] = '1y' with pytest.raises(KeyError): timechop_config = fill_timechop_config_missing(config, None) with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) config = sample_config() for key in remove_keys: config['temporal_config'].pop(key) config['temporal_config']['label_timespans'] = '1y' timechop_config = fill_timechop_config_missing(config, db_engine) assert timechop_config['model_update_frequency'] == '100y' assert timechop_config['training_as_of_date_frequencies'] == '100y' assert timechop_config['test_as_of_date_frequencies'] == '100y' assert timechop_config['max_training_histories'] == '0d' assert timechop_config['test_durations'] == '0d' assert timechop_config['training_label_timespans'] == '1y' assert timechop_config['test_label_timespans'] == '1y' assert 'label_timespans' not in timechop_config.keys() assert timechop_config['feature_start_time'] == '2010-10-01' assert timechop_config['feature_end_time'] == '2013-10-01' assert timechop_config['label_start_time'] == '2010-10-01' assert timechop_config['label_end_time'] == '2013-10-01'
def test_profiling(db_engine): populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: project_path = os.path.join(temp_dir, "inspections") SingleThreadedExperiment(config=sample_config(), db_engine=db_engine, project_path=project_path, profile=True).run() assert len(os.listdir(os.path.join(project_path, "profiling_stats"))) == 1
def test_noload_if_wrong_version(self): experiment_config = sample_config() experiment_config["config_version"] = "v0" with TemporaryDirectory() as temp_dir: with self.assertRaises(ValueError): SingleThreadedExperiment( config=experiment_config, db_engine=None, project_path=os.path.join(temp_dir, "inspections"), )
class GetSplits(TestCase): config = { "temporal_config": sample_config()["temporal_config"], "config_version": sample_config()["config_version"], } def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() assert experiment.split_definitions def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: experiment.validate(strict=False) def test_validate_strict(self): with prepare_experiment(self.config) as experiment: with self.assertRaises(ValueError): experiment.validate()
class Cohort(TestCase): config = { 'temporal_config': sample_config()['temporal_config'], 'cohort_config': sample_config()['cohort_config'], 'config_version': sample_config()['config_version'] } def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() table_should_have_data(experiment.sparse_states_table_name, experiment.db_engine) def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: experiment.validate(strict=False) def test_validate_strict(self): with prepare_experiment(self.config) as experiment: with self.assertRaises(ValueError): experiment.validate()
class Labels(TestCase): config = { "temporal_config": sample_config()["temporal_config"], "label_config": sample_config()["label_config"], "config_version": sample_config()["config_version"], } def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() table_should_have_data(experiment.labels_table_name, experiment.db_engine) def test_validate_nonstrict(self): with prepare_experiment(self.config) as experiment: experiment.validate(strict=False) def test_validate_strict(self): with prepare_experiment(self.config) as experiment: with self.assertRaises(ValueError): experiment.validate()
def test_noload_if_wrong_version(self): experiment_config = sample_config() experiment_config['config_version'] = 'v0' with TemporaryDirectory() as temp_dir: with self.assertRaises(ValueError): SingleThreadedExperiment( config=experiment_config, db_engine=None, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), )
def test_experiment_tracker_in_parts(test_engine, project_path): experiment = SingleThreadedExperiment( config=sample_config(), db_engine=test_engine, project_path=project_path, ) experiment.generate_matrices() experiment.train_and_test_models() with scoped_session(test_engine) as session: experiment_run = session.query(ExperimentRun).get(experiment.run_id) assert experiment_run.start_method == "generate_matrices"
def test_serializable_engine_check_sqlalchemy_fail(): """If we pass a vanilla sqlalchemy engine to the experiment we should blow up""" with testing.postgresql.Postgresql() as postgresql: db_engine = sqlalchemy.create_engine(postgresql.url()) with TemporaryDirectory() as temp_dir: with pytest.raises(TypeError): MultiCoreExperiment( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), )
def test_noload_if_wrong_version(self): experiment_config = sample_config() experiment_config["config_version"] = "v0" with TemporaryDirectory() as temp_dir, mock.patch( "triage.util.conf.open", side_effect=open_side_effect) as mock_file: with self.assertRaises(ValueError): SingleThreadedExperiment( config=experiment_config, db_engine=None, project_path=os.path.join(temp_dir, "inspections"), )
def test_run(self): with prepare_experiment(self.config) as experiment: experiment.run() generated_tables = [ table for table in schema_tables(experiment.features_schema_name, experiment.db_engine).keys() if '_aggregation_imputed' in table ] assert len(generated_tables) == len(sample_config()['feature_aggregations']) for table in generated_tables: table_should_have_data(table, experiment.db_engine)
def test_fill_cohort_config_missing(): config = sample_config() config.pop('cohort_config') cohort_config = fill_cohort_config_missing(config) assert cohort_config == { 'query': "select distinct entity_id from " "((select entity_id, as_of_date as knowledge_date from " "(select * from cat_complaints) as t)\n union \n(select entity_id, " "as_of_date as knowledge_date from (select * from entity_zip_codes " "join zip_code_events using (zip_code)) as t)) as e " "where knowledge_date < '{as_of_date}'", 'name': 'all_entities' }
def test_custom_label_name(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) config = sample_config() config["label_config"]["name"] = "custom_label_name" with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), ) assert experiment.label_generator.label_name == "custom_label_name" assert experiment.planner.label_names == ["custom_label_name"]
def test_load_if_right_version(self): experiment_config = sample_config() experiment_config["config_version"] = CONFIG_VERSION with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) with TemporaryDirectory() as temp_dir: experiment = SingleThreadedExperiment( config=experiment_config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), ) assert isinstance(experiment, SingleThreadedExperiment)
def test_experiment_tracker_in_parts(test_engine, project_path): with mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: experiment = SingleThreadedExperiment( config=sample_config(), db_engine=test_engine, project_path=project_path, ) experiment.generate_matrices() experiment.train_and_test_models() with scoped_session(test_engine) as session: experiment_run = session.query(TriageRun).get(experiment.run_id) assert experiment_run.start_method == "generate_matrices"
def test_cleanup_timeout(_clean_up_mock, experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, cleanup_timeout=0.02, # Set short timeout ) with pytest.raises(TimeoutError): experiment()
def test_custom_label_name(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) config = sample_config() config['label_config']['name'] = 'custom_label_name' with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), ) assert experiment.label_generator.label_name == 'custom_label_name' assert experiment.planner.label_names == ['custom_label_name']
def finished_experiment(shared_db_engine, shared_project_storage): """A successfully-run experiment. Its database schemas and project storage can be queried. Returns: (triage.experiments.SingleThreadedExperiment) """ populate_source_data(shared_db_engine) base_config = sample_config() experiment = SingleThreadedExperiment( base_config, db_engine=shared_db_engine, project_path=shared_project_storage.project_path ) experiment.run() return experiment