def triage(ctx, config_file, triage_db, replace, debug): config_file = os.path.join(os.sep, "triage", "experiments", config_file) click.echo(f"Using the config file {config_file}") with open(config_file) as f: experiments = yaml.load(f) click.echo( f"The output (matrices and models) of this experiment will be stored in triage/output" ) click.echo(f"Using data stored in {triage_db}") click.echo( f"The experiment will utilize any preexisting matrix or model: {not replace}" ) click.echo(f"Creating experiment object") experiment = SingleThreadedExperiment(config=experiments, db_engine=create_engine(triage_db), project_path='/triage/output', cleanup=True, replace=replace) ctx.obj = experiment if debug: logging.basicConfig(level=logging.DEBUG) click.echo("Debug enabled (Expect A LOT of output at the screen!!!)") click.echo("Experiment loaded")
def test_query_to_df(): """Test the write_to_csv function by checking whether the csv contains the correct number of lines. """ with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) # for each table, check that corresponding csv has the correct # of rows for table in features_tables: df = builder.query_to_df(""" select * from features.features{} """.format(features_tables.index(table))) assert len(df) == len(table)
def test_test_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) uuid = filename_friendly_hash(self.good_metadata) builder.build_matrix( as_of_times=self.good_dates, label_name="booking", label_type="binary", feature_dictionary=self.good_feature_dictionary, matrix_metadata=self.good_metadata, matrix_uuid=uuid, matrix_type="test", ) assert len( matrix_storage_engine.get_store(uuid).design_matrix) == 5
def test_build_error_cleanup_timeout(_clean_up_mock, experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, cleanup_timeout=0.02, # Set short timeout skip_validation= True, # avoid catching the missing data at validation stage ) with mock.patch.object(experiment, "generate_matrices") as build_mock: build_mock.side_effect = RuntimeError("boom!") with pytest.raises(TimeoutError) as exc_info: experiment() # Last exception is TimeoutError, but earlier error is preserved in # __context__, and will be noted as well in any standard traceback: assert exc_info.value.__context__ is build_mock.side_effect
def setup_experiment(experiment_config_file): experiment_config = read_config_file(experiment_config_file) cred_folder = os.path.join(project_path, 'config') cred_file = os.path.join(cred_folder, 'joco_db_profile.yaml') print(cred_file) db = read_config_file(cred_file) sql_engine = create_engine( 'postgresql+psycopg2://%s:%s@%s:%i/%s'%( db['user'], db['pass'], db['host'], db['port'], db['db'] ) ) dateTimeObj = datetime.now() timestr = dateTimeObj.strftime('%Y%m%d%H%M') user = getpass.getuser() cf = ntpath.basename(experiment_config_file)[0:10] data_folder = '/mnt/data/experiment_data/' project_folder = os.path.join(data_folder, 'joco', 'decoupled_nonwhite') print("="*10) print(project_folder) print("="*10) # create the folder if not os.path.exists(project_folder): os.mkdir(project_folder) return experiment_config, sql_engine, project_folder
def test_restart_experiment(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) experiment.run() evaluations = num_linked_evaluations(db_engine) assert evaluations > 0 experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, replace=False, ) experiment.make_entity_date_table = mock.Mock() experiment.run() assert not experiment.make_entity_date_table.called
def test_build_error_cleanup_timeout(_clean_up_mock, experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), cleanup=True, cleanup_timeout=0.02, # Set short timeout ) with mock.patch.object(experiment, 'generate_matrices') as build_mock: build_mock.side_effect = RuntimeError('boom!') with pytest.raises(TimeoutError) as exc_info: experiment() # Last exception is TimeoutError, but earlier error is preserved in # __context__, and will be noted as well in any standard traceback: assert exc_info.value.__context__ is build_mock.side_effect
def test_filepaths_and_queries_give_same_hashes(experiment_class): with testing.postgresql.Postgresql() as postgresql, TemporaryDirectory( ) as temp_dir, mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) query_config = sample_config(query_source="query") file_config = sample_config(query_source="filepath") experiment_with_queries = experiment_class( config=query_config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) experiment_with_filepaths = experiment_class( config=file_config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) assert (experiment_with_queries.experiment_hash == experiment_with_filepaths.experiment_hash) assert (experiment_with_queries.cohort_table_name == experiment_with_filepaths.cohort_table_name) assert (experiment_with_queries.labels_table_name == experiment_with_filepaths.labels_table_name)
def setup_experiment(experiment_config_file, use_s3, s3_path): project_path = '' experiment_config = read_config_file(experiment_config_file) cred_folder = os.path.join(project_path, 'conf', 'local') cred_file = os.path.join(cred_folder, 'credentials_2.yaml') configs = read_config_file(cred_file) db = configs['db'] sql_engine = create_engine( 'postgresql+psycopg2://%s:%s@%s:%i/%s' % (db['user'], db['pass'], db['host'], db['port'], db['db'])) dateTimeObj = datetime.now() timestr = dateTimeObj.strftime('%Y%m%d%H%M') user = getpass.getuser() cf = ntpath.basename(experiment_config_file)[0:10] if use_s3 == False: data_folder = '/mnt/data/experiment_data' project_folder = os.path.join( data_folder, 'triage', 'elsal') #'{}_{}_{}'.format(user, timestr, cf)) if not os.path.exists(project_folder): os.mkdir(project_folder) else: project_folder = s3_path print(project_folder) return experiment_config, sql_engine, project_folder
def test_nullcheck(self): f0_dict = {(r[0], r[1]): r for r in features0_pre} f1_dict = {(r[0], r[1]): r for r in features1_pre} features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0])) features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0])) features_tables = [features0, features1] with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"], } matrix_metadata = { "matrix_id": "hi", "state": "active", "label_name": "booking", "end_time": datetime.datetime(2016, 3, 1, 0, 0), "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0), "label_timespan": "1 month", "test_duration": "1 month", "indices": ["entity_id", "as_of_date"], } uuid = filename_friendly_hash(matrix_metadata) with self.assertRaises(ValueError): builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", )
def test_make_entity_date_table(): """Test that the make_entity_date_table function contains the correct values. """ dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] # make a dataframe of entity ids and dates to test against ids_dates = create_entity_date_df( labels=labels, states=states, as_of_dates=dates, label_name="booking", label_type="binary", label_timespan="1 month", ) with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) engine.execute( "CREATE TABLE features.tmp_entity_date (a int, b date);") # call the function to test the creation of the table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type="binary", label_name="booking", state="active", matrix_uuid="my_uuid", matrix_type="train", label_timespan="1 month", ) # read in the table result = pd.read_sql( "select * from features.{} order by entity_id, as_of_date". format(entity_date_table_name), engine, ) # compare the table to the test dataframe test = result == ids_dates assert test.all().all()
def shared_db_engine(): """pytest fixture provider to set up and teardown a "test" database and provide a test module a connection engine with which to query that database. """ with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) yield engine engine.dispose()
def connect(cred_folder): #cred_file = os.path.join(cred_folder, 'donors_db_profile.yaml') #cred_file = os.path.join(cred_folder, 'elsal_db_profile2.yaml') cred_file = os.path.join(cred_folder, 'san_jose_db.yaml') db = read_config_file(cred_file) sql_engine = create_engine( 'postgresql+psycopg2://%s:%s@%s:%i/%s' % (db['user'], db['pass'], db['host'], db['port'], db['db'])) return sql_engine
def test_replace_true_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) matrix_metadata = matrix_metadata_creator(state="active", test_duration="1month", label_name="booking") dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"] } uuid = filename_friendly_hash(matrix_metadata) build_args = dict( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, replace=True, ) builder.build_matrix(**build_args) assert len( matrix_storage_engine.get_store(uuid).design_matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid) # rerun builder.build_matrix(**build_args) assert len( matrix_storage_engine.get_store(uuid).design_matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid)
def prepare_experiment(config): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = SingleThreadedExperiment(config=config, db_engine=db_engine, project_path=os.path.join( temp_dir, 'inspections'), cleanup=False) yield experiment
def test_load_if_right_version(self): experiment_config = sample_config() experiment_config["config_version"] = CONFIG_VERSION with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) with TemporaryDirectory() as temp_dir: experiment = SingleThreadedExperiment( config=experiment_config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), ) assert isinstance(experiment, SingleThreadedExperiment)
def test_custom_label_name(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) config = sample_config() config["label_config"]["name"] = "custom_label_name" with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), ) assert experiment.label_generator.label_name == "custom_label_name" assert experiment.planner.label_names == ["custom_label_name"]
def test_custom_label_name(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) config = sample_config() config['label_config']['name'] = 'custom_label_name' with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), ) assert experiment.label_generator.label_name == 'custom_label_name' assert experiment.planner.label_names == ['custom_label_name']
def test_validate_default(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) experiment.validate = mock.MagicMock() experiment.run() experiment.validate.assert_called_once()
def test_load_if_right_version(self): experiment_config = sample_config() experiment_config["config_version"] = CONFIG_VERSION with testing.postgresql.Postgresql() as postgresql, TemporaryDirectory( ) as temp_dir, mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: db_engine = create_engine(postgresql.url()) experiment = SingleThreadedExperiment( config=experiment_config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), ) assert isinstance(experiment, SingleThreadedExperiment)
def test_custom_label_name(experiment_class): with testing.postgresql.Postgresql() as postgresql, TemporaryDirectory( ) as temp_dir, mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: db_engine = create_engine(postgresql.url()) config = sample_config() config["label_config"]["name"] = "custom_label_name" experiment = experiment_class( config=config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), ) assert experiment.label_generator.label_name == "custom_label_name" assert experiment.planner.label_names == ["custom_label_name"]
def test_cleanup_timeout(_clean_up_mock, experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, cleanup_timeout=0.02, # Set short timeout ) with pytest.raises(TimeoutError): experiment()
def connect(cred_folder): cred_file = os.path.join(cred_folder, 'joco_db_profile.yaml') db = read_config_file(cred_file) sql_engine = create_engine( 'postgresql+psycopg2://%s:%s@%s:%i/%s'%( db['user'], db['pass'], db['host'], db['port'], db['db'] ) ) return sql_engine
def run(config_filename, verbose, replace, predictions, validate_only): # configure logging log_filename = 'logs/modeling_{}'.format( str(datetime.datetime.now()).replace(' ', '_').replace(':', '')) if verbose: logging_level = logging.DEBUG else: logging_level = logging.INFO logging.basicConfig( format='%(asctime)s %(process)d %(levelname)s: %(message)s', level=logging_level, handlers=[logging.FileHandler(log_filename), logging.StreamHandler()]) # config_filename = 'experiment_config' features_directory = 'features' # load main experiment config with open('config/{}.yaml'.format(config_filename)) as f: experiment_config = yaml.load(f) # load feature configs and update experiment config with their contents all_feature_aggregations = [] for filename in os.listdir('config/{}/'.format(features_directory)): with open('config/{}/{}'.format(features_directory, filename)) as f: feature_aggregations = yaml.load(f) for aggregation in feature_aggregations: all_feature_aggregations.append(aggregation) experiment_config['feature_aggregations'] = all_feature_aggregations with open('config/db_default_profile.json') as f: DB_CONFIG = json.load(f) db_engine = create_engine( f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['pass']}@{DB_CONFIG['host']}/{DB_CONFIG['db']}" ) experiment = MultiCoreExperiment( config=experiment_config, db_engine=db_engine, project_path=PROJECT_PATH, replace=replace, n_db_processes=4, n_processes=40, save_predictions=predictions, ) experiment.validate() if not validate_only: experiment.run()
def test_validate_default(experiment_class): with testing.postgresql.Postgresql() as postgresql, TemporaryDirectory( ) as temp_dir, mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) experiment.validate = mock.MagicMock() experiment.run() experiment.validate.assert_called_once()
def test_load_if_right_version(self): experiment_config = sample_config() experiment_config['config_version'] = CONFIG_VERSION with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with TemporaryDirectory() as temp_dir: experiment = SingleThreadedExperiment( config=experiment_config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), ) assert isinstance(experiment, SingleThreadedExperiment)
def main(): args = parse_args() dburl = os.environ['DBURL'] hiv_engine = create_engine(dburl, pool_pre_ping=True) with open(args.config_path) as f: experiment_config = yaml.load(f) experiment = SingleThreadedExperiment(config=experiment_config, db_engine=hiv_engine, project_path=args.project_path, replace=False) experiment.validate() experiment.run()
def prepare_experiment(config): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: with mock.patch( "triage.util.conf.open", side_effect=open_side_effect ) as mock_file: experiment = SingleThreadedExperiment( config=config, db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=False, partial_run=True, ) yield experiment
def test_build_error(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) with TemporaryDirectory() as temp_dir: experiment = experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ) with mock.patch.object(experiment, "generate_matrices") as build_mock: build_mock.side_effect = RuntimeError("boom!") with pytest.raises(RuntimeError): experiment()
def upgrade_if_clean(dburl): """Upgrade the database only if the results schema hasn't been created yet. Raises: ValueError if the database results schema version does not equal the code's version """ alembic_cfg = alembic_config(dburl) engine = create_engine(dburl) script_ = script.ScriptDirectory.from_config(alembic_cfg) if not table_exists('results_schema_versions', engine): logger.info( "No results_schema_versions table exists, which means that this installation " "is fresh. Upgrading db.") upgrade_db(dburl=dburl) return with engine.begin() as conn: current_revision = conn.execute( 'select version_num from results_schema_versions limit 1').scalar( ) logger.debug("Database's triage_metadata schema version is %s", current_revision) triage_head = script_.get_current_head() logger.debug("Code's triage_metadata schema version is %s", triage_head) database_is_ahead = not any(migration.revision == current_revision for migration in script_.walk_revisions()) if database_is_ahead: raise ValueError( f"Your database's results schema version, {current_revision}, is not a known " "revision to this version of Triage. Usually, this happens if you use a branch " "with a new results schema version and upgrade the database to that version. " "To use this version of Triage, you will likely need to check out that branch " f"and downgrade to {triage_head}", ) elif current_revision != triage_head: raise ValueError( f"Your database's results schema revision, {current_revision}, is out of date " "for this version of Triage. However, your database can be upgraded to this " "revision. If you would like to upgrade your database from the console, and " "you've installed Triage, you may execute `triage db upgrade`. " "If the `triage` command is unavailable, (because you are running Triage directly " " from a repository checkout), then `manage alembic upgrade head`. " "The database changes may take a long time on a heavily populated database. " "Otherwise, you can also downgrade your Triage version to match your database." )