def get_data_from_ds(ds: DataSet, start: Optional[int] = None, end: Optional[int] = None) -> Dict[str, List[List]]: """ Returns a dictionary in the format {'name' : data}, where data is what dataset.get_data('name') returns, i.e., a list of lists, where the inner list is the row as inserted into the DB. with `start` and `end` only a subset of rows in the DB can be specified. """ names = [n for n, v in ds.paramspecs.items()] return {n: np.squeeze(ds.get_data(n, start=start, end=end)) for n in names}
def test_basic_extraction(two_empty_temp_db_connections, some_paramspecs): source_conn, target_conn = two_empty_temp_db_connections source_path = path_to_dbfile(source_conn) target_path = path_to_dbfile(target_conn) type_casters = {'numeric': float, 'array': (lambda x: np.array(x) if hasattr(x, '__iter__') else np.array([x])), 'text': str} source_exp = Experiment(conn=source_conn) source_dataset = DataSet(conn=source_conn, name="basic_copy_paste_name") with pytest.raises(RuntimeError) as excinfo: extract_runs_into_db(source_path, target_path, source_dataset.run_id) assert error_caused_by(excinfo, ('Dataset not completed. An incomplete ' 'dataset can not be copied. The ' 'incomplete dataset has GUID: ' f'{source_dataset.guid} and run_id: ' f'{source_dataset.run_id}')) for ps in some_paramspecs[1].values(): source_dataset.add_parameter(ps) for value in range(10): result = {ps.name: type_casters[ps.type](value) for ps in some_paramspecs[1].values()} source_dataset.add_result(result) source_dataset.add_metadata('goodness', 'fair') source_dataset.add_metadata('test', True) source_dataset.mark_complete() extract_runs_into_db(source_path, target_path, source_dataset.run_id) target_exp = Experiment(conn=target_conn, exp_id=1) length1 = len(target_exp) assert length1 == 1 # trying to insert the same run again should be a NOOP with raise_if_file_changed(target_path): extract_runs_into_db(source_path, target_path, source_dataset.run_id) assert len(target_exp) == length1 target_dataset = DataSet(conn=target_conn, run_id=1) # Now make the interesting comparisons: are the target objects the same as # the source objects? assert source_dataset.the_same_dataset_as(target_dataset) source_data = source_dataset.get_data(*source_dataset.parameters.split(',')) target_data = target_dataset.get_data(*target_dataset.parameters.split(',')) assert source_data == target_data exp_attrs = ['name', 'sample_name', 'format_string', 'started_at', 'finished_at'] for exp_attr in exp_attrs: assert getattr(source_exp, exp_attr) == getattr(target_exp, exp_attr) # trying to insert the same run again should be a NOOP with raise_if_file_changed(target_path): extract_runs_into_db(source_path, target_path, source_dataset.run_id)
def test_correct_experiment_routing(two_empty_temp_db_connections, some_paramspecs): """ Test that existing experiments are correctly identified AND that multiple insertions of the same runs don't matter (run insertion is idempotent) """ source_conn, target_conn = two_empty_temp_db_connections source_exp_1 = Experiment(conn=source_conn) # make 5 runs in first experiment exp_1_run_ids = [] for _ in range(5): source_dataset = DataSet(conn=source_conn, exp_id=source_exp_1.exp_id) exp_1_run_ids.append(source_dataset.run_id) for ps in some_paramspecs[2].values(): source_dataset.add_parameter(ps) for val in range(10): source_dataset.add_result({ps.name: val for ps in some_paramspecs[2].values()}) source_dataset.mark_complete() # make a new experiment with 1 run source_exp_2 = Experiment(conn=source_conn) ds = DataSet(conn=source_conn, exp_id=source_exp_2.exp_id, name="lala") exp_2_run_ids = [ds.run_id] for ps in some_paramspecs[2].values(): ds.add_parameter(ps) for val in range(10): ds.add_result({ps.name: val for ps in some_paramspecs[2].values()}) ds.mark_complete() source_path = path_to_dbfile(source_conn) target_path = path_to_dbfile(target_conn) # now copy 2 runs extract_runs_into_db(source_path, target_path, *exp_1_run_ids[:2]) target_exp1 = Experiment(conn=target_conn, exp_id=1) assert len(target_exp1) == 2 # copy two other runs, one of them already in extract_runs_into_db(source_path, target_path, *exp_1_run_ids[1:3]) assert len(target_exp1) == 3 # insert run from different experiment extract_runs_into_db(source_path, target_path, ds.run_id) assert len(target_exp1) == 3 target_exp2 = Experiment(conn=target_conn, exp_id=2) assert len(target_exp2) == 1 # finally insert every single run from experiment 1 extract_runs_into_db(source_path, target_path, *exp_1_run_ids) # check for idempotency once more by inserting all the runs but in another # order with raise_if_file_changed(target_path): extract_runs_into_db(source_path, target_path, *exp_1_run_ids[::-1]) target_exps = get_experiments(target_conn) assert len(target_exps) == 2 assert len(target_exp1) == 5 assert len(target_exp2) == 1 # check that all the datasets match up for run_id in exp_1_run_ids + exp_2_run_ids: source_ds = DataSet(conn=source_conn, run_id=run_id) target_ds = load_by_guid(guid=source_ds.guid, conn=target_conn) assert source_ds.the_same_dataset_as(target_ds) source_data = source_ds.get_data(*source_ds.parameters.split(',')) target_data = target_ds.get_data(*target_ds.parameters.split(',')) assert source_data == target_data