def test_dataset_validation(dataset_dir, transformer_cfg_dataset, records, batch_size, as_list): # Fit training dataset RecordDataset( artifact_dir=dataset_dir, cfg_dataset=transformer_cfg_dataset, records=_to_list(records, as_list), mode=RecordMode.TRAIN, batch_size=batch_size, ) ds_val = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=transformer_cfg_dataset, records=_to_list(records, as_list), mode=RecordMode.VALIDATION, batch_size=batch_size, ) for ind in range(len(ds_val)): result = ds_val[ind] # Compute expected sample_inds = ds_val.sample_order[ind * batch_size:(ind + 1) * batch_size] batch_records = records.iloc[sample_inds] mX = np.array([49.5, 2]) X = batch_records[["x1", "x2"]].values.reshape(len(sample_inds), 2) X -= mX y1 = batch_records["y1"].values.reshape(len(sample_inds), 1) y2 = batch_records["y2"].values.reshape(len(sample_inds), 1) % 5 - 2 expected = ({"input_1": X}, {"output_1": y1, "output_2": y2}) _assert_batch_equal(result, expected)
def test_dataset_score(dataset_dir, transformer_cfg_dataset, records, batch_size, as_list): # Fit training dataset RecordDataset( artifact_dir=dataset_dir, cfg_dataset=transformer_cfg_dataset, records=_to_list(records, as_list), mode=RecordMode.TRAIN, batch_size=batch_size, ) ds_score = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=transformer_cfg_dataset, records=_to_list(records, as_list), mode=RecordMode.SCORE, batch_size=batch_size, ) for ind in range(len(ds_score)): result = ds_score[ind] # Compute expected sample_inds = ds_score.sample_order[ind * batch_size:(ind + 1) * batch_size] batch_records = records.iloc[sample_inds] mX = np.array([49.5, 2]) X = batch_records[["x1", "x2"]].values.reshape(len(sample_inds), 2) X -= mX expected = ({"input_1": X}, ) _assert_batch_equal(result, expected)
def test_dataset_init_transformers(dataset_dir, transformer_cfg_dataset, records, mode, sample_count): def _assert_dict_array_equal(d1, d2): assert len(d1) == len(d2) for k in d1.keys(): np.testing.assert_array_equal(d1[k], d2[k]) transformer_cfg_dataset["sample_count"] = sample_count batch_size = 32 ds_train = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=transformer_cfg_dataset, records=records, mode=RecordMode.TRAIN, batch_size=batch_size, ) ref_obj = {"mean_input": np.array([49.5, 2]), "mean_output": np.array([2])} ref_network_params = {"num_inputs": 1, "num_outputs": 2} _assert_dict_array_equal(ds_train.transformer.obj, ref_obj) assert ds_train.transformer.network_params == ref_network_params ds_non_train = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=transformer_cfg_dataset, records=records, mode=mode, batch_size=batch_size, ) _assert_dict_array_equal(ds_non_train.transformer.obj, ref_obj)
def test_dataset_train(dataset_dir, transformer_cfg_dataset, records, batch_size, sample_count, as_list): transformer_cfg_dataset["sample_count"] = sample_count ds_train = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=transformer_cfg_dataset, records=_to_list(records, as_list), mode=RecordMode.TRAIN, batch_size=batch_size, ) for ind in range(len(ds_train)): result = ds_train[ind] # Compute expected sample_inds = ds_train.sample_order[ind * batch_size:(ind + 1) * batch_size] batch_records = records.iloc[sample_inds] mX = np.array([49.5, 2]) X = batch_records[["x1", "x2"]].values.reshape(len(sample_inds), 2) X -= mX X += 3 y1 = batch_records["y1"].values.reshape(len(sample_inds), 1) y2 = batch_records["y2"].values.reshape(len(sample_inds), 1) % 5 - 2 y2 *= 5 expected = ({"input_1": X}, {"output_1": y1, "output_2": y2}) _assert_batch_equal(result, expected)
def test_dataset_sample_inds(dataset_dir, base_cfg_dataset, records, mode, sample_count): base_cfg_dataset["sample_count"] = sample_count batch_size = 32 ds = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=base_cfg_dataset, records=records, mode=mode, batch_size=batch_size, ) if mode == RecordMode.TRAIN and sample_count is not None: assert ds.sample_inds == RecordDataset.convert_sample_count_to_inds( records[sample_count]) else: assert ds.sample_inds == list(range(len(records)))
def test_dataset_non_train_before_train(dataset_dir, transformer_cfg_dataset, records, mode): batch_size = 32 with pytest.raises(FileNotFoundError): RecordDataset( artifact_dir=dataset_dir, cfg_dataset=transformer_cfg_dataset, records=records, mode=mode, batch_size=batch_size, )
def test_dataset_sample_order(dataset_dir, base_cfg_dataset, records, mode, sample_count): base_cfg_dataset["sample_count"] = sample_count base_cfg_dataset["seed"] = 13 batch_size = 32 ds = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=base_cfg_dataset, records=records, mode=mode, batch_size=batch_size, ) if mode == RecordMode.TRAIN or mode == RecordMode.VALIDATION: assert ds.sample_order != ds.sample_inds assert sorted(ds.sample_order) == ds.sample_inds sample_order_1 = ds.sample_order ds.shuffle() sample_order_2 = ds.sample_order assert sample_order_1 != sample_order_2 assert sorted(sample_order_1) == sorted(sample_order_2) ds.on_epoch_end() sample_order_3 = ds.sample_order assert sample_order_2 != sample_order_3 assert sorted(sample_order_2) == sorted(sample_order_3) else: assert ds.sample_order == ds.sample_inds sample_order_1 = ds.sample_order ds.shuffle() sample_order_2 = ds.sample_order assert sample_order_1 == sample_order_2 ds.on_epoch_end() sample_order_3 = ds.sample_order assert sample_order_2 == sample_order_3
def test_dataset_length(dataset_dir, base_cfg_dataset, records, mode, sample_count, batch_size): base_cfg_dataset["sample_count"] = sample_count ds = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=base_cfg_dataset, records=records, mode=mode, batch_size=batch_size, ) if mode == RecordMode.TRAIN and sample_count is not None: num_samples = records[sample_count].sum() assert len(ds) == int(np.ceil(num_samples / float(batch_size))) else: assert len(ds) == int(np.ceil(len(records) / float(batch_size)))
def test_dataset_init_basics(dataset_dir, base_cfg_dataset, records, mode): # Exploit the fact identity transformer does not save params batch_size = 32 ds = RecordDataset( artifact_dir=dataset_dir, cfg_dataset=base_cfg_dataset, records=records, mode=mode, batch_size=batch_size, ) assert ds.num_records == len(records) assert ds.records == records.to_dict(orient="records") assert ds.mode == mode assert ds.batch_size == batch_size assert hasattr(ds, "loader") assert hasattr(ds, "transformer") if mode == RecordMode.TRAIN: assert hasattr(ds, "augmentor") else: assert not hasattr(ds, "augmentor") assert ds.transformer.network_params == {}
def test_convert_sample_count_to_inds(s, result): assert RecordDataset.convert_sample_count_to_inds(s) == result