def test_resume_training(optimizer, generated_data, tmp_path): input_features, output_features = get_feature_configs() config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat" }, "training": { "epochs": 2, "early_stop": 1000, "batch_size": 16, "optimizer": { "type": optimizer } }, } # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() _, _, _, _, output_dir1 = experiment_cli( config, training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, ) config["training"]["epochs"] = 4 experiment_cli( config, training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, model_resume_path=output_dir1, ) _, _, _, _, output_dir2 = experiment_cli( config, training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, ) # compare learning curves with and without resuming ts1 = load_json(os.path.join(output_dir1, "training_statistics.json")) ts2 = load_json(os.path.join(output_dir2, "training_statistics.json")) print("ts1", ts1) print("ts2", ts2) assert ts1["training"]["combined"]["loss"] == ts2["training"]["combined"][ "loss"] # compare predictions with and without resuming y_pred1 = np.load(os.path.join(output_dir1, "y_predictions.npy")) y_pred2 = np.load(os.path.join(output_dir2, "y_predictions.npy")) print("y_pred1", y_pred1) print("y_pred2", y_pred2) assert np.all(np.isclose(y_pred1, y_pred2))
def test_experiment_model_resume(csv_filename): # Single sequence input, single category output # Tests saving a model file, loading it to rerun training and predict input_features = [sequence_feature(encoder='rnn', reduce_output='sum')] output_features = [category_feature(vocab_size=2, reduce_input='sum')] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } _, _, _, _, output_dir = experiment_cli(config, dataset=rel_path) logger.info('Experiment Directory: {0}'.format(output_dir)) experiment_cli(config, dataset=rel_path, model_resume_path=output_dir) predict_cli(os.path.join(output_dir, 'model'), dataset=rel_path) shutil.rmtree(output_dir, ignore_errors=True)
def test_sequence_generator( enc_encoder, enc_cell_type, dec_cell_type, dec_attention, dec_beam_width, dec_num_layers, loss_sampler, generate_deterministic_sequence, ): # Define input and output features input_features = [ { "name": "in_seq", "type": "sequence", "encoder": enc_encoder, "cell_type": enc_cell_type, "reduce_output": None, } ] output_features = [ { "name": "out_seq", "type": "sequence", "cell_type": dec_cell_type, "num_layers": dec_num_layers, "beam_width": dec_beam_width, "decoder": "generator", "attention": dec_attention, "reduce_input": None, "loss": {"type": "sampled_softmax_cross_entropy", "negative_samples": 10, "sampler": loss_sampler}, } ] model_definition = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 14}, # 'concat' "training": { "" "epochs": 2, "early_stop": 5, "batch_size": 80, "optimizer": {"type": "adam"}, "learning_rate": 0.001, }, } args = { "config": model_definition, "skip_save_processed_input": True, "skip_save_progress": True, "skip_save_unprocessed_output": True, "skip_save_model": True, "skip_save_log": True, "debug": False, } # Generate test data np.random.seed(42) # 13 df = generate_deterministic_sequence # run the experiment experiment_cli(dataset=df, **args)
def test_resume_training(optimizer, generated_data, tmp_path): input_features, output_features = get_feature_configs() config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat' }, 'training': { 'epochs': 2, 'early_stop': 1000, 'batch_size': 16, 'optimizer': { 'type': optimizer } } } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() _, _, _, _, output_dir1 = experiment_cli( config, training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, ) config['training']['epochs'] = 4 experiment_cli( config, training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, model_resume_path=output_dir1, ) _, _, _, _, output_dir2 = experiment_cli( config, training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, ) # compare learning curves with and without resuming ts1 = load_json(os.path.join(output_dir1, 'training_statistics.json')) ts2 = load_json(os.path.join(output_dir2, 'training_statistics.json')) print('ts1', ts1) print('ts2', ts2) assert ts1['training']['combined']['loss'] == ts2['training']['combined'][ 'loss'] # compare predictions with and without resuming y_pred1 = np.load(os.path.join(output_dir1, 'y_predictions.npy')) y_pred2 = np.load(os.path.join(output_dir2, 'y_predictions.npy')) print('y_pred1', y_pred1) print('y_pred2', y_pred2) assert np.all(np.isclose(y_pred1, y_pred2))
def test_experiment_model_resume(tmpdir): # Single sequence input, single category output # Tests saving a model file, loading it to rerun training and predict input_features = [sequence_feature(encoder="rnn", reduce_output="sum")] output_features = [category_feature(vocab_size=2, reduce_input="sum")] # Generate test data rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv")) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } _, _, _, _, output_dir = experiment_cli(config, dataset=rel_path, output_directory=tmpdir) experiment_cli(config, dataset=rel_path, model_resume_path=output_dir) predict_cli(os.path.join(output_dir, "model"), dataset=rel_path) shutil.rmtree(output_dir, ignore_errors=True)
def test_numeric_transformer(transformer_key, tmpdir): Transformer = get_from_registry(transformer_key, numeric_transformation_registry) transformer_name = Transformer().__class__.__name__ if transformer_name == 'Log1pTransformer': raw_values = np.random.lognormal(5, 2, size=100) else: raw_values = np.random.normal(5, 2, size=100) backend = LOCAL_BACKEND parameters = Transformer.fit_transform_params(raw_values, backend) if transformer_name in {'Log1pTransformer', 'IdentityTransformer'}: # should be empty assert not bool(parameters) else: # should not be empty assert bool(parameters) # instantiate numeric transformer numeric_transfomer = Transformer(**parameters) # transform values transformed_values = numeric_transfomer.transform(raw_values) # inverse transform the prior transformed values reconstructed_values = \ numeric_transfomer.inverse_transform(transformed_values) # should now match assert np.allclose(raw_values, reconstructed_values) # now test numeric transformer with output feature df = pd.DataFrame(np.array([raw_values, raw_values]).T, columns=['x', 'y']) config = { 'input_features': [ {'name': 'x', 'type': 'numerical'} ], 'output_features': [ {'name': 'y', 'type': 'numerical', 'preprocessing': {'normalization': transformer_key}} ], 'combiner': { 'type': 'concat', }, 'training': { 'epochs': 2, 'batch_size': 16, } } args = { 'config': config, 'skip_save_processed_input': True, 'output_directory': os.path.join(tmpdir, 'results'), 'logging_level': logging.WARN } # ensure no exceptions are raised experiment_cli(dataset=df, **args)
def train_model(): train_df = pd.read_csv("../hand_dataset_training.csv") # print(train_df.shape) # print(train_df.head()) test_df = pd.read_csv("../hand_dataset_testing.csv") experiment_cli(config="../model.yaml", training_set=train_df, test_set=test_df, output_directory="results", experiment_name="exp", random_seed=100)
def run_experiment(input_features=None, output_features=None, config=None, skip_save_processed_input=True, backend=None, **kwargs): """Helper method to avoid code repetition in running an experiment. Deletes the data saved to disk related to running an experiment. :param input_features: list of input feature dictionaries :param output_features: list of output feature dictionaries **kwargs you may also pass extra parameters to the experiment as keyword arguments :return: None """ if input_features is None and output_features is None and config is None: raise ValueError( "Cannot run test experiment without features nor config.") if config is None: config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } with tempfile.TemporaryDirectory() as tmpdir: args = { "config": config, "backend": backend or LocalTestBackend(), "skip_save_training_description": True, "skip_save_training_statistics": True, "skip_save_processed_input": skip_save_processed_input, "skip_save_progress": True, "skip_save_unprocessed_output": True, "skip_save_model": True, "skip_save_predictions": True, "skip_save_eval_stats": True, "skip_collect_predictions": True, "skip_collect_overall_stats": True, "skip_save_log": True, "output_directory": tmpdir, } args.update(kwargs) experiment_cli(**args)
def test_numeric_transformer(transformer_key, tmpdir): Transformer = get_from_registry(transformer_key, numeric_transformation_registry) transformer_name = Transformer().__class__.__name__ if transformer_name == "Log1pTransformer": raw_values = np.random.lognormal(5, 2, size=100) else: raw_values = np.random.normal(5, 2, size=100) backend = LOCAL_BACKEND parameters = Transformer.fit_transform_params(raw_values, backend) if transformer_name in {"Log1pTransformer", "IdentityTransformer"}: # should be empty assert not bool(parameters) else: # should not be empty assert bool(parameters) # instantiate numeric transformer numeric_transfomer = Transformer(**parameters) # transform values transformed_values = numeric_transfomer.transform(raw_values) # inverse transform the prior transformed values reconstructed_values = numeric_transfomer.inverse_transform(transformed_values) # should now match assert np.allclose(raw_values, reconstructed_values) # now test numeric transformer with output feature df = pd.DataFrame(np.array([raw_values, raw_values]).T, columns=["x", "y"]) config = { "input_features": [{"name": "x", "type": "number"}], "output_features": [{"name": "y", "type": "number", "preprocessing": {"normalization": transformer_key}}], "combiner": { "type": "concat", }, TRAINER: { "epochs": 2, "batch_size": 16, }, } args = { "config": config, "skip_save_processed_input": True, "output_directory": os.path.join(tmpdir, "results"), "logging_level": logging.WARN, } # ensure no exceptions are raised experiment_cli(dataset=df, **args)
def run_experiment(input_features, output_features, **kwargs): """Helper method to avoid code repetition in running an experiment. Deletes the data saved to disk after running the experiment. :param input_features: list of input feature dictionaries :param output_features: list of output feature dictionaries **kwargs you may also pass extra parameters to the experiment as keyword arguments :return: None """ config = None if input_features is not None and output_features is not None: # This if is necessary so that the caller can call with # config_file (and not config) config = { "backend": "local", "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 64, "num_fc_layers": 5}, "training": {"epochs": 2}, } args = { "config": config, "skip_save_processed_input": True, "skip_save_progress": True, "skip_save_unprocessed_output": True, "skip_save_model": True, "skip_save_log": True, } args.update(kwargs) exp_dir_name = experiment_cli(**args) shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_experiment_sequence_combiner(sequence_combiner_encoder, csv_filename): # Sequence combiner input_features = [ sequence_feature( name='seq1', min_len=5, max_len=5, encoder='rnn', cell_type='lstm', reduce_output=None ), sequence_feature( name='seq2', min_len=5, max_len=5, encoder='rnn', cell_type='lstm', reduce_output=None ), category_feature(vocab_size=5) ] output_features = [ category_feature(reduce_input='sum', vocab_size=5) ] model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 }, 'combiner': { 'type': 'sequence', 'encoder': 'rnn', 'main_sequence_feature': 'seq1', 'reduce_output': None, } } # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) for encoder in ENCODERS[:-2]: logger.error('sequence combiner. encoders: {0}, {1}'.format( encoder, encoder )) input_features[0]['encoder'] = encoder input_features[1]['encoder'] = encoder model_definition['input_features'] = input_features exp_dir_name = experiment_cli( model_definition, skip_save_processed_input=False, skip_save_progress=True, skip_save_unprocessed_output=True, dataset=rel_path ) shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_experiment_sequence_combiner(sequence_encoder, csv_filename): config = { "input_features": [ sequence_feature( name="seq1", min_len=5, max_len=5, encoder=sequence_encoder, cell_type="lstm", reduce_output=None ), sequence_feature( name="seq2", min_len=5, max_len=5, encoder=sequence_encoder, cell_type="lstm", reduce_output=None ), category_feature(vocab_size=5), ], "output_features": [category_feature(reduce_input="sum", vocab_size=5)], "training": {"epochs": 2}, "combiner": { "type": "sequence", "encoder": "rnn", "main_sequence_feature": "seq1", "reduce_output": None, }, } # Generate test data rel_path = generate_data(config["input_features"], config["output_features"], csv_filename) exp_dir_name = experiment_cli( config, skip_save_processed_input=False, skip_save_progress=True, skip_save_unprocessed_output=True, dataset=rel_path, ) shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_resume_training_mlflow(optimizer, generated_data, tmp_path): input_features, output_features = get_feature_configs() config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat" }, TRAINER: { "epochs": 2, "batch_size": 16, "optimizer": { "type": optimizer } }, } # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() mlflow_uri = f"file://{tmp_path}/mlruns" experiment_name = optimizer + "_experiment" _, _, _, _, output_dir1 = experiment_cli( config, training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, callbacks=[MlflowCallback(mlflow_uri)], experiment_name=experiment_name, ) # Can't change any artifact spec on a run once it has been logged to mlflow, so skipping changing epochs _, _, _, _, output_dir2 = experiment_cli( config, training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, model_resume_path=output_dir1, callbacks=[MlflowCallback(mlflow_uri)], experiment_name=experiment_name, ) # make sure there is only one mlflow run id experiment = mlflow.get_experiment_by_name(experiment_name) previous_runs = mlflow.search_runs([experiment.experiment_id]) assert len(previous_runs) == 1
def test_sequence_generator(enc_encoder, enc_cell_type, dec_cell_type, dec_attention, dec_beam_width, dec_num_layers, loss_sampler, generate_deterministic_sequence): # Define input and output features input_features = [{ 'name': 'in_seq', 'type': 'sequence', 'encoder': enc_encoder, 'cell_type': enc_cell_type, 'reduce_output': None }] output_features = [{ 'name': 'out_seq', 'type': 'sequence', 'cell_type': dec_cell_type, 'num_layers': dec_num_layers, 'beam_width': dec_beam_width, 'decoder': 'generator', 'attention': dec_attention, 'reduce_input': None, 'loss': { 'type': 'sampled_softmax_cross_entropy', 'negative_samples': 10, 'sampler': loss_sampler } }] model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', # 'concat' 'fc_size': 14 }, 'training': { '' 'epochs': 2, 'early_stop': 5, 'batch_size': 80, 'optimizer': { 'type': 'adam' }, 'learning_rate': 0.001, } } args = { 'config': model_definition, 'skip_save_processed_input': True, 'skip_save_progress': True, 'skip_save_unprocessed_output': True, 'skip_save_model': True, 'skip_save_log': True, 'debug': False } # Generate test data np.random.seed(42) # 13 df = generate_deterministic_sequence # run the experiment results = experiment_cli(dataset=df, **args)
def test_early_stopping(early_stop, generated_data, tmp_path): input_features, output_features = get_feature_configs() config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat' }, 'training': { 'epochs': 30, 'early_stop': early_stop, 'batch_size': 16 } } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # run experiment _, _, _, _, output_dir = experiment_cli( training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, output_directory=str(results_dir), config=config, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_model=True, skip_save_log=True ) # test existence of required files train_stats_fp = os.path.join(output_dir, 'training_statistics.json') metadata_fp = os.path.join(output_dir, 'description.json') assert os.path.isfile(train_stats_fp) assert os.path.isfile(metadata_fp) # retrieve results so we can validate early stopping with open(train_stats_fp, 'r') as f: train_stats = json.load(f) with open(metadata_fp, 'r') as f: metadata = json.load(f) # get early stopping value early_stop_value = metadata['config']['training']['early_stop'] # retrieve validation losses vald_losses = np.array(train_stats['validation']['combined']['loss']) last_epoch = vald_losses.shape[0] best_epoch = np.argmin(vald_losses) # confirm early stopping assert (last_epoch - best_epoch - 1) == early_stop_value
def test_model_progress_save( skip_save_progress, skip_save_model, generated_data, tmp_path ): input_features, output_features = get_feature_configs() config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat'}, 'training': {'epochs': 5} } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # run experiment _, _, _, _, output_dir = experiment_cli( training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, output_directory=str(results_dir), config=config, skip_save_processed_input=True, skip_save_progress=skip_save_progress, skip_save_unprocessed_output=True, skip_save_model=skip_save_model, skip_save_log=True ) # ========== Check for required result data sets ============= if skip_save_model: model_dir = os.path.join(output_dir, 'model') files = [f for f in os.listdir(model_dir) if re.match(r'model_weights', f)] assert len(files) == 0 else: model_dir = os.path.join(output_dir, 'model') files = [f for f in os.listdir(model_dir) if re.match(r'model_weights', f)] # at least one .index and one .data file, but .data may be more assert len(files) >= 2 assert os.path.isfile( os.path.join(output_dir, 'model', 'checkpoint')) if skip_save_progress: assert not os.path.isdir( os.path.join(output_dir, 'model', 'training_checkpoints') ) else: assert os.path.isdir( os.path.join(output_dir, 'model', 'training_checkpoints') )
def test_early_stopping(early_stop, generated_data, tmp_path): input_features, output_features = get_feature_configs() config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat" }, TRAINER: { "epochs": 30, "early_stop": early_stop, "batch_size": 16 }, } # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() # run experiment _, _, _, _, output_dir = experiment_cli( training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, output_directory=str(results_dir), config=config, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_model=True, skip_save_log=True, ) # test existence of required files train_stats_fp = os.path.join(output_dir, "training_statistics.json") metadata_fp = os.path.join(output_dir, DESCRIPTION_FILE_NAME) assert os.path.isfile(train_stats_fp) assert os.path.isfile(metadata_fp) # retrieve results so we can validate early stopping with open(train_stats_fp) as f: train_stats = json.load(f) with open(metadata_fp) as f: metadata = json.load(f) # get early stopping value early_stop_value = metadata["config"][TRAINER]["early_stop"] # retrieve validation losses vald_losses_data = train_stats["validation"]["combined"]["loss"] last_evaluation = len(vald_losses_data) - 1 best_evaluation = np.argmin(vald_losses_data) assert last_evaluation - best_evaluation == early_stop_value
def run_experiment( input_features, output_features, skip_save_processed_input=True, config=None, backend=None, **kwargs, ): """ Helper method to avoid code repetition in running an experiment. Deletes the data saved to disk after running the experiment :param input_features: list of input feature dictionaries :param output_features: list of output feature dictionaries **kwargs you may also pass extra parameters to the experiment as keyword arguments :return: None """ if input_features is not None and output_features is not None: # This if is necessary so that the caller can call with # config_file (and not config) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } args = { "config": config, "backend": backend or LocalTestBackend(), "skip_save_training_description": True, "skip_save_training_statistics": True, "skip_save_processed_input": skip_save_processed_input, "skip_save_progress": True, "skip_save_unprocessed_output": True, "skip_save_model": True, "skip_save_predictions": True, "skip_save_eval_stats": True, "skip_collect_predictions": True, "skip_collect_overall_stats": True, "skip_save_log": True, } args.update(kwargs) _, _, _, _, exp_dir_name = experiment_cli(**args) shutil.rmtree(exp_dir_name, ignore_errors=True)
def run_experiment(input_features, output_features, skip_save_processed_input=True, config=None, backend=None, **kwargs): """ Helper method to avoid code repetition in running an experiment. Deletes the data saved to disk after running the experiment :param input_features: list of input feature dictionaries :param output_features: list of output feature dictionaries **kwargs you may also pass extra parameters to the experiment as keyword arguments :return: None """ if input_features is not None and output_features is not None: # This if is necessary so that the caller can call with # config_file (and not config) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } args = { 'config': config, 'backend': backend or LocalTestBackend(), 'skip_save_training_description': True, 'skip_save_training_statistics': True, 'skip_save_processed_input': skip_save_processed_input, 'skip_save_progress': True, 'skip_save_unprocessed_output': True, 'skip_save_model': True, 'skip_save_predictions': True, 'skip_save_eval_stats': True, 'skip_collect_predictions': True, 'skip_collect_overall_stats': True, 'skip_save_log': True } args.update(kwargs) _, _, _, _, exp_dir_name = experiment_cli(**args) shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_model_progress_save(skip_save_progress, skip_save_model, generated_data, tmp_path): input_features, output_features = get_feature_configs() config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat" }, TRAINER: { "epochs": 5 }, } # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() # run experiment _, _, _, _, output_dir = experiment_cli( training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, output_directory=str(results_dir), config=config, skip_save_processed_input=True, skip_save_progress=skip_save_progress, skip_save_unprocessed_output=True, skip_save_model=skip_save_model, skip_save_log=True, ) # ========== Check for required result data sets ============= model_dir = os.path.join(output_dir, "model") files = [f for f in os.listdir(model_dir) if re.match(r"model_weights", f)] if skip_save_model: assert len(files) == 0 else: assert len(files) == 1 training_checkpoints_dir = os.path.join(output_dir, "model", "training_checkpoints") training_checkpoints = os.listdir(training_checkpoints_dir) if skip_save_progress: assert len(training_checkpoints) == 0 else: assert len(training_checkpoints) > 0
def test_experiment_sequence_combiner_with_embed_encoder_fails(csv_filename): config = { "input_features": [ sequence_feature( name="seq1", min_len=5, max_len=5, encoder="embed", cell_type="lstm", reduce_output=None, ), sequence_feature(name="seq2", min_len=5, max_len=5, encoder="embed", cell_type="lstm", reduce_output=None), category_feature(vocab_size=5), ], "output_features": [category_feature(reduce_input="sum", vocab_size=5)], "training": { "epochs": 2 }, "combiner": { "type": "sequence", "encoder": "rnn", "main_sequence_feature": "seq1", "reduce_output": None, }, } # Generate test data rel_path = generate_data(config["input_features"], config["output_features"], csv_filename) # Encoding sequence features with 'embed' should fail with SequenceConcatCombiner, since at least one sequence feature should be rank 3. with pytest.raises(ValueError): exp_dir_name = experiment_cli( config, skip_save_processed_input=False, skip_save_progress=True, skip_save_unprocessed_output=True, dataset=rel_path, ) shutil.rmtree(exp_dir_name, ignore_errors=True)
def run_experiment(input_features, output_features, **kwargs): """ Helper method to avoid code repetition in running an experiment. Deletes the data saved to disk after running the experiment :param input_features: list of input feature dictionaries :param output_features: list of output feature dictionaries **kwargs you may also pass extra parameters to the experiment as keyword arguments :return: None """ config = None if input_features is not None and output_features is not None: # This if is necessary so that the caller can call with # config_file (and not config) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 64, 'num_fc_layers': 5 }, 'training': { 'epochs': 2 } } args = { 'config': config, 'skip_save_processed_input': True, 'skip_save_progress': True, 'skip_save_unprocessed_output': True, 'skip_save_model': True, 'skip_save_log': True } args.update(kwargs) exp_dir_name = experiment_cli(**args) shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_regularization(generated_data, tmp_path): input_features, output_features = get_feature_configs() config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat' }, 'training': { 'epochs': 1, 'batch_size': 16, 'regularization_lambda': 1 } } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() regularization_losses = [] for regularizer in [None, 'l1', 'l2', 'l1_l2']: tf.keras.backend.clear_session() np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # setup regularization parameters config['output_features'][0][ 'weights_regularizer'] = regularizer config['output_features'][0][ 'bias_regularizer'] = regularizer config['output_features'][0][ 'activity_regularizer'] = regularizer # run experiment _, _, _, _, output_dir = experiment_cli( training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, output_directory=str(results_dir), config=config, experiment_name='regularization', model_name=str(regularizer), skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_model=True, skip_save_log=True ) # test existence of required files train_stats_fp = os.path.join(output_dir, 'training_statistics.json') metadata_fp = os.path.join(output_dir, 'description.json') assert os.path.isfile(train_stats_fp) assert os.path.isfile(metadata_fp) # retrieve results so we can compare training loss with regularization with open(train_stats_fp, 'r') as f: train_stats = json.load(f) # retrieve training losses for all epochs train_losses = np.array(train_stats['training']['combined']['loss']) regularization_losses.append(train_losses[0]) # create a set of losses regularization_losses_set = set(regularization_losses) # ensure all losses obtained with the different methods are different assert len(regularization_losses) == len(regularization_losses_set)
<<<<<<< HEAD rel_path = generate_data(input_features, output_features, csv_filename) logger.error('sequence combiner. encoders: {0}, {1}'.format( sequence_combiner_encoder, sequence_combiner_encoder )) input_features[0]['encoder'] = sequence_combiner_encoder input_features[1]['encoder'] = sequence_combiner_encoder config['input_features'] = input_features exp_dir_name = experiment_cli( config, skip_save_processed_input=False, skip_save_progress=True, skip_save_unprocessed_output=True, dataset=rel_path ) shutil.rmtree(exp_dir_name, ignore_errors=True) ======= rel_path = generate_data( config['input_features'], config['output_features'], csv_filename) ======= rel_path = generate_data(config["input_features"], config["output_features"], csv_filename) >>>>>>> upstream/master # Encoding sequence features with 'embed' should fail with SequenceConcatCombiner, since at least one sequence # feature should be rank 3. with pytest.raises(ValueError): exp_dir_name = experiment_cli(
def test_regularization(generated_data, tmp_path): input_features, output_features = get_feature_configs() config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat"}, TRAINER: { "epochs": 1, "batch_size": 16, "regularization_lambda": 1, }, } # create sub-directory to store results results_dir = tmp_path / "results" results_dir.mkdir() regularization_losses = [] for regularizer in [None, "l1", "l2", "l1_l2"]: np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) # setup regularization parameters config[TRAINER]["regularization_type"] = regularizer # run experiment _, _, _, _, output_dir = experiment_cli( training_set=generated_data.train_df, validation_set=generated_data.validation_df, test_set=generated_data.test_df, output_directory=str(results_dir), config=config, experiment_name="regularization", model_name=str(regularizer), skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_model=True, skip_save_log=True, ) # test existence of required files train_stats_fp = os.path.join(output_dir, "training_statistics.json") metadata_fp = os.path.join(output_dir, "description.json") assert os.path.isfile(train_stats_fp) assert os.path.isfile(metadata_fp) # retrieve results so we can compare training loss with regularization with open(train_stats_fp) as f: train_stats = json.load(f) # retrieve training losses for all epochs train_losses = np.array(train_stats[TRAINING]["combined"]["loss"]) regularization_losses.append(train_losses[0]) # create a set of losses regularization_losses_set = set(regularization_losses) # ensure all losses obtained with the different methods are different assert len(regularization_losses) == len(regularization_losses_set)