def test_experiment_model_resume(csv_filename): # Single sequence input, single category output # Tests saving a model file, loading it to rerun training and predict input_features = [sequence_feature(encoder='rnn', reduce_output='sum')] output_features = [category_feature(vocab_size=2, reduce_input='sum')] # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2} } exp_dir_name = full_experiment(model_definition, data_csv=rel_path) logger.info('Experiment Directory: {0}'.format(exp_dir_name)) full_experiment( model_definition, data_csv=rel_path, model_resume_path=exp_dir_name ) full_predict(os.path.join(exp_dir_name, 'model'), data_csv=rel_path) shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_resume_training(optimizer, generated_data, tmp_path): input_features, output_features = get_feature_definitions() model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat'}, 'training': { 'epochs': 2, 'early_stop': 1000, 'batch_size': 16, 'optimizer': {'type': optimizer} } } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() exp_dir_name_1 = full_experiment( model_definition, data_train_df=generated_data.train_df, data_validation_df=generated_data.validation_df, data_test_df=generated_data.test_df, output_directory='results' # results_dir ) model_definition['training']['epochs'] = 4 full_experiment( model_definition, data_train_df=generated_data.train_df, data_validation_df=generated_data.validation_df, data_test_df=generated_data.test_df, model_resume_path=exp_dir_name_1 ) exp_dir_name_2 = full_experiment( model_definition, data_train_df=generated_data.train_df, data_validation_df=generated_data.validation_df, data_test_df=generated_data.test_df, ) # compare learning curves with and without resuming ts1 = load_json(os.path.join(exp_dir_name_1, 'training_statistics.json')) ts2 = load_json(os.path.join(exp_dir_name_2, 'training_statistics.json')) print('ts1', ts1) print('ts2', ts2) assert ts1['training']['combined']['loss'] == ts2['training']['combined'][ 'loss'] # compare predictions with and without resuming y_pred1 = np.load(os.path.join(exp_dir_name_1, 'y_predictions.npy')) y_pred2 = np.load(os.path.join(exp_dir_name_2, 'y_predictions.npy')) print('y_pred1', y_pred1) print('y_pred2', y_pred2) assert np.all(np.isclose(y_pred1, y_pred2))
def test_experiment_sequence_combiner(sequence_combiner_encoder, csv_filename): # Sequence combiner input_features = [ sequence_feature( name='seq1', min_len=5, max_len=5, encoder='rnn', cell_type='lstm', reduce_output=None ), sequence_feature( name='seq2', min_len=5, max_len=5, encoder='rnn', cell_type='lstm', reduce_output=None ), category_feature(vocab_size=5) ] output_features = [ category_feature(reduce_input='sum', vocab_size=5) ] model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 }, 'combiner': { 'type': 'sequence', 'encoder': 'rnn', 'main_sequence_feature': 'seq1', 'reduce_output': None, } } # Generate test data rel_path = generate_data(input_features, output_features, csv_filename) for encoder in ENCODERS[:-2]: logger.error('sequence combiner. encoders: {0}, {1}'.format( encoder, encoder )) input_features[0]['encoder'] = encoder input_features[1]['encoder'] = encoder model_definition['input_features'] = input_features exp_dir_name = full_experiment( model_definition, skip_save_processed_input=False, skip_save_progress=True, skip_save_unprocessed_output=True, data_csv=rel_path ) shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_early_stopping(early_stop, generated_data, tmp_path): input_features, output_features = get_feature_definitions() model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat' }, 'training': { 'epochs': 30, 'early_stop': early_stop, 'batch_size': 16 } } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # run experiment exp_dir_name = full_experiment( data_train_df=generated_data.train_df, data_validation_df=generated_data.validation_df, data_test_df=generated_data.test_df, output_directory=str(results_dir), model_definition=model_definition, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_model=True, skip_save_log=True ) # test existence of required files train_stats_fp = os.path.join(exp_dir_name, 'training_statistics.json') metadata_fp = os.path.join(exp_dir_name, 'description.json') assert os.path.isfile(train_stats_fp) assert os.path.isfile(metadata_fp) # retrieve results so we can validate early stopping with open(train_stats_fp, 'r') as f: train_stats = json.load(f) with open(metadata_fp, 'r') as f: metadata = json.load(f) # get early stopping value early_stop_value = metadata['model_definition']['training']['early_stop'] # retrieve validation losses vald_losses = np.array(train_stats['validation']['combined']['loss']) last_epoch = vald_losses.shape[0] best_epoch = np.argmin(vald_losses) # confirm early stopping assert (last_epoch - best_epoch - 1) == early_stop_value
def test_model_progress_save( skip_save_progress, skip_save_model, generated_data, tmp_path ): input_features, output_features = get_feature_definitions() model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat'}, 'training': {'epochs': 5} } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # run experiment exp_dir_name = full_experiment( data_train_df=generated_data.train_df, data_validation_df=generated_data.validation_df, data_test_df=generated_data.test_df, output_directory=str(results_dir), model_definition=model_definition, skip_save_processed_input=True, skip_save_progress=skip_save_progress, skip_save_unprocessed_output=True, skip_save_model=skip_save_model, skip_save_log=True ) # ========== Check for required result data sets ============= if skip_save_model: model_dir = os.path.join(exp_dir_name, 'model') files = [f for f in os.listdir(model_dir) if re.match(r'model_weights', f)] assert len(files) == 0 else: model_dir = os.path.join(exp_dir_name, 'model') files = [f for f in os.listdir(model_dir) if re.match(r'model_weights', f)] # at least one .index and one .data file, but .data may be more assert len(files) >= 2 assert os.path.isfile( os.path.join(exp_dir_name, 'model', 'checkpoint')) if skip_save_progress: assert not os.path.isdir( os.path.join(exp_dir_name, 'model', 'training_checkpoints') ) else: assert os.path.isdir( os.path.join(exp_dir_name, 'model', 'training_checkpoints') )
def test_optimizers(optimizer_type, generated_data, tmp_path): input_features, output_features = get_feature_definitions() model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat' }, 'training': { 'epochs': 5, 'batch_size': 16, 'optimizer': {'type': optimizer_type} } } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # run experiment exp_dir_name = full_experiment( data_train_df=generated_data.train_df, data_validation_df=generated_data.validation_df, data_test_df=generated_data.test_df, output_directory=str(results_dir), model_definition=model_definition, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_model=True, skip_save_log=True ) # test existence of required files train_stats_fp = os.path.join(exp_dir_name, 'training_statistics.json') metadata_fp = os.path.join(exp_dir_name, 'description.json') assert os.path.isfile(train_stats_fp) assert os.path.isfile(metadata_fp) # retrieve results so we can validate early stopping with open(train_stats_fp, 'r') as f: train_stats = json.load(f) # retrieve training losses for first and last epochs train_losses = np.array(train_stats['training']['combined']['loss']) last_epoch = train_losses.shape[0] # ensure train loss for last epoch is less than first epoch assert train_losses[last_epoch - 1] < train_losses[0]
def run_experiment(input_features, output_features, **kwargs): """ Helper method to avoid code repetition in running an experiment. Deletes the data saved to disk after running the experiment :param input_features: list of input feature dictionaries :param output_features: list of output feature dictionaries **kwargs you may also pass extra parameters to the experiment as keyword arguments :return: None """ model_definition = None if input_features is not None and output_features is not None: # This if is necessary so that the caller can call with # model_definition_file (and not model_definition) model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 64, 'num_fc_layers': 5 }, 'training': { 'epochs': 2 } } args = { 'model_definition': model_definition, 'skip_save_processed_input': True, 'skip_save_progress': True, 'skip_save_unprocessed_output': True, 'skip_save_model': True, 'skip_save_log': True } args.update(kwargs) exp_dir_name = full_experiment(**args) shutil.rmtree(exp_dir_name, ignore_errors=True)
def test_regularization(generated_data, tmp_path): input_features, output_features = get_feature_definitions() model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat' }, 'training': { 'epochs': 5, 'batch_size': 16, 'regularization_lambda': 1 } } # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() regularization_losses = [] for regularizer in [None, 'l1', 'l2', 'l1_l2']: tf.keras.backend.clear_session() np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) # setup regularization parameters model_definition['output_features'][0]['weights_regularizer'] = regularizer model_definition['output_features'][0]['bias_regularizer'] = regularizer model_definition['output_features'][0]['activity_regularizer'] = regularizer # run experiment exp_dir_name = full_experiment( data_train_df=generated_data.train_df, data_validation_df=generated_data.validation_df, data_test_df=generated_data.test_df, output_directory=str(results_dir), model_definition=model_definition, experiment_name='regularization', model_name=str(regularizer), skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_model=True, skip_save_log=True ) # test existence of required files train_stats_fp = os.path.join(exp_dir_name, 'training_statistics.json') metadata_fp = os.path.join(exp_dir_name, 'description.json') assert os.path.isfile(train_stats_fp) assert os.path.isfile(metadata_fp) # retrieve results so we can compare training loss with regularization with open(train_stats_fp, 'r') as f: train_stats = json.load(f) # retrieve training losses for all epochs train_losses = np.array(train_stats['training']['combined']['loss']) regularization_losses.append(train_losses) # prepare for comparing training losses regularization_losses = np.array(regularization_losses).T # extract training losses w/o regularization reg_loss_none = regularization_losses[:, 0].reshape(-1, 1) # extract training losses with regularization reg_loss_l1_l2_l1l2 = regularization_losses[:, 1:] # ensure loss value for l1, l2 and l1_l2 are greater than None assert np.all(reg_loss_none < reg_loss_l1_l2_l1l2)