Ejemplo n.º 1
0
def test_experiment_model_resume(csv_filename):
    # Single sequence input, single category output
    # Tests saving a model file, loading it to rerun training and predict
    input_features = [sequence_feature(encoder='rnn', reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]
    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {'type': 'concat', 'fc_size': 14},
        'training': {'epochs': 2}
    }

    exp_dir_name = full_experiment(model_definition, data_csv=rel_path)
    logger.info('Experiment Directory: {0}'.format(exp_dir_name))

    full_experiment(
        model_definition,
        data_csv=rel_path,
        model_resume_path=exp_dir_name
    )

    full_predict(os.path.join(exp_dir_name, 'model'), data_csv=rel_path)
    shutil.rmtree(exp_dir_name, ignore_errors=True)
Ejemplo n.º 2
0
def test_resume_training(optimizer, generated_data, tmp_path):
    input_features, output_features = get_feature_definitions()
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {'type': 'concat'},
        'training': {
            'epochs': 2,
            'early_stop': 1000,
            'batch_size': 16,
            'optimizer': {'type': optimizer}
        }
    }

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    exp_dir_name_1 = full_experiment(
        model_definition,
        data_train_df=generated_data.train_df,
        data_validation_df=generated_data.validation_df,
        data_test_df=generated_data.test_df,
        output_directory='results'  # results_dir
    )

    model_definition['training']['epochs'] = 4

    full_experiment(
        model_definition,
        data_train_df=generated_data.train_df,
        data_validation_df=generated_data.validation_df,
        data_test_df=generated_data.test_df,
        model_resume_path=exp_dir_name_1
    )

    exp_dir_name_2 = full_experiment(
        model_definition,
        data_train_df=generated_data.train_df,
        data_validation_df=generated_data.validation_df,
        data_test_df=generated_data.test_df,
    )

    # compare learning curves with and without resuming
    ts1 = load_json(os.path.join(exp_dir_name_1, 'training_statistics.json'))
    ts2 = load_json(os.path.join(exp_dir_name_2, 'training_statistics.json'))
    print('ts1', ts1)
    print('ts2', ts2)
    assert ts1['training']['combined']['loss'] == ts2['training']['combined'][
        'loss']

    # compare predictions with and without resuming
    y_pred1 = np.load(os.path.join(exp_dir_name_1, 'y_predictions.npy'))
    y_pred2 = np.load(os.path.join(exp_dir_name_2, 'y_predictions.npy'))
    print('y_pred1', y_pred1)
    print('y_pred2', y_pred2)
    assert np.all(np.isclose(y_pred1, y_pred2))
Ejemplo n.º 3
0
def test_experiment_sequence_combiner(sequence_combiner_encoder, csv_filename):
    # Sequence combiner
    input_features = [
        sequence_feature(
            name='seq1',
            min_len=5,
            max_len=5,
            encoder='rnn',
            cell_type='lstm',
            reduce_output=None
        ),
        sequence_feature(
            name='seq2',
            min_len=5,
            max_len=5,
            encoder='rnn',
            cell_type='lstm',
            reduce_output=None
        ),
        category_feature(vocab_size=5)
    ]
    output_features = [
        category_feature(reduce_input='sum', vocab_size=5)
    ]

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        },
        'combiner': {
            'type': 'sequence',
            'encoder': 'rnn',
            'main_sequence_feature': 'seq1',
            'reduce_output': None,
        }
    }

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    for encoder in ENCODERS[:-2]:
        logger.error('sequence combiner. encoders: {0}, {1}'.format(
            encoder,
            encoder
        ))
        input_features[0]['encoder'] = encoder
        input_features[1]['encoder'] = encoder

        model_definition['input_features'] = input_features

        exp_dir_name = full_experiment(
            model_definition,
            skip_save_processed_input=False,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            data_csv=rel_path
        )
        shutil.rmtree(exp_dir_name, ignore_errors=True)
Ejemplo n.º 4
0
def test_early_stopping(early_stop, generated_data, tmp_path):
    input_features, output_features = get_feature_definitions()

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat'
        },
        'training': {
            'epochs': 30,
            'early_stop': early_stop,
            'batch_size': 16
        }
    }

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # run experiment
    exp_dir_name = full_experiment(
        data_train_df=generated_data.train_df,
        data_validation_df=generated_data.validation_df,
        data_test_df=generated_data.test_df,
        output_directory=str(results_dir),
        model_definition=model_definition,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_model=True,
        skip_save_log=True
    )

    # test existence of required files
    train_stats_fp = os.path.join(exp_dir_name, 'training_statistics.json')
    metadata_fp = os.path.join(exp_dir_name, 'description.json')
    assert os.path.isfile(train_stats_fp)
    assert os.path.isfile(metadata_fp)

    # retrieve results so we can validate early stopping
    with open(train_stats_fp, 'r') as f:
        train_stats = json.load(f)
    with open(metadata_fp, 'r') as f:
        metadata = json.load(f)

    # get early stopping value
    early_stop_value = metadata['model_definition']['training']['early_stop']

    # retrieve validation losses
    vald_losses = np.array(train_stats['validation']['combined']['loss'])
    last_epoch = vald_losses.shape[0]
    best_epoch = np.argmin(vald_losses)

    # confirm early stopping
    assert (last_epoch - best_epoch - 1) == early_stop_value
Ejemplo n.º 5
0
def test_model_progress_save(
        skip_save_progress,
        skip_save_model,
        generated_data,
        tmp_path
):
    input_features, output_features = get_feature_definitions()

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {'type': 'concat'},
        'training': {'epochs': 5}
    }

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # run experiment
    exp_dir_name = full_experiment(
        data_train_df=generated_data.train_df,
        data_validation_df=generated_data.validation_df,
        data_test_df=generated_data.test_df,
        output_directory=str(results_dir),
        model_definition=model_definition,
        skip_save_processed_input=True,
        skip_save_progress=skip_save_progress,
        skip_save_unprocessed_output=True,
        skip_save_model=skip_save_model,
        skip_save_log=True
    )

    # ========== Check for required result data sets =============
    if skip_save_model:
        model_dir = os.path.join(exp_dir_name, 'model')
        files = [f for f in os.listdir(model_dir) if
                 re.match(r'model_weights', f)]
        assert len(files) == 0
    else:
        model_dir = os.path.join(exp_dir_name, 'model')
        files = [f for f in os.listdir(model_dir) if
                 re.match(r'model_weights', f)]
        # at least one .index and one .data file, but .data may be more
        assert len(files) >= 2
        assert os.path.isfile(
            os.path.join(exp_dir_name, 'model', 'checkpoint'))

    if skip_save_progress:
        assert not os.path.isdir(
            os.path.join(exp_dir_name, 'model', 'training_checkpoints')
        )
    else:
        assert os.path.isdir(
            os.path.join(exp_dir_name, 'model', 'training_checkpoints')
        )
Ejemplo n.º 6
0
def test_optimizers(optimizer_type, generated_data, tmp_path):
    input_features, output_features = get_feature_definitions()

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat'
        },
        'training': {
            'epochs': 5,
            'batch_size': 16,
            'optimizer': {'type': optimizer_type}
        }
    }

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # run experiment
    exp_dir_name = full_experiment(
        data_train_df=generated_data.train_df,
        data_validation_df=generated_data.validation_df,
        data_test_df=generated_data.test_df,
        output_directory=str(results_dir),
        model_definition=model_definition,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_model=True,
        skip_save_log=True
    )

    # test existence of required files
    train_stats_fp = os.path.join(exp_dir_name, 'training_statistics.json')
    metadata_fp = os.path.join(exp_dir_name, 'description.json')
    assert os.path.isfile(train_stats_fp)
    assert os.path.isfile(metadata_fp)

    # retrieve results so we can validate early stopping
    with open(train_stats_fp, 'r') as f:
        train_stats = json.load(f)

    # retrieve training losses for first and last epochs
    train_losses = np.array(train_stats['training']['combined']['loss'])
    last_epoch = train_losses.shape[0]

    # ensure train loss for last epoch is less than first epoch
    assert train_losses[last_epoch - 1] < train_losses[0]
Ejemplo n.º 7
0
def run_experiment(input_features, output_features, **kwargs):
    """
    Helper method to avoid code repetition in running an experiment. Deletes
    the data saved to disk after running the experiment
    :param input_features: list of input feature dictionaries
    :param output_features: list of output feature dictionaries
    **kwargs you may also pass extra parameters to the experiment as keyword
    arguments
    :return: None
    """
    model_definition = None
    if input_features is not None and output_features is not None:
        # This if is necessary so that the caller can call with
        # model_definition_file (and not model_definition)
        model_definition = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 64,
                'num_fc_layers': 5
            },
            'training': {
                'epochs': 2
            }
        }

    args = {
        'model_definition': model_definition,
        'skip_save_processed_input': True,
        'skip_save_progress': True,
        'skip_save_unprocessed_output': True,
        'skip_save_model': True,
        'skip_save_log': True
    }
    args.update(kwargs)

    exp_dir_name = full_experiment(**args)
    shutil.rmtree(exp_dir_name, ignore_errors=True)
Ejemplo n.º 8
0
def test_regularization(generated_data, tmp_path):
    input_features, output_features = get_feature_definitions()

    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat'
        },
        'training': {
            'epochs': 5,
            'batch_size': 16,
            'regularization_lambda': 1
        }
    }


    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    regularization_losses = []
    for regularizer in [None, 'l1', 'l2', 'l1_l2']:
        tf.keras.backend.clear_session()
        np.random.seed(RANDOM_SEED)
        tf.random.set_seed(RANDOM_SEED)

        # setup regularization parameters
        model_definition['output_features'][0]['weights_regularizer'] = regularizer
        model_definition['output_features'][0]['bias_regularizer'] = regularizer
        model_definition['output_features'][0]['activity_regularizer'] = regularizer

        # run experiment
        exp_dir_name = full_experiment(
            data_train_df=generated_data.train_df,
            data_validation_df=generated_data.validation_df,
            data_test_df=generated_data.test_df,
            output_directory=str(results_dir),
            model_definition=model_definition,
            experiment_name='regularization',
            model_name=str(regularizer),
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            skip_save_model=True,
            skip_save_log=True
        )

        # test existence of required files
        train_stats_fp = os.path.join(exp_dir_name, 'training_statistics.json')
        metadata_fp = os.path.join(exp_dir_name, 'description.json')
        assert os.path.isfile(train_stats_fp)
        assert os.path.isfile(metadata_fp)

        # retrieve results so we can compare training loss with regularization
        with open(train_stats_fp, 'r') as f:
            train_stats = json.load(f)

        # retrieve training losses for all epochs
        train_losses = np.array(train_stats['training']['combined']['loss'])
        regularization_losses.append(train_losses)

    # prepare for comparing training losses
    regularization_losses = np.array(regularization_losses).T

    # extract training losses w/o regularization
    reg_loss_none = regularization_losses[:, 0].reshape(-1, 1)

    # extract training losses with regularization
    reg_loss_l1_l2_l1l2 = regularization_losses[:, 1:]

    # ensure loss value for l1, l2 and l1_l2 are greater than None
    assert np.all(reg_loss_none < reg_loss_l1_l2_l1l2)