def test_experiment_sequence_combiner(sequence_encoder, csv_filename):
    config = {
        "input_features": [
            sequence_feature(name="seq1",
                             min_len=5,
                             max_len=5,
                             encoder=sequence_encoder,
                             cell_type="lstm",
                             reduce_output=None),
            sequence_feature(name="seq2",
                             min_len=5,
                             max_len=5,
                             encoder=sequence_encoder,
                             cell_type="lstm",
                             reduce_output=None),
            category_feature(vocab_size=5),
        ],
        "output_features":
        [category_feature(reduce_input="sum", vocab_size=5)],
        "training": {
            "epochs": 2
        },
        "combiner": {
            "type": "sequence",
            "encoder": "rnn",
            "main_sequence_feature": "seq1",
            "reduce_output": None,
        },
    }

    # Generate test data
    rel_path = generate_data(config["input_features"],
                             config["output_features"], csv_filename)

    exp_dir_name = experiment_cli(
        config,
        skip_save_processed_input=False,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        dataset=rel_path,
    )
    shutil.rmtree(exp_dir_name, ignore_errors=True)
Exemple #2
0
def test_experiment_dataset_formats(data_format, csv_filename):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting

    input_features = [number_feature(), category_feature()]
    output_features = [category_feature(), number_feature()]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        "preprocessing": {},
        TRAINER: {
            "epochs": 2
        },
    }

    # setup training data format to test
    raw_data = generate_data(input_features, output_features, csv_filename)

    training_set_metadata = None

    if data_format == "hdf5":
        # hdf5 format
        training_set, _, _, training_set_metadata = preprocess_for_training(
            config, dataset=raw_data)
        dataset_to_use = training_set.data_hdf5_fp
    else:
        dataset_to_use = create_data_set_to_use(data_format, raw_data)

    # define Ludwig model
    model = LudwigModel(config=config)
    model.train(dataset=dataset_to_use,
                training_set_metadata=training_set_metadata,
                random_seed=default_random_seed)

    # # run functions with the specified data format
    model.evaluate(dataset=dataset_to_use)
    model.predict(dataset=dataset_to_use)
Exemple #3
0
def test_basic_image_feature(num_channels, image_source, in_memory,
                             skip_save_processed_input, tmpdir):
    # Image Inputs
    image_dest_folder = os.path.join(tmpdir, "generated_images")

    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="stacked_cnn",
            preprocessing={
                "in_memory": in_memory,
                "height": 12,
                "width": 12,
                "num_channels": num_channels,
                "num_processes": 5,
            },
            output_size=16,
            num_filters=8,
        )
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    if image_source == "file":
        # use images from file
        run_experiment(input_features,
                       output_features,
                       dataset=rel_path,
                       skip_save_processed_input=skip_save_processed_input)
    else:
        # import image from file and store in dataframe as tensors.
        df = pd.read_csv(rel_path)
        image_feature_name = input_features[0]["name"]
        df[image_feature_name] = df[image_feature_name].apply(
            lambda x: torchvision.io.read_image(x))

        run_experiment(input_features,
                       output_features,
                       dataset=df,
                       skip_save_processed_input=skip_save_processed_input)
Exemple #4
0
def run_hyperopt_executor(
    sampler,
    executor,
    csv_filename,
    validate_output_feature=False,
    validation_metric=None,
):
    config = _get_config(sampler, executor)
    rel_path = generate_data(config["input_features"], config["output_features"], csv_filename)

    config = merge_with_defaults(config)

    hyperopt_config = config["hyperopt"]

    if validate_output_feature:
        hyperopt_config["output_feature"] = config["output_features"][0]["name"]
    if validation_metric:
        hyperopt_config["validation_metric"] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    if sampler.get("search_alg", {}).get("type", "") == "bohb":
        # bohb does not support grid_search search space
        del parameters["utterance.cell_type"]

    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal, parameters, **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor
    )

    hyperopt_executor.execute(
        config,
        dataset=rel_path,
        backend="local",
    )
Exemple #5
0
def test_experiment_sequence_combiner_with_reduction_fails(csv_filename):
    config = {
        "input_features": [
            sequence_feature(
                name="seq1",
                min_len=5,
                max_len=5,
                encoder="embed",
                cell_type="lstm",
                reduce_output="sum",
            ),
            sequence_feature(
                name="seq2",
                min_len=5,
                max_len=5,
                encoder="embed",
                cell_type="lstm",
                reduce_output="sum",
            ),
            category_feature(vocab_size=5),
        ],
        "output_features":
        [category_feature(reduce_input="sum", vocab_size=5)],
        TRAINER: {
            "epochs": 2
        },
        "combiner": {
            "type": "sequence",
            "encoder": "rnn",
            "main_sequence_feature": "seq1",
            "reduce_output": None,
        },
    }

    # Generate test data
    rel_path = generate_data(config["input_features"],
                             config["output_features"], csv_filename)

    # Encoding sequence features with 'embed' should fail with SequenceConcatCombiner, since at least one sequence
    # feature should be rank 3.
    with pytest.raises(ValueError):
        run_experiment(config=config, dataset=rel_path)
Exemple #6
0
def test_api_callbacks(tmpdir, csv_filename, epochs, batch_size, num_examples, steps_per_checkpoint):
    mock_callback = mock.Mock(wraps=Callback())

    steps_per_epoch = num_examples / batch_size
    total_checkpoints = (steps_per_epoch / steps_per_checkpoint) * epochs
    total_batches = epochs * (num_examples / batch_size)

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=5, reduce_input="sum")]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "output_size": 14},
        TRAINER: {"epochs": epochs, "batch_size": batch_size, "steps_per_checkpoint": steps_per_checkpoint},
    }
    model = LudwigModel(config, callbacks=[mock_callback])

    data_csv = generate_data(
        input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=num_examples
    )
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv"))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

    model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv)

    assert mock_callback.on_epoch_start.call_count == epochs
    assert mock_callback.on_epoch_end.call_count == epochs

    assert mock_callback.should_early_stop.call_count == total_checkpoints

    assert mock_callback.on_validation_start.call_count == total_checkpoints
    assert mock_callback.on_validation_end.call_count == total_checkpoints

    assert mock_callback.on_test_start.call_count == total_checkpoints
    assert mock_callback.on_test_end.call_count == total_checkpoints

    assert mock_callback.on_batch_start.call_count == total_batches
    assert mock_callback.on_batch_end.call_count == total_batches

    assert mock_callback.on_eval_end.call_count == total_checkpoints
    assert mock_callback.on_eval_start.call_count == total_checkpoints
Exemple #7
0
def test_basic_image_feature(num_channels, image_source, in_memory,
                             skip_save_processed_input, csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='stacked_cnn',
                      preprocessing={
                          'in_memory': in_memory,
                          'height': 12,
                          'width': 12,
                          'num_channels': num_channels,
                          'num_processes': 5
                      },
                      fc_size=16,
                      num_filters=8)
    ]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]

    rel_path = generate_data(input_features, output_features, csv_filename)

    if image_source == 'file':
        # use images from file
        run_experiment(input_features,
                       output_features,
                       dataset=rel_path,
                       skip_save_processed_input=skip_save_processed_input)
    else:
        # import image from file and store in dataframe as ndarrays
        df = pd.read_csv(rel_path)
        image_feature_name = input_features[0]['name']
        df[image_feature_name] = df[image_feature_name].apply(
            lambda x: imread(x))

        run_experiment(input_features,
                       output_features,
                       dataset=df,
                       skip_save_processed_input=skip_save_processed_input)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder, ignore_errors=True)
Exemple #8
0
def run(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Inputs & Outputs
    input_features = [image_feature(folder=image_dest_folder)]
    output_features = [category_feature()]
    data_csv = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "output_size": 14},
        TRAINER: {"epochs": 2},
    }

    callback = CometCallback()
    model = LudwigModel(config, callbacks=[callback])
    output_dir = None

    # Wrap these methods so we can check that they were called
    callback.on_train_init = Mock(side_effect=callback.on_train_init)
    callback.on_train_start = Mock(side_effect=callback.on_train_start)

    with patch("comet_ml.Experiment.log_asset_data") as mock_log_asset_data:
        try:
            # Training with csv
            _, _, output_dir = model.train(dataset=data_csv)
            model.predict(dataset=data_csv)
        finally:
            if output_dir:
                shutil.rmtree(output_dir, ignore_errors=True)

    # Verify that the experiment was created successfully
    assert callback.cometml_experiment is not None

    # Check that these methods were called at least once
    callback.on_train_init.assert_called()
    callback.on_train_start.assert_called()

    # Check that we ran `train_model`, which calls into `log_assert_data`, successfully
    mock_log_asset_data.assert_called()
Exemple #9
0
def test_experiment_seq_seq1(csv_filename):
    # Single Sequence input, single sequence output
    # Only the following encoders are working
    input_features_template = Template(
        '[{name: utterance, type: text, reduce_output: null,'
        ' vocab_size: 10, min_len: 10, max_len: 10, encoder: ${encoder}}]')

    output_features = '[{name: iob, type: text, reduce_input: null,' \
                      ' vocab_size: 3, min_len: 10, max_len: 10,' \
                      ' decoder: tagger}]'
    # Generate test data
    rel_path = generate_data(input_features_template.substitute(encoder='rnn'),
                             output_features, csv_filename)

    encoders2 = ['embed', 'rnn', 'cnnrnn']
    for encoder in encoders2:
        logging.info('Test 2, Encoder: {0}'.format(encoder))

        input_features = input_features_template.substitute(encoder=encoder)
        run_experiment(input_features, output_features, data_csv=rel_path)
Exemple #10
0
def test_api_skip_parameters_predict(
    tmpdir,
    csv_filename,
    skip_save_unprocessed_output,
    skip_save_predictions,
):
    # Single sequence input, single category output
    input_features = [category_feature(vocab_size=5)]
    output_features = [category_feature(vocab_size=5)]

    # Generate test data
    rel_path = generate_data(input_features, output_features, os.path.join(tmpdir, csv_filename))
    run_api_commands(
        input_features,
        output_features,
        data_csv=rel_path,
        output_dir=tmpdir,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
    )
def test_sequence_tagger(enc_cell_type, attention, csv_filename):
    # Define input and output features
    input_features = [sequence_feature(max_len=10, encoder="rnn", cell_type=enc_cell_type, reduce_output=None)]
    output_features = [
        sequence_feature(
            max_len=10,
            decoder="tagger",
            attention=attention,
            reduce_input=None,
        )
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # setup sampled softmax loss
    output_features[0].update({"loss": {"type": "sampled_softmax_cross_entropy", "negative_samples": 7}})

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
Exemple #12
0
def test_sequence_tagger(enc_cell_type, attention, csv_filename):
    # Define input and output features
    input_features = [
        sequence_feature(max_len=10,
                         encoder="rnn",
                         cell_type=enc_cell_type,
                         reduce_output=None)
    ]
    output_features = [
        sequence_feature(max_len=10,
                         decoder="tagger",
                         attention=attention,
                         reduce_input=None)
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
Exemple #13
0
def test_sequence_tagger(enc_cell_type, csv_filename):
    # Define input and output features
    input_features = [
        sequence_feature(max_len=10,
                         encoder='rnn',
                         cell_type='lstm',
                         reduce_output=None)
    ]
    output_features = [
        sequence_feature(max_len=10, decoder='tagger', reduce_input=None)
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # setup encoder specification
    input_features[0]['cell_type'] = enc_cell_type

    # run the experiment
    run_experiment(input_features, output_features, data_csv=rel_path)
Exemple #14
0
def test_experiment_attention(csv_filename):
    # Machine translation with attention
    input_features = [
        sequence_feature(encoder='rnn', cell_type='lstm', max_len=10)
    ]
    output_features = [
        sequence_feature(
            max_len=10,
            cell_type='lstm',
            decoder='generator',
            attention='bahdanau'
        )
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    for attention in ['bahdanau', 'luong']:
        output_features[0]['attention'] = attention
        run_experiment(input_features, output_features, data_csv=rel_path)
Exemple #15
0
def run_test_gbm_category(tmpdir, backend_config):
    """Test that the GBM model can train and predict a categorical output (multiclass classification)."""
    input_features = [number_feature(), category_feature(reduce_output="sum")]
    vocab_size = 3
    output_feature = category_feature(vocab_size=vocab_size)
    output_features = [output_feature]

    csv_filename = os.path.join(tmpdir, "training.csv")
    dataset_filename = generate_data(input_features,
                                     output_features,
                                     csv_filename,
                                     num_examples=100)

    config = {
        MODEL_TYPE: "gbm",
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "num_boost_round": 2
        },
    }

    model = LudwigModel(config, backend=backend_config)

    _, _, output_directory = model.train(
        dataset=dataset_filename,
        output_directory=tmpdir,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_log=True,
    )
    model.load(os.path.join(tmpdir, "api_experiment_run", "model"))
    preds, _ = model.predict(dataset=dataset_filename,
                             output_directory=output_directory)

    prob_col = preds[output_feature["name"] + "_probabilities"]
    if backend_config["type"] == "ray":
        prob_col = prob_col.compute()
    assert len(prob_col.iloc[0]) == (vocab_size + 1)
    assert prob_col.apply(sum).mean() == pytest.approx(1.0)
 def __init__(self, csv_filename):
     self.csv_file = csv_filename
     self.model = None
     self.input_features = [
         text_feature(vocab_size=10, min_len=1, representation='sparse'),
         category_feature(vocab_size=10)
     ]
     self.output_features = [
         category_feature(vocab_size=2, reduce_input='sum')]
     encoder = 'parallel_cnn'
     data_csv = generate_data(
         self.input_features,
         self.output_features,
         self.csv_file
     )
     self.input_features[0]['encoder'] = encoder
     self.setup_model()
     test_df, train_df, val_df = obtain_df_splits(data_csv)
     self.train_stats = self.model.train(
         data_train_df=train_df,
         data_validation_df=val_df
     )
     self.test_stats_full = self.model.test(
         data_df=test_df
     )
     self.output_feature_name = self.output_features[0]['name']
     # probabilities need to be list of lists containing each row data
     # from the probability columns
     # ref: https://uber.github.io/ludwig/api/#test - Return
     num_probs = self.output_features[0]['vocab_size']
     self.probability = self.test_stats_full[0].iloc[:, 1:(num_probs+2)].values
     self.ground_truth_metadata = self.model.train_set_metadata
     target_predictions = test_df[self.output_feature_name]
     self.ground_truth = np.asarray([
         self.ground_truth_metadata[self.output_feature_name]['str2idx'][test_row]
         for test_row in target_predictions
     ])
     self.prediction_raw = self.test_stats_full[0].iloc[:, 0].tolist()
     self.prediction = np.asarray([
         self.ground_truth_metadata[self.output_feature_name]['str2idx'][pred_row]
         for pred_row in self.prediction_raw])
Exemple #17
0
def test_add_feature_data(feature_type, tmpdir):
    preprocessing_params = {
        "audio_file_length_limit_in_s": 3.0,
        "missing_value_strategy": BACKFILL,
        "in_memory": True,
        "padding_value": 0,
        "norm": "per_file",
        "type": feature_type,
        "window_length_in_s": 0.04,
        "window_shift_in_s": 0.02,
        "num_fft_points": None,
        "window_type": "hamming",
        "num_filter_bands": 80,
    }
    audio_dest_folder = os.path.join(tmpdir, "generated_audio")
    audio_feature_config = audio_feature(audio_dest_folder, preprocessing=preprocessing_params)
    data_df_path = generate_data(
        [audio_feature_config],
        [category_feature(vocab_size=5, reduce_input="sum")],
        os.path.join(tmpdir, "data.csv"),
        num_examples=10,
    )
    data_df = pd.read_csv(data_df_path)
    metadata = {
        audio_feature_config["name"]: AudioFeatureMixin.get_feature_meta(
            data_df[audio_feature_config["name"]], preprocessing_params, LOCAL_BACKEND
        )
    }

    proc_df = {}
    AudioFeatureMixin.add_feature_data(
        feature_config=audio_feature_config,
        input_df=data_df,
        proc_df=proc_df,
        metadata=metadata,
        preprocessing_parameters=preprocessing_params,
        backend=LOCAL_BACKEND,
        skip_save_processed_input=False,
    )

    assert len(proc_df[audio_feature_config[PROC_COLUMN]]) == 10
Exemple #18
0
def test_server_integration(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='resnet',
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3
                      },
                      fc_size=16,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [
        categorical_feature(vocab_size=2, reduce_input='sum'),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    model = train_model(input_features, output_features, data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.post('/predict')
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)
    data, files = convert_to_form(data_df.T.to_dict()[0])
    response = client.post('/predict', data=data, files=files)

    response_keys = sorted(list(response.json().keys()))
    assert response_keys == sorted(output_keys_for(output_features))

    shutil.rmtree(model.exp_dir_name, ignore_errors=True)
    shutil.rmtree(image_dest_folder)
Exemple #19
0
def test_remote_training_set(tmpdir, fs_protocol):
    output_directory = f"{fs_protocol}://{tmpdir}"

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    csv_filename = os.path.join(tmpdir, "training.csv")
    data_csv = generate_data(input_features, output_features, csv_filename)
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv"))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

    data_csv = f"{fs_protocol}://{os.path.abspath(data_csv)}"
    val_csv = f"{fs_protocol}://{os.path.abspath(val_csv)}"
    test_csv = f"{fs_protocol}://{os.path.abspath(test_csv)}"

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "output_size": 14},
        TRAINER: {"epochs": 2},
    }

    config_path = os.path.join(tmpdir, "config.yaml")
    with open(config_path, "w") as f:
        yaml.dump(config, f)
    config_path = f"{fs_protocol}://{config_path}"

    backend_config = {
        "type": "local",
    }
    backend = initialize_backend(backend_config)

    model = LudwigModel(config_path, backend=backend)
    _, _, output_directory = model.train(
        training_set=data_csv, validation_set=val_csv, test_set=test_csv, output_directory=output_directory
    )
    model.predict(dataset=test_csv, output_directory=output_directory)

    # Train again, this time the cache will be used
    # Resume from the remote output directory
    model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, model_resume_path=output_directory)
Exemple #20
0
def test_missing_values_drop_rows(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}}
    input_features = [
        number_feature(),
        binary_feature(),
        category_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(**kwargs),
        number_feature(**kwargs),
        category_feature(vocab_size=3, **kwargs),
        sequence_feature(vocab_size=3, **kwargs),
        text_feature(vocab_size=3, **kwargs),
        set_feature(vocab_size=3, **kwargs),
        vector_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # set 10% of values to NaN
    nan_percent = 0.1
    ix = [(row, col) for row in range(df.shape[0])
          for col in range(df.shape[1])]
    for row, col in random.sample(ix, int(round(nan_percent * len(ix)))):
        df.iat[row, col] = np.nan

    # run preprocessing
    ludwig_model = LudwigModel(config, backend=backend)
    ludwig_model.preprocess(dataset=df)
Exemple #21
0
def test_experiment_tied_weights(csv_filename):
    # Single sequence input, single category output
    input_features = [
        text_feature(name="text_feature1",
                     min_len=1,
                     encoder="cnnrnn",
                     reduce_output="sum"),
        text_feature(name="text_feature2",
                     min_len=1,
                     encoder="cnnrnn",
                     reduce_output="sum",
                     tied="text_feature1"),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    for encoder in ENCODERS:
        input_features[0]["encoder"] = encoder
        input_features[1]["encoder"] = encoder
        run_experiment(input_features, output_features, dataset=rel_path)
Exemple #22
0
def test_experiment_infer_image_metadata(csv_filename: str):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder, encoder="stacked_cnn", fc_size=16, num_filters=8),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="zscore"),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum"), numerical_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)

    # remove image preprocessing section to force inferring image meta data
    input_features[0].pop("preprocessing")

    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Exemple #23
0
def test_experiment_tied_weights(csv_filename):
    # Single sequence input, single category output
    input_features = [
        text_feature(name='text_feature1',
                     min_len=1,
                     encoder='cnnrnn',
                     reduce_output='sum'),
        text_feature(name='text_feature2',
                     min_len=1,
                     encoder='cnnrnn',
                     reduce_output='sum',
                     tied_weights='text_feature1')
    ]
    output_features = [categorical_feature(vocab_size=2, reduce_input='sum')]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    for encoder in ENCODERS:
        input_features[0]['encoder'] = encoder
        input_features[1]['encoder'] = encoder
        run_experiment(input_features, output_features, data_csv=rel_path)
Exemple #24
0
def test_tune_batch_size_lr(tmpdir):
    with ray_start(num_cpus=2, num_gpus=None):
        config = {
            "input_features": [
                number_feature(normalization="zscore"),
                set_feature(),
                binary_feature(),
            ],
            "output_features": [category_feature(vocab_size=2, reduce_input="sum")],
            "combiner": {"type": "concat", "output_size": 14},
            TRAINER: {"epochs": 2, "batch_size": "auto", "learning_rate": "auto"},
        }

        backend_config = {**RAY_BACKEND_CONFIG}

        csv_filename = os.path.join(tmpdir, "dataset.csv")
        dataset_csv = generate_data(config["input_features"], config["output_features"], csv_filename, num_examples=100)
        dataset_parquet = create_data_set_to_use("parquet", dataset_csv)
        model = run_api_experiment(config, dataset=dataset_parquet, backend_config=backend_config)
        assert model.config[TRAINER]["batch_size"] != "auto"
        assert model.config[TRAINER]["learning_rate"] != "auto"
Exemple #25
0
def test_api_skip_parameters_predict(
    csv_filename,
    skip_save_unprocessed_output,
    skip_save_predictions,
):
    # Single sequence input, single category output
    input_features = [category_feature(vocab_size=2)]
    output_features = [category_feature(vocab_size=2)]

    with tempfile.TemporaryDirectory() as output_dir:
        # Generate test data
        rel_path = generate_data(input_features, output_features,
                                 os.path.join(output_dir, csv_filename))
        run_api_commands(
            input_features,
            output_features,
            data_csv=rel_path,
            output_dir=output_dir,
            skip_save_unprocessed_output=skip_save_unprocessed_output,
            skip_save_predictions=skip_save_predictions,
        )
Exemple #26
0
def _prepare_data(csv_filename, config_filename):
    # Single sequence input, single category output
    input_features = [sequence_feature(reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]

    # Generate test data
    dataset_filename = generate_data(input_features, output_features,
                                     csv_filename)

    # generate config file
    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {'type': 'concat', 'fc_size': 14},
        'training': {'epochs': 2}
    }

    with open(config_filename, 'w') as f:
        yaml.dump(config, f)

    return dataset_filename
Exemple #27
0
def test_experiment_seq_seq_model_def_file(csv_filename, yaml_filename):
    # seq-to-seq test to use model definition file instead of dictionary
    input_features = [text_feature(reduce_output=None, encoder='embed')]
    output_features = [
        text_feature(reduce_input=None, vocab_size=3, decoder='tagger')
    ]

    # Save the model definition to a yaml file
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {'type': 'concat', 'fc_size': 14},
        'training': {'epochs': 2}
    }
    with open(yaml_filename, 'w') as yaml_out:
        yaml.safe_dump(model_definition, yaml_out)

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(
        None, None, data_csv=rel_path, model_definition_file=yaml_filename
    )
def test_sequence_generator(enc_encoder, enc_cell_type, dec_cell_type,
                            csv_filename):
    # Define input and output features
    input_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         encoder=enc_encoder,
                         cell_type=enc_cell_type)
    ]
    output_features = [
        sequence_feature(min_len=5,
                         max_len=10,
                         decoder="generator",
                         cell_type=dec_cell_type)
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
Exemple #29
0
def test_visual_question_answering(csv_filename):
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='resnet',
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3
                      },
                      fc_size=8,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1, level='word'),
    ]
    output_features = [sequence_feature(decoder='generator', cell_type='lstm')]
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Exemple #30
0
def test_torchscript_e2e_date(tmpdir, csv_filename):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    input_features = [
        date_feature(),
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    validate_torchscript_outputs(tmpdir, config, backend,
                                 training_data_csv_path)