Beispiel #1
0
def test_missing_values_drop_rows(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)

    kwargs = {PREPROCESSING: {"missing_value_strategy": DROP_ROW}}
    input_features = [
        number_feature(),
        binary_feature(),
        category_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(**kwargs),
        number_feature(**kwargs),
        category_feature(vocab_size=3, **kwargs),
        sequence_feature(vocab_size=3, **kwargs),
        text_feature(vocab_size=3, **kwargs),
        set_feature(vocab_size=3, **kwargs),
        vector_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)
    df = pd.read_csv(training_data_csv_path)

    # set 10% of values to NaN
    nan_percent = 0.1
    ix = [(row, col) for row in range(df.shape[0])
          for col in range(df.shape[1])]
    for row, col in random.sample(ix, int(round(nan_percent * len(ix)))):
        df.iat[row, col] = np.nan

    # run preprocessing
    ludwig_model = LudwigModel(config, backend=backend)
    ludwig_model.preprocess(dataset=df)
 def __init__(self, csv_filename):
     self.csv_file = csv_filename
     self.model = None
     self.input_features = [
         text_feature(vocab_size=10, min_len=1, representation='sparse'),
         category_feature(vocab_size=10)
     ]
     self.output_features = [
         category_feature(vocab_size=2, reduce_input='sum')]
     encoder = 'parallel_cnn'
     data_csv = generate_data(
         self.input_features,
         self.output_features,
         self.csv_file
     )
     self.input_features[0]['encoder'] = encoder
     self.setup_model()
     test_df, train_df, val_df = obtain_df_splits(data_csv)
     self.train_stats = self.model.train(
         data_train_df=train_df,
         data_validation_df=val_df
     )
     self.test_stats_full = self.model.test(
         data_df=test_df
     )
     self.output_feature_name = self.output_features[0]['name']
     # probabilities need to be list of lists containing each row data
     # from the probability columns
     # ref: https://uber.github.io/ludwig/api/#test - Return
     num_probs = self.output_features[0]['vocab_size']
     self.probability = self.test_stats_full[0].iloc[:, 1:(num_probs+2)].values
     self.ground_truth_metadata = self.model.train_set_metadata
     target_predictions = test_df[self.output_feature_name]
     self.ground_truth = np.asarray([
         self.ground_truth_metadata[self.output_feature_name]['str2idx'][test_row]
         for test_row in target_predictions
     ])
     self.prediction_raw = self.test_stats_full[0].iloc[:, 0].tolist()
     self.prediction = np.asarray([
         self.ground_truth_metadata[self.output_feature_name]['str2idx'][pred_row]
         for pred_row in self.prediction_raw])
Beispiel #3
0
def test_experiment_image_inputs(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder='resnet',
            preprocessing={
                'in_memory': True,
                'height': 8,
                'width': 8,
                'num_channels': 3
            },
            fc_size=16,
            num_filters=8
        ),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Stacked CNN encoder
    input_features[0]['encoder'] = 'stacked_cnn'
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Stacked CNN encoder, in_memory = False
    input_features[0]['preprocessing']['in_memory'] = False
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Beispiel #4
0
def test_server_integration(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='resnet',
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3
                      },
                      fc_size=16,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [
        categorical_feature(vocab_size=2, reduce_input='sum'),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    model = train_model(input_features, output_features, data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.post('/predict')
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)
    data, files = convert_to_form(data_df.T.to_dict()[0])
    response = client.post('/predict', data=data, files=files)

    response_keys = sorted(list(response.json().keys()))
    assert response_keys == sorted(output_keys_for(output_features))

    shutil.rmtree(model.exp_dir_name, ignore_errors=True)
    shutil.rmtree(image_dest_folder)
Beispiel #5
0
def test_experiment_infer_image_metadata(csv_filename: str):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder, encoder="stacked_cnn", fc_size=16, num_filters=8),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="zscore"),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input="sum"), numerical_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)

    # remove image preprocessing section to force inferring image meta data
    input_features[0].pop("preprocessing")

    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Beispiel #6
0
def test_visual_question_answering(csv_filename):
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='resnet',
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3
                      },
                      fc_size=8,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1, level='word'),
    ]
    output_features = [sequence_feature(decoder='generator', cell_type='lstm')]
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
def test_experiment_image_inputs(image_params: ImageParams, csv_filename: str):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 12,
                "width": 12,
                "num_channels": 3,
                "num_processes": 5
            },
            fc_size=16,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        numerical_feature(normalization="zscore"),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
        numerical_feature()
    ]

    input_features[0]["encoder"] = image_params.image_encoder
    input_features[0]["preprocessing"][
        "in_memory"] = image_params.in_memory_flag
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(
        input_features,
        output_features,
        dataset=rel_path,
        skip_save_processed_input=image_params.skip_save_processed_input,
    )

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Beispiel #8
0
def test_hyperopt_run_hyperopt(csv_filename):
    with ray_start_4_cpus():
        input_features = [
            text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
            category_feature(vocab_size=2, reduce_input="sum"),
        ]

        output_features = [category_feature(vocab_size=2, reduce_input="sum")]

        rel_path = generate_data(input_features, output_features, csv_filename)

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {"type": "concat", "num_fc_layers": 2},
            "training": {"epochs": 2, "learning_rate": 0.001},
        }

        output_feature_name = output_features[0]["name"]

        hyperopt_configs = {
            "parameters": {
                "training.learning_rate": {
                    "space": "loguniform",
                    "lower": 0.001,
                    "upper": 0.1,
                },
                output_feature_name + ".fc_size": {"space": "randint", "lower": 32, "upper": 256},
                output_feature_name + ".num_fc_layers": {"space": "randint", "lower": 2, "upper": 6},
            },
            "goal": "minimize",
            "output_feature": output_feature_name,
            "validation_metrics": "loss",
            "executor": {"type": "ray"},
            "sampler": {"type": "ray", "num_samples": 2},
        }

        # add hyperopt parameter space to the config
        config["hyperopt"] = hyperopt_configs
        run_hyperopt(config, rel_path)
Beispiel #9
0
def test_experiment_image_inputs(image_parms: ImageParms, csv_filename: str):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder='resnet',
            preprocessing={
                'in_memory': True,
                'height': 12,
                'width': 12,
                'num_channels': 3,
                'num_processes': 5
            },
            fc_size=16,
            num_filters=8
        ),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        numerical_feature()
    ]

    input_features[0]['encoder'] = image_parms.image_encoder
    input_features[0]['preprocessing'][
        'in_memory'] = image_parms.in_memory_flag
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(
        input_features,
        output_features,
        dataset=rel_path,
        skip_save_processed_input=image_parms.skip_save_processed_input
    )

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Beispiel #10
0
def test_experiment_image_inputs(image_params: ImageParams, tmpdir):
    # Image Inputs
    image_dest_folder = os.path.join(tmpdir, "generated_images")

    # Resnet encoder
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 12,
                "width": 12,
                "num_channels": 3,
                "num_processes": 5
            },
            output_size=16,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
        number_feature(normalization="zscore"),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
        number_feature()
    ]

    input_features[0]["encoder"] = image_params.image_encoder
    input_features[0]["preprocessing"][
        "in_memory"] = image_params.in_memory_flag
    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    run_experiment(
        input_features,
        output_features,
        dataset=rel_path,
        skip_save_processed_input=image_params.skip_save_processed_input,
    )
Beispiel #11
0
def test_visual_question_answering(tmpdir):
    image_dest_folder = os.path.join(tmpdir, "generated_images")
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 8,
                "width": 8,
                "num_channels": 3,
                "num_processes": 5
            },
            output_size=8,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1),
    ]
    output_features = [sequence_feature(decoder="generator", cell_type="lstm")]
    rel_path = generate_data(input_features, output_features,
                             os.path.join(tmpdir, "dataset.csv"))

    run_experiment(input_features, output_features, dataset=rel_path)
Beispiel #12
0
def test_sequence_tagger_text(
        csv_filename
):
    # Define input and output features
    input_features = [
        text_feature(
            max_len=10,
            encoder='rnn',
            reduce_output=None
        )
    ]
    output_features = [
        sequence_feature(
            max_len=10,
            decoder='tagger',
            reduce_input=None
        )
    ]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)

    # run the experiment
    run_experiment(input_features, output_features, dataset=rel_path)
Beispiel #13
0
def run_test_gbm_output_not_supported(tmpdir, backend_config):
    """Test that an error is raised when the output feature is not supported by the model."""
    input_features = [number_feature(), category_feature(reduce_output="sum")]
    output_features = [text_feature()]

    csv_filename = os.path.join(tmpdir, "training.csv")
    dataset_filename = generate_data(input_features,
                                     output_features,
                                     csv_filename,
                                     num_examples=100)

    config = {
        MODEL_TYPE: "gbm",
        "input_features": input_features,
        "output_features": output_features
    }

    model = LudwigModel(config, backend=backend_config)
    with pytest.raises(
            ValueError,
            match=
            "Model type GBM only supports numerical, categorical, or binary output features"
    ):
        model.train(dataset=dataset_filename, output_directory=tmpdir)
def test_visual_question_answering(csv_filename):
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")
    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="resnet",
            preprocessing={
                "in_memory": True,
                "height": 8,
                "width": 8,
                "num_channels": 3,
                "num_processes": 5
            },
            fc_size=8,
            num_filters=8,
        ),
        text_feature(encoder="embed", min_len=1, level="word"),
    ]
    output_features = [sequence_feature(decoder="generator", cell_type="lstm")]
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, dataset=rel_path)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
Beispiel #15
0
def test_experiment_multiple_seq_seq(csv_filename):
    # Multiple inputs, Multiple outputs
    input_features = [
        text_feature(vocab_size=100, min_len=1, encoder='stacked_cnn'),
        numerical_feature(),
        categorical_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder='embed')
    ]
    output_features = [
        categorical_feature(vocab_size=2, reduce_input='sum'),
        sequence_feature(vocab_size=10, max_len=5),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Use generator as decoder
    output_features = [
        categorical_feature(vocab_size=2, reduce_input='sum'),
        sequence_feature(vocab_size=10, max_len=5, decoder='generator'),
        numerical_feature()
    ]

    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)

    # Generator decoder and reduce_input = None
    output_features = [
        categorical_feature(vocab_size=2, reduce_input='sum'),
        sequence_feature(max_len=5, decoder='generator', reduce_input=None),
        numerical_feature()
    ]
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)
Beispiel #16
0
            sequence_feature(min_len=5,
                             max_len=10,
                             encoder='rnn',
                             cell_type='lstm',
                             reduce_output=None)
        ],
        # output feature
        [sequence_feature(max_len=10, decoder='tagger', reduce_input=None)]),
    FeaturesToUse(
        # input feature
        [
            numerical_feature(normalization='zscore'),
            numerical_feature(normalization='zscore')
        ],
        # output feature
        [text_feature()]),
]


@pytest.mark.parametrize('features_to_use', FEATURES_TO_TEST)
def test_kfold_cv_cli(features_to_use: FeaturesToUse):
    # k-fold cross validation cli
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        training_data_fp = os.path.join(tmpdir, 'train.csv')
        config_fp = os.path.join(tmpdir, 'config.yaml')
        results_dir = os.path.join(tmpdir, 'results')
        statistics_fp = os.path.join(results_dir,
Beispiel #17
0
def test_model_save_reload_api(csv_filename, tmp_path):
    tf.random.set_seed(1234)

    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3, encoder='rnn', cell_type='lstm',
                     num_layers=2, bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder='stacked_cnn'),
        timeseries_feature(encoder='parallel_cnn'),
        sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    config = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {'epochs': 2}
    }

    data_df = read_csv(data_csv_path)
    data_df[SPLIT] = get_split(data_df)
    training_set, test_set, validation_set = split_dataset_ttv(
        data_df,
        SPLIT
    )
    training_set = pd.DataFrame(training_set)
    validation_set = pd.DataFrame(validation_set)
    test_set = pd.DataFrame(test_set)

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory='results'  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        # this has to be done after predicts because of TF2 lazy restoration
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.weights,
                                    if2.encoder_obj.weights):
                assert np.allclose(if1_w.numpy(), if2_w.numpy())

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.weights, c2.weights):
            assert np.allclose(c1_w.numpy(), c2_w.numpy())

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.weights,
                                    of2.decoder_obj.weights):
                assert np.allclose(of1_w.numpy(), of2_w.numpy())

    # Test saving and loading the model explicitly
    with tempfile.TemporaryDirectory() as tmpdir:
        ludwig_model1.save(tmpdir)
        ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
        check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(
        os.path.join(output_dir, 'model'),
        backend=backend
    )
    check_model_equal(ludwig_model_exp)
Beispiel #18
0
def test_model_save_reload_api(tmpdir, csv_filename, tmp_path):
    torch.manual_seed(1)
    random.seed(1)
    np.random.seed(1)

    image_dest_folder = os.path.join(os.getcwd(), "generated_images")
    audio_dest_folder = os.path.join(os.getcwd(), "generated_audio")

    input_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3,
                     encoder="rnn",
                     cell_type="lstm",
                     num_layers=2,
                     bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder="stacked_cnn"),
        timeseries_feature(encoder="parallel_cnn"),
        sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features,
                                  output_features,
                                  csv_filename,
                                  num_examples=50)

    #############
    # Train model
    #############
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    data_df = read_csv(data_csv_path)
    splitter = get_splitter("random")
    training_set, validation_set, test_set = splitter.split(
        data_df, LocalTestBackend())

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory="results",  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b
                          for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.parameters(),
                                    if2.encoder_obj.parameters()):
                assert torch.allclose(if1_w, if2_w)

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.parameters(), c2.parameters()):
            assert torch.allclose(c1_w, c2_w)

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.parameters(),
                                    of2.decoder_obj.parameters()):
                assert torch.allclose(of1_w, of2_w)

    ludwig_model1.save(tmpdir)
    ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
    check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"),
                                        backend=backend)
    check_model_equal(ludwig_model_exp)
Beispiel #19
0
def run_hyperopt_executor(sampler,
                          executor,
                          csv_filename,
                          validate_output_feature=False,
                          validation_metric=None):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        },
        "hyperopt": {
            **HYPEROPT_CONFIG,
            "executor": executor,
            "sampler": sampler,
        },
    }

    config = merge_with_defaults(config)

    hyperopt_config = config["hyperopt"]

    if validate_output_feature:
        hyperopt_config['output_feature'] = output_features[0]['name']
    if validation_metric:
        hyperopt_config['validation_metric'] = validation_metric

    update_hyperopt_params_with_defaults(hyperopt_config)

    parameters = hyperopt_config["parameters"]
    if sampler.get("search_alg", {}).get("type", "") == 'bohb':
        # bohb does not support grid_search search space
        del parameters['utterance.cell_type']

    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    hyperopt_sampler = get_build_hyperopt_sampler(sampler["type"])(goal,
                                                                   parameters,
                                                                   **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor["type"])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_executor.execute(config, dataset=rel_path)
Beispiel #20
0
def test_hyperopt_run_hyperopt(csv_filename, samplers):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        }
    }

    output_feature_name = output_features[0]['name']

    hyperopt_configs = {
        "parameters": {
            "training.learning_rate": {
                "type": "float",
                "low": 0.0001,
                "high": 0.01,
                "space": "log",
                "steps": 3,
            },
            output_feature_name + ".fc_layers": {
                'type':
                'category',
                'values': [[{
                    'fc_size': 512
                }, {
                    'fc_size': 256
                }], [{
                    'fc_size': 512
                }], [{
                    'fc_size': 256
                }]]
            },
            output_feature_name + ".fc_size": {
                "type": "int",
                "low": 32,
                "high": 256,
                "steps": 5
            },
            output_feature_name + ".num_fc_layers": {
                'type': 'int',
                'low': 1,
                'high': 5,
                'space': 'linear',
                'steps': 4
            }
        },
        "goal": "minimize",
        'output_feature': output_feature_name,
        'validation_metrics': 'loss',
        'executor': {
            'type': 'serial'
        },
        'sampler': {
            'type': samplers["type"],
            'num_samples': 2
        }
    }

    # add hyperopt parameter space to the config
    config['hyperopt'] = hyperopt_configs

    hyperopt_results = hyperopt(config,
                                dataset=rel_path,
                                output_directory='results_hyperopt')

    # check for return results
    assert isinstance(hyperopt_results, list)

    # check for existence of the hyperopt statistics file
    assert os.path.isfile(
        os.path.join('results_hyperopt', 'hyperopt_statistics.json'))

    if os.path.isfile(
            os.path.join('results_hyperopt', 'hyperopt_statistics.json')):
        os.remove(os.path.join('results_hyperopt', 'hyperopt_statistics.json'))
Beispiel #21
0
def test_torchscript(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, "generated_images")
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            set_feature(vocab_size=3),
            vector_feature()
            # TODO(#1333): Re-enable.
            # sequence_feature(vocab_size=3),
            # text_feature(vocab_size=3),
        ]

        predictions_column_name = "{}_predictions".format(output_features[0]["name"])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features, data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, "ludwigmodel")
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path)
        original_weights = deepcopy(list(ludwig_model.model.parameters()))

        #################
        # save torchscript
        #################
        torchscript_path = os.path.join(dir_path, "torchscript")
        shutil.rmtree(torchscript_path, ignore_errors=True)
        ludwig_model.model.save_torchscript(torchscript_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(list(ludwig_model.model.parameters()))

        #####################################################
        # restore torchscript, obtain predictions and weights
        #####################################################
        training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = torch.jit.load(torchscript_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: torch.from_numpy(dataset.dataset[feature.proc_column])
            for name, feature in ludwig_model.model.input_features.items()
        }

        # Get predictions from restored torchscript.
        logits = restored_model(data_to_predict)
        restored_predictions = torch.argmax(
            output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1
        )

        restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions]

        restored_weights = deepcopy(list(restored_model.parameters()))

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(torchscript_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # Check to weight values match the original model.
        assert utils.is_all_close(original_weights, loaded_weights)
        assert utils.is_all_close(original_weights, restored_weights)

        # Check that predictions are identical to the original model.
        assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name])

        assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
Beispiel #22
0
def test_experiment_seq_seq_tagger(csv_filename, encoder):
    input_features = [text_feature(reduce_output=None, encoder=encoder)]
    output_features = [text_feature(decoder="tagger")]
    rel_path = generate_data(input_features, output_features, csv_filename)

    run_experiment(input_features, output_features, dataset=rel_path)
Beispiel #23
0
def test_server_integration(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Resnet encoder
    input_features = [
        image_feature(folder=image_dest_folder,
                      preprocessing={
                          'in_memory': True,
                          'height': 8,
                          'width': 8,
                          'num_channels': 3
                      },
                      fc_size=16,
                      num_filters=8),
        text_feature(encoder='embed', min_len=1),
        numerical_feature(normalization='zscore')
    ]
    output_features = [category_feature(vocab_size=2), numerical_feature()]

    rel_path = generate_data(input_features, output_features, csv_filename)
    model, output_dir = train_model(input_features,
                                    output_features,
                                    data_csv=rel_path)

    app = server(model)
    client = TestClient(app)
    response = client.get('/')
    assert response.status_code == 200

    response = client.post('/predict')
    assert response.json() == ALL_FEATURES_PRESENT_ERROR

    data_df = read_csv(rel_path)

    # One-off prediction
    first_entry = data_df.T.to_dict()[0]
    data, files = convert_to_form(first_entry)
    server_response = client.post('/predict', data=data, files=files)
    server_response = server_response.json()

    server_response_keys = sorted(list(server_response.keys()))
    assert server_response_keys == sorted(output_keys_for(output_features))

    model_output, _ = model.predict(dataset=[first_entry], data_format=dict)
    model_output = model_output.to_dict('records')[0]
    assert model_output == server_response

    # Batch prediction
    assert len(data_df) > 1
    files = convert_to_batch_form(data_df)
    server_response = client.post('/batch_predict', files=files)
    server_response = server_response.json()

    server_response_keys = sorted(server_response['columns'])
    assert server_response_keys == sorted(output_keys_for(output_features))
    assert len(data_df) == len(server_response['data'])

    model_output, _ = model.predict(dataset=data_df)
    model_output = model_output.to_dict('split')
    assert model_output == server_response

    # Cleanup
    shutil.rmtree(output_dir, ignore_errors=True)
    shutil.rmtree(image_dest_folder, ignore_errors=True)
Beispiel #24
0
            sequence_feature(
                max_len=10,
                decoder='tagger',
                reduce_input=None
            )
        ]
    ),
    FeaturesToUse(
        # input feature
        [
            numerical_feature(normalization='zscore'),
            numerical_feature(normalization='zscore')
        ],
        # output feature
        [
            text_feature()
        ]
    ),
]


@pytest.mark.parametrize('features_to_use', FEATURES_TO_TEST)
def test_kfold_cv_cli(features_to_use: FeaturesToUse):
    # k-fold cross validation cli
    num_folds = 3

    # setup temporary directory to run test
    with tempfile.TemporaryDirectory() as tmpdir:

        training_data_fp = os.path.join(tmpdir, 'train.csv')
        config_fp = os.path.join(tmpdir, 'config.yaml')
Beispiel #25
0
def test_savedmodel(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, 'generated_images')
        audio_dest_folder = os.path.join(tmpdir, 'generated_audio')

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            set_feature(vocab_size=3),
            vector_feature()
        ]

        predictions_column_name = '{}_predictions'.format(
            output_features[0]['name'])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features,
                                      data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'training': {
                'epochs': 2
            }
        }
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(
            dataset=data_csv_path)
        original_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################
        # save savedmodel
        #################
        savedmodel_path = os.path.join(dir_path, 'savedmodel')
        shutil.rmtree(savedmodel_path, ignore_errors=True)
        ludwig_model.model.save_savedmodel(savedmodel_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################################################
        # restore savedmodel, obtain predictions and weights
        #################################################
        training_set_metadata_json_fp = os.path.join(
            ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = tf.saved_model.load(savedmodel_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: tf.convert_to_tensor(dataset.dataset[feature.proc_column],
                                       dtype=feature.get_input_dtype())
            for name, feature in ludwig_model.model.input_features.items()
        }

        logits = restored_model(data_to_predict, False, None)

        restored_predictions = tf.argmax(logits[of_name]['logits'],
                                         -1,
                                         name='predictions_{}'.format(of_name))
        restored_predictions = tf.map_fn(
            lambda idx: training_set_metadata[of_name]['idx2str'][idx],
            restored_predictions,
            dtype=tf.string)

        restored_weights = deepcopy(restored_model.trainable_variables)

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(savedmodel_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # check for same number of weights as original model
        assert len(original_weights) == len(loaded_weights)
        assert len(original_weights) == len(restored_weights)

        # check to ensure weight valuess match the original model
        loaded_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           loaded_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        original_weights = sorted(original_weights, key=lambda w: w.name)
        restored_weights = sorted(restored_weights, key=lambda w: w.name)

        restored_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           restored_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        assert loaded_weights_match and restored_weights_match

        #  Are predictions identical to original ones?
        loaded_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            loaded_prediction_df[predictions_column_name])

        restored_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            restored_predictions.numpy().astype('str'))

        assert loaded_predictions_match and restored_predictions_match
Beispiel #26
0
def hyperopt_results():
    """
    This function generates hyperopt results
    """
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    csv_filename = uuid.uuid4().hex[:10].upper() + '.csv'
    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "num_fc_layers": 2},
        "training": {"epochs": 2, "learning_rate": 0.001}
    }

    output_feature_name = output_features[0]['name']

    hyperopt_configs = {
        "parameters": {
            "training.learning_rate": {
                "type": "float",
                "low": 0.0001,
                "high": 0.01,
                "space": "log",
                "steps": 3,
            },
            output_feature_name + ".fc_size": {
                "type": "int",
                "low": 32,
                "high": 256,
                "steps": 5
            },
            output_feature_name + ".num_fc_layers": {
                'type': 'int',
                'low': 1,
                'high': 5,
                'space': 'linear',
                'steps': 4
            }
        },
        "goal": "minimize",
        'output_feature': output_feature_name,
        'validation_metrics': 'loss',
        'executor': {'type': 'serial'},
        'sampler': {'type': 'random', 'num_samples': 2}
    }

    # add hyperopt parameter space to the config
    config['hyperopt'] = hyperopt_configs

    hyperopt(
        config,
        dataset=rel_path,
        output_directory='results'
    )

    return os.path.abspath('results')
Beispiel #27
0
def test_confidence_thresholding_2thresholds_3d_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [
        text_feature(vocab_size=10, min_len=1, encoder='stacked_cnn'),
        numerical_feature(),
        category_feature(vocab_size=10, embedding_size=5),
        set_feature(),
        sequence_feature(vocab_size=10, max_len=10, encoder='embed')
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
        category_feature(vocab_size=2, reduce_input='sum')
    ]
    encoder = 'parallel_cnn'
    with TemporaryDirectory() as tmpvizdir:
        # Generate test data
        data_csv = generate_data(input_features, output_features,
                                 os.path.join(tmpvizdir, csv_filename))
        input_features[0]['encoder'] = encoder
        model = run_api_experiment(input_features, output_features)
        test_df, train_df, val_df = obtain_df_splits(data_csv)
        _, _, output_dir = model.train(
            training_set=train_df,
            validation_set=val_df,
            output_directory=os.path.join(tmpvizdir, 'results')
        )
        test_stats, predictions, _ = model.evaluate(
            dataset=test_df,
            collect_predictions=True,
            output_directory=output_dir
        )

        output_feature_name1 = output_features[0]['name']
        output_feature_name2 = output_features[1]['name']
        # probabilities need to be list of lists containing each row data from the
        # probability columns ref: https://ludwig-ai.github.io/ludwig-docs/api/#test - Return
        probability1 = predictions.iloc[:, [2, 3, 4]].values
        probability2 = predictions.iloc[:, [7, 8, 9]].values

        ground_truth_metadata = model.training_set_metadata
        target_predictions1 = test_df[output_feature_name1]
        target_predictions2 = test_df[output_feature_name2]
        ground_truth1 = np.asarray([
            ground_truth_metadata[output_feature_name1]['str2idx'][prediction]
            for prediction in target_predictions1
        ])
        ground_truth2 = np.asarray([
            ground_truth_metadata[output_feature_name2]['str2idx'][prediction]
            for prediction in target_predictions2
        ])
        viz_outputs = ('pdf', 'png')
        for viz_output in viz_outputs:
            vis_output_pattern_pdf = os.path.join(
                output_dir, '*.{}'.format(viz_output)
            )
            visualize.confidence_thresholding_2thresholds_3d(
                [probability1, probability2],
                [ground_truth1, ground_truth2],
                model.training_set_metadata,
                [output_feature_name1, output_feature_name2],
                labels_limit=0,
                output_directory=output_dir,
                file_format=viz_output
            )
            figure_cnt = glob.glob(vis_output_pattern_pdf)
            assert 1 == len(figure_cnt)
Beispiel #28
0
def test_hyperopt_run_hyperopt(csv_filename, ray_start_4_cpus):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum")
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        "training": {
            "epochs": 2,
            "learning_rate": 0.001
        }
    }

    output_feature_name = output_features[0]['name']

    hyperopt_configs = {
        "parameters": {
            "training.learning_rate": {
                "space": "loguniform",
                "lower": 0.001,
                "upper": 0.1,
            },
            output_feature_name + ".fc_size": {
                "space": "randint",
                "lower": 32,
                "upper": 256
            },
            output_feature_name + ".num_fc_layers": {
                "space": "randint",
                "lower": 2,
                "upper": 6
            }
        },
        "goal": "minimize",
        'output_feature': output_feature_name,
        'validation_metrics': 'loss',
        'executor': {
            'type': 'ray'
        },
        'sampler': {
            'type': 'ray',
            'num_samples': 2
        }
    }

    # add hyperopt parameter space to the config
    config['hyperopt'] = hyperopt_configs

    hyperopt_results = hyperopt(config,
                                dataset=rel_path,
                                output_directory='results_hyperopt')

    # check for return results
    assert isinstance(hyperopt_results, list)

    # check for existence of the hyperopt statistics file
    assert os.path.isfile(
        os.path.join('results_hyperopt', 'hyperopt_statistics.json'))
Beispiel #29
0
def t_neuropod(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder),
        timeseries_feature(),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature()
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    ################
    # build neuropod
    ################
    neuropod_path = os.path.join(dir_path, 'neuropod')
    export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path)

    ########################
    # predict using neuropod
    ########################
    data_df = pd.read_csv(data_csv_path)
    if_dict = {
        input_feature['name']: np.expand_dims(
            np.array([str(x) for x in data_df[input_feature['name']].tolist()],
                     dtype='str'), 1)
        for input_feature in input_features
    }

    from neuropod.loader import load_neuropod
    neuropod_model = load_neuropod(neuropod_path)
    preds = neuropod_model.infer(if_dict)

    for key in preds:
        preds[key] = np.squeeze(preds[key])

    #########
    # cleanup
    #########
    # Delete the temporary data created
    for path in [
            ludwigmodel_path, neuropod_path, image_dest_folder,
            audio_dest_folder
    ]:
        if os.path.exists(path):
            if os.path.isfile(path):
                os.remove(path)
            else:
                shutil.rmtree(path, ignore_errors=True)

    ########
    # checks
    ########
    for output_feature in output_features:
        output_feature_name = output_feature['name']
        output_feature_type = output_feature['type']

        if (output_feature_name + "_predictions" in preds
                and output_feature_name + "_predictions"
                in original_predictions_df):
            neuropod_pred = preds[output_feature_name +
                                  "_predictions"].tolist()
            if output_feature_type == BINARY:
                neuropod_pred = list(map(lambda x: str2bool(x), neuropod_pred))
            if output_feature_type in {SEQUENCE, TEXT, SET}:
                neuropod_pred = list(map(lambda x: x.split(), neuropod_pred))

            original_pred = original_predictions_df[output_feature_name +
                                                    "_predictions"].tolist()

            assert neuropod_pred == original_pred

        if (output_feature_name + "_probability" in preds
                and output_feature_name + "_probability"
                in original_predictions_df):
            neuropod_prob = preds[output_feature_name +
                                  "_probability"].tolist()
            if output_feature_type in {SEQUENCE, TEXT, SET}:
                neuropod_prob = list(
                    map(lambda x: [float(n) for n in x.split()],
                        neuropod_prob))
            if any(isinstance(el, list) for el in neuropod_prob):
                neuropod_prob = np.array(
                    list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T

            original_prob = original_predictions_df[output_feature_name +
                                                    "_probability"].tolist()
            if any(isinstance(el, list) for el in original_prob):
                original_prob = np.array(
                    list(itertools.zip_longest(*original_prob, fillvalue=0))).T

            assert np.isclose(neuropod_prob, original_prob).all()

        if (output_feature_name + "_probabilities" in preds
                and output_feature_name + "_probabilities"
                in original_predictions_df):
            neuropod_prob = preds[output_feature_name +
                                  "_probabilities"].tolist()

            original_prob = original_predictions_df[output_feature_name +
                                                    "_probabilities"].tolist()

            assert np.isclose(neuropod_prob, original_prob).all()
Beispiel #30
0
def test_hyperopt_run_hyperopt(csv_filename, samplers):
    input_features = [
        text_feature(name="utterance", cell_type="lstm", reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
    ]

    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    rel_path = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "num_fc_layers": 2
        },
        TRAINER: {
            "epochs": 2,
            "learning_rate": 0.001
        },
    }

    output_feature_name = output_features[0]["name"]

    hyperopt_configs = {
        "parameters": {
            "trainer.learning_rate": {
                "type": "float",
                "low": 0.0001,
                "high": 0.01,
                "space": "log",
                "steps": 3,
            },
            output_feature_name + ".fc_layers": {
                "type":
                "category",
                "values": [
                    [{
                        "output_size": 64
                    }, {
                        "output_size": 32
                    }],
                    [{
                        "output_size": 64
                    }],
                    [{
                        "output_size": 32
                    }],
                ],
            },
            output_feature_name + ".output_size": {
                "type": "int",
                "low": 16,
                "high": 36,
                "steps": 5
            },
            output_feature_name + ".num_fc_layers": {
                "type": "int",
                "low": 1,
                "high": 5,
                "space": "linear",
                "steps": 4
            },
        },
        "goal": "minimize",
        "output_feature": output_feature_name,
        "validation_metrics": "loss",
        "executor": {
            "type": "serial"
        },
        "sampler": {
            "type": samplers["type"],
            "num_samples": 2
        },
    }

    # add hyperopt parameter space to the config
    config["hyperopt"] = hyperopt_configs

    hyperopt_results = hyperopt(config,
                                dataset=rel_path,
                                output_directory="results_hyperopt")

    # check for return results
    assert isinstance(hyperopt_results, HyperoptResults)

    # check for existence of the hyperopt statistics file
    assert os.path.isfile(
        os.path.join("results_hyperopt", "hyperopt_statistics.json"))

    if os.path.isfile(
            os.path.join("results_hyperopt", "hyperopt_statistics.json")):
        os.remove(os.path.join("results_hyperopt", "hyperopt_statistics.json"))