Example #1
0
def run_api_experiment(input_features, output_features, data_csv):
    """
    Helper method to avoid code repetition in running an experiment
    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :return: None
    """
    model_definition = model_definition_template.substitute(
        input_name=input_features,
        output_name=output_features
    )

    model = LudwigModel(yaml.safe_load(model_definition))

    # Training with csv
    model.train(
        data_csv=data_csv,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True
    )
    model.predict(data_csv=data_csv)

    # Training with dataframe
    data_df = read_csv(data_csv)
    model.train(
        data_df=data_df,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True
    )
    model.predict(data_df=data_df)
Example #2
0
def predict_with_backend(tmpdir,
                         config,
                         data_csv_path,
                         backend,
                         patch_args=None):
    with init_backend(backend):
        if backend == "ray":
            backend = RAY_BACKEND_CONFIG
            backend["processor"]["type"] = "dask"

        ludwig_model = LudwigModel(config, backend=backend)
        _, _, output_directory = ludwig_model.train(
            dataset=data_csv_path,
            output_directory=os.path.join(tmpdir, "output"),
        )
        # Check that metadata JSON saves and loads correctly
        ludwig_model = LudwigModel.load(os.path.join(output_directory,
                                                     "model"))

        if patch_args is not None:
            with mock.patch(*patch_args):
                preds_df, _ = ludwig_model.predict(dataset=data_csv_path)
        else:
            preds_df, _ = ludwig_model.predict(dataset=data_csv_path)

    return preds_df, ludwig_model
Example #3
0
def train_with_backend(backend, config, dataset=None, training_set=None, validation_set=None, test_set=None):
    model = LudwigModel(config, backend=backend)
    output_dir = None

    try:
        _, _, output_dir = model.train(
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True
        )

        if dataset is None:
            dataset = training_set

        import dask.dataframe as dd
        if isinstance(dataset, dd.DataFrame):
            # For now, prediction must be done on Pandas DataFrame
            dataset = dataset.compute()

        model.predict(dataset=dataset)
        return model.model.get_weights()
    finally:
        # Remove results/intermediate data saved to disk
        shutil.rmtree(output_dir, ignore_errors=True)
Example #4
0
def _run_test(input_features=None, output_features=None, combiner=None):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = input_features or [
            sequence_feature(reduce_output="sum"),
            number_feature(),
        ]
        output_features = output_features or [
            category_feature(vocab_size=2, reduce_input="sum")
        ]
        combiner = combiner or {"type": "concat"}

        csv_filename = os.path.join(tmpdir, "training.csv")
        data_csv = generate_data(input_features, output_features, csv_filename)

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": combiner,
            TRAINER: {
                "epochs": 2
            },
        }

        model = LudwigModel(config, backend=LocalTestBackend())
        _, _, output_directory = model.train(
            dataset=data_csv,
            output_directory=tmpdir,
        )
        model.predict(dataset=data_csv, output_directory=output_directory)
Example #5
0
def test_api_training_set(csv_filename):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [sequence_feature(reduce_output='sum')]
        output_features = [category_feature(vocab_size=2, reduce_input='sum')]

        data_csv = generate_data(input_features, output_features, csv_filename)
        val_csv = shutil.copyfile(data_csv,
                                  os.path.join(tmpdir, 'validation.csv'))
        test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv'))

        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
        }
        model = LudwigModel(config)
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv)
        model.predict(dataset=test_csv)

        # Train again, this time the HDF5 cache will be used
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv)
Example #6
0
def run_api_experiment(input_features, output_features, dataset, **kwargs):
    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {'type': 'concat', 'fc_size': 14},
        'training': {'epochs': 2}
    }

    model = LudwigModel(config)
    output_dir = None

    try:
        # Training with csv
        _, _, output_dir = model.train(
            dataset=dataset,
            **kwargs
        )

        model.predict(dataset=dataset)

        # Attempt loading saved model, should broadcast successfully
        model_dir = os.path.join(output_dir, 'model') if output_dir else None
        loaded_model = LudwigModel.load(model_dir)

        # Model loading should broadcast weights from coordinator
        loaded_weights = loaded_model.model.get_weights()
        bcast_weights = hvd.broadcast_object(loaded_weights)
        for loaded, bcast in zip(loaded_weights, bcast_weights):
            assert np.allclose(loaded, bcast)
    finally:
        if output_dir:
            shutil.rmtree(output_dir, ignore_errors=True)
Example #7
0
def test_api_training_set(csv_filename):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [sequence_feature(reduce_output="sum")]
        output_features = [category_feature(vocab_size=5, reduce_input="sum")]

        data_csv = generate_data(input_features, output_features, csv_filename)
        val_csv = shutil.copyfile(data_csv,
                                  os.path.join(tmpdir, "validation.csv"))
        test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "fc_size": 14
            },
        }
        model = LudwigModel(config)
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv)
        model.predict(dataset=test_csv)

        # Train again, this time the HDF5 cache will be used
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv)
Example #8
0
def train_model(input_features, output_features, data_csv):
    """
    Helper method to avoid code repetition in running an experiment
    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :return: None
    """
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }
    model = LudwigModel(model_definition)
    _, _, output_dir = model.train(dataset=data_csv,
                                   skip_save_processed_input=True,
                                   skip_save_progress=True,
                                   skip_save_unprocessed_output=True)
    model.predict(dataset=data_csv, output_directory=output_dir)

    return model, output_dir
def test_missing_value_prediction(csv_filename):
    random.seed(1)
    np.random.seed(1)
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [
            category_feature(
                vocab_size=2,
                reduce_input="sum",
                preprocessing=dict(missing_value_strategy="fill_with_mode"))
        ]
        output_features = [binary_feature()]

        dataset = pd.read_csv(
            generate_data(input_features, output_features, csv_filename))

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "fc_size": 14
            },
        }
        model = LudwigModel(config)
        _, _, output_dir = model.train(dataset=dataset,
                                       output_directory=tmpdir)

        # Set the input column to None, we should be able to replace the missing value with the mode
        # from the training set
        dataset[input_features[0]["name"]] = None
        model.predict(dataset=dataset)

        model = LudwigModel.load(os.path.join(output_dir, "model"))
        model.predict(dataset=dataset)
Example #10
0
def train_and_predict_model(input_features, output_features, data_csv,
                            output_directory):
    """Helper method to avoid code repetition for training a model and using it for prediction.

    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :param output_directory: model output directory
    :return: None
    """
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        TRAINER: {
            "epochs": 2
        },
    }
    model = LudwigModel(config, backend=LocalTestBackend())
    model.train(
        dataset=data_csv,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        output_directory=output_directory,
    )
    model.predict(dataset=data_csv, output_directory=output_directory)
    return model
Example #11
0
def test_missing_value_prediction(csv_filename):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [
            category_feature(
                vocab_size=2,
                reduce_input='sum',
                preprocessing=dict(missing_value_strategy='fill_with_mode'))
        ]
        output_features = [binary_feature()]

        dataset = pd.read_csv(
            generate_data(input_features, output_features, csv_filename))

        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
        }
        model = LudwigModel(config)
        _, _, output_dir = model.train(dataset=dataset,
                                       output_directory=tmpdir)

        # Set the input column to None, we should be able to replace the missing value with the mode
        # from the training set
        dataset[input_features[0]['name']] = None
        model.predict(dataset=dataset)

        model = LudwigModel.load(os.path.join(output_dir, 'model'))
        model.predict(dataset=dataset)
Example #12
0
def run_api_experiment(input_features, output_features, dataset, **kwargs):
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "fc_size": 14},
        "training": {"epochs": 2},
    }

    model = LudwigModel(config)
    output_dir = None

    try:
        # Training with csv
        _, _, output_dir = model.train(dataset=dataset, **kwargs)

        model.predict(dataset=dataset)

        # Attempt loading saved model, should broadcast successfully
        model_dir = os.path.join(output_dir, "model") if output_dir else None
        loaded_model = LudwigModel.load(model_dir)

        # Model loading should broadcast weights from coordinator
        loaded_state = loaded_model.model.state_dict()
        bcast_state = hvd.broadcast_object(loaded_state)
        for loaded, bcast in zip(loaded_state.values(), bcast_state.values()):
            assert np.allclose(loaded, bcast)
    finally:
        if output_dir:
            shutil.rmtree(output_dir, ignore_errors=True)
Example #13
0
def test_experiment_dataset_formats(data_format):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting

    input_features = [
        numerical_feature(),
        category_feature()
    ]
    output_features = [
        category_feature(),
        numerical_feature()
    ]

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'preprocessing': {},
        'training': {'epochs': 2}
    }

    # create temporary name for train and test data sets
    csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv'

    # setup training data format to test
    raw_data = generate_data(input_features, output_features,
                               csv_filename)

    training_set_metadata = None

    if data_format == 'hdf5':
        # hdf5 format
        training_set, _, _, training_set_metadata = preprocess_for_training(
            config,
            dataset=raw_data
        )
        dataset_to_use = training_set.data_hdf5_fp
    else:
        dataset_to_use = create_data_set_to_use(data_format, raw_data)

    # define Ludwig model
    model = LudwigModel(
        config=config
    )
    model.train(
        dataset=dataset_to_use,
        training_set_metadata=training_set_metadata,
        random_seed=default_random_seed
    )

    # # run functions with the specified data format
    model.evaluate(dataset=dataset_to_use)
    model.predict(dataset=dataset_to_use)

    # Delete the temporary data created
    delete_temporary_data(csv_filename)
Example #14
0
def run_api_experiment(input_features, output_features, data_csv):
    """
    Helper method to avoid code repetition in running an experiment
    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :return: None
    """
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "fc_size": 14
        },
        "training": {
            "epochs": 2
        },
    }

    model = LudwigModel(config)
    output_dir = None

    try:
        # Training with csv
        _, _, output_dir = model.train(
            dataset=data_csv,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
        )
        model.predict(dataset=data_csv)

        model_dir = os.path.join(output_dir, "model")
        loaded_model = LudwigModel.load(model_dir)

        # Necessary before call to get_weights() to materialize the weights
        loaded_model.predict(dataset=data_csv)

        model_weights = model.model.get_weights()
        loaded_weights = loaded_model.model.get_weights()
        for model_weight, loaded_weight in zip(model_weights, loaded_weights):
            assert np.allclose(model_weight, loaded_weight)
    finally:
        # Remove results/intermediate data saved to disk
        shutil.rmtree(output_dir, ignore_errors=True)

    try:
        # Training with dataframe
        data_df = read_csv(data_csv)
        _, _, output_dir = model.train(
            dataset=data_df,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
        )
        model.predict(dataset=data_df)
    finally:
        shutil.rmtree(output_dir, ignore_errors=True)
Example #15
0
def run(csv_filename):
    # Check that comet has been imported successfully as a contrib package
    contrib_instances = ludwig.contrib.contrib_registry["instances"]
    assert len(contrib_instances) == 1

    comet_instance = contrib_instances[0]
    assert isinstance(comet_instance, Comet)

    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    # Inputs & Outputs
    input_features = [image_feature(folder=image_dest_folder)]
    output_features = [category_feature()]
    data_csv = generate_data(input_features, output_features, csv_filename)

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }

    model = LudwigModel(config)
    output_dir = None

    # Wrap these methods so we can check that they were called
    comet_instance.train_init = Mock(side_effect=comet_instance.train_init)
    comet_instance.train_model = Mock(side_effect=comet_instance.train_model)

    with patch('comet_ml.Experiment.log_asset_data') as mock_log_asset_data:
        try:
            # Training with csv
            _, _, output_dir = model.train(dataset=data_csv)
            model.predict(dataset=data_csv)
        finally:
            if output_dir:
                shutil.rmtree(output_dir, ignore_errors=True)

    # Verify that the experiment was created successfully
    assert comet_instance.cometml_experiment is not None

    # Check that these methods were called at least once
    comet_instance.train_init.assert_called()
    comet_instance.train_model.assert_called()

    # Check that we ran `train_model`, which calls into `log_assert_data`, successfully
    mock_log_asset_data.assert_called()
Example #16
0
def test_remote_training_set(tmpdir, fs_protocol):
    with tempfile.TemporaryDirectory() as outdir:
        output_directory = f"{fs_protocol}://{outdir}"

        input_features = [sequence_feature(reduce_output="sum")]
        output_features = [category_feature(vocab_size=2, reduce_input="sum")]

        csv_filename = os.path.join(tmpdir, "training.csv")
        data_csv = generate_data(input_features, output_features, csv_filename)
        val_csv = shutil.copyfile(data_csv,
                                  os.path.join(tmpdir, "validation.csv"))
        test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

        data_csv = f"{fs_protocol}://{os.path.abspath(data_csv)}"
        val_csv = f"{fs_protocol}://{os.path.abspath(val_csv)}"
        test_csv = f"{fs_protocol}://{os.path.abspath(test_csv)}"

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "fc_size": 14
            },
            "training": {
                "epochs": 2
            },
        }

        config_path = os.path.join(tmpdir, "config.yaml")
        with open(config_path, "w") as f:
            yaml.dump(config, f)
        config_path = f"{fs_protocol}://{config_path}"

        backend_config = {
            "type": "local",
        }
        backend = initialize_backend(backend_config)

        model = LudwigModel(config_path, backend=backend)
        _, _, output_directory = model.train(training_set=data_csv,
                                             validation_set=val_csv,
                                             test_set=test_csv,
                                             output_directory=output_directory)
        model.predict(dataset=test_csv, output_directory=output_directory)

        # Train again, this time the cache will be used
        # Resume from the remote output directory
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv,
                    model_resume_path=output_directory)
Example #17
0
def test_remote_training_set(tmpdir, fs_protocol, cache_format):
    with tempfile.TemporaryDirectory() as outdir:
        output_directory = f'{fs_protocol}://{outdir}'

        input_features = [sequence_feature(reduce_output='sum')]
        output_features = [category_feature(vocab_size=2, reduce_input='sum')]

        csv_filename = os.path.join(tmpdir, 'training.csv')
        data_csv = generate_data(input_features, output_features, csv_filename)
        val_csv = shutil.copyfile(data_csv,
                                  os.path.join(tmpdir, 'validation.csv'))
        test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv'))

        data_csv = f'{fs_protocol}://{os.path.abspath(data_csv)}'
        val_csv = f'{fs_protocol}://{os.path.abspath(val_csv)}'
        test_csv = f'{fs_protocol}://{os.path.abspath(test_csv)}'

        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {'type': 'concat', 'fc_size': 14},
            'training': {'epochs': 2},
        }

        config_path = os.path.join(tmpdir, 'config.yaml')
        with open(config_path, 'w') as f:
            yaml.dump(config, f)
        config_path = f'{fs_protocol}://{config_path}'

        backend_config = {
            'type': 'local',
            'cache_format': cache_format
        }
        backend = initialize_backend(backend_config)

        model = LudwigModel(config_path, backend=backend)
        _, _, output_directory = model.train(
            training_set=data_csv,
            validation_set=val_csv,
            test_set=test_csv,
            output_directory=output_directory
        )
        model.predict(dataset=test_csv,
                      output_directory=output_directory)

        # Train again, this time the cache will be used
        # Resume from the remote output directory
        model.train(training_set=data_csv,
                    validation_set=val_csv,
                    test_set=test_csv,
                    model_resume_path=output_directory)
Example #18
0
def test_api_train_online(csv_filename):
    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=5, reduce_input="sum")]
    data_csv = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "output_size": 14},
    }
    model = LudwigModel(config)

    for i in range(2):
        model.train_online(dataset=data_csv)
    model.predict(dataset=data_csv)
Example #19
0
def test_api_train_online(csv_filename):
    input_features = [sequence_feature(reduce_output='sum')]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]
    data_csv = generate_data(input_features, output_features, csv_filename)

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {'type': 'concat', 'fc_size': 14},
    }
    model = LudwigModel(config)

    for i in range(2):
        model.train_online(dataset=data_csv)
    model.predict(dataset=data_csv)
Example #20
0
def test_api_save_torchscript(tmpdir):
    """Tests successful saving and loading of model in TorchScript format."""
    input_features = [category_feature(vocab_size=5)]
    output_features = [category_feature(name="class", vocab_size=5, reduce_input="sum")]

    data_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv"))
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv"))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "output_size": 14},
    }
    model = LudwigModel(config)
    _, _, output_dir = model.train(
        training_set=data_csv, validation_set=val_csv, test_set=test_csv, output_directory=tmpdir
    )

    test_df = pd.read_csv(test_csv)
    output_df_expected, _ = model.predict(test_df, return_type=pd.DataFrame)

    save_path = os.path.join(output_dir, "model")
    os.makedirs(save_path, exist_ok=True)
    model.save_torchscript(save_path)
    inference_module = InferenceModule.from_directory(save_path)
    output_df, _ = inference_module.predict(test_df, return_type=pd.DataFrame)

    for col in output_df.columns:
        assert output_df[col].equals(output_df_expected[col])
Example #21
0
def run(csv_filename):
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), "generated_images")

    # Inputs & Outputs
    input_features = [image_feature(folder=image_dest_folder)]
    output_features = [category_feature()]
    data_csv = generate_data(input_features, output_features, csv_filename)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "fc_size": 14
        },
        "training": {
            "epochs": 2
        },
    }

    callback = CometCallback()
    model = LudwigModel(config, callbacks=[callback])
    output_dir = None

    # Wrap these methods so we can check that they were called
    callback.on_train_init = Mock(side_effect=callback.on_train_init)
    callback.on_train_start = Mock(side_effect=callback.on_train_start)

    with patch("comet_ml.Experiment.log_asset_data") as mock_log_asset_data:
        try:
            # Training with csv
            _, _, output_dir = model.train(dataset=data_csv)
            model.predict(dataset=data_csv)
        finally:
            if output_dir:
                shutil.rmtree(output_dir, ignore_errors=True)

    # Verify that the experiment was created successfully
    assert callback.cometml_experiment is not None

    # Check that these methods were called at least once
    callback.on_train_init.assert_called()
    callback.on_train_start.assert_called()

    # Check that we ran `train_model`, which calls into `log_assert_data`, successfully
    mock_log_asset_data.assert_called()
Example #22
0
def test_experiment_dataset_formats(data_format, csv_filename):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting

    input_features = [number_feature(), category_feature()]
    output_features = [category_feature(), number_feature()]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        "preprocessing": {},
        TRAINER: {
            "epochs": 2
        },
    }

    # setup training data format to test
    raw_data = generate_data(input_features, output_features, csv_filename)

    training_set_metadata = None

    if data_format == "hdf5":
        # hdf5 format
        training_set, _, _, training_set_metadata = preprocess_for_training(
            config, dataset=raw_data)
        dataset_to_use = training_set.data_hdf5_fp
    else:
        dataset_to_use = create_data_set_to_use(data_format, raw_data)

    # define Ludwig model
    model = LudwigModel(config=config)
    model.train(dataset=dataset_to_use,
                training_set_metadata=training_set_metadata,
                random_seed=default_random_seed)

    # # run functions with the specified data format
    model.evaluate(dataset=dataset_to_use)
    model.predict(dataset=dataset_to_use)
Example #23
0
def train_model(input_features, output_features, data_csv):
    """
    Helper method to avoid code repetition in running an experiment
    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :return: None
    """
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }

    model = LudwigModel(model_definition)

    # Training with csv
    model.train(data_csv=data_csv,
                skip_save_processed_input=True,
                skip_save_progress=True,
                skip_save_unprocessed_output=True)

    model.predict(data_csv=data_csv)

    # Remove results/intermediate data saved to disk
    shutil.rmtree(model.exp_dir_name, ignore_errors=True)

    # Training with dataframe
    data_df = read_csv(data_csv)
    model.train(data_df=data_df,
                skip_save_processed_input=True,
                skip_save_progress=True,
                skip_save_unprocessed_output=True)
    model.predict(data_df=data_df)
    return model
Example #24
0
def train_model(input_features, output_features, data_csv):
    """Helper method to avoid code repetition in running an experiment.

    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :return: None
    """
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {"type": "concat", "fc_size": 14},
        "training": {"epochs": 2},
    }
    model = LudwigModel(config, backend=LocalTestBackend())
    _, _, output_dir = model.train(
        dataset=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True
    )
    model.predict(dataset=data_csv, output_directory=output_dir)

    return model, output_dir
Example #25
0
def run_api_experiment(input_features, output_features, data_csv, **kwargs):
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }

    model = LudwigModel(model_definition)

    try:
        # Training with csv
        model.train(data_csv=data_csv, **kwargs)

        model.predict(data_csv=data_csv)
    finally:
        if model.exp_dir_name:
            shutil.rmtree(model.exp_dir_name, ignore_errors=True)
Example #26
0
def test_model_weights_match_training(tmpdir, csv_filename):
    np.random.seed(1)

    input_features = [number_feature()]
    output_features = [number_feature()]
    output_feature_name = output_features[0][NAME]

    # Generate test data
    data_csv_path = generate_data(input_features,
                                  output_features,
                                  os.path.join(tmpdir, csv_filename),
                                  num_examples=100)

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "trainer": {
            "epochs": 5,
            "batch_size": 32
        },
    }

    model = LudwigModel(config=config, )

    training_stats, _, _ = model.train(training_set=data_csv_path,
                                       random_seed=1919)

    # generate predicitons from training data
    df = pd.read_csv(data_csv_path)
    predictions = model.predict(df)

    # compute loss on predictions from training data
    loss_function = MSELoss()
    loss = loss_function(
        torch.tensor(predictions[0][output_feature_name +
                                    "_predictions"].values),  # predictions
        torch.tensor(df[output_feature_name].values),  # target
    ).type(torch.float32)

    # get last loss value from training
    last_training_loss = torch.tensor(
        training_stats[TRAINING][output_feature_name][LOSS][-1])

    # loss from predictions should match last loss value recorded during training
    assert torch.isclose(loss, last_training_loss), (
        "Model predictions on training set did not generate same loss value as in training. "
        "Need to confirm that weights were correctly captured in model.")
Example #27
0
def run_test_gbm_number(tmpdir, backend_config):
    """Test that the GBM model can train and predict a numerical output (regression)."""
    # Given a dataset with a single input feature and a single output feature,
    input_features = [number_feature(), category_feature(reduce_output="sum")]
    output_feature = number_feature()
    output_features = [output_feature]

    csv_filename = os.path.join(tmpdir, "training.csv")
    dataset_filename = generate_data(input_features,
                                     output_features,
                                     csv_filename,
                                     num_examples=100)

    config = {
        MODEL_TYPE: "gbm",
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "num_boost_round": 2
        },
    }

    # When I train a model on the dataset, load the model from the output directory, and
    # predict on the dataset
    model = LudwigModel(config, backend=backend_config)

    model.train(
        dataset=dataset_filename,
        output_directory=tmpdir,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_log=True,
    )
    model.load(os.path.join(tmpdir, "api_experiment_run", "model"))
    preds, _ = model.predict(
        dataset=dataset_filename,
        output_directory=os.path.join(tmpdir, "predictions"),
    )

    # Then the predictions should be included in the output
    pred_col = preds[output_feature["name"] + "_predictions"]
    if backend_config["type"] == "ray":
        pred_col = pred_col.compute()
    assert pred_col.dtype == float
Example #28
0
def test_whylogs_callback_local(tmpdir):
    epochs = 2
    batch_size = 8
    num_examples = 32

    input_features = [sequence_feature(reduce_output="sum")]
    output_features = [category_feature(vocab_size=2, reduce_input="sum")]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        TRAINER: {
            "epochs": epochs,
            "batch_size": batch_size
        },
    }

    data_csv = generate_data(input_features,
                             output_features,
                             os.path.join(tmpdir, "train.csv"),
                             num_examples=num_examples)
    val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv"))
    test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv"))

    exp_name = "whylogs_test_local"
    callback = WhyLogsCallback()

    model = LudwigModel(config, callbacks=[callback])
    model.train(training_set=data_csv,
                validation_set=val_csv,
                test_set=test_csv,
                experiment_name=exp_name)
    _, _ = model.predict(test_csv)

    local_training_output_dir = "output/training"
    local_prediction_output_dir = "output/prediction"

    assert os.path.isdir(local_training_output_dir) is True
    assert os.path.isdir(local_prediction_output_dir) is True
Example #29
0
def train_with_backend(
    backend,
    config,
    dataset=None,
    training_set=None,
    validation_set=None,
    test_set=None,
    predict=True,
    evaluate=True,
    callbacks=None,
):
    model = LudwigModel(config, backend=backend, callbacks=callbacks)
    output_dir = None

    try:
        _, _, output_dir = model.train(
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            skip_save_log=True,
        )

        if dataset is None:
            dataset = training_set

        if predict:
            preds, _ = model.predict(dataset=dataset)
            assert preds is not None

        if evaluate:
            _, eval_preds, _ = model.evaluate(dataset=dataset)
            assert eval_preds is not None

        return model
    finally:
        # Remove results/intermediate data saved to disk
        shutil.rmtree(output_dir, ignore_errors=True)
Example #30
0
def run_test_gbm_category(tmpdir, backend_config):
    """Test that the GBM model can train and predict a categorical output (multiclass classification)."""
    input_features = [number_feature(), category_feature(reduce_output="sum")]
    vocab_size = 3
    output_feature = category_feature(vocab_size=vocab_size)
    output_features = [output_feature]

    csv_filename = os.path.join(tmpdir, "training.csv")
    dataset_filename = generate_data(input_features,
                                     output_features,
                                     csv_filename,
                                     num_examples=100)

    config = {
        MODEL_TYPE: "gbm",
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "num_boost_round": 2
        },
    }

    model = LudwigModel(config, backend=backend_config)

    _, _, output_directory = model.train(
        dataset=dataset_filename,
        output_directory=tmpdir,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_log=True,
    )
    model.load(os.path.join(tmpdir, "api_experiment_run", "model"))
    preds, _ = model.predict(dataset=dataset_filename,
                             output_directory=output_directory)

    prob_col = preds[output_feature["name"] + "_probabilities"]
    if backend_config["type"] == "ray":
        prob_col = prob_col.compute()
    assert len(prob_col.iloc[0]) == (vocab_size + 1)
    assert prob_col.apply(sum).mean() == pytest.approx(1.0)