Esempio n. 1
0
def run_test_gbm_category(tmpdir, backend_config):
    """Test that the GBM model can train and predict a categorical output (multiclass classification)."""
    input_features = [number_feature(), category_feature(reduce_output="sum")]
    vocab_size = 3
    output_feature = category_feature(vocab_size=vocab_size)
    output_features = [output_feature]

    csv_filename = os.path.join(tmpdir, "training.csv")
    dataset_filename = generate_data(input_features,
                                     output_features,
                                     csv_filename,
                                     num_examples=100)

    config = {
        MODEL_TYPE: "gbm",
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "num_boost_round": 2
        },
    }

    model = LudwigModel(config, backend=backend_config)

    _, _, output_directory = model.train(
        dataset=dataset_filename,
        output_directory=tmpdir,
        skip_save_processed_input=True,
        skip_save_progress=True,
        skip_save_unprocessed_output=True,
        skip_save_log=True,
    )
    model.load(os.path.join(tmpdir, "api_experiment_run", "model"))
    preds, _ = model.predict(dataset=dataset_filename,
                             output_directory=output_directory)

    prob_col = preds[output_feature["name"] + "_probabilities"]
    if backend_config["type"] == "ray":
        prob_col = prob_col.compute()
    assert len(prob_col.iloc[0]) == (vocab_size + 1)
    assert prob_col.apply(sum).mean() == pytest.approx(1.0)
Esempio n. 2
0
def test_experiment_ignore_torch_seed(raw_dataset_fp: str,
                                      random_seed: int) -> None:
    """Test reproducibility of experiment API when an unrelated torch random operation is performed between the
    Ludwig operations.

    Args:
        raw_dataset_fp (str): file path for data to be used as part of this test
        random_seed(int): random seed integer to use for test

    Returns: None
    """
    # define Ludwig model
    model1 = LudwigModel(config=CONFIG, logging_level=logging.WARN)

    evaluation_statistics1, training_statistics1, preprocessed_data1, _ = model1.experiment(
        dataset=raw_dataset_fp,
        random_seed=random_seed,
        skip_save_processed_input=True)

    # invoke torch random functions with unrelated seed to
    # see if it affects Ludwig reproducibility
    torch.manual_seed(random_seed + 5)
    torch.rand((5, ))

    model2 = LudwigModel(config=CONFIG, logging_level=logging.WARN)
    evaluation_statistics2, training_statistics2, preprocessed_data2, _ = model2.experiment(
        dataset=raw_dataset_fp,
        random_seed=random_seed,
        skip_save_processed_input=True)

    # confirm data splits are reproducible
    for i in range(3):
        for k in preprocessed_data1[i].dataset:
            # same seeds should result in same output
            assert np.all(preprocessed_data1[i].dataset[k] ==
                          preprocessed_data2[i].dataset[k])

    # confirm results reproducibility/non-reproducibility of results
    # same seeds should result in same output
    assert training_statistics1 == training_statistics2
    assert evaluation_statistics1 == evaluation_statistics2
Esempio n. 3
0
def run_api_experiment(input_features, output_features, data_csv, **kwargs):
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'training': {
            'epochs': 2
        }
    }

    model = LudwigModel(model_definition)

    try:
        # Training with csv
        model.train(data_csv=data_csv, **kwargs)

        model.predict(data_csv=data_csv)
    finally:
        if model.exp_dir_name:
            shutil.rmtree(model.exp_dir_name, ignore_errors=True)
Esempio n. 4
0
def test_preprocess(raw_dataset_fp: str, random_seed: int,
                    second_seed_offset: int) -> None:
    """Test reproducibility of train/validation/test splits.

    Args:
        raw_dataset_fp (str): file path for data to be used as part of this test
        random_seed(int): random seed integer to use for test
        second_seed_offset(int): zero to use same random seed for second test, non-zero to use a different
            seed for the second run.

    Returns: None
    """
    # define Ludwig model
    model1 = LudwigModel(config=CONFIG)

    # preprocess the raw data set, specify seed
    preprocessed_data1 = model1.preprocess(raw_dataset_fp,
                                           random_seed=random_seed)

    # perform second preprocess operation
    model2 = LudwigModel(config=CONFIG)
    # preprocess same raw data set with same seed
    preprocessed_data2 = model2.preprocess(raw_dataset_fp,
                                           random_seed=random_seed +
                                           second_seed_offset)

    # confirm data splits are reproducible
    for i in range(3):
        for k in preprocessed_data1[i].dataset:
            if second_seed_offset == 0:
                # same seeds should result in same output
                assert np.all(preprocessed_data1[i].dataset[k] ==
                              preprocessed_data2[i].dataset[k])
            else:
                # non-zero second_seed_offset uses different seeds and should result in different output
                assert not np.all(preprocessed_data1[i].dataset[k] ==
                                  preprocessed_data2[i].dataset[k])
Esempio n. 5
0
def main():
    folderName = "testeste"

    cvsFile = pd.read_csv(folderName + '/train.csv')
    print(cvsFile)
    cvsFilePredict = pd.read_csv(folderName + '/predict.csv')
    
    model_definition = {
        'input_features':[
            {'name':'image_path', 'type':'image', 'encoder':'stacked_cnn'}
        ],
        'output_features': [
            {'name': 'class', 'type': 'binary'}
        ]
    }

    model = LudwigModel(model_definition)
    trainData = model.train(data_df=cvsFile)

    #model = LudwigModel.load("trainedModel")
    predictionData1 = model.predict(data_df=cvsFilePredict)

    '''
    numpyPrediction = predictionData1.to_numpy()
    results = []
    for i in range(len(numpyPrediction)):
        results.append(numpyPrediction[i][0])
    
    #results now has the bool values
    '''


    
    print("=========================PREDICTION 1=========================")
    print(predictionData1.to_string())

    model.close()
#!/usr/bin/env python
# coding: utf-8

# # Simple Model Training Example
#
# This example is the API example for this Ludwig command line example
# (https://uber.github.io/ludwig/examples/#kaggles-titanic-predicting-survivors).

# Import required libraries

from ludwig.api import LudwigModel
import logging
import shutil

# clean out prior results
try:
    shutil.rmtree('./results')
except:
    pass

# Define Ludwig model object that drive model training
model = LudwigModel(model_definition_file='./model1_definition.yaml',
                    logging_level=logging.INFO)

# initiate model training
train_stats = model.train(data_csv='./data/train.csv',
                          experiment_name='simple_experiment',
                          model_name='simple_model')

model.close()
Esempio n. 7
0
def predict_cli(model_path: str,
                dataset: Union[str, dict, pd.DataFrame] = None,
                data_format: str = None,
                split: str = FULL,
                batch_size: int = 128,
                skip_save_unprocessed_output: bool = False,
                skip_save_predictions: bool = False,
                output_directory: str = 'results',
                gpus: Union[str, int, List[int]] = None,
                gpu_memory_limit: int = None,
                allow_parallel_threads: bool = True,
                callbacks: List[Callback] = None,
                backend: Union[Backend, str] = None,
                logging_level: int = logging.INFO,
                debug: bool = False,
                **kwargs) -> None:
    """
    Loads pre-trained model to make predictions on the provided data set.

    # Inputs

    :param model_path: (str) filepath to pre-trained model.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the prediction.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param split: (str, default: `full`) split on which
        to perform predictions. Valid values are `'training'`, `'validation'`,
        `'test'` and `'full'`.
    :param batch_size: (int, default `128`) size of batches for processing.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param callbacks: (list, default: `None`) a list of
        `ludwig.callbacks.Callback` objects that provide hooks into the
        Ludwig pipeline.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param logging_level: (int) Log level that will be sent to stderr.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.
        **kwargs:

    # Returns

    :return: ('None')
    """
    model = LudwigModel.load(
        model_path,
        logging_level=logging_level,
        backend=backend,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        callbacks=callbacks,
    )
    model.predict(
        dataset=dataset,
        data_format=data_format,
        split=split,
        batch_size=batch_size,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        output_directory=output_directory,
        return_type='dict',
        debug=debug,
    )
Esempio n. 8
0
def run_experiment(
    config,
    parameters=None,
    dataset=None,
    training_set=None,
    validation_set=None,
    test_set=None,
    training_set_metadata=None,
    data_format=None,
    experiment_name="hyperopt",
    model_name="run",
    # model_load_path=None,
    model_resume_path=None,
    eval_split=VALIDATION,
    skip_save_training_description=False,
    skip_save_training_statistics=False,
    skip_save_model=False,
    skip_save_progress=False,
    skip_save_log=False,
    skip_save_processed_input=False,
    skip_save_unprocessed_output=False,
    skip_save_predictions=False,
    skip_save_eval_stats=False,
    output_directory="results",
    gpus=None,
    gpu_memory_limit=None,
    allow_parallel_threads=True,
    callbacks=None,
    backend=None,
    random_seed=default_random_seed,
    debug=False,
    **kwargs,
):
    for callback in callbacks or []:
        callback.on_hyperopt_trial_start(parameters)

    # Collect training and validation losses and metrics
    # & append it to `results`
    model = LudwigModel(
        config=config,
        backend=backend,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        callbacks=callbacks,
    )
    eval_stats, train_stats, _, _ = model.experiment(
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        # model_load_path=model_load_path,
        model_resume_path=model_resume_path,
        eval_split=eval_split,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        skip_collect_predictions=True,
        skip_collect_overall_stats=False,
        random_seed=random_seed,
        debug=debug,
    )

    for callback in callbacks or []:
        callback.on_hyperopt_trial_end(parameters)

    return train_stats, eval_stats
Esempio n. 9
0
def test_binary_predictions(tmpdir, distinct_values):
    input_features = [
        category_feature(vocab_size=3),
    ]

    feature = binary_feature()
    output_features = [
        feature,
    ]

    data_csv_path = generate_data(
        input_features,
        output_features,
        os.path.join(tmpdir, 'dataset.csv'),
    )
    data_df = pd.read_csv(data_csv_path)

    # Optionally convert bool values to strings, e.g., {'Yes', 'No'}
    false_value, true_value = distinct_values
    data_df[feature[NAME]] = data_df[feature[NAME]].map(lambda x: true_value
                                                        if x else false_value)

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 1
        }
    }
    ludwig_model = LudwigModel(config)
    _, _, output_directory = ludwig_model.train(
        dataset=data_df,
        output_directory=os.path.join(tmpdir, 'output'),
    )

    # Check that metadata JSON saves and loads correctly
    ludwig_model = LudwigModel.load(os.path.join(output_directory, 'model'))

    # Produce an even mix of True and False predictions, as the model may be biased towards
    # one direction without training
    def random_logits(*args, **kwargs):
        return tf.convert_to_tensor(
            np.random.uniform(low=-1.0, high=1.0, size=(len(data_df), )))

    with mock.patch(
            'ludwig.features.binary_feature.BinaryOutputFeature.logits',
            random_logits):
        preds_df, _ = ludwig_model.predict(dataset=data_csv_path)

    cols = set(preds_df.columns)
    assert f'{feature[NAME]}_predictions' in cols
    assert f'{feature[NAME]}_probabilities_{str(false_value)}' in cols
    assert f'{feature[NAME]}_probabilities_{str(true_value)}' in cols
    assert f'{feature[NAME]}_probability' in cols

    for pred, prob_0, prob_1, prob in zip(
            preds_df[f'{feature[NAME]}_predictions'],
            preds_df[f'{feature[NAME]}_probabilities_{str(false_value)}'],
            preds_df[f'{feature[NAME]}_probabilities_{str(true_value)}'],
            preds_df[f'{feature[NAME]}_probability'],
    ):
        assert pred == false_value or pred == true_value
        if pred == true_value:
            assert prob_1 == prob
        else:
            assert prob_0 == prob
        assert prob_0 == 1 - prob_1
Esempio n. 10
0
def test_savedmodel(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, 'generated_images')
        audio_dest_folder = os.path.join(tmpdir, 'generated_audio')

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            set_feature(vocab_size=3),
            vector_feature()
        ]

        predictions_column_name = '{}_predictions'.format(
            output_features[0]['name'])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features,
                                      data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'training': {
                'epochs': 2
            }
        }
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(
            dataset=data_csv_path)
        original_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################
        # save savedmodel
        #################
        savedmodel_path = os.path.join(dir_path, 'savedmodel')
        shutil.rmtree(savedmodel_path, ignore_errors=True)
        ludwig_model.model.save_savedmodel(savedmodel_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################################################
        # restore savedmodel, obtain predictions and weights
        #################################################
        training_set_metadata_json_fp = os.path.join(
            ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = tf.saved_model.load(savedmodel_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: tf.convert_to_tensor(dataset.dataset[feature.proc_column],
                                       dtype=feature.get_input_dtype())
            for name, feature in ludwig_model.model.input_features.items()
        }

        logits = restored_model(data_to_predict, False, None)

        restored_predictions = tf.argmax(logits[of_name]['logits'],
                                         -1,
                                         name='predictions_{}'.format(of_name))
        restored_predictions = tf.map_fn(
            lambda idx: training_set_metadata[of_name]['idx2str'][idx],
            restored_predictions,
            dtype=tf.string)

        restored_weights = deepcopy(restored_model.trainable_variables)

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(savedmodel_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # check for same number of weights as original model
        assert len(original_weights) == len(loaded_weights)
        assert len(original_weights) == len(restored_weights)

        # check to ensure weight valuess match the original model
        loaded_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           loaded_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        original_weights = sorted(original_weights, key=lambda w: w.name)
        restored_weights = sorted(restored_weights, key=lambda w: w.name)

        restored_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           restored_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        assert loaded_weights_match and restored_weights_match

        #  Are predictions identical to original ones?
        loaded_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            loaded_prediction_df[predictions_column_name])

        restored_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            restored_predictions.numpy().astype('str'))

        assert loaded_predictions_match and restored_predictions_match
Esempio n. 11
0
import logging

from ludwig.api import LudwigModel
from ludwig.datasets import higgs

model = LudwigModel(
    config='medium_config.yaml',
    logging_level=logging.INFO,
)

higgs_df = higgs.load()
model.train(dataset=higgs_df,
            experiment_name='higgs_medium',
            model_name='higgs_tabnet_medium')
Esempio n. 12
0
def run_api_experiment_separated_datasets(input_features, output_features,
                                          data_csv):
    """Helper method to avoid code repetition in running an experiment.

    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :return: None
    """
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "fc_size": 14
        },
        "training": {
            "epochs": 2
        },
    }

    model = LudwigModel(config)

    # Training with dataframe
    data_df = read_csv(data_csv)
    train_df = data_df.sample(frac=0.8)
    test_df = data_df.drop(train_df.index).sample(frac=0.5)
    validation_df = data_df.drop(train_df.index).drop(test_df.index)

    basename, ext = os.path.splitext(data_csv)
    train_fname = basename + ".train" + ext
    val_fname = basename + ".validation" + ext
    test_fname = basename + ".test" + ext
    output_dirs = []

    try:
        train_df.to_csv(train_fname)
        validation_df.to_csv(val_fname)
        test_df.to_csv(test_fname)

        # Training with csv
        _, _, output_dir = model.train(
            training_set=train_fname,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
        )
        output_dirs.append(output_dir)

        _, _, output_dir = model.train(
            training_set=train_fname,
            validation_set=val_fname,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
        )
        output_dirs.append(output_dir)

        _, _, output_dir = model.train(
            training_set=train_fname,
            validation_set=val_fname,
            test_set=test_fname,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
        )
        output_dirs.append(output_dir)

        _, output_dir = model.predict(dataset=test_fname)
        output_dirs.append(output_dir)

    finally:
        # Remove results/intermediate data saved to disk
        os.remove(train_fname)
        os.remove(val_fname)
        os.remove(test_fname)
        for output_dir in output_dirs:
            shutil.rmtree(output_dir, ignore_errors=True)

    output_dirs = []
    try:
        _, _, output_dir = model.train(
            training_set=train_df,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
        )
        output_dirs.append(output_dir)

        _, _, output_dir = model.train(
            training_set=train_df,
            validation_set=validation_df,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
        )
        output_dirs.append(output_dir)

        _, _, output_dir = model.train(
            training_set=train_df,
            validation_set=validation_df,
            test_set=test_df,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
        )
        output_dirs.append(output_dir)

        _, output_dir = model.predict(dataset=data_df)
        output_dirs.append(output_dir)

    finally:
        for output_dir in output_dirs:
            shutil.rmtree(output_dir, ignore_errors=True)
Esempio n. 13
0
def run_api_commands(
    input_features,
    output_features,
    data_csv,
    output_dir,
    skip_save_training_description=False,
    skip_save_training_statistics=False,
    skip_save_model=False,
    skip_save_progress=False,
    skip_save_log=False,
    skip_save_processed_input=False,
    skip_save_unprocessed_output=False,
    skip_save_predictions=False,
    skip_save_eval_stats=False,
    skip_collect_predictions=False,
    skip_collect_overall_stats=False,
):
    """Helper method to avoid code repetition in running an experiment.

    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :return: None
    """
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "fc_size": 14
        },
        "training": {
            "epochs": 2
        },
    }

    model = LudwigModel(config)

    # Training with csv
    model.train(
        dataset=data_csv,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        output_directory=output_dir,
    )
    model.predict(
        dataset=data_csv,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        output_directory=output_dir,
    )
    model.evaluate(
        dataset=data_csv,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        collect_predictions=not skip_collect_predictions,
        collect_overall_stats=not skip_collect_overall_stats,
        output_directory=output_dir,
    )
    model.experiment(
        dataset=data_csv,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        skip_collect_predictions=skip_collect_predictions,
        skip_collect_overall_stats=skip_collect_overall_stats,
        output_directory=output_dir,
    )
Esempio n. 14
0
def test_api_training_determinism(csv_filename):
    with tempfile.TemporaryDirectory() as tmpdir:
        input_features = [sequence_feature(reduce_output="sum")]
        output_features = [category_feature(vocab_size=5, reduce_input="sum")]

        data_csv = generate_data(input_features, output_features, csv_filename)

        config = {
            "input_features": input_features,
            "output_features": output_features,
            "combiner": {
                "type": "concat",
                "fc_size": 14
            },
        }

        # Train the model 3 times:
        #
        # 1. seed x
        # 2. seed y
        # 3. seed x
        #
        # Check that models (1) and (3) produce the same weights,
        # but (1) and (2) do not
        rand_x = 42
        rand_y = 24

        model_1 = LudwigModel(config)
        model_1.train(dataset=data_csv,
                      output_directory=tmpdir,
                      random_seed=rand_x)

        model_2 = LudwigModel(config)
        model_2.train(dataset=data_csv,
                      output_directory=tmpdir,
                      random_seed=rand_y)

        model_3 = LudwigModel(config)
        model_3.train(dataset=data_csv,
                      output_directory=tmpdir,
                      random_seed=rand_x)

        model_weights_1 = get_weights(model_1.model)
        model_weights_2 = get_weights(model_2.model)
        model_weights_3 = get_weights(model_3.model)

        divergence = False
        for weight_1, weight_2 in zip(model_weights_1, model_weights_2):
            if not torch.allclose(weight_1, weight_2):
                divergence = True
                break
        assert divergence, "model_1 and model_2 have identical weights with different seeds!"

        for weight_1, weight_3 in zip(model_weights_1, model_weights_3):
            assert torch.allclose(weight_1, weight_3)
Esempio n. 15
0
"""Sample ludwig training code for training an NMT model (en -> fr) on WMT15 (https://www.statmt.org/wmt15/).

The dataset is rather large (8GB), which can take several minutes to preprocess.
"""

import logging
import shutil

from ludwig.api import LudwigModel
from ludwig.datasets import wmt15

# clean out prior results
shutil.rmtree("./results", ignore_errors=True)

# Download and prepare the dataset
training_set = wmt15.load()

model = LudwigModel(config="./config_small.yaml", logging_level=logging.INFO)

(
    train_stats,  # dictionary containing training statistics
    preprocessed_data,  # tuple Ludwig Dataset objects of pre-processed training data
    output_directory,  # location of training results stored on disk
) = model.train(dataset=training_set, experiment_name="simple_experiment", model_name="simple_model")
Esempio n. 16
0
from flask import Flask, request, jsonify  # loading in Flask
from ludwig.api import LudwigModel  # loading in Ludwig
import pandas as pd  # loading pandas for reading csv

# creating a Flask application
app = Flask(__name__)

# Load the model
model = LudwigModel.load('model')


# creating predict url and only allowing post requests.
@app.route('/predict', methods=['POST'])
def predict():
    # Get data from Post request
    data = request.get_json()
    # Make prediction
    df = pd.DataFrame([str(data['text'])], columns=['content'])
    print(df.head())
    # making predictions
    pred = model.predict(dataset=df, data_format='df')
    print(pred)
    # returning the predictions as json
    return jsonify(pred['airline_sentiment_predictions'][0])


if __name__ == '__main__':
    app.run(port=3000, debug=True)
Esempio n. 17
0
def collect_activations(model_path: str,
                        layers: List[str],
                        dataset: str,
                        data_format: str = None,
                        split: str = FULL,
                        batch_size: int = 128,
                        output_directory: str = 'results',
                        gpus: List[str] = None,
                        gpu_memory_limit: int = None,
                        allow_parallel_threads: bool = True,
                        backend: Union[Backend, str] = None,
                        debug: bool = False,
                        **kwargs) -> List[str]:
    """
    Uses the pretrained model to collect the tensors corresponding to a
    datapoint in the dataset. Saves the tensors to the experiment directory

    # Inputs

    :param model_path: (str) filepath to pre-trained model.
    :param layers: (List[str]) list of strings for layer names in the model
        to collect activations.
    :param dataset: (str) source
        containing the data to make predictions.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param split: (str, default: `full`) split on which
        to perform predictions. Valid values are `'training'`, `'validation'`,
        `'test'` and `'full'`.
    :param batch_size: (int, default `128`) size of batches for processing.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.

    # Return

    :return: (List[str]) list of filepath to `*.npy` files containing
        the activations.
    """
    logger.info('Dataset path: {}'.format(dataset))
    logger.info('Model path: {}'.format(model_path))
    logger.info('Output path: {}'.format(output_directory))
    logger.info('\n')

    model = LudwigModel.load(model_path,
                             gpus=gpus,
                             gpu_memory_limit=gpu_memory_limit,
                             allow_parallel_threads=allow_parallel_threads,
                             backend=backend)

    # collect activations
    print_boxed('COLLECT ACTIVATIONS')
    collected_tensors = model.collect_activations(layers,
                                                  dataset,
                                                  data_format=data_format,
                                                  split=split,
                                                  batch_size=batch_size,
                                                  debug=debug)

    # saving
    os.makedirs(output_directory, exist_ok=True)
    saved_filenames = save_tensors(collected_tensors, output_directory)

    logger.info('Saved to: {0}'.format(output_directory))
    return saved_filenames
Esempio n. 18
0
def experiment_cli(config: dict,
                   config_file: str = None,
                   dataset: Union[str, dict, pd.DataFrame] = None,
                   training_set: Union[str, dict, pd.DataFrame] = None,
                   validation_set: Union[str, dict, pd.DataFrame] = None,
                   test_set: Union[str, dict, pd.DataFrame] = None,
                   training_set_metadata: Union[str, dict] = None,
                   data_format: str = None,
                   experiment_name: str = 'experiment',
                   model_name: str = 'run',
                   model_load_path: str = None,
                   model_resume_path: str = None,
                   eval_split: str = TEST,
                   skip_save_training_description: bool = False,
                   skip_save_training_statistics: bool = False,
                   skip_save_model: bool = False,
                   skip_save_progress: bool = False,
                   skip_save_log: bool = False,
                   skip_save_processed_input: bool = False,
                   skip_save_unprocessed_output: bool = False,
                   skip_save_predictions: bool = False,
                   skip_save_eval_stats: bool = False,
                   skip_collect_predictions: bool = False,
                   skip_collect_overall_stats: bool = False,
                   output_directory: str = 'results',
                   gpus: Union[str, int, List[int]] = None,
                   gpu_memory_limit: int = None,
                   allow_parallel_threads: bool = True,
                   backend: Union[Backend, str] = None,
                   random_seed: int = default_random_seed,
                   debug: bool = False,
                   logging_level: int = logging.INFO,
                   **kwargs):
    """Trains a model on a dataset's training and validation splits and
    uses it to predict on the test split.
    It saves the trained model and the statistics of training and testing.

    # Inputs

    :param config: (dict) config which defines the different
        parameters of the model, features, preprocessing and training.
    :param config_file: (str, default: `None`) the filepath string
        that specifies the config.  It is a yaml file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the experiment.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocessed
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param model_load_path: (str, default: `None`) if this is specified the
        loaded model will be used as initialization
        (useful for transfer learning).
    :param model_resume_path: (str, default: `None`) resumes training of
        the model from the path specified. The config is restored.
        In addition to config, training statistics and loss for
        epoch and the state of the optimizer are restored such that
        training can be effectively continued from a previously interrupted
        training process.
    :param eval_split: (str, default: `test`) split on which
        to perform evaluation. Valid values are `training`, `validation`
        and `test`.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming. If you do not want to keep
        the weights and just find out what performance a model can get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
   :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files
    :param skip_save_eval_stats: (bool, default: `False`) skips saving test
        statistics JSON file
   :param skip_collect_predictions: (bool, default: `False`) skips
        collecting post-processed predictions during eval.
    :param skip_collect_overall_stats: (bool, default: `False`) skips
        collecting overall stats during eval.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.
    :param logging_level: (int) Log level that will be sent to stderr.

    # Return
    :return: (Tuple[LudwigModel, dict, dict, tuple, str)) `(model, evaluation_statistics, training_statistics, preprocessed_data, output_directory)`
        `model` LudwigModel instance
        `evaluation_statistics` dictionary with evaluation performance
            statistics on the test_set,
        `training_statistics` is a dictionary of training statistics for
            each output
        feature containing loss and metrics values for each epoch,
        `preprocessed_data` tuple containing preprocessed
        `(training_set, validation_set, test_set)`, `output_directory`
        filepath string to where results are stored.

    """
    backend = initialize_backend(backend)

    config = check_which_config(config, config_file)

    if model_load_path:
        model = LudwigModel.load(model_load_path)
    else:
        model = LudwigModel(
            config=config,
            logging_level=logging_level,
            backend=backend,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
        )
    (eval_stats, train_stats, preprocessed_data,
     output_directory) = model.experiment(
         dataset=dataset,
         training_set=training_set,
         validation_set=validation_set,
         test_set=test_set,
         training_set_metadata=training_set_metadata,
         data_format=data_format,
         experiment_name=experiment_name,
         model_name=model_name,
         model_resume_path=model_resume_path,
         eval_split=eval_split,
         skip_save_training_description=skip_save_training_description,
         skip_save_training_statistics=skip_save_training_statistics,
         skip_save_model=skip_save_model,
         skip_save_progress=skip_save_progress,
         skip_save_log=skip_save_log,
         skip_save_processed_input=skip_save_processed_input,
         skip_save_unprocessed_output=skip_save_unprocessed_output,
         skip_save_predictions=skip_save_predictions,
         skip_save_eval_stats=skip_save_eval_stats,
         skip_collect_predictions=skip_collect_predictions,
         skip_collect_overall_stats=skip_collect_overall_stats,
         output_directory=output_directory,
         random_seed=random_seed,
         debug=debug,
     )

    return model, eval_stats, train_stats, preprocessed_data, output_directory
Esempio n. 19
0
import shutil

import yaml

from ludwig.api import LudwigModel

# clean out prior results
shutil.rmtree('./results', ignore_errors=True)


# set up Python dictionary to hold model training parameters
with open('./config.yaml', 'r') as f:
    config = yaml.safe_load(f.read())

# Define Ludwig model object that drive model training
model = LudwigModel(config,
                    logging_level=logging.INFO)


# initiate model training
(
    train_stats,  #training statistics
    _,
    output_directory  # location for training results saved to disk
) = model.train(
    training_set='./data/mnist_dataset_training.csv',
    test_set='./data/mnist_dataset_testing.csv',
    experiment_name='simple_image_experiment',
    model_name='single_model',
    skip_save_processed_input=True
)
Esempio n. 20
0
import pandas as pd
from ludwig.api import LudwigModel

model = LudwigModel(model_definition_file='model_definition.yaml')
train_stats = model.train(data_csv='training_dataframe.csv')

# obtain predictions
predictions, test_stats = model.test(data_csv='test_dataframe.csv')

print(predictions)
print('===========================')
print(test_stats)

# closing model
model.close()
Esempio n. 21
0
 def __init__(self, data_root, backend):
     self.ludwig_model = LudwigModel.load(data_root, backend=backend)
Esempio n. 22
0
    def train(self):
        training_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe(
            'train')

        if len(timeseries_cols) > 0:
            training_dataframe, model_definition = self._translate_df_to_timeseries_format(
                training_dataframe, model_definition, timeseries_cols, 'train')

        with disable_console_output(True):
            # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295
            #model.initialize_model(train_set_metadata={})
            #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name']

            ludwig_save_is_working = False

            if not ludwig_save_is_working:
                shutil.rmtree('results', ignore_errors=True)

            if self.transaction.lmd['rebuild_model'] is True:
                model = LudwigModel(model_definition)
                merged_model_definition = model.model_definition
                train_set_metadata = build_metadata(
                    training_dataframe,
                    (merged_model_definition['input_features'] +
                     merged_model_definition['output_features']),
                    merged_model_definition['preprocessing'])
                model.initialize_model(train_set_metadata=train_set_metadata,
                                       gpus=self.get_useable_gpus())

                train_stats = model.train(
                    data_df=training_dataframe,
                    model_name=self.transaction.lmd['name'],
                    skip_save_model=ludwig_save_is_working,
                    skip_save_progress=True,
                    gpus=self.get_useable_gpus())
            else:
                model = LudwigModel.load(model_dir=self.get_model_dir())
                train_stats = model.train(
                    data_df=training_dataframe,
                    model_name=self.transaction.lmd['name'],
                    skip_save_model=ludwig_save_is_working,
                    skip_save_progress=True,
                    gpus=self.get_useable_gpus())

            for k in train_stats['train']:
                if k not in self.transaction.lmd['model_accuracy']['train']:
                    self.transaction.lmd['model_accuracy']['train'][k] = []
                    self.transaction.lmd['model_accuracy']['test'][k] = []
                elif k is not 'combined':
                    # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway
                    pass
                else:
                    self.transaction.lmd['model_accuracy']['train'][k].extend(
                        train_stats['train'][k]['accuracy'])
                    self.transaction.lmd['model_accuracy']['test'][k].extend(
                        train_stats['test'][k]['accuracy'])
            '''
            @ TRAIN ONLINE BIT That's not working
            model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path'])
            for i in range(0,100):
                train_stats = model.train_online(data_df=training_dataframe)
                # The resulting train_stats are "None"... wonderful -_-
            '''

        ludwig_model_savepath = os.path.join(
            CONFIG.MINDSDB_STORAGE_PATH,
            self.transaction.lmd['name'] + '_ludwig_data')
        if ludwig_save_is_working:
            model.save(ludwig_model_savepath)
            model.close()
        else:
            shutil.rmtree(ludwig_model_savepath, ignore_errors=True)
            shutil.move(os.path.join('results',
                                     os.listdir('results')[0]),
                        ludwig_model_savepath)
        self.transaction.lmd['ludwig_data'] = {
            'ludwig_save_path': ludwig_model_savepath
        }
        self.transaction.hmd['ludwig_data'] = {
            'model_definition': model_definition
        }
Esempio n. 23
0
 def best_model(self) -> LudwigModel:
     return LudwigModel.load(os.path.join(self.path_to_best_model, "model"))
Esempio n. 24
0
def run_server(model_path, host, port):
    model = LudwigModel.load(model_path)
    app = server(model)
    uvicorn.run(app, host=host, port=port)
Esempio n. 25
0
def preprocess_cli(preprocessing_config: Union[str, dict] = None,
                   dataset: Union[str, dict, pd.DataFrame] = None,
                   training_set: Union[str, dict, pd.DataFrame] = None,
                   validation_set: Union[str, dict, pd.DataFrame] = None,
                   test_set: Union[str, dict, pd.DataFrame] = None,
                   training_set_metadata: Union[str, dict] = None,
                   data_format: str = None,
                   random_seed: int = default_random_seed,
                   logging_level: int = logging.INFO,
                   callbacks: List[Callback] = None,
                   backend: Union[Backend, str] = None,
                   **kwargs) -> None:
    """*train* defines the entire training procedure used by Ludwig's
    internals. Requires most of the parameters that are taken into the model.
    Builds a full ludwig model and performs the training.

    :param preprocessing_config: (Union[str, dict]) in-memory representation of
            config or string path to a YAML config file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used for training.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocessed
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param model_load_path: (str, default: `None`) if this is specified the
        loaded model will be used as initialization
        (useful for transfer learning).
    :param model_resume_path: (str, default: `None`) resumes training of
        the model from the path specified. The config is restored.
        In addition to config, training statistics, loss for each
        epoch and the state of the optimizer are restored such that
        training can be effectively continued from a previously interrupted
        training process.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming. If you do not want to keep
        the weights and just find out what performance a model can get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
    :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param callbacks: (list, default: `None`) a list of
        `ludwig.callbacks.Callback` objects that provide hooks into the
        Ludwig pipeline.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param logging_level: (int) Log level that will be sent to stderr.

    # Return

    :return: (`None`)
    """
    model = LudwigModel(
        config=preprocessing_config,
        logging_level=logging_level,
        callbacks=callbacks,
        backend=backend,
    )
    model.preprocess(
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        skip_save_processed_input=False,
        random_seed=random_seed,
    )
Esempio n. 26
0
]

#
list_of_train_stats = []

# ## Train models
for model_option in list_of_fc_layers:
    print('>>>> training: ', model_option.name)

    # set up Python dictionary to hold model training parameters
    model_definition = base_model.copy()
    model_definition['input_features'][0]['fc_layers'] = model_option.fc_layers
    model_definition['training']['epochs'] = 8

    # Define Ludwig model object that drive model training
    model = LudwigModel(model_definition, logging_level=logging.INFO)

    # initiate model training
    train_stats = model.train(data_csv='./data/mnist_dataset_training.csv',
                              experiment_name='multiple_experiment',
                              model_name=model_option.name)

    # save training stats for later use
    list_of_train_stats.append(
        TrainingResult(name=model_option.name, train_stats=train_stats))

    print('>>>>>>> completed: ', model_option.name, '\n')

    model.close()

# generating learning curves from training
Esempio n. 27
0
    def execute(
        self,
        config,
        dataset=None,
        training_set=None,
        validation_set=None,
        test_set=None,
        training_set_metadata=None,
        data_format=None,
        experiment_name="hyperopt",
        model_name="run",
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=True,
        skip_save_unprocessed_output=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        output_directory="results",
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        callbacks=None,
        backend=None,
        random_seed=default_random_seed,
        debug=False,
        **kwargs,
    ) -> HyperoptResults:
        trial_results = []
        trials = 0
        while not self.hyperopt_sampler.finished():
            sampled_parameters = self.hyperopt_sampler.sample_batch()
            metric_scores = []

            for i, parameters in enumerate(sampled_parameters):
                modified_config = substitute_parameters(copy.deepcopy(config), parameters)

                trial_id = trials + i

                model = LudwigModel(
                    config=modified_config,
                    backend=backend,
                    gpus=gpus,
                    gpu_memory_limit=gpu_memory_limit,
                    allow_parallel_threads=allow_parallel_threads,
                    callbacks=callbacks,
                )
                eval_stats, train_stats, _, _ = model.experiment(
                    dataset=dataset,
                    training_set=training_set,
                    validation_set=validation_set,
                    test_set=test_set,
                    training_set_metadata=training_set_metadata,
                    data_format=data_format,
                    experiment_name=f"{experiment_name}_{trial_id}",
                    model_name=model_name,
                    # model_load_path=model_load_path,
                    # model_resume_path=model_resume_path,
                    eval_split=self.split,
                    skip_save_training_description=skip_save_training_description,
                    skip_save_training_statistics=skip_save_training_statistics,
                    skip_save_model=skip_save_model,
                    skip_save_progress=skip_save_progress,
                    skip_save_log=skip_save_log,
                    skip_save_processed_input=skip_save_processed_input,
                    skip_save_unprocessed_output=skip_save_unprocessed_output,
                    skip_save_predictions=skip_save_predictions,
                    skip_save_eval_stats=skip_save_eval_stats,
                    output_directory=output_directory,
                    skip_collect_predictions=True,
                    skip_collect_overall_stats=False,
                    random_seed=random_seed,
                    debug=debug,
                )
                metric_score = self.get_metric_score(train_stats)
                metric_scores.append(metric_score)

                trial_results.append(
                    TrialResults(
                        parameters=parameters,
                        metric_score=metric_score,
                        training_stats=train_stats,
                        eval_stats=eval_stats,
                    )
                )
            trials += len(sampled_parameters)

            self.hyperopt_sampler.update_batch(zip(sampled_parameters, metric_scores))

        ordered_trials = self.sort_hyperopt_results(trial_results)
        return HyperoptResults(ordered_trials=ordered_trials)
Esempio n. 28
0
def test_model_save_reload_api(csv_filename, tmp_path):
    tf.random.set_seed(1234)

    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3, encoder='rnn', cell_type='lstm',
                     num_layers=2, bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder='stacked_cnn'),
        timeseries_feature(encoder='parallel_cnn'),
        sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    config = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {'epochs': 2}
    }

    data_df = read_csv(data_csv_path)
    data_df[SPLIT] = get_split(data_df)
    training_set, test_set, validation_set = split_dataset_ttv(
        data_df,
        SPLIT
    )
    training_set = pd.DataFrame(training_set)
    validation_set = pd.DataFrame(validation_set)
    test_set = pd.DataFrame(test_set)

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory='results'  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        # this has to be done after predicts because of TF2 lazy restoration
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.weights,
                                    if2.encoder_obj.weights):
                assert np.allclose(if1_w.numpy(), if2_w.numpy())

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.weights, c2.weights):
            assert np.allclose(c1_w.numpy(), c2_w.numpy())

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.weights,
                                    of2.decoder_obj.weights):
                assert np.allclose(of1_w.numpy(), of2_w.numpy())

    # Test saving and loading the model explicitly
    with tempfile.TemporaryDirectory() as tmpdir:
        ludwig_model1.save(tmpdir)
        ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
        check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(
        os.path.join(output_dir, 'model'),
        backend=backend
    )
    check_model_equal(ludwig_model_exp)
Esempio n. 29
0
 def best_model(self) -> LudwigModel:
     return LudwigModel.load(self.path_to_best_model)
Esempio n. 30
0
def test_experiment_image_dataset(train_format, train_in_memory, test_format,
                                  test_in_memory):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='stacked_cnn',
                      preprocessing={
                          'in_memory': True,
                          'height': 12,
                          'width': 12,
                          'num_channels': 3,
                          'num_processes': 5
                      },
                      fc_size=16,
                      num_filters=8),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
    ]

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'preprocessing': {},
        'training': {
            'epochs': 2
        }
    }

    # create temporary name for train and test data sets
    train_csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv'
    test_csv_filename = 'test_' + uuid.uuid4().hex[:10].upper() + '.csv'

    # setup training data format to test
    train_data = generate_data(input_features, output_features,
                               train_csv_filename)
    config['input_features'][0]['preprocessing']['in_memory'] \
        = train_in_memory
    training_set_metadata = None

    if train_format == 'hdf5':
        # hdf5 format
        train_set, _, _, training_set_metadata = preprocess_for_training(
            config, dataset=train_data)
        train_dataset_to_use = train_set.data_hdf5_fp
    else:
        train_dataset_to_use = create_data_set_to_use(train_format, train_data)

    # define Ludwig model
    model = LudwigModel(config=config, )
    model.train(dataset=train_dataset_to_use,
                training_set_metadata=training_set_metadata)

    model.config['input_features'][0]['preprocessing']['in_memory'] \
        = test_in_memory

    # setup test data format to test
    test_data = generate_data(input_features, output_features,
                              test_csv_filename)

    if test_format == 'hdf5':
        # hdf5 format
        # create hdf5 data set
        _, test_set, _, training_set_metadata_for_test = preprocess_for_training(
            model.config, dataset=test_data)
        test_dataset_to_use = test_set.data_hdf5_fp
    else:
        test_dataset_to_use = create_data_set_to_use(test_format, test_data)

    # run functions with the specified data format
    model.evaluate(dataset=test_dataset_to_use)
    model.predict(dataset=test_dataset_to_use)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
    delete_temporary_data(train_csv_filename)
    delete_temporary_data(test_csv_filename)