コード例 #1
0
def test_experiment_dataset_formats(data_format):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting

    input_features = [
        numerical_feature(),
        category_feature()
    ]
    output_features = [
        category_feature(),
        numerical_feature()
    ]

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'preprocessing': {},
        'training': {'epochs': 2}
    }

    # create temporary name for train and test data sets
    csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv'

    # setup training data format to test
    raw_data = generate_data(input_features, output_features,
                               csv_filename)

    training_set_metadata = None

    if data_format == 'hdf5':
        # hdf5 format
        training_set, _, _, training_set_metadata = preprocess_for_training(
            config,
            dataset=raw_data
        )
        dataset_to_use = training_set.data_hdf5_fp
    else:
        dataset_to_use = create_data_set_to_use(data_format, raw_data)

    # define Ludwig model
    model = LudwigModel(
        config=config
    )
    model.train(
        dataset=dataset_to_use,
        training_set_metadata=training_set_metadata,
        random_seed=default_random_seed
    )

    # # run functions with the specified data format
    model.evaluate(dataset=dataset_to_use)
    model.predict(dataset=dataset_to_use)

    # Delete the temporary data created
    delete_temporary_data(csv_filename)
コード例 #2
0
def test_resource_usage_tracker(tmpdir):
    train_df = pd.DataFrame(np.random.normal(0, 1, size=(100, 3)),
                            columns=["input_1", "input_2", "output_1"])
    eval_df = pd.DataFrame(np.random.normal(0, 1, size=(20, 3)),
                           columns=["input_1", "input_2", "output_1"])

    config = {
        "input_features": [{
            "name": "input_1",
            "type": "number"
        }, {
            "name": "input_2",
            "type": "number"
        }],
        "output_features": [{
            "name": "output_1",
            "type": "number"
        }],
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        TRAINER: {
            "epochs": 1
        },
    }

    model = LudwigModel(config=config, backend="local")

    with ResourceUsageTracker(tag="train",
                              output_dir=tmpdir,
                              logging_interval=0.05,
                              num_examples=len(train_df)):
        model.train(
            dataset=train_df,
            output_directory=tmpdir,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

    with ResourceUsageTracker(tag="evaluate",
                              output_dir=tmpdir,
                              logging_interval=0.05,
                              num_examples=len(eval_df)):
        model.evaluate(dataset=eval_df)

    assert os.path.exists(
        os.path.join(tmpdir, "train_resource_usage_metrics.json"))
    assert os.path.exists(
        os.path.join(tmpdir, "evaluate_resource_usage_metrics.json"))

    shutil.rmtree(tmpdir)
コード例 #3
0
ファイル: test_experiment.py プロジェクト: ludwig-ai/ludwig
def test_experiment_dataset_formats(data_format, csv_filename):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting

    input_features = [number_feature(), category_feature()]
    output_features = [category_feature(), number_feature()]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        "preprocessing": {},
        TRAINER: {
            "epochs": 2
        },
    }

    # setup training data format to test
    raw_data = generate_data(input_features, output_features, csv_filename)

    training_set_metadata = None

    if data_format == "hdf5":
        # hdf5 format
        training_set, _, _, training_set_metadata = preprocess_for_training(
            config, dataset=raw_data)
        dataset_to_use = training_set.data_hdf5_fp
    else:
        dataset_to_use = create_data_set_to_use(data_format, raw_data)

    # define Ludwig model
    model = LudwigModel(config=config)
    model.train(dataset=dataset_to_use,
                training_set_metadata=training_set_metadata,
                random_seed=default_random_seed)

    # # run functions with the specified data format
    model.evaluate(dataset=dataset_to_use)
    model.predict(dataset=dataset_to_use)
コード例 #4
0
def train_with_backend(
    backend,
    config,
    dataset=None,
    training_set=None,
    validation_set=None,
    test_set=None,
    predict=True,
    evaluate=True,
    callbacks=None,
):
    model = LudwigModel(config, backend=backend, callbacks=callbacks)
    output_dir = None

    try:
        _, _, output_dir = model.train(
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            skip_save_processed_input=True,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            skip_save_log=True,
        )

        if dataset is None:
            dataset = training_set

        if predict:
            preds, _ = model.predict(dataset=dataset)
            assert preds is not None

        if evaluate:
            _, eval_preds, _ = model.evaluate(dataset=dataset)
            assert eval_preds is not None

        return model
    finally:
        # Remove results/intermediate data saved to disk
        shutil.rmtree(output_dir, ignore_errors=True)
コード例 #5
0
def test_experiment_image_dataset(train_format, train_in_memory, test_format,
                                  test_in_memory):
    # primary focus of this test is to determine if exceptions are
    # raised for different data set formats and in_memory setting
    # Image Inputs
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')

    input_features = [
        image_feature(folder=image_dest_folder,
                      encoder='stacked_cnn',
                      preprocessing={
                          'in_memory': True,
                          'height': 12,
                          'width': 12,
                          'num_channels': 3,
                          'num_processes': 5
                      },
                      fc_size=16,
                      num_filters=8),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input='sum'),
    ]

    config = {
        'input_features': input_features,
        'output_features': output_features,
        'combiner': {
            'type': 'concat',
            'fc_size': 14
        },
        'preprocessing': {},
        'training': {
            'epochs': 2
        }
    }

    # create temporary name for train and test data sets
    train_csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv'
    test_csv_filename = 'test_' + uuid.uuid4().hex[:10].upper() + '.csv'

    # setup training data format to test
    train_data = generate_data(input_features, output_features,
                               train_csv_filename)
    config['input_features'][0]['preprocessing']['in_memory'] \
        = train_in_memory
    training_set_metadata = None

    if train_format == 'hdf5':
        # hdf5 format
        train_set, _, _, training_set_metadata = preprocess_for_training(
            config, dataset=train_data)
        train_dataset_to_use = train_set.data_hdf5_fp
    else:
        train_dataset_to_use = create_data_set_to_use(train_format, train_data)

    # define Ludwig model
    model = LudwigModel(config=config, )
    model.train(dataset=train_dataset_to_use,
                training_set_metadata=training_set_metadata)

    model.config['input_features'][0]['preprocessing']['in_memory'] \
        = test_in_memory

    # setup test data format to test
    test_data = generate_data(input_features, output_features,
                              test_csv_filename)

    if test_format == 'hdf5':
        # hdf5 format
        # create hdf5 data set
        _, test_set, _, training_set_metadata_for_test = preprocess_for_training(
            model.config, dataset=test_data)
        test_dataset_to_use = test_set.data_hdf5_fp
    else:
        test_dataset_to_use = create_data_set_to_use(test_format, test_data)

    # run functions with the specified data format
    model.evaluate(dataset=test_dataset_to_use)
    model.predict(dataset=test_dataset_to_use)

    # Delete the temporary data created
    shutil.rmtree(image_dest_folder)
    delete_temporary_data(train_csv_filename)
    delete_temporary_data(test_csv_filename)
コード例 #6
0
def run_api_commands(
    input_features,
    output_features,
    data_csv,
    output_dir,
    skip_save_training_description=False,
    skip_save_training_statistics=False,
    skip_save_model=False,
    skip_save_progress=False,
    skip_save_log=False,
    skip_save_processed_input=False,
    skip_save_unprocessed_output=False,
    skip_save_predictions=False,
    skip_save_eval_stats=False,
    skip_collect_predictions=False,
    skip_collect_overall_stats=False,
):
    """Helper method to avoid code repetition in running an experiment.

    :param input_features: input schema
    :param output_features: output schema
    :param data_csv: path to data
    :return: None
    """
    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "fc_size": 14
        },
        "training": {
            "epochs": 2
        },
    }

    model = LudwigModel(config)

    # Training with csv
    model.train(
        dataset=data_csv,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        output_directory=output_dir,
    )
    model.predict(
        dataset=data_csv,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        output_directory=output_dir,
    )
    model.evaluate(
        dataset=data_csv,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        collect_predictions=not skip_collect_predictions,
        collect_overall_stats=not skip_collect_overall_stats,
        output_directory=output_dir,
    )
    model.experiment(
        dataset=data_csv,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        skip_collect_predictions=skip_collect_predictions,
        skip_collect_overall_stats=skip_collect_overall_stats,
        output_directory=output_dir,
    )
コード例 #7
0
ファイル: test_experiment.py プロジェクト: ludwig-ai/ludwig
def test_experiment_image_dataset(train_format, train_in_memory, test_format,
                                  test_in_memory, tmpdir):
    # Image Inputs
    image_dest_folder = os.path.join(tmpdir, "generated_images")

    input_features = [
        image_feature(
            folder=image_dest_folder,
            encoder="stacked_cnn",
            preprocessing={
                "in_memory": True,
                "height": 12,
                "width": 12,
                "num_channels": 3,
                "num_processes": 5
            },
            output_size=16,
            num_filters=8,
        ),
    ]
    output_features = [
        category_feature(vocab_size=2, reduce_input="sum"),
    ]

    config = {
        "input_features": input_features,
        "output_features": output_features,
        "combiner": {
            "type": "concat",
            "output_size": 14
        },
        "preprocessing": {},
        TRAINER: {
            "epochs": 2
        },
    }

    # create temporary name for train and test data sets
    train_csv_filename = os.path.join(
        tmpdir, "train_" + uuid.uuid4().hex[:10].upper() + ".csv")
    test_csv_filename = os.path.join(
        tmpdir, "test_" + uuid.uuid4().hex[:10].upper() + ".csv")

    # setup training data format to test
    train_data = generate_data(input_features, output_features,
                               train_csv_filename)
    config["input_features"][0]["preprocessing"]["in_memory"] = train_in_memory
    training_set_metadata = None

    backend = LocalTestBackend()
    if train_format == "hdf5":
        # hdf5 format
        train_set, _, _, training_set_metadata = preprocess_for_training(
            config,
            dataset=train_data,
            backend=backend,
        )
        train_dataset_to_use = train_set.data_hdf5_fp
    else:
        train_dataset_to_use = create_data_set_to_use(train_format, train_data)

    # define Ludwig model
    model = LudwigModel(
        config=config,
        backend=backend,
    )
    model.train(dataset=train_dataset_to_use,
                training_set_metadata=training_set_metadata)

    model.config["input_features"][0]["preprocessing"][
        "in_memory"] = test_in_memory

    # setup test data format to test
    test_data = generate_data(input_features, output_features,
                              test_csv_filename)

    if test_format == "hdf5":
        # hdf5 format
        # create hdf5 data set
        _, test_set, _, training_set_metadata_for_test = preprocess_for_training(
            model.config,
            dataset=test_data,
            backend=backend,
        )
        test_dataset_to_use = test_set.data_hdf5_fp
    else:
        test_dataset_to_use = create_data_set_to_use(test_format, test_data)

    # run functions with the specified data format
    model.evaluate(dataset=test_dataset_to_use)
    model.predict(dataset=test_dataset_to_use)
コード例 #8
0

config = {
    'input_features': input_features,
    'output_features': output_features,
    'combiner': {'type': 'concat', 'fc_size': 14},
    'training': {'epochs': 2}
}


model = LudwigModel(config)

train_stats, _, _ = model.train(dataset=df)

st.header('Eval Stats')
eval_stats, _, _ = model.evaluate(dataset=df)
#st.write(eval_stats)
#st.write(type(eval_stats))

#WORKS!
st.subheader('In JSON format')
json_object = json.dumps(str(eval_stats), indent = 4)   
st.write(json_object)  
#DOESN'T WORK YET
st.subheader('In dataframe format')
st.write('separate dictionnaries from main dictionnary')
df = pd.DataFrame([eval_stats], columns=eval_stats.keys())
st.table(df)


######################
コード例 #9
0
def page_settings(state):
    st.title("Train your text classifier!")
    display_state_values(state)

    #st.write("---")

    import os
    #ModelFiles = os.path.isfile("test.csv")
    ModelFiles = os.path.isfile("training_set_metadata.json")

    if not ModelFiles:

        st.warning('Please train a model first!')
        #st.stop()
        #st.success('ModelFiles_are_saved')

        import pandas as pd
        import io
        import base64

        uploaded_file = st.file_uploader("Choose a file", key="2")

        if uploaded_file is not None:
            # Can be used wherever a "file-like" object is accepted:
            df = pd.read_csv(uploaded_file)
            uploaded_file.seek(0)
            #df.seek(0)
            df.columns = ["doc_text", "class"]
            st.write(df)

        else:
            #st.warning('Upload the CSV to be trained')
            st.stop()

        input_features = [{'name': 'doc_text', 'type': 'text'}]
        output_features = [{'name': 'class', 'type': 'category'}]

        config = {
            'input_features': input_features,
            'output_features': output_features,
            'combiner': {
                'type': 'concat',
                'fc_size': 14
            },
            'training': {
                'epochs': 2
            }
        }

        model = LudwigModel(config)

        import pandas as pd

        train_stats, _, _ = model.train(dataset=df)

        st.header('Eval Stats')
        eval_stats, _, _ = model.evaluate(dataset=df)
        #st.write(eval_stats)
        #st.write(type(eval_stats))

        #WORKS!
        st.subheader('In JSON format')
        json_object = json.dumps(str(eval_stats), indent=4)
        #DOESN'T WORK YET
        st.subheader('In dataframe format')
        st.write(json_object)
        st.write('separate dictionnaries from main dictionnary')
        df = pd.DataFrame([eval_stats], columns=eval_stats.keys())
        #st.table("df")
        #st.table(df)

        #Save model
        model.save(cwd)

    else:
        st.success(
            'РюЁ The model has now been trained, you can start making predictions!'
        )
        st.image('arrow2.png', width=325)
コード例 #10
0
def train_with_backend(
    backend,
    config,
    dataset=None,
    training_set=None,
    validation_set=None,
    test_set=None,
    predict=True,
    evaluate=True,
    callbacks=None,
    skip_save_processed_input=True,
):
    model = LudwigModel(config, backend=backend, callbacks=callbacks)
    output_dir = None

    try:
        _, _, output_dir = model.train(
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            skip_save_processed_input=skip_save_processed_input,
            skip_save_progress=True,
            skip_save_unprocessed_output=True,
            skip_save_log=True,
        )

        if dataset is None:
            dataset = training_set

        if predict:
            preds, _ = model.predict(dataset=dataset)
            assert preds is not None

        if evaluate:
            eval_stats, eval_preds, _ = model.evaluate(
                dataset=dataset,
                collect_overall_stats=False,
                collect_predictions=True)
            assert eval_preds is not None

            # Test that eval_stats are approx equal when using local backend
            with tempfile.TemporaryDirectory() as tmpdir:
                model.save(tmpdir)
                local_model = LudwigModel.load(tmpdir,
                                               backend=LocalTestBackend())
                local_eval_stats, _, _ = local_model.evaluate(
                    dataset=dataset,
                    collect_overall_stats=False,
                    collect_predictions=False)

                # Filter out metrics that are not being aggregated correctly for now
                # TODO(travis): https://github.com/ludwig-ai/ludwig/issues/1956
                def filter(stats):
                    return {
                        k: {
                            metric_name: value
                            for metric_name, value in v.items()
                            if metric_name not in
                            {"loss", "root_mean_squared_percentage_error"}
                        }
                        for k, v in stats.items()
                    }

                for (k1, v1), (k2, v2) in zip(
                        filter(eval_stats).items(),
                        filter(local_eval_stats).items()):
                    assert k1 == k2
                    for (name1,
                         metric1), (name2,
                                    metric2) in zip(v1.items(), v2.items()):
                        assert name1 == name2
                        assert np.isclose(
                            metric1, metric2, rtol=1e-04, atol=1e-5
                        ), f"metric {name1}: {metric1} != {metric2}"

        return model
    finally:
        # Remove results/intermediate data saved to disk
        shutil.rmtree(output_dir, ignore_errors=True)
コード例 #11
0
ファイル: model_training.py プロジェクト: ludwig-ai/ludwig
    # Define Ludwig model object that drive model training
    model = LudwigModel(config=model_id + "_config.yaml",
                        logging_level=logging.WARN)

    # initiate model training
    train_stats, _, _ = model.train(
        training_set=training_set,
        validation_set=val_set,
        test_set=test_set,
        experiment_name="balance_example",
        model_name=model_id,
        skip_save_model=True,
    )

    # evaluate model on test_set
    eval_stats, _, _ = model.evaluate(test_set)

    # save eval stats for later use
    list_of_eval_stats.append(eval_stats)

    print(">>>>>>> completed: ", model_id, "\n")

compare_performance(
    list_of_eval_stats,
    "Response",
    model_names=list_of_model_ids,
    output_directory="./visualizations",
    file_format="png",
)
コード例 #12
0

if __name__ == "__main__":
    if sys.argv[1] == "train":
        if len(sys.argv) != 5:
            print("Incorrect number of arguments. Please use format:\npython main.py train <path-to-input-csv-file> <path-to-output-csv-file> <ludwig-model-definition>")

        preprocess_dataset(sys.argv[2], sys.argv[3], 1)

        config = sys.argv[4]
        model = LudwigModel(config)
        train_stats = model.experiment(dataset=sys.argv[3], training_set=sys.argv[3], validation_set=sys.argv[3],
                                       test_set=sys.argv[3], experiment_name='covid_inference', model_name='train')

        print(train_stats)
    elif sys.argv[1] == "evaluate":
        if len(sys.argv) != 6:
            print("Incorrect number of arguments. Please use format:\npython main.py evaluate <path-to-trained-model> <path-to-input-csv-file> <path-to-output-csv-file> <ludwig-model-definition>")

        preprocess_dataset(sys.argv[3], sys.argv[4], 0)

        config = sys.argv[5]
        model = LudwigModel.load(sys.argv[2])
        train_stats = model.evaluate(dataset=sys.argv[4], skip_save_predictions=False,
                                     skip_save_eval_stats=False, collect_predictions=True, collect_overall_stats=True)
        print(train_stats)
    else:
        print(
            "Incorrect arguments. Please use format:\n python main.py [train/evaluate]")
        # ludwig train --dataset sys.argv[2] --config sys.argv[3]
コード例 #13
0
def train_and_eval_on_split(
        model_definition,
        eval_split=VALIDATION,
        dataset=None,
        training_set=None,
        validation_set=None,
        test_set=None,
        training_set_metadata=None,
        data_format=None,
        experiment_name="hyperopt",
        model_name="run",
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=False,
        skip_save_training_statistics=False,
        skip_save_model=False,
        skip_save_progress=False,
        skip_save_log=False,
        skip_save_processed_input=False,
        skip_save_unprocessed_output=False,
        skip_save_predictions=False,
        skip_save_eval_stats=False,
        output_directory="results",
        gpus=None,
        gpu_memory_limit=None,
        allow_parallel_threads=True,
        use_horovod=None,
        random_seed=default_random_seed,
        debug=False,
        **kwargs):
    # Collect training and validation losses and metrics
    # & append it to `results`
    model = LudwigModel(
        model_definition=model_definition,
        use_horovod=use_horovod,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
    )

    train_stats, preprocessed_data, _ = model.train(
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        output_directory=output_directory,
        random_seed=random_seed,
        debug=debug,
    )

    if model_definition[TRAINING]["eval_batch_size"] > 0:
        batch_size = model_definition[TRAINING]["eval_batch_size"]
    else:
        batch_size = model_definition[TRAINING]["batch_size"]

    training_set, validation_set, test_set, train_set_metadata = preprocessed_data
    eval_set = validation_set
    if eval_split == TRAINING:
        eval_set = training_set
    elif eval_split == VALIDATION:
        eval_set = validation_set
    elif eval_split == TEST:
        eval_set = test_set

    test_results, postproc_predictions, _ = model.evaluate(
        dataset=eval_set,
        data_format=data_format,
        batch_size=batch_size,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        return_type=dict,
        debug=debug,
    )

    return train_stats, test_results
コード例 #14
0
    dataset = twitter_bots.TwitterBots(cache_dir=".")
    training_set, val_set, test_set = dataset.load(split=True)
    # Moves profile images into local directory, so relative paths in the dataset will be resolved.
    rename(os.path.join(dataset.processed_dataset_path, "profile_images"),
           "./profile_images")

    with open("./config.yaml") as f:
        config = yaml.safe_load(f.read())

    model = LudwigModel(config, logging_level=logging.INFO)

    train_stats, preprocessed_data, output_directory = model.train(
        dataset=training_set)

    # Generates predictions and performance statistics for the test set.
    test_stats, predictions, output_directory = model.evaluate(
        test_set, collect_predictions=True, collect_overall_stats=True)

    confusion_matrix(
        [test_stats],
        model.training_set_metadata,
        "account_type",
        top_n_classes=[2],
        model_names=[""],
        normalize=True,
        output_directory="./visualizations",
        file_format="png",
    )

    # Visualizes learning curves, which show how performance metrics changed over time during training.
    learning_curves(train_stats,
                    output_feature_name="account_type",