Esempio n. 1
0
def test_roc_curves_from_test_statistics_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [binary_feature(), bag_feature()]
    output_features = [binary_feature()]

    # Generate test data
    data_csv = generate_data(input_features, output_features, csv_filename)
    output_feature_name = output_features[0]['name']

    model = run_api_experiment(input_features, output_features)
    data_df = read_csv(data_csv)
    _, _, output_dir = model.train(dataset=data_df)
    # extract test metrics
    test_stats, _, _ = model.evaluate(dataset=data_df,
                                      collect_overall_stats=True,
                                      output_directory=output_dir)
    test_stats = test_stats
    viz_outputs = ('pdf', 'png')
    for viz_output in viz_outputs:
        vis_output_pattern_pdf = os.path.join(output_dir, '*.{}'.format(
            viz_output))
        visualize.roc_curves_from_test_statistics(
            [test_stats, test_stats],
            output_feature_name,
            model_names=['Model1', 'Model2'],
            output_directory=output_dir,
            file_format=viz_output
        )
        figure_cnt = glob.glob(vis_output_pattern_pdf)
        assert 1 == len(figure_cnt)
    shutil.rmtree(output_dir, ignore_errors=True)
Esempio n. 2
0
def test_validate_with_preprocessing_defaults():
    config = {
        "input_features": [
            audio_feature("/tmp/destination_folder", preprocessing=AudioFeatureMixin.preprocessing_defaults),
            bag_feature(preprocessing=BagFeatureMixin.preprocessing_defaults),
            binary_feature(preprocessing=BinaryFeatureMixin.preprocessing_defaults),
            category_feature(preprocessing=CategoryFeatureMixin.preprocessing_defaults),
            date_feature(preprocessing=DateFeatureMixin.preprocessing_defaults),
            h3_feature(preprocessing=H3FeatureMixin.preprocessing_defaults),
            image_feature("/tmp/destination_folder", preprocessing=ImageFeatureMixin.preprocessing_defaults),
            numerical_feature(preprocessing=NumericalFeatureMixin.preprocessing_defaults),
            sequence_feature(preprocessing=SequenceFeatureMixin.preprocessing_defaults),
            set_feature(preprocessing=SetFeatureMixin.preprocessing_defaults),
            text_feature(preprocessing=TextFeatureMixin.preprocessing_defaults),
            timeseries_feature(preprocessing=TimeseriesFeatureMixin.preprocessing_defaults),
            vector_feature(preprocessing=VectorFeatureMixin.preprocessing_defaults),
        ],
        "output_features": [{"name": "target", "type": "category"}],
        "training": {
            "decay": True,
            "learning_rate": 0.001,
            "validation_field": "target",
            "validation_metric": "accuracy",
        },
    }

    validate_config(config)
    config = merge_with_defaults(config)
    validate_config(config)
Esempio n. 3
0
def test_torchscript_e2e_tabnet_combiner(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    # Configure features to be tested:
    input_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        bag_feature(vocab_size=3),
        set_feature(vocab_size=3),
    ]
    output_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        COMBINER: {
            "type": "tabnet",
            "num_total_blocks": 2,
            "num_shared_blocks": 2,
        },
        TRAINER: {
            "epochs": 2
        },
    }

    # Generate training data
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    validate_torchscript_outputs(tmpdir, config, backend,
                                 training_data_csv_path)
Esempio n. 4
0
def test_roc_curves_from_test_statistics_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [binary_feature(), bag_feature()]
    output_features = [binary_feature()]
    encoder = 'parallel_cnn'

    # Generate test data
    data_csv = generate_data(input_features, output_features, csv_filename)
    output_feature_name = output_features[0]['name']
    input_features[0]['encoder'] = encoder
    model = run_api_experiment(input_features, output_features)
    data_df = read_csv(data_csv)
    model.train(data_df=data_df)
    test_stats = model.test(data_df=data_df)[1]
    viz_outputs = ('pdf', 'png')
    for viz_output in viz_outputs:
        vis_output_pattern_pdf = model.exp_dir_name + '/*.{}'.format(
            viz_output)
        visualize.roc_curves_from_test_statistics(
            [test_stats, test_stats],
            output_feature_name,
            model_namess=['Model1', 'Model2'],
            output_directory=model.exp_dir_name,
            file_format=viz_output)
        figure_cnt = glob.glob(vis_output_pattern_pdf)
        assert 1 == len(figure_cnt)
    shutil.rmtree(model.exp_dir_name, ignore_errors=True)
Esempio n. 5
0
def test_experiment_various_feature_types(csv_filename):
    input_features = [binary_feature(), bag_feature()]
    output_features = [set_feature(max_len=3, vocab_size=5)]

    # Generate test data
    rel_path = generate_data(input_features, output_features, csv_filename)
    run_experiment(input_features, output_features, data_csv=rel_path)
Esempio n. 6
0
def test_config_features():
    all_input_features = [
        audio_feature("/tmp/destination_folder"),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature("/tmp/destination_folder"),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        number_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        "input_features": all_input_features,
        "output_features": all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature["type"] not in output_type_registry.keys()
    ]
    for input_feature in input_only_features:
        config = {
            "input_features": all_input_features,
            "output_features": all_output_features + [input_feature],
        }

        dtype = input_feature["type"]
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
Esempio n. 7
0
def test_config_features():
    all_input_features = [
        audio_feature('/tmp/destination_folder'),
        bag_feature(),
        binary_feature(),
        category_feature(),
        date_feature(),
        h3_feature(),
        image_feature('/tmp/destination_folder'),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        timeseries_feature(),
        vector_feature(),
    ]
    all_output_features = [
        binary_feature(),
        category_feature(),
        numerical_feature(),
        sequence_feature(),
        set_feature(),
        text_feature(),
        vector_feature(),
    ]

    # validate config with all features
    config = {
        'input_features': all_input_features,
        'output_features': all_output_features,
    }
    validate_config(config)

    # make sure all defaults provided also registers as valid
    config = merge_with_defaults(config)
    validate_config(config)

    # test various invalid output features
    input_only_features = [
        feature for feature in all_input_features
        if feature['type'] not in OUTPUT_FEATURE_TYPES
    ]
    for input_feature in input_only_features:
        config = {
            'input_features': all_input_features,
            'output_features': all_output_features + [input_feature],
        }

        dtype = input_feature['type']
        with pytest.raises(ValidationError,
                           match=rf"^'{dtype}' is not one of .*"):
            validate_config(config)
Esempio n. 8
0
def test_torchscript_e2e_tabular(csv_filename, tmpdir):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    # Configure features to be tested:
    bin_str_feature = binary_feature()
    transformed_number_features = [
        number_feature(preprocessing={"normalization": numeric_transformer})
        for numeric_transformer in numeric_transformation_registry.keys()
    ]
    input_features = [
        bin_str_feature,
        binary_feature(),
        *transformed_number_features,
        category_feature(vocab_size=3),
        bag_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
        # TODO: future support
        # date_feature(),
        # h3_feature(),
    ]
    output_features = [
        bin_str_feature,
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    # Generate training data
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    # Convert bool values to strings, e.g., {'Yes', 'No'}
    df = pd.read_csv(training_data_csv_path)
    false_value, true_value = "No", "Yes"
    df[bin_str_feature[NAME]] = df[bin_str_feature[NAME]].map(
        lambda x: true_value if x else false_value)
    df.to_csv(training_data_csv_path)

    validate_torchscript_outputs(tmpdir, config, backend,
                                 training_data_csv_path)
Esempio n. 9
0
def test_ray_tabular(ray_start_2_cpus):
    input_features = [
        sequence_feature(reduce_output='sum'),
        numerical_feature(normalization='zscore'),
        set_feature(),
        text_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [category_feature(vocab_size=2, reduce_input='sum')]
    run_test_parquet(input_features, output_features)
Esempio n. 10
0
def test_ray_tabular():
    input_features = [
        sequence_feature(reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
        numerical_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [
        binary_feature(),
        numerical_feature(normalization="zscore"),
    ]
    run_test_parquet(input_features, output_features)
Esempio n. 11
0
def test_ray_tabular(df_engine):
    if df_engine == "modin" and sys.version_info < (3, 7):
        pytest.skip("Modin is not supported with Python 3.6 at this time")

    input_features = [
        sequence_feature(reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
        number_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [
        binary_feature(),
        number_feature(normalization="zscore"),
    ]
    run_test_parquet(input_features, output_features, df_engine=df_engine)
Esempio n. 12
0
def test_ray_tabular(df_engine):
    input_features = [
        sequence_feature(reduce_output="sum"),
        category_feature(vocab_size=2, reduce_input="sum"),
        number_feature(normalization="zscore"),
        set_feature(),
        binary_feature(),
        bag_feature(),
        vector_feature(),
        h3_feature(),
        date_feature(),
    ]
    output_features = [
        binary_feature(bool2str=["No", "Yes"]),
        binary_feature(),
        number_feature(normalization="zscore"),
    ]
    run_test_with_features(
        input_features,
        output_features,
        df_engine=df_engine,
    )
Esempio n. 13
0
def test_roc_curves_from_test_statistics_vis_api(csv_filename):
    """Ensure pdf and png figures can be saved via visualization API call.

    :param csv_filename: csv fixture from tests.fixtures.filenames.csv_filename
    :return: None
    """
    input_features = [binary_feature(), bag_feature()]
    output_features = [binary_feature()]

    with TemporaryDirectory() as tmpvizdir:
        # Generate test data
        data_csv = generate_data(input_features, output_features,
                                 os.path.join(tmpvizdir, csv_filename))
        output_feature_name = output_features[0]["name"]

        model = run_api_experiment(input_features, output_features)
        data_df = read_csv(data_csv)
        _, _, output_dir = model.train(dataset=data_df,
                                       output_directory=os.path.join(
                                           tmpvizdir, "results"))
        # extract test metrics
        test_stats, _, _ = model.evaluate(dataset=data_df,
                                          collect_overall_stats=True,
                                          output_directory=output_dir)
        test_stats = test_stats
        viz_outputs = ("pdf", "png")
        for viz_output in viz_outputs:
            vis_output_pattern_pdf = os.path.join(output_dir,
                                                  f"*.{viz_output}")
            visualize.roc_curves_from_test_statistics(
                [test_stats, test_stats],
                output_feature_name,
                model_names=["Model1", "Model2"],
                output_directory=output_dir,
                file_format=viz_output,
            )
            figure_cnt = glob.glob(vis_output_pattern_pdf)
            assert 1 == len(figure_cnt)
Esempio n. 14
0
def test_savedmodel(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, 'generated_images')
        audio_dest_folder = os.path.join(tmpdir, 'generated_audio')

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            set_feature(vocab_size=3),
            vector_feature()
        ]

        predictions_column_name = '{}_predictions'.format(
            output_features[0]['name'])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features,
                                      data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {
            'input_features': input_features,
            'output_features': output_features,
            'training': {
                'epochs': 2
            }
        }
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(
            dataset=data_csv_path)
        original_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################
        # save savedmodel
        #################
        savedmodel_path = os.path.join(dir_path, 'savedmodel')
        shutil.rmtree(savedmodel_path, ignore_errors=True)
        ludwig_model.model.save_savedmodel(savedmodel_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(ludwig_model.model.trainable_variables)

        #################################################
        # restore savedmodel, obtain predictions and weights
        #################################################
        training_set_metadata_json_fp = os.path.join(
            ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = tf.saved_model.load(savedmodel_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: tf.convert_to_tensor(dataset.dataset[feature.proc_column],
                                       dtype=feature.get_input_dtype())
            for name, feature in ludwig_model.model.input_features.items()
        }

        logits = restored_model(data_to_predict, False, None)

        restored_predictions = tf.argmax(logits[of_name]['logits'],
                                         -1,
                                         name='predictions_{}'.format(of_name))
        restored_predictions = tf.map_fn(
            lambda idx: training_set_metadata[of_name]['idx2str'][idx],
            restored_predictions,
            dtype=tf.string)

        restored_weights = deepcopy(restored_model.trainable_variables)

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(savedmodel_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # check for same number of weights as original model
        assert len(original_weights) == len(loaded_weights)
        assert len(original_weights) == len(restored_weights)

        # check to ensure weight valuess match the original model
        loaded_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           loaded_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        original_weights = sorted(original_weights, key=lambda w: w.name)
        restored_weights = sorted(restored_weights, key=lambda w: w.name)

        restored_weights_match = np.all([
            np.all(
                np.isclose(original_weights[i].numpy(),
                           restored_weights[i].numpy()))
            for i in range(len(original_weights))
        ])

        assert loaded_weights_match and restored_weights_match

        #  Are predictions identical to original ones?
        loaded_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            loaded_prediction_df[predictions_column_name])

        restored_predictions_match = np.all(
            original_predictions_df[predictions_column_name] ==
            restored_predictions.numpy().astype('str'))

        assert loaded_predictions_match and restored_predictions_match
Esempio n. 15
0
def test_model_save_reload_api(csv_filename, tmp_path):
    torch.manual_seed(1)
    random.seed(1)
    np.random.seed(1)

    image_dest_folder = os.path.join(os.getcwd(), "generated_images")
    audio_dest_folder = os.path.join(os.getcwd(), "generated_audio")

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3, encoder="rnn", cell_type="lstm", num_layers=2, bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder="stacked_cnn"),
        timeseries_feature(encoder="parallel_cnn"),
        sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
<<<<<<< HEAD
        # TODO(shreya): Reintroduce sequence and text after sequence output feature.
=======
        # TODO(#1333): Reintroduce sequence and text after sequence output feature.
>>>>>>> upstream/master
        # sequence_feature(vocab_size=3),
        # text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features, csv_filename)

    #############
    # Train model
    #############
    config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}

    data_df = read_csv(data_csv_path)
    data_df[SPLIT] = get_split(data_df)
    training_set, test_set, validation_set = split_dataset_ttv(data_df, SPLIT)
    training_set = pd.DataFrame(training_set)
    validation_set = pd.DataFrame(validation_set)
    test_set = pd.DataFrame(test_set)

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory="results",  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        # this has to be done after predicts because of TF2 lazy restoration
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.parameters(), if2.encoder_obj.parameters()):
                assert torch.allclose(if1_w, if2_w)

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.parameters(), c2.parameters()):
            assert torch.allclose(c1_w, c2_w)

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.parameters(), of2.decoder_obj.parameters()):
                assert torch.allclose(of1_w, of2_w)

    # Test saving and loading the model explicitly
    with tempfile.TemporaryDirectory() as tmpdir:
        ludwig_model1.save(tmpdir)
        ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
        check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"), backend=backend)
    check_model_equal(ludwig_model_exp)
Esempio n. 16
0
        assert feature_name in preproc_inputs.keys(
        ), f'feature "{feature_name}" not found.'

        feature_values = preproc_inputs[feature_name]
        assert utils.is_all_close(
            feature_values, feature_values_expected
        ), f'feature "{feature_name}" value mismatch.'


@pytest.mark.parametrize(
    "feature",
    [
        number_feature(),
        binary_feature(),
        category_feature(vocab_size=3),
        bag_feature(vocab_size=3),
        set_feature(vocab_size=3),
        text_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        timeseries_feature(),
        h3_feature(),
        # TODO: future support
        # audio_feature(),  # default BACKFILL strategy is unintuitive at inference time
        # image_feature(),  # default BACKFILL strategy is unintuitive at inference time
        # vector_feature(), # does not have a missing_value_strategy
        # date_feature(),   # default fill with datetime.now() strategy is not scriptable
    ],
)
def test_torchscript_preproc_with_nans(tmpdir, csv_filename, feature):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    input_features = [
Esempio n. 17
0
def test_model_save_reload_api(csv_filename, tmp_path):
    tf.random.set_seed(1234)

    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3, encoder='rnn', cell_type='lstm',
                     num_layers=2, bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder='stacked_cnn'),
        timeseries_feature(encoder='parallel_cnn'),
        sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    config = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {'epochs': 2}
    }

    data_df = read_csv(data_csv_path)
    data_df[SPLIT] = get_split(data_df)
    training_set, test_set, validation_set = split_dataset_ttv(
        data_df,
        SPLIT
    )
    training_set = pd.DataFrame(training_set)
    validation_set = pd.DataFrame(validation_set)
    test_set = pd.DataFrame(test_set)

    # create sub-directory to store results
    results_dir = tmp_path / 'results'
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory='results'  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        # this has to be done after predicts because of TF2 lazy restoration
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.weights,
                                    if2.encoder_obj.weights):
                assert np.allclose(if1_w.numpy(), if2_w.numpy())

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.weights, c2.weights):
            assert np.allclose(c1_w.numpy(), c2_w.numpy())

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.weights,
                                    of2.decoder_obj.weights):
                assert np.allclose(of1_w.numpy(), of2_w.numpy())

    # Test saving and loading the model explicitly
    with tempfile.TemporaryDirectory() as tmpdir:
        ludwig_model1.save(tmpdir)
        ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
        check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(
        os.path.join(output_dir, 'model'),
        backend=backend
    )
    check_model_equal(ludwig_model_exp)
Esempio n. 18
0

@pytest.mark.parametrize(
    "input_test_feature, output_test_feature",
    [
        ([category_feature()], [binary_feature(), binary_feature()]),
        ([category_feature()], [category_feature(vocab_size=5), category_feature(vocab_size=7)]),
        ([category_feature()], [numerical_feature(), numerical_feature()]),
        # ([category_feature()],
        #  [sequence_feature(vocab_size=5), sequence_feature(vocab_size=7)]),
        ([set_feature(vocab_size=5)], [set_feature(vocab_size=5), set_feature(vocab_size=7)]),
        # ([category_feature()],
        #  [text_feature(vocab_size=5), text_feature(vocab_size=7)]),
        ([category_feature()], [vector_feature(), vector_feature()]),
        ([vector_feature()], [vector_feature(), vector_feature()]),
        ([bag_feature()], [vector_feature(), vector_feature()]),
    ],
)
def test_feature_multiple_outputs(input_test_feature, output_test_feature, csv_filename):
    # Generate test data
    rel_path = generate_data(input_test_feature, output_test_feature, csv_filename, 1001)

    run_experiment(input_test_feature, output_test_feature, dataset=rel_path)


def test_category_int_dtype(tmpdir):
    feature = category_feature()
    input_features = [feature]
    output_features = [binary_feature()]

    csv_fname = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv"))
Esempio n. 19
0
def t_neuropod(csv_filename):
    #######
    # Setup
    #######
    dir_path = os.path.dirname(csv_filename)
    image_dest_folder = os.path.join(os.getcwd(), 'generated_images')
    audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio')

    input_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder),
        timeseries_feature(),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        numerical_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature()
    ]

    # Generate test data
    data_csv_path = generate_data(input_features, output_features,
                                  csv_filename)

    #############
    # Train model
    #############
    model_definition = {
        'input_features': input_features,
        'output_features': output_features,
        'training': {
            'epochs': 2
        }
    }
    ludwig_model = LudwigModel(model_definition)
    ludwig_model.train(
        data_csv=data_csv_path,
        skip_save_training_description=True,
        skip_save_training_statistics=True,
        skip_save_model=True,
        skip_save_progress=True,
        skip_save_log=True,
        skip_save_processed_input=True,
    )
    original_predictions_df = ludwig_model.predict(data_csv=data_csv_path)

    ###################
    # save Ludwig model
    ###################
    ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel')
    shutil.rmtree(ludwigmodel_path, ignore_errors=True)
    ludwig_model.save(ludwigmodel_path)

    ################
    # build neuropod
    ################
    neuropod_path = os.path.join(dir_path, 'neuropod')
    export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path)

    ########################
    # predict using neuropod
    ########################
    data_df = pd.read_csv(data_csv_path)
    if_dict = {
        input_feature['name']: np.expand_dims(
            np.array([str(x) for x in data_df[input_feature['name']].tolist()],
                     dtype='str'), 1)
        for input_feature in input_features
    }

    from neuropod.loader import load_neuropod
    neuropod_model = load_neuropod(neuropod_path)
    preds = neuropod_model.infer(if_dict)

    for key in preds:
        preds[key] = np.squeeze(preds[key])

    #########
    # cleanup
    #########
    # Delete the temporary data created
    for path in [
            ludwigmodel_path, neuropod_path, image_dest_folder,
            audio_dest_folder
    ]:
        if os.path.exists(path):
            if os.path.isfile(path):
                os.remove(path)
            else:
                shutil.rmtree(path, ignore_errors=True)

    ########
    # checks
    ########
    for output_feature in output_features:
        output_feature_name = output_feature['name']
        output_feature_type = output_feature['type']

        if (output_feature_name + "_predictions" in preds
                and output_feature_name + "_predictions"
                in original_predictions_df):
            neuropod_pred = preds[output_feature_name +
                                  "_predictions"].tolist()
            if output_feature_type == BINARY:
                neuropod_pred = list(map(lambda x: str2bool(x), neuropod_pred))
            if output_feature_type in {SEQUENCE, TEXT, SET}:
                neuropod_pred = list(map(lambda x: x.split(), neuropod_pred))

            original_pred = original_predictions_df[output_feature_name +
                                                    "_predictions"].tolist()

            assert neuropod_pred == original_pred

        if (output_feature_name + "_probability" in preds
                and output_feature_name + "_probability"
                in original_predictions_df):
            neuropod_prob = preds[output_feature_name +
                                  "_probability"].tolist()
            if output_feature_type in {SEQUENCE, TEXT, SET}:
                neuropod_prob = list(
                    map(lambda x: [float(n) for n in x.split()],
                        neuropod_prob))
            if any(isinstance(el, list) for el in neuropod_prob):
                neuropod_prob = np.array(
                    list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T

            original_prob = original_predictions_df[output_feature_name +
                                                    "_probability"].tolist()
            if any(isinstance(el, list) for el in original_prob):
                original_prob = np.array(
                    list(itertools.zip_longest(*original_prob, fillvalue=0))).T

            assert np.isclose(neuropod_prob, original_prob).all()

        if (output_feature_name + "_probabilities" in preds
                and output_feature_name + "_probabilities"
                in original_predictions_df):
            neuropod_prob = preds[output_feature_name +
                                  "_probabilities"].tolist()

            original_prob = original_predictions_df[output_feature_name +
                                                    "_probabilities"].tolist()

            assert np.isclose(neuropod_prob, original_prob).all()
Esempio n. 20
0
def test_neuropod(csv_filename):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, "generated_images")
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")

        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            set_feature(vocab_size=3),
            vector_feature(),
        ]

        # Generate test data
        data_csv_path = generate_data(input_features, output_features, data_csv_path)

        #############
        # Train model
        #############
        config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}
        ludwig_model = LudwigModel(config, backend=LocalTestBackend())
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
            output_directory=dir_path,
        )

        data_df = pd.read_csv(data_csv_path)
        original_predictions_df, _ = ludwig_model.predict(dataset=data_df)

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, "ludwigmodel")
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ################
        # build neuropod
        ################
        neuropod_path = os.path.join(dir_path, "neuropod")
        shutil.rmtree(neuropod_path, ignore_errors=True)
        export_neuropod(ludwigmodel_path, neuropod_path=neuropod_path, entrypoint="get_test_model")

        ########################
        # predict using neuropod
        ########################
        if_dict = {
            input_feature["name"]: np.expand_dims(
                np.array([str(x) for x in data_df[input_feature["name"]].tolist()], dtype="str"), 1
            )
            for input_feature in input_features
        }

        from neuropod.loader import load_neuropod

        neuropod_model = load_neuropod(neuropod_path, _always_use_native=False)
        preds = neuropod_model.infer(if_dict)

        for key in preds:
            preds[key] = np.squeeze(preds[key])

        #########
        # cleanup
        #########
        # Delete the temporary data created
        for path in [ludwigmodel_path, neuropod_path, image_dest_folder, audio_dest_folder]:
            if os.path.exists(path):
                if os.path.isfile(path):
                    os.remove(path)
                else:
                    shutil.rmtree(path, ignore_errors=True)

        ########
        # checks
        ########
        for output_feature in output_features:
            output_feature_name = output_feature["name"]
            output_feature_type = output_feature["type"]

            if (
                output_feature_name + "_predictions" in preds
                and output_feature_name + "_predictions" in original_predictions_df
            ):
                neuropod_pred = preds[output_feature_name + "_predictions"].tolist()
                if output_feature_type == BINARY:
                    neuropod_pred = [str2bool(x) for x in neuropod_pred]
                if output_feature_type in {SEQUENCE, TEXT, SET}:
                    neuropod_pred = [x.split() for x in neuropod_pred]

                original_pred = original_predictions_df[output_feature_name + "_predictions"].tolist()

                assert neuropod_pred == original_pred

            if (
                output_feature_name + "_probability" in preds
                and output_feature_name + "_probability" in original_predictions_df
            ):
                neuropod_prob = preds[output_feature_name + "_probability"].tolist()
                if output_feature_type in {SEQUENCE, TEXT, SET}:
                    neuropod_prob = [[float(n) for n in x.split()] for x in neuropod_prob]
                if any(isinstance(el, list) for el in neuropod_prob):
                    neuropod_prob = np.array(list(itertools.zip_longest(*neuropod_prob, fillvalue=0))).T

                original_prob = original_predictions_df[output_feature_name + "_probability"].tolist()
                if any(isinstance(el, list) for el in original_prob):
                    original_prob = np.array(list(itertools.zip_longest(*original_prob, fillvalue=0))).T

                assert np.allclose(neuropod_prob, original_prob)

            if (
                output_feature_name + "_probabilities" in preds
                and output_feature_name + "_probabilities" in original_predictions_df
            ):
                neuropod_prob = preds[output_feature_name + "_probabilities"].tolist()

                original_prob = original_predictions_df[output_feature_name + "_probabilities"].tolist()

                assert np.allclose(neuropod_prob, original_prob)
Esempio n. 21
0
def test_torchscript(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, "generated_images")
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            set_feature(vocab_size=3),
            vector_feature()
            # TODO(#1333): Re-enable.
            # sequence_feature(vocab_size=3),
            # text_feature(vocab_size=3),
        ]

        predictions_column_name = "{}_predictions".format(output_features[0]["name"])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features, data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, "ludwigmodel")
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path)
        original_weights = deepcopy(list(ludwig_model.model.parameters()))

        #################
        # save torchscript
        #################
        torchscript_path = os.path.join(dir_path, "torchscript")
        shutil.rmtree(torchscript_path, ignore_errors=True)
        ludwig_model.model.save_torchscript(torchscript_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(list(ludwig_model.model.parameters()))

        #####################################################
        # restore torchscript, obtain predictions and weights
        #####################################################
        training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = torch.jit.load(torchscript_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: torch.from_numpy(dataset.dataset[feature.proc_column])
            for name, feature in ludwig_model.model.input_features.items()
        }

        # Get predictions from restored torchscript.
        logits = restored_model(data_to_predict)
        restored_predictions = torch.argmax(
            output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1
        )

        restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions]

        restored_weights = deepcopy(list(restored_model.parameters()))

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(torchscript_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # Check to weight values match the original model.
        assert utils.is_all_close(original_weights, loaded_weights)
        assert utils.is_all_close(original_weights, restored_weights)

        # Check that predictions are identical to the original model.
        assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name])

        assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
Esempio n. 22
0
def test_model_save_reload_api(tmpdir, csv_filename, tmp_path):
    torch.manual_seed(1)
    random.seed(1)
    np.random.seed(1)

    image_dest_folder = os.path.join(os.getcwd(), "generated_images")
    audio_dest_folder = os.path.join(os.getcwd(), "generated_audio")

    input_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3,
                     encoder="rnn",
                     cell_type="lstm",
                     num_layers=2,
                     bidirections=True),
        vector_feature(),
        image_feature(image_dest_folder),
        audio_feature(audio_dest_folder, encoder="stacked_cnn"),
        timeseries_feature(encoder="parallel_cnn"),
        sequence_feature(vocab_size=3, encoder="stacked_parallel_cnn"),
        date_feature(),
        h3_feature(),
        set_feature(vocab_size=3),
        bag_feature(vocab_size=3),
    ]

    output_features = [
        binary_feature(),
        number_feature(),
        category_feature(vocab_size=3),
        sequence_feature(vocab_size=3),
        text_feature(vocab_size=3),
        set_feature(vocab_size=3),
        vector_feature(),
    ]

    # Generate test data
    data_csv_path = generate_data(input_features,
                                  output_features,
                                  csv_filename,
                                  num_examples=50)

    #############
    # Train model
    #############
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }

    data_df = read_csv(data_csv_path)
    splitter = get_splitter("random")
    training_set, validation_set, test_set = splitter.split(
        data_df, LocalTestBackend())

    # create sub-directory to store results
    results_dir = tmp_path / "results"
    results_dir.mkdir()

    # perform initial model training
    backend = LocalTestBackend()
    ludwig_model1 = LudwigModel(config, backend=backend)
    _, _, output_dir = ludwig_model1.train(
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        output_directory="results",  # results_dir
    )

    preds_1, _ = ludwig_model1.predict(dataset=validation_set)

    def check_model_equal(ludwig_model2):
        # Compare model predictions
        preds_2, _ = ludwig_model2.predict(dataset=validation_set)
        assert set(preds_1.keys()) == set(preds_2.keys())
        for key in preds_1:
            assert preds_1[key].dtype == preds_2[key].dtype, key
            assert np.all(a == b
                          for a, b in zip(preds_1[key], preds_2[key])), key
            # assert preds_2[key].dtype == preds_3[key].dtype, key
            # assert list(preds_2[key]) == list(preds_3[key]), key

        # Compare model weights
        for if_name in ludwig_model1.model.input_features:
            if1 = ludwig_model1.model.input_features[if_name]
            if2 = ludwig_model2.model.input_features[if_name]
            for if1_w, if2_w in zip(if1.encoder_obj.parameters(),
                                    if2.encoder_obj.parameters()):
                assert torch.allclose(if1_w, if2_w)

        c1 = ludwig_model1.model.combiner
        c2 = ludwig_model2.model.combiner
        for c1_w, c2_w in zip(c1.parameters(), c2.parameters()):
            assert torch.allclose(c1_w, c2_w)

        for of_name in ludwig_model1.model.output_features:
            of1 = ludwig_model1.model.output_features[of_name]
            of2 = ludwig_model2.model.output_features[of_name]
            for of1_w, of2_w in zip(of1.decoder_obj.parameters(),
                                    of2.decoder_obj.parameters()):
                assert torch.allclose(of1_w, of2_w)

    ludwig_model1.save(tmpdir)
    ludwig_model_loaded = LudwigModel.load(tmpdir, backend=backend)
    check_model_equal(ludwig_model_loaded)

    # Test loading the model from the experiment directory
    ludwig_model_exp = LudwigModel.load(os.path.join(output_dir, "model"),
                                        backend=backend)
    check_model_equal(ludwig_model_exp)