Beispiel #1
0
def test_torchscript_preproc_with_nans(tmpdir, csv_filename, feature):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    input_features = [
        feature,
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features,
                                           output_features,
                                           data_csv_path,
                                           nan_percent=0.2)

    # Initialize Ludwig model
    ludwig_model, script_module = initialize_torchscript_module(
        tmpdir, config, backend, training_data_csv_path)

    # Obtain preprocessed inputs from Python model
    preproc_inputs_expected, _ = preprocess_for_prediction(
        ludwig_model.config,
        training_data_csv_path,
        ludwig_model.training_set_metadata,
        backend=backend,
        include_outputs=False,
    )

    df = pd.read_csv(training_data_csv_path)
    inputs = to_inference_module_input_from_dataframe(df,
                                                      config,
                                                      load_paths=True)
    preproc_inputs = script_module.preprocessor_forward(inputs)

    # Check that preproc_inputs is the same as preproc_inputs_expected.
    for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items(
    ):
        feature_name = feature_name_expected[:feature_name_expected.rfind(
            "_")]  # remove proc suffix
        if feature_name not in preproc_inputs.keys():
            continue

        feature_values = preproc_inputs[feature_name]
        assert utils.is_all_close(
            feature_values,
            feature_values_expected), f"feature: {feature_name}"
Beispiel #2
0
def validate_torchscript_outputs(tmpdir,
                                 config,
                                 backend,
                                 training_data_csv_path,
                                 tolerance=1e-8):
    # Train Ludwig (Pythonic) model:
    ludwig_model, script_module = initialize_torchscript_module(
        tmpdir,
        config,
        backend,
        training_data_csv_path,
    )

    # Obtain predictions from Python model
    preds_dict, _ = ludwig_model.predict(dataset=training_data_csv_path,
                                         return_type=dict)

    df = pd.read_csv(training_data_csv_path)
    inputs = to_inference_module_input_from_dataframe(df,
                                                      config,
                                                      load_paths=True)
    outputs = script_module(inputs)

    # TODO: these are the only outputs we provide from Torchscript for now
    ts_outputs = {PREDICTIONS, PROBABILITIES, LOGITS}

    # Compare results from Python trained model against Torchscript
    for feature_name, feature_outputs_expected in preds_dict.items():
        assert feature_name in outputs

        feature_outputs = outputs[feature_name]
        for output_name, output_values_expected in feature_outputs_expected.items(
        ):
            if output_name not in ts_outputs:
                continue

            assert output_name in feature_outputs
            output_values = feature_outputs[output_name]
            assert utils.has_no_grad(
                output_values
            ), f'"{feature_name}.{output_name}" tensors have gradients'
            assert utils.is_all_close(
                output_values, output_values_expected
            ), f'"{feature_name}.{output_name}" tensors are not close to ludwig model'
Beispiel #3
0
def test_torchscript(csv_filename, should_load_model):
    #######
    # Setup
    #######
    with tempfile.TemporaryDirectory() as tmpdir:
        dir_path = tmpdir
        data_csv_path = os.path.join(tmpdir, csv_filename)
        image_dest_folder = os.path.join(tmpdir, "generated_images")
        audio_dest_folder = os.path.join(tmpdir, "generated_audio")

        # Single sequence input, single category output
        input_features = [
            binary_feature(),
            numerical_feature(),
            category_feature(vocab_size=3),
            sequence_feature(vocab_size=3),
            text_feature(vocab_size=3),
            vector_feature(),
            image_feature(image_dest_folder),
            audio_feature(audio_dest_folder),
            timeseries_feature(),
            date_feature(),
            date_feature(),
            h3_feature(),
            set_feature(vocab_size=3),
            bag_feature(vocab_size=3),
        ]

        output_features = [
            category_feature(vocab_size=3),
            binary_feature(),
            numerical_feature(),
            set_feature(vocab_size=3),
            vector_feature()
            # TODO(#1333): Re-enable.
            # sequence_feature(vocab_size=3),
            # text_feature(vocab_size=3),
        ]

        predictions_column_name = "{}_predictions".format(output_features[0]["name"])

        # Generate test data
        data_csv_path = generate_data(input_features, output_features, data_csv_path)

        #############
        # Train model
        #############
        backend = LocalTestBackend()
        config = {"input_features": input_features, "output_features": output_features, "training": {"epochs": 2}}
        ludwig_model = LudwigModel(config, backend=backend)
        ludwig_model.train(
            dataset=data_csv_path,
            skip_save_training_description=True,
            skip_save_training_statistics=True,
            skip_save_model=True,
            skip_save_progress=True,
            skip_save_log=True,
            skip_save_processed_input=True,
        )

        ###################
        # save Ludwig model
        ###################
        ludwigmodel_path = os.path.join(dir_path, "ludwigmodel")
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        ludwig_model.save(ludwigmodel_path)

        ###################
        # load Ludwig model
        ###################
        if should_load_model:
            ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)

        ##############################
        # collect weight tensors names
        ##############################
        original_predictions_df, _ = ludwig_model.predict(dataset=data_csv_path)
        original_weights = deepcopy(list(ludwig_model.model.parameters()))

        #################
        # save torchscript
        #################
        torchscript_path = os.path.join(dir_path, "torchscript")
        shutil.rmtree(torchscript_path, ignore_errors=True)
        ludwig_model.model.save_torchscript(torchscript_path)

        ###################################################
        # load Ludwig model, obtain predictions and weights
        ###################################################
        ludwig_model = LudwigModel.load(ludwigmodel_path, backend=backend)
        loaded_prediction_df, _ = ludwig_model.predict(dataset=data_csv_path)
        loaded_weights = deepcopy(list(ludwig_model.model.parameters()))

        #####################################################
        # restore torchscript, obtain predictions and weights
        #####################################################
        training_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME)

        dataset, training_set_metadata = preprocess_for_prediction(
            ludwig_model.config,
            dataset=data_csv_path,
            training_set_metadata=training_set_metadata_json_fp,
            backend=backend,
        )

        restored_model = torch.jit.load(torchscript_path)

        # Check the outputs for one of the features for correctness
        # Here we choose the first output feature (categorical)
        of_name = list(ludwig_model.model.output_features.keys())[0]

        data_to_predict = {
            name: torch.from_numpy(dataset.dataset[feature.proc_column])
            for name, feature in ludwig_model.model.input_features.items()
        }

        # Get predictions from restored torchscript.
        logits = restored_model(data_to_predict)
        restored_predictions = torch.argmax(
            output_feature_utils.get_output_feature_tensor(logits, of_name, "logits"), -1
        )

        restored_predictions = [training_set_metadata[of_name]["idx2str"][idx] for idx in restored_predictions]

        restored_weights = deepcopy(list(restored_model.parameters()))

        #########
        # Cleanup
        #########
        shutil.rmtree(ludwigmodel_path, ignore_errors=True)
        shutil.rmtree(torchscript_path, ignore_errors=True)

        ###############################################
        # Check if weights and predictions are the same
        ###############################################

        # Check to weight values match the original model.
        assert utils.is_all_close(original_weights, loaded_weights)
        assert utils.is_all_close(original_weights, restored_weights)

        # Check that predictions are identical to the original model.
        assert np.all(original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name])

        assert np.all(original_predictions_df[predictions_column_name] == restored_predictions)
Beispiel #4
0
def test_torchscript_preproc_timeseries_alternative_type(
        tmpdir, csv_filename, padding, fill_value):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    feature = timeseries_feature(
        preprocessing={
            "padding": padding,
            "timeseries_length_limit": 4,
            "fill_value": "1.0",
        },
        max_len=7,
    )
    input_features = [
        feature,
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features,
                                           output_features,
                                           data_csv_path,
                                           nan_percent=0.2)

    # Initialize Ludwig model
    ludwig_model, script_module = initialize_torchscript_module(
        tmpdir, config, backend, training_data_csv_path)

    # Obtain preprocessed inputs from Python model
    preproc_inputs_expected, _ = preprocess_for_prediction(
        ludwig_model.config,
        training_data_csv_path,
        ludwig_model.training_set_metadata,
        backend=backend,
        include_outputs=False,
    )

    df = pd.read_csv(training_data_csv_path)
    inputs = to_inference_module_input_from_dataframe(df,
                                                      config,
                                                      load_paths=True)

    def transform_timeseries_from_str_list_to_tensor_list(timeseries_list):
        timeseries = []
        for timeseries_str in timeseries_list:
            timeseries.append(
                torch.tensor([float(x) for x in timeseries_str.split()]))
        return timeseries

    inputs[feature[NAME]] = transform_timeseries_from_str_list_to_tensor_list(
        inputs[feature[NAME]])

    preproc_inputs = script_module.preprocessor_forward(inputs)

    # Check that preproc_inputs is the same as preproc_inputs_expected.
    for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items(
    ):
        feature_name = feature_name_expected[:feature_name_expected.rfind(
            "_")]  # remove proc suffix
        assert feature_name in preproc_inputs.keys(
        ), f'feature "{feature_name}" not found.'

        feature_values = preproc_inputs[feature_name]
        assert utils.is_all_close(
            feature_values, feature_values_expected
        ), f'feature "{feature_name}" value mismatch.'
Beispiel #5
0
def test_torchscript_preproc_vector_alternative_type(tmpdir, csv_filename,
                                                     vector_type):
    data_csv_path = os.path.join(tmpdir, csv_filename)
    feature = vector_feature()
    input_features = [
        feature,
    ]
    output_features = [
        binary_feature(),
    ]
    backend = LocalTestBackend()
    config = {
        "input_features": input_features,
        "output_features": output_features,
        TRAINER: {
            "epochs": 2
        }
    }
    training_data_csv_path = generate_data(input_features, output_features,
                                           data_csv_path)

    # Initialize Ludwig model
    ludwig_model, script_module = initialize_torchscript_module(
        tmpdir, config, backend, training_data_csv_path)

    # Obtain preprocessed inputs from Python model
    preproc_inputs_expected, _ = preprocess_for_prediction(
        ludwig_model.config,
        training_data_csv_path,
        ludwig_model.training_set_metadata,
        backend=backend,
        include_outputs=False,
    )

    df = pd.read_csv(training_data_csv_path)
    inputs = to_inference_module_input_from_dataframe(df,
                                                      config,
                                                      load_paths=True)

    def transform_vector_list(vector_list, vector_type):
        vectors = []
        for vector_str in vector_list:
            vectors.append(torch.tensor([float(x)
                                         for x in vector_str.split()]))

        if vector_type == torch.Tensor:
            vectors = torch.stack(vectors)
        return vectors

    inputs[feature[NAME]] = transform_vector_list(inputs[feature[NAME]],
                                                  vector_type)

    preproc_inputs = script_module.preprocessor_forward(inputs)

    # Check that preproc_inputs is the same as preproc_inputs_expected.
    for feature_name_expected, feature_values_expected in preproc_inputs_expected.dataset.items(
    ):
        feature_name = feature_name_expected[:feature_name_expected.rfind(
            "_")]  # remove proc suffix
        if feature_name not in preproc_inputs.keys():
            continue

        feature_values = preproc_inputs[feature_name]
        assert utils.is_all_close(
            feature_values,
            feature_values_expected), f"feature: {feature_name}"