def run_api_experiment(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ model_definition = model_definition_template.substitute( input_name=input_features, output_name=output_features ) model = LudwigModel(yaml.safe_load(model_definition)) # Training with csv model.train( data_csv=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True ) model.predict(data_csv=data_csv) # Training with dataframe data_df = read_csv(data_csv) model.train( data_df=data_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True ) model.predict(data_df=data_df)
def predict_with_backend(tmpdir, config, data_csv_path, backend, patch_args=None): with init_backend(backend): if backend == "ray": backend = RAY_BACKEND_CONFIG backend["processor"]["type"] = "dask" ludwig_model = LudwigModel(config, backend=backend) _, _, output_directory = ludwig_model.train( dataset=data_csv_path, output_directory=os.path.join(tmpdir, "output"), ) # Check that metadata JSON saves and loads correctly ludwig_model = LudwigModel.load(os.path.join(output_directory, "model")) if patch_args is not None: with mock.patch(*patch_args): preds_df, _ = ludwig_model.predict(dataset=data_csv_path) else: preds_df, _ = ludwig_model.predict(dataset=data_csv_path) return preds_df, ludwig_model
def train_with_backend(backend, config, dataset=None, training_set=None, validation_set=None, test_set=None): model = LudwigModel(config, backend=backend) output_dir = None try: _, _, output_dir = model.train( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True ) if dataset is None: dataset = training_set import dask.dataframe as dd if isinstance(dataset, dd.DataFrame): # For now, prediction must be done on Pandas DataFrame dataset = dataset.compute() model.predict(dataset=dataset) return model.model.get_weights() finally: # Remove results/intermediate data saved to disk shutil.rmtree(output_dir, ignore_errors=True)
def _run_test(input_features=None, output_features=None, combiner=None): with tempfile.TemporaryDirectory() as tmpdir: input_features = input_features or [ sequence_feature(reduce_output="sum"), number_feature(), ] output_features = output_features or [ category_feature(vocab_size=2, reduce_input="sum") ] combiner = combiner or {"type": "concat"} csv_filename = os.path.join(tmpdir, "training.csv") data_csv = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": combiner, TRAINER: { "epochs": 2 }, } model = LudwigModel(config, backend=LocalTestBackend()) _, _, output_directory = model.train( dataset=data_csv, output_directory=tmpdir, ) model.predict(dataset=data_csv, output_directory=output_directory)
def test_api_training_set(csv_filename): with tempfile.TemporaryDirectory() as tmpdir: input_features = [sequence_feature(reduce_output='sum')] output_features = [category_feature(vocab_size=2, reduce_input='sum')] data_csv = generate_data(input_features, output_features, csv_filename) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'validation.csv')) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv')) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, } model = LudwigModel(config) model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv) model.predict(dataset=test_csv) # Train again, this time the HDF5 cache will be used model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv)
def run_api_experiment(input_features, output_features, dataset, **kwargs): config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2} } model = LudwigModel(config) output_dir = None try: # Training with csv _, _, output_dir = model.train( dataset=dataset, **kwargs ) model.predict(dataset=dataset) # Attempt loading saved model, should broadcast successfully model_dir = os.path.join(output_dir, 'model') if output_dir else None loaded_model = LudwigModel.load(model_dir) # Model loading should broadcast weights from coordinator loaded_weights = loaded_model.model.get_weights() bcast_weights = hvd.broadcast_object(loaded_weights) for loaded, bcast in zip(loaded_weights, bcast_weights): assert np.allclose(loaded, bcast) finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True)
def test_api_training_set(csv_filename): with tempfile.TemporaryDirectory() as tmpdir: input_features = [sequence_feature(reduce_output="sum")] output_features = [category_feature(vocab_size=5, reduce_input="sum")] data_csv = generate_data(input_features, output_features, csv_filename) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv")) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv")) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, } model = LudwigModel(config) model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv) model.predict(dataset=test_csv) # Train again, this time the HDF5 cache will be used model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv)
def train_model(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } model = LudwigModel(model_definition) _, _, output_dir = model.train(dataset=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.predict(dataset=data_csv, output_directory=output_dir) return model, output_dir
def test_missing_value_prediction(csv_filename): random.seed(1) np.random.seed(1) with tempfile.TemporaryDirectory() as tmpdir: input_features = [ category_feature( vocab_size=2, reduce_input="sum", preprocessing=dict(missing_value_strategy="fill_with_mode")) ] output_features = [binary_feature()] dataset = pd.read_csv( generate_data(input_features, output_features, csv_filename)) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, } model = LudwigModel(config) _, _, output_dir = model.train(dataset=dataset, output_directory=tmpdir) # Set the input column to None, we should be able to replace the missing value with the mode # from the training set dataset[input_features[0]["name"]] = None model.predict(dataset=dataset) model = LudwigModel.load(os.path.join(output_dir, "model")) model.predict(dataset=dataset)
def train_and_predict_model(input_features, output_features, data_csv, output_directory): """Helper method to avoid code repetition for training a model and using it for prediction. :param input_features: input schema :param output_features: output schema :param data_csv: path to data :param output_directory: model output directory :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2 }, } model = LudwigModel(config, backend=LocalTestBackend()) model.train( dataset=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, output_directory=output_directory, ) model.predict(dataset=data_csv, output_directory=output_directory) return model
def test_missing_value_prediction(csv_filename): with tempfile.TemporaryDirectory() as tmpdir: input_features = [ category_feature( vocab_size=2, reduce_input='sum', preprocessing=dict(missing_value_strategy='fill_with_mode')) ] output_features = [binary_feature()] dataset = pd.read_csv( generate_data(input_features, output_features, csv_filename)) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, } model = LudwigModel(config) _, _, output_dir = model.train(dataset=dataset, output_directory=tmpdir) # Set the input column to None, we should be able to replace the missing value with the mode # from the training set dataset[input_features[0]['name']] = None model.predict(dataset=dataset) model = LudwigModel.load(os.path.join(output_dir, 'model')) model.predict(dataset=dataset)
def run_api_experiment(input_features, output_features, dataset, **kwargs): config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 14}, "training": {"epochs": 2}, } model = LudwigModel(config) output_dir = None try: # Training with csv _, _, output_dir = model.train(dataset=dataset, **kwargs) model.predict(dataset=dataset) # Attempt loading saved model, should broadcast successfully model_dir = os.path.join(output_dir, "model") if output_dir else None loaded_model = LudwigModel.load(model_dir) # Model loading should broadcast weights from coordinator loaded_state = loaded_model.model.state_dict() bcast_state = hvd.broadcast_object(loaded_state) for loaded, bcast in zip(loaded_state.values(), bcast_state.values()): assert np.allclose(loaded, bcast) finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True)
def test_experiment_dataset_formats(data_format): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting input_features = [ numerical_feature(), category_feature() ] output_features = [ category_feature(), numerical_feature() ] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'preprocessing': {}, 'training': {'epochs': 2} } # create temporary name for train and test data sets csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv' # setup training data format to test raw_data = generate_data(input_features, output_features, csv_filename) training_set_metadata = None if data_format == 'hdf5': # hdf5 format training_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=raw_data ) dataset_to_use = training_set.data_hdf5_fp else: dataset_to_use = create_data_set_to_use(data_format, raw_data) # define Ludwig model model = LudwigModel( config=config ) model.train( dataset=dataset_to_use, training_set_metadata=training_set_metadata, random_seed=default_random_seed ) # # run functions with the specified data format model.evaluate(dataset=dataset_to_use) model.predict(dataset=dataset_to_use) # Delete the temporary data created delete_temporary_data(csv_filename)
def run_api_experiment(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } model = LudwigModel(config) output_dir = None try: # Training with csv _, _, output_dir = model.train( dataset=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) model.predict(dataset=data_csv) model_dir = os.path.join(output_dir, "model") loaded_model = LudwigModel.load(model_dir) # Necessary before call to get_weights() to materialize the weights loaded_model.predict(dataset=data_csv) model_weights = model.model.get_weights() loaded_weights = loaded_model.model.get_weights() for model_weight, loaded_weight in zip(model_weights, loaded_weights): assert np.allclose(model_weight, loaded_weight) finally: # Remove results/intermediate data saved to disk shutil.rmtree(output_dir, ignore_errors=True) try: # Training with dataframe data_df = read_csv(data_csv) _, _, output_dir = model.train( dataset=data_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) model.predict(dataset=data_df) finally: shutil.rmtree(output_dir, ignore_errors=True)
def run(csv_filename): # Check that comet has been imported successfully as a contrib package contrib_instances = ludwig.contrib.contrib_registry["instances"] assert len(contrib_instances) == 1 comet_instance = contrib_instances[0] assert isinstance(comet_instance, Comet) # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Inputs & Outputs input_features = [image_feature(folder=image_dest_folder)] output_features = [category_feature()] data_csv = generate_data(input_features, output_features, csv_filename) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } model = LudwigModel(config) output_dir = None # Wrap these methods so we can check that they were called comet_instance.train_init = Mock(side_effect=comet_instance.train_init) comet_instance.train_model = Mock(side_effect=comet_instance.train_model) with patch('comet_ml.Experiment.log_asset_data') as mock_log_asset_data: try: # Training with csv _, _, output_dir = model.train(dataset=data_csv) model.predict(dataset=data_csv) finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True) # Verify that the experiment was created successfully assert comet_instance.cometml_experiment is not None # Check that these methods were called at least once comet_instance.train_init.assert_called() comet_instance.train_model.assert_called() # Check that we ran `train_model`, which calls into `log_assert_data`, successfully mock_log_asset_data.assert_called()
def test_remote_training_set(tmpdir, fs_protocol): with tempfile.TemporaryDirectory() as outdir: output_directory = f"{fs_protocol}://{outdir}" input_features = [sequence_feature(reduce_output="sum")] output_features = [category_feature(vocab_size=2, reduce_input="sum")] csv_filename = os.path.join(tmpdir, "training.csv") data_csv = generate_data(input_features, output_features, csv_filename) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv")) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv")) data_csv = f"{fs_protocol}://{os.path.abspath(data_csv)}" val_csv = f"{fs_protocol}://{os.path.abspath(val_csv)}" test_csv = f"{fs_protocol}://{os.path.abspath(test_csv)}" config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } config_path = os.path.join(tmpdir, "config.yaml") with open(config_path, "w") as f: yaml.dump(config, f) config_path = f"{fs_protocol}://{config_path}" backend_config = { "type": "local", } backend = initialize_backend(backend_config) model = LudwigModel(config_path, backend=backend) _, _, output_directory = model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, output_directory=output_directory) model.predict(dataset=test_csv, output_directory=output_directory) # Train again, this time the cache will be used # Resume from the remote output directory model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, model_resume_path=output_directory)
def test_remote_training_set(tmpdir, fs_protocol, cache_format): with tempfile.TemporaryDirectory() as outdir: output_directory = f'{fs_protocol}://{outdir}' input_features = [sequence_feature(reduce_output='sum')] output_features = [category_feature(vocab_size=2, reduce_input='sum')] csv_filename = os.path.join(tmpdir, 'training.csv') data_csv = generate_data(input_features, output_features, csv_filename) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'validation.csv')) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, 'test.csv')) data_csv = f'{fs_protocol}://{os.path.abspath(data_csv)}' val_csv = f'{fs_protocol}://{os.path.abspath(val_csv)}' test_csv = f'{fs_protocol}://{os.path.abspath(test_csv)}' config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2}, } config_path = os.path.join(tmpdir, 'config.yaml') with open(config_path, 'w') as f: yaml.dump(config, f) config_path = f'{fs_protocol}://{config_path}' backend_config = { 'type': 'local', 'cache_format': cache_format } backend = initialize_backend(backend_config) model = LudwigModel(config_path, backend=backend) _, _, output_directory = model.train( training_set=data_csv, validation_set=val_csv, test_set=test_csv, output_directory=output_directory ) model.predict(dataset=test_csv, output_directory=output_directory) # Train again, this time the cache will be used # Resume from the remote output directory model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, model_resume_path=output_directory)
def test_api_train_online(csv_filename): input_features = [sequence_feature(reduce_output="sum")] output_features = [category_feature(vocab_size=5, reduce_input="sum")] data_csv = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "output_size": 14}, } model = LudwigModel(config) for i in range(2): model.train_online(dataset=data_csv) model.predict(dataset=data_csv)
def test_api_train_online(csv_filename): input_features = [sequence_feature(reduce_output='sum')] output_features = [category_feature(vocab_size=2, reduce_input='sum')] data_csv = generate_data(input_features, output_features, csv_filename) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, } model = LudwigModel(config) for i in range(2): model.train_online(dataset=data_csv) model.predict(dataset=data_csv)
def test_api_save_torchscript(tmpdir): """Tests successful saving and loading of model in TorchScript format.""" input_features = [category_feature(vocab_size=5)] output_features = [category_feature(name="class", vocab_size=5, reduce_input="sum")] data_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "dataset.csv")) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv")) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv")) config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "output_size": 14}, } model = LudwigModel(config) _, _, output_dir = model.train( training_set=data_csv, validation_set=val_csv, test_set=test_csv, output_directory=tmpdir ) test_df = pd.read_csv(test_csv) output_df_expected, _ = model.predict(test_df, return_type=pd.DataFrame) save_path = os.path.join(output_dir, "model") os.makedirs(save_path, exist_ok=True) model.save_torchscript(save_path) inference_module = InferenceModule.from_directory(save_path) output_df, _ = inference_module.predict(test_df, return_type=pd.DataFrame) for col in output_df.columns: assert output_df[col].equals(output_df_expected[col])
def run(csv_filename): # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Inputs & Outputs input_features = [image_feature(folder=image_dest_folder)] output_features = [category_feature()] data_csv = generate_data(input_features, output_features, csv_filename) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } callback = CometCallback() model = LudwigModel(config, callbacks=[callback]) output_dir = None # Wrap these methods so we can check that they were called callback.on_train_init = Mock(side_effect=callback.on_train_init) callback.on_train_start = Mock(side_effect=callback.on_train_start) with patch("comet_ml.Experiment.log_asset_data") as mock_log_asset_data: try: # Training with csv _, _, output_dir = model.train(dataset=data_csv) model.predict(dataset=data_csv) finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True) # Verify that the experiment was created successfully assert callback.cometml_experiment is not None # Check that these methods were called at least once callback.on_train_init.assert_called() callback.on_train_start.assert_called() # Check that we ran `train_model`, which calls into `log_assert_data`, successfully mock_log_asset_data.assert_called()
def test_experiment_dataset_formats(data_format, csv_filename): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting input_features = [number_feature(), category_feature()] output_features = [category_feature(), number_feature()] config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, "preprocessing": {}, TRAINER: { "epochs": 2 }, } # setup training data format to test raw_data = generate_data(input_features, output_features, csv_filename) training_set_metadata = None if data_format == "hdf5": # hdf5 format training_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=raw_data) dataset_to_use = training_set.data_hdf5_fp else: dataset_to_use = create_data_set_to_use(data_format, raw_data) # define Ludwig model model = LudwigModel(config=config) model.train(dataset=dataset_to_use, training_set_metadata=training_set_metadata, random_seed=default_random_seed) # # run functions with the specified data format model.evaluate(dataset=dataset_to_use) model.predict(dataset=dataset_to_use)
def train_model(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } model = LudwigModel(model_definition) # Training with csv model.train(data_csv=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.predict(data_csv=data_csv) # Remove results/intermediate data saved to disk shutil.rmtree(model.exp_dir_name, ignore_errors=True) # Training with dataframe data_df = read_csv(data_csv) model.train(data_df=data_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True) model.predict(data_df=data_df) return model
def train_model(input_features, output_features, data_csv): """Helper method to avoid code repetition in running an experiment. :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 14}, "training": {"epochs": 2}, } model = LudwigModel(config, backend=LocalTestBackend()) _, _, output_dir = model.train( dataset=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True ) model.predict(dataset=data_csv, output_directory=output_dir) return model, output_dir
def run_api_experiment(input_features, output_features, data_csv, **kwargs): model_definition = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } model = LudwigModel(model_definition) try: # Training with csv model.train(data_csv=data_csv, **kwargs) model.predict(data_csv=data_csv) finally: if model.exp_dir_name: shutil.rmtree(model.exp_dir_name, ignore_errors=True)
def test_model_weights_match_training(tmpdir, csv_filename): np.random.seed(1) input_features = [number_feature()] output_features = [number_feature()] output_feature_name = output_features[0][NAME] # Generate test data data_csv_path = generate_data(input_features, output_features, os.path.join(tmpdir, csv_filename), num_examples=100) config = { "input_features": input_features, "output_features": output_features, "trainer": { "epochs": 5, "batch_size": 32 }, } model = LudwigModel(config=config, ) training_stats, _, _ = model.train(training_set=data_csv_path, random_seed=1919) # generate predicitons from training data df = pd.read_csv(data_csv_path) predictions = model.predict(df) # compute loss on predictions from training data loss_function = MSELoss() loss = loss_function( torch.tensor(predictions[0][output_feature_name + "_predictions"].values), # predictions torch.tensor(df[output_feature_name].values), # target ).type(torch.float32) # get last loss value from training last_training_loss = torch.tensor( training_stats[TRAINING][output_feature_name][LOSS][-1]) # loss from predictions should match last loss value recorded during training assert torch.isclose(loss, last_training_loss), ( "Model predictions on training set did not generate same loss value as in training. " "Need to confirm that weights were correctly captured in model.")
def run_test_gbm_number(tmpdir, backend_config): """Test that the GBM model can train and predict a numerical output (regression).""" # Given a dataset with a single input feature and a single output feature, input_features = [number_feature(), category_feature(reduce_output="sum")] output_feature = number_feature() output_features = [output_feature] csv_filename = os.path.join(tmpdir, "training.csv") dataset_filename = generate_data(input_features, output_features, csv_filename, num_examples=100) config = { MODEL_TYPE: "gbm", "input_features": input_features, "output_features": output_features, TRAINER: { "num_boost_round": 2 }, } # When I train a model on the dataset, load the model from the output directory, and # predict on the dataset model = LudwigModel(config, backend=backend_config) model.train( dataset=dataset_filename, output_directory=tmpdir, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) model.load(os.path.join(tmpdir, "api_experiment_run", "model")) preds, _ = model.predict( dataset=dataset_filename, output_directory=os.path.join(tmpdir, "predictions"), ) # Then the predictions should be included in the output pred_col = preds[output_feature["name"] + "_predictions"] if backend_config["type"] == "ray": pred_col = pred_col.compute() assert pred_col.dtype == float
def test_whylogs_callback_local(tmpdir): epochs = 2 batch_size = 8 num_examples = 32 input_features = [sequence_feature(reduce_output="sum")] output_features = [category_feature(vocab_size=2, reduce_input="sum")] config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": epochs, "batch_size": batch_size }, } data_csv = generate_data(input_features, output_features, os.path.join(tmpdir, "train.csv"), num_examples=num_examples) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv")) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv")) exp_name = "whylogs_test_local" callback = WhyLogsCallback() model = LudwigModel(config, callbacks=[callback]) model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, experiment_name=exp_name) _, _ = model.predict(test_csv) local_training_output_dir = "output/training" local_prediction_output_dir = "output/prediction" assert os.path.isdir(local_training_output_dir) is True assert os.path.isdir(local_prediction_output_dir) is True
def train_with_backend( backend, config, dataset=None, training_set=None, validation_set=None, test_set=None, predict=True, evaluate=True, callbacks=None, ): model = LudwigModel(config, backend=backend, callbacks=callbacks) output_dir = None try: _, _, output_dir = model.train( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) if dataset is None: dataset = training_set if predict: preds, _ = model.predict(dataset=dataset) assert preds is not None if evaluate: _, eval_preds, _ = model.evaluate(dataset=dataset) assert eval_preds is not None return model finally: # Remove results/intermediate data saved to disk shutil.rmtree(output_dir, ignore_errors=True)
def run_test_gbm_category(tmpdir, backend_config): """Test that the GBM model can train and predict a categorical output (multiclass classification).""" input_features = [number_feature(), category_feature(reduce_output="sum")] vocab_size = 3 output_feature = category_feature(vocab_size=vocab_size) output_features = [output_feature] csv_filename = os.path.join(tmpdir, "training.csv") dataset_filename = generate_data(input_features, output_features, csv_filename, num_examples=100) config = { MODEL_TYPE: "gbm", "input_features": input_features, "output_features": output_features, TRAINER: { "num_boost_round": 2 }, } model = LudwigModel(config, backend=backend_config) _, _, output_directory = model.train( dataset=dataset_filename, output_directory=tmpdir, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) model.load(os.path.join(tmpdir, "api_experiment_run", "model")) preds, _ = model.predict(dataset=dataset_filename, output_directory=output_directory) prob_col = preds[output_feature["name"] + "_probabilities"] if backend_config["type"] == "ray": prob_col = prob_col.compute() assert len(prob_col.iloc[0]) == (vocab_size + 1) assert prob_col.apply(sum).mean() == pytest.approx(1.0)