def compare_perf(): "compare performance of two models" test_file = SCRIPT_DIR / 'rotten_tomatoes_test.csv' output_dir = get_ludwig_output_dir() model_name = "run" experiment_name1 = "rt" experiment_dir = experiment_name1 + '_' + model_name model_dir1 = output_dir / 'results' / experiment_dir / 'model' model1 = LudwigModel.load(model_dir1, backend='local') eval_stats1, predictions1, output_dir1 = model1.evaluate( dataset=str(test_file)) experiment_name2 = "rt_zscore" experiment_dir = experiment_name2 + '_' + model_name model_dir2 = output_dir / 'results' / experiment_dir / 'model' model2 = LudwigModel.load(model_dir2, backend='local') eval_stats2, predictions2, output_dir2 = model2.evaluate( dataset=str(test_file)) list_of_eval_stats = [eval_stats1, eval_stats2] model_names = [experiment_name1, experiment_name2] compare_performance( list_of_eval_stats, "recommended", model_names=model_names, output_directory=output_dir, file_format="png", ) print(f'{output_dir=}')
def predict(self, mode='predict', ignore_columns=[]): predict_dataframe, model_definition = self._create_ludwig_dataframe(mode) model_definition = self.transaction.hmd['ludwig_data']['model_definition'] model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) if self.transaction.lmd['model_order_by'] is None: timeseries_cols = [] else: timeseries_cols = list(map(lambda x: x[0], self.transaction.lmd['model_order_by'])) if len(timeseries_cols) > 0: predict_dataframe, model_definition = self._translate_df_to_timeseries_format(predict_dataframe, model_definition, timeseries_cols) for ignore_col in ignore_columns: try: predict_dataframe[ignore_col] = [None] * len(predict_dataframe[ignore_col]) except: for date_appendage in ['_year', '_month','_day']: predict_dataframe[ignore_col + date_appendage] = [None] * len(predict_dataframe[ignore_col + date_appendage]) with disable_ludwig_output(): model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) predictions = model.predict(data_df=predict_dataframe) for col_name in predictions: col_name_normalized = col_name.replace('_predictions', '') predictions = predictions.rename(columns = {col_name: col_name_normalized}) return predictions
def predict_test_with_ludwig(self): ludwig_model = LudwigModel.load("results/api_experiment_cifar/model") predictions = ludwig_model.predict(data_csv="dataset/test.csv") predictions.to_csv(index=False, header=True, path_or_buf="predicted.csv") print(predictions)
def export_triton(model_path, output_path="model_repository", model_name="ludwig_model", model_version=1, **kwargs): """Exports a model in torchscript format with config for Triton serving. # Inputs :param model_path: (str) filepath to pre-trained model. :param output_path: (str, default: `'model_repository'`) directory to store the triton models. :param model_name: (str, default: `'ludwig_model'`) save triton under this name. :param model_name: (int, default: `1`) save neuropod under this verison. # Return :returns: (`None`) """ logger.info(f"Model path: {model_path}") logger.info(f"Output path: {output_path}") logger.info(f"Model name: {model_name}") logger.info(f"Model version: {model_version}") logger.info("\n") model = LudwigModel.load(model_path) os.makedirs(output_path, exist_ok=True) utils_export_triton(model, output_path, model_name, model_version) logger.info(f"Saved to: {output_path}")
def predict(self, mode='predict', ignore_columns=[]): predict_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe( mode) model_definition = self.transaction.hmd['ludwig_data'][ 'model_definition'] if len(timeseries_cols) > 0: predict_dataframe, model_definition = self._translate_df_to_timeseries_format( predict_dataframe, model_definition, timeseries_cols) for ignore_col in ignore_columns: try: predict_dataframe[ignore_col] = [None] * len( predict_dataframe[ignore_col]) except: for date_appendage in ['_year', '_month', '_day']: predict_dataframe[ignore_col + date_appendage] = [ None ] * len(predict_dataframe[ignore_col + date_appendage]) with disable_console_output(True): model_dir = self.get_model_dir() model = LudwigModel.load(model_dir=model_dir) predictions = model.predict(data_df=predict_dataframe, gpus=self.get_useable_gpus()) for col_name in predictions: col_name_normalized = col_name.replace('_predictions', '') predictions = predictions.rename( columns={col_name: col_name_normalized}) return predictions
def run_api_experiment(input_features, output_features, dataset, **kwargs): config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2} } model = LudwigModel(config) output_dir = None try: # Training with csv _, _, output_dir = model.train( dataset=dataset, **kwargs ) model.predict(dataset=dataset) # Attempt loading saved model, should broadcast successfully model_dir = os.path.join(output_dir, 'model') if output_dir else None loaded_model = LudwigModel.load(model_dir) # Model loading should broadcast weights from coordinator loaded_weights = loaded_model.model.get_weights() bcast_weights = hvd.broadcast_object(loaded_weights) for loaded, bcast in zip(loaded_weights, bcast_weights): assert np.allclose(loaded, bcast) finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True)
def run_api_experiment(input_features, output_features, dataset, **kwargs): config = { "input_features": input_features, "output_features": output_features, "combiner": {"type": "concat", "fc_size": 14}, "training": {"epochs": 2}, } model = LudwigModel(config) output_dir = None try: # Training with csv _, _, output_dir = model.train(dataset=dataset, **kwargs) model.predict(dataset=dataset) # Attempt loading saved model, should broadcast successfully model_dir = os.path.join(output_dir, "model") if output_dir else None loaded_model = LudwigModel.load(model_dir) # Model loading should broadcast weights from coordinator loaded_state = loaded_model.model.state_dict() bcast_state = hvd.broadcast_object(loaded_state) for loaded, bcast in zip(loaded_state.values(), bcast_state.values()): assert np.allclose(loaded, bcast) finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True)
def collect_weights(model_path: str, tensors: List[str], output_directory: str = "results", **kwargs) -> List[str]: """Loads a pretrained model and collects weights. # Inputs :param model_path: (str) filepath to pre-trained model. :param tensors: (list, default: `None`) List of tensor names to collect weights :param output_directory: (str, default: `'results'`) the directory where collected weights will be stored. # Return :return: (List[str]) list of filepath to `*.npy` files containing the weights. """ logger.info(f"Model path: {model_path}") logger.info(f"Output path: {output_directory}") logger.info("\n") model = LudwigModel.load(model_path) # collect weights print_boxed("COLLECT WEIGHTS") collected_tensors = model.collect_weights(tensors) # saving os.makedirs(output_directory, exist_ok=True) saved_filenames = save_tensors(collected_tensors, output_directory) logger.info(f"Saved to: {output_directory}") return saved_filenames
def predict_with_backend(tmpdir, config, data_csv_path, backend, patch_args=None): with init_backend(backend): if backend == "ray": backend = RAY_BACKEND_CONFIG backend["processor"]["type"] = "dask" ludwig_model = LudwigModel(config, backend=backend) _, _, output_directory = ludwig_model.train( dataset=data_csv_path, output_directory=os.path.join(tmpdir, "output"), ) # Check that metadata JSON saves and loads correctly ludwig_model = LudwigModel.load(os.path.join(output_directory, "model")) if patch_args is not None: with mock.patch(*patch_args): preds_df, _ = ludwig_model.predict(dataset=data_csv_path) else: preds_df, _ = ludwig_model.predict(dataset=data_csv_path) return preds_df, ludwig_model
def test_model_loaded_from_old_config_prediction_works(tmpdir): # Titanic model based on 0.5.3. old_model_url = "https://predibase-public-us-west-2.s3.us-west-2.amazonaws.com/ludwig_unit_tests/old_model.zip" old_model_filename = wget.download(old_model_url, tmpdir) with zipfile.ZipFile(old_model_filename, "r") as zip_ref: zip_ref.extractall(tmpdir) example_data = { "PassengerId": 892, "Pclass": 3, "Name": "Kelly, Mr. James", "Sex": "male", "Age": 34.5, "SibSp": 0, "Parch": 0, "Ticket": "330911", "Fare": 7.8292, "Cabin": None, "Embarked": "Q", } test_set = pd.DataFrame(example_data, index=[0]) ludwig_model = LudwigModel.load(os.path.join(tmpdir, "old_model/model")) predictions, _ = ludwig_model.predict(dataset=test_set) assert predictions.to_dict()["Survived_predictions"] == {0: False}
def test_missing_value_prediction(csv_filename): with tempfile.TemporaryDirectory() as tmpdir: input_features = [ category_feature( vocab_size=2, reduce_input='sum', preprocessing=dict(missing_value_strategy='fill_with_mode')) ] output_features = [binary_feature()] dataset = pd.read_csv( generate_data(input_features, output_features, csv_filename)) config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, } model = LudwigModel(config) _, _, output_dir = model.train(dataset=dataset, output_directory=tmpdir) # Set the input column to None, we should be able to replace the missing value with the mode # from the training set dataset[input_features[0]['name']] = None model.predict(dataset=dataset) model = LudwigModel.load(os.path.join(output_dir, 'model')) model.predict(dataset=dataset)
def export_neuropod(model_path, output_path="neuropod", model_name="neuropod", **kwargs): """Exports a model to Neuropod. # Inputs :param model_path: (str) filepath to pre-trained model. :param output_path: (str, default: `'neuropod'`) directory to store the neuropod model. :param model_name: (str, default: `'neuropod'`) save neuropod under this name. # Return :returns: (`None`) """ logger.info(f"Model path: {model_path}") logger.info(f"Output path: {output_path}") logger.info("\n") model = LudwigModel.load(model_path) os.makedirs(output_path, exist_ok=True) utils_export_neuropod(model, output_path, model_name) logger.info(f"Saved to: {output_path}")
def export_torchscript(model_path: str, model_only: bool = False, output_path: str = "torchscript", device: Optional[str] = None, **kwargs) -> None: """Exports a model to torchscript. # Inputs :param model_path: (str) filepath to pre-trained model. :param model_only: (bool, default: `False`) If true, scripts and exports the model only. :param output_path: (str, default: `'torchscript'`) directory to store torchscript # Return :returns: (`None`) """ logger.info(f"Model path: {model_path}") logger.info(f"Saving model only: {model_only}") logger.info(f"Output path: {output_path}") logger.info("\n") model = LudwigModel.load(model_path) os.makedirs(output_path, exist_ok=True) model.save_torchscript(output_path, model_only=model_only, device=device) logger.info(f"Saved to: {output_path}")
def test_collect_weights(csv_filename): output_dir = None try: # This will reset the layer numbering scheme TensorFlow uses. # Otherwise, when we load the model, its layer names will be appended # with "_1". tf.keras.backend.reset_uids() model, output_dir = _train(*_prepare_data(csv_filename)) model_path = os.path.join(output_dir, 'model') weights = [w for name, w in model.model.collect_weights()] # 1 for the encoder (embeddings), # 2 for the decoder classifier (w and b) assert len(weights) == 3 # Load model from disk to ensure correct weight names tf.keras.backend.reset_uids() model_loaded = LudwigModel.load(model_path) tensor_names = [name for name, w in model_loaded.collect_weights()] assert len(tensor_names) == 3 tf.keras.backend.reset_uids() with tempfile.TemporaryDirectory() as output_directory: filenames = collect_weights(model_path, tensor_names, output_directory) assert len(filenames) == 3 for weight, filename in zip(weights, filenames): saved_weight = np.load(filename) assert np.allclose(weight.numpy(), saved_weight, rtol=1.e-4), filename finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True)
def test_collect_weights(tmpdir, csv_filename): output_dir = None try: model, output_dir = _train(*_prepare_data(csv_filename)) model_path = os.path.join(output_dir, "model") # 1 for the encoder (embeddings). # 2 for the decoder classifier (w and b). weights = [w for _, w in model.model.collect_weights()] assert len(weights) == 3 # Load model from disk to ensure correct weight names model_loaded = LudwigModel.load(model_path) tensor_names = [name for name, w in model_loaded.collect_weights()] assert len(tensor_names) == 3 filenames = collect_weights(model_path, tensor_names, tmpdir) assert len(filenames) == 3 for weight, filename in zip(weights, filenames): saved_weight = np.load(filename) assert torch.allclose(weight, torch.from_numpy(saved_weight).to(DEVICE), rtol=1.0e-4), filename finally: if output_dir: shutil.rmtree(output_dir, ignore_errors=True)
def test_missing_value_prediction(csv_filename): random.seed(1) np.random.seed(1) with tempfile.TemporaryDirectory() as tmpdir: input_features = [ category_feature( vocab_size=2, reduce_input="sum", preprocessing=dict(missing_value_strategy="fill_with_mode")) ] output_features = [binary_feature()] dataset = pd.read_csv( generate_data(input_features, output_features, csv_filename)) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, } model = LudwigModel(config) _, _, output_dir = model.train(dataset=dataset, output_directory=tmpdir) # Set the input column to None, we should be able to replace the missing value with the mode # from the training set dataset[input_features[0]["name"]] = None model.predict(dataset=dataset) model = LudwigModel.load(os.path.join(output_dir, "model")) model.predict(dataset=dataset)
def predict_cli(model_path, dataset=None, data_format=None, batch_size=128, skip_save_unprocessed_output=False, skip_save_predictions=False, output_directory='results', gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, use_horovod=None, logging_level=logging.INFO, debug=False, **kwargs): model = LudwigModel.load(model_path, logging_level=logging_level, use_horovod=use_horovod, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads) model.predict( dataset=dataset, data_format=data_format, batch_size=batch_size, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, output_directory=output_directory, return_type=dict, debug=debug, )
def test_tune_batch_size_and_lr(tmpdir): with tempfile.TemporaryDirectory() as outdir: input_features = [sequence_feature(reduce_output="sum")] output_features = [category_feature(vocab_size=2, reduce_input="sum")] csv_filename = os.path.join(tmpdir, "training.csv") data_csv = generate_data(input_features, output_features, csv_filename) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv")) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv")) config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 2, "batch_size": "auto", "eval_batch_size": "auto", "learning_rate": "auto", }, } model = LudwigModel(config, backend=LocalTestBackend()) # check preconditions assert model.config[TRAINER][BATCH_SIZE] == "auto" assert model.config[TRAINER][EVAL_BATCH_SIZE] == "auto" assert model.config[TRAINER][LEARNING_RATE] == "auto" _, _, output_directory = model.train(training_set=data_csv, validation_set=val_csv, test_set=test_csv, output_directory=outdir) def check_postconditions(model): # check batch size assert model.config[TRAINER][BATCH_SIZE] != "auto" assert model.config[TRAINER][BATCH_SIZE] > 1 assert model.config[TRAINER][EVAL_BATCH_SIZE] != "auto" assert model.config[TRAINER][EVAL_BATCH_SIZE] > 1 assert model.config[TRAINER][BATCH_SIZE] == model.config[TRAINER][ EVAL_BATCH_SIZE] # check learning rate assert model.config[TRAINER][LEARNING_RATE] != "auto" assert model.config[TRAINER][LEARNING_RATE] > 0 check_postconditions(model) model = LudwigModel.load(os.path.join(output_directory, "model")) # loaded model should retain the tuned params check_postconditions(model)
def collect_activations(model_path, layers, dataset, data_format=None, batch_size=128, output_directory='results', gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, use_horovod=None, debug=False, **kwargs): """Uses the pretrained model to collect the tensors corresponding to a datapoint in the dataset. Saves the tensors to the experiment directory :param model_path: Is the model from which the tensors will be collected :param layers: List of layer names we wish to collect the output from :param data_csv: The CSV filepath which contains the datapoints from which the tensors are collected :param data_hdf5: The HDF5 file path if the CSV file path does not exist, an alternative source of providing the data to the model :param split: Split type :param batch_size: Batch size :param output_directory: Output directory :param gpus: The total number of GPUs that the model intends to use :param gpu_memory_limit: (int: default: `None`) maximum memory in MB to allocate per GPU device. :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow to use multithreading parallelism to improve performance at the cost of determinism. :param debug: To step through the stack traces and find possible errors :returns: None """ logger.info('Dataset path: {}'.format(dataset)) logger.info('Model path: {}'.format(model_path)) logger.info('Output path: {}'.format(output_directory)) logger.info('\n') model = LudwigModel.load(model_path, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, use_horovod=use_horovod) # collect activations print_boxed('COLLECT ACTIVATIONS') collected_tensors = model.collect_activations(layers, dataset, data_format=data_format, batch_size=batch_size, debug=debug) # saving os.makedirs(output_directory, exist_ok=True) saved_filenames = save_tensors(collected_tensors, output_directory) logger.info('Saved to: {0}'.format(output_directory)) return saved_filenames
def run_api_experiment(input_features, output_features, data_csv): """ Helper method to avoid code repetition in running an experiment :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } model = LudwigModel(config) output_dir = None try: # Training with csv _, _, output_dir = model.train( dataset=data_csv, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) model.predict(dataset=data_csv) model_dir = os.path.join(output_dir, "model") loaded_model = LudwigModel.load(model_dir) # Necessary before call to get_weights() to materialize the weights loaded_model.predict(dataset=data_csv) model_weights = model.model.get_weights() loaded_weights = loaded_model.model.get_weights() for model_weight, loaded_weight in zip(model_weights, loaded_weights): assert np.allclose(model_weight, loaded_weight) finally: # Remove results/intermediate data saved to disk shutil.rmtree(output_dir, ignore_errors=True) try: # Training with dataframe data_df = read_csv(data_csv) _, _, output_dir = model.train( dataset=data_df, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, ) model.predict(dataset=data_df) finally: shutil.rmtree(output_dir, ignore_errors=True)
def run_test_gbm_number(tmpdir, backend_config): """Test that the GBM model can train and predict a numerical output (regression).""" # Given a dataset with a single input feature and a single output feature, input_features = [number_feature(), category_feature(reduce_output="sum")] output_feature = number_feature() output_features = [output_feature] csv_filename = os.path.join(tmpdir, "training.csv") dataset_filename = generate_data(input_features, output_features, csv_filename, num_examples=100) config = { MODEL_TYPE: "gbm", "input_features": input_features, "output_features": output_features, TRAINER: { "num_boost_round": 2 }, } # When I train a model on the dataset, load the model from the output directory, and # predict on the dataset model = LudwigModel(config, backend=backend_config) model.train( dataset=dataset_filename, output_directory=tmpdir, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) model.load(os.path.join(tmpdir, "api_experiment_run", "model")) preds, _ = model.predict( dataset=dataset_filename, output_directory=os.path.join(tmpdir, "predictions"), ) # Then the predictions should be included in the output pred_col = preds[output_feature["name"] + "_predictions"] if backend_config["type"] == "ray": pred_col = pred_col.compute() assert pred_col.dtype == float
def predict_test_with_ludwig(self): ludwig_model = LudwigModel.load( "results/api_experiment_imdb_review/model") df = pd.read_table("../../test.tsv", sep='\t') predictions = ludwig_model.predict(data_df=df) predictions.to_csv(index=False, header=True, path_or_buf="predicted.csv") print(predictions)
def initialize(cfg): pth_mdl = os.path.join(cfg['pth_mdls'], cfg['model_to_load'], "model") if not os.path.isdir(pth_mdl): raise Exception( "Could not find the model specified in the models directory: {}". format(pth_mdl)) global MODL, MMTA, FEAS_IN, FEAS_OUT # load a model st = time.time() print("...loading model from {}".format(pth_mdl)) MODL = LudwigModel.load(pth_mdl) MODL.set_logging_level(logging.ERROR) with open(os.path.join(pth_mdl, "train_set_metadata.json")) as f: MMTA = json.load(f) print("...loaded model in {:.2f}s".format(time.time() - st)) FEAS_IN = [fea for fea in MODL.model.hyperparameters['input_features']] FEAS_OUT = [fea for fea in MODL.model.hyperparameters['output_features']] input_features_desc, output_features_desc = "", "" for fea in FEAS_IN: if fea['type'] == "image": input_features_desc += "\t{}\t({}: {})\n".format( fea['name'], fea['type'], "({}, {}) {}".format( fea['width'], fea['height'], fe.util.chan_count_to_mode(fea['num_channels']))) else: input_features_desc += "\t{}\t({})\n".format( fea['name'], fea['type']) for fea in FEAS_OUT: if fea['type'] == "category": fea['meta'] = MMTA[fea['name']] output_features_desc += "\t{}\t(category with {} classes)\n".format( fea['name'], fea['num_classes']) output_features_desc += "\t\t\t{}\n".format(", ".join( fea['meta']['idx2str'])) else: output_features_desc += "\t{}\t({})\n".format( fea['name'], fea['type']) #print(output_features) console_msg = """#################### FRESH EYES #################### I've just loaded a saved model from {0} To check that the server is working, go to http://localhost:{1}/ It looks like the model I loaded requires the following inputs to make a prediction: {2} Make note of these names, as these particular fields will be required by API calls. It looks like the following values will result from a prediction. {3} Make note of these as well, as these fields will be returned to API calls. Invoke Cntl+C to stop the server #################### FRESH EYES #################### """ print( console_msg.format(pth_mdl, cfg['port_num'], input_features_desc, output_features_desc))
def get_tags(sentence): tagger = LudwigModel.load('../tagger') sentence = sentence.strip().lower().translate( str.maketrans('', '', string.punctuation)) ret = tagger.predict( data_dict={'utterance': [sentence]})['slots_predictions'].values tagger.close() return ret
def run_test_gbm_category(tmpdir, backend_config): """Test that the GBM model can train and predict a categorical output (multiclass classification).""" input_features = [number_feature(), category_feature(reduce_output="sum")] vocab_size = 3 output_feature = category_feature(vocab_size=vocab_size) output_features = [output_feature] csv_filename = os.path.join(tmpdir, "training.csv") dataset_filename = generate_data(input_features, output_features, csv_filename, num_examples=100) config = { MODEL_TYPE: "gbm", "input_features": input_features, "output_features": output_features, TRAINER: { "num_boost_round": 2 }, } model = LudwigModel(config, backend=backend_config) _, _, output_directory = model.train( dataset=dataset_filename, output_directory=tmpdir, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) model.load(os.path.join(tmpdir, "api_experiment_run", "model")) preds, _ = model.predict(dataset=dataset_filename, output_directory=output_directory) prob_col = preds[output_feature["name"] + "_probabilities"] if backend_config["type"] == "ray": prob_col = prob_col.compute() assert len(prob_col.iloc[0]) == (vocab_size + 1) assert prob_col.apply(sum).mean() == pytest.approx(1.0)
def make_inference(): trained_model = LudwigModel.load("results/exp_run/model") prediction = trained_model.predict( {"image_path":["D:\programs\python\exampl\hand_recognition\infer/right (24).jpg", "D:\programs\python\exampl\hand_recognition\infer\left (26).jpg" ], "label":[-1,-1] } ) print(prediction[0]["label_predictions"].values)
def run_test_with_ludwig(model_path, test_file_csv): """ Wrap around Ludwig testing. :param model_path: path in which already trained model is :param test_file_csv: path to csv file with test data points :return: predictions from the model and dictionary with stats for human debug """ model = LudwigModel.load(model_path) predictions, test_stats = model.test(data_csv=test_file_csv) model.close() return predictions, test_stats
def train(self): training_dataframe, model_definition = self._create_ludwig_dataframe('train') if self.transaction.lmd['model_order_by'] is None: timeseries_cols = [] else: timeseries_cols = list(map(lambda x: x[0], self.transaction.lmd['model_order_by'])) if len(timeseries_cols) > 0: training_dataframe, model_definition = self._translate_df_to_timeseries_format(training_dataframe, model_definition, timeseries_cols, 'train') with disable_ludwig_output(True): model = LudwigModel(model_definition) # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295 #model.initialize_model(train_set_metadata={}) #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name'] if self.transaction.lmd['rebuild_model'] is True: train_stats = model.train(data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=True) else: model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) train_stats = model.train(data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=True) #,model_load_path=self.transaction.lmd['ludwig_data']['ludwig_save_path']) for k in train_stats['train']: if k not in self.transaction.lmd['model_accuracy']['train']: self.transaction.lmd['model_accuracy']['train'][k] = [] self.transaction.lmd['model_accuracy']['test'][k] = [] elif k is not 'combined': # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway pass else: self.transaction.lmd['model_accuracy']['train'][k].extend(train_stats['train'][k]['accuracy']) self.transaction.lmd['model_accuracy']['test'][k].extend(train_stats['test'][k]['accuracy']) ''' @ TRAIN ONLINE BIT That's not working model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) for i in range(0,100): train_stats = model.train_online(data_df=training_dataframe) # The resulting train_stats are "None"... wonderful -_- ''' ludwig_model_savepath = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_ludwig_data') model.save(ludwig_model_savepath) model.close() self.transaction.lmd['ludwig_data'] = {'ludwig_save_path': ludwig_model_savepath} self.transaction.hmd['ludwig_data'] = {'model_definition': model_definition}
def do_predictions(prediction_dictionary, target_folder): # reload the model model = LudwigModel.load(target_folder) # get predictions predictions = model.predict(data_dict=prediction_dictionary) for input_q, input_skus, output in zip(prediction_dictionary['query'], prediction_dictionary['skus_in_session'], predictions['path_predictions']): print("\nInput: <{}, {}>, predicted path: {}".format(input_q, input_skus, ' > '.join([o for o in output if o != '<PAD>']) )) return
def predict_test_with_ludwig(self): ludwig_model = LudwigModel.load( "results/api_experiment_cancer_diagnosis/model") df = pd.read_csv("../../processed_data/test.csv") df.rename(columns={ "concave points_se": "concave_points_se", "concave points_worst": "concave_points_worst" }, inplace=True) predictions = ludwig_model.predict(data_df=df, ) predictions.to_csv(index=False, header=True, path_or_buf="predicted.csv") print(predictions)