def test_experiment_dataset_formats(data_format): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting input_features = [ numerical_feature(), category_feature() ] output_features = [ category_feature(), numerical_feature() ] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'preprocessing': {}, 'training': {'epochs': 2} } # create temporary name for train and test data sets csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv' # setup training data format to test raw_data = generate_data(input_features, output_features, csv_filename) training_set_metadata = None if data_format == 'hdf5': # hdf5 format training_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=raw_data ) dataset_to_use = training_set.data_hdf5_fp else: dataset_to_use = create_data_set_to_use(data_format, raw_data) # define Ludwig model model = LudwigModel( config=config ) model.train( dataset=dataset_to_use, training_set_metadata=training_set_metadata, random_seed=default_random_seed ) # # run functions with the specified data format model.evaluate(dataset=dataset_to_use) model.predict(dataset=dataset_to_use) # Delete the temporary data created delete_temporary_data(csv_filename)
def test_resource_usage_tracker(tmpdir): train_df = pd.DataFrame(np.random.normal(0, 1, size=(100, 3)), columns=["input_1", "input_2", "output_1"]) eval_df = pd.DataFrame(np.random.normal(0, 1, size=(20, 3)), columns=["input_1", "input_2", "output_1"]) config = { "input_features": [{ "name": "input_1", "type": "number" }, { "name": "input_2", "type": "number" }], "output_features": [{ "name": "output_1", "type": "number" }], "combiner": { "type": "concat", "output_size": 14 }, TRAINER: { "epochs": 1 }, } model = LudwigModel(config=config, backend="local") with ResourceUsageTracker(tag="train", output_dir=tmpdir, logging_interval=0.05, num_examples=len(train_df)): model.train( dataset=train_df, output_directory=tmpdir, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) with ResourceUsageTracker(tag="evaluate", output_dir=tmpdir, logging_interval=0.05, num_examples=len(eval_df)): model.evaluate(dataset=eval_df) assert os.path.exists( os.path.join(tmpdir, "train_resource_usage_metrics.json")) assert os.path.exists( os.path.join(tmpdir, "evaluate_resource_usage_metrics.json")) shutil.rmtree(tmpdir)
def test_experiment_dataset_formats(data_format, csv_filename): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting input_features = [number_feature(), category_feature()] output_features = [category_feature(), number_feature()] config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, "preprocessing": {}, TRAINER: { "epochs": 2 }, } # setup training data format to test raw_data = generate_data(input_features, output_features, csv_filename) training_set_metadata = None if data_format == "hdf5": # hdf5 format training_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=raw_data) dataset_to_use = training_set.data_hdf5_fp else: dataset_to_use = create_data_set_to_use(data_format, raw_data) # define Ludwig model model = LudwigModel(config=config) model.train(dataset=dataset_to_use, training_set_metadata=training_set_metadata, random_seed=default_random_seed) # # run functions with the specified data format model.evaluate(dataset=dataset_to_use) model.predict(dataset=dataset_to_use)
def train_with_backend( backend, config, dataset=None, training_set=None, validation_set=None, test_set=None, predict=True, evaluate=True, callbacks=None, ): model = LudwigModel(config, backend=backend, callbacks=callbacks) output_dir = None try: _, _, output_dir = model.train( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, skip_save_processed_input=True, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) if dataset is None: dataset = training_set if predict: preds, _ = model.predict(dataset=dataset) assert preds is not None if evaluate: _, eval_preds, _ = model.evaluate(dataset=dataset) assert eval_preds is not None return model finally: # Remove results/intermediate data saved to disk shutil.rmtree(output_dir, ignore_errors=True)
def test_experiment_image_dataset(train_format, train_in_memory, test_format, test_in_memory): # primary focus of this test is to determine if exceptions are # raised for different data set formats and in_memory setting # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') input_features = [ image_feature(folder=image_dest_folder, encoder='stacked_cnn', preprocessing={ 'in_memory': True, 'height': 12, 'width': 12, 'num_channels': 3, 'num_processes': 5 }, fc_size=16, num_filters=8), ] output_features = [ category_feature(vocab_size=2, reduce_input='sum'), ] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'preprocessing': {}, 'training': { 'epochs': 2 } } # create temporary name for train and test data sets train_csv_filename = 'train_' + uuid.uuid4().hex[:10].upper() + '.csv' test_csv_filename = 'test_' + uuid.uuid4().hex[:10].upper() + '.csv' # setup training data format to test train_data = generate_data(input_features, output_features, train_csv_filename) config['input_features'][0]['preprocessing']['in_memory'] \ = train_in_memory training_set_metadata = None if train_format == 'hdf5': # hdf5 format train_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=train_data) train_dataset_to_use = train_set.data_hdf5_fp else: train_dataset_to_use = create_data_set_to_use(train_format, train_data) # define Ludwig model model = LudwigModel(config=config, ) model.train(dataset=train_dataset_to_use, training_set_metadata=training_set_metadata) model.config['input_features'][0]['preprocessing']['in_memory'] \ = test_in_memory # setup test data format to test test_data = generate_data(input_features, output_features, test_csv_filename) if test_format == 'hdf5': # hdf5 format # create hdf5 data set _, test_set, _, training_set_metadata_for_test = preprocess_for_training( model.config, dataset=test_data) test_dataset_to_use = test_set.data_hdf5_fp else: test_dataset_to_use = create_data_set_to_use(test_format, test_data) # run functions with the specified data format model.evaluate(dataset=test_dataset_to_use) model.predict(dataset=test_dataset_to_use) # Delete the temporary data created shutil.rmtree(image_dest_folder) delete_temporary_data(train_csv_filename) delete_temporary_data(test_csv_filename)
def run_api_commands( input_features, output_features, data_csv, output_dir, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=False, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, skip_collect_predictions=False, skip_collect_overall_stats=False, ): """Helper method to avoid code repetition in running an experiment. :param input_features: input schema :param output_features: output schema :param data_csv: path to data :return: None """ config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "fc_size": 14 }, "training": { "epochs": 2 }, } model = LudwigModel(config) # Training with csv model.train( dataset=data_csv, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, output_directory=output_dir, ) model.predict( dataset=data_csv, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, output_directory=output_dir, ) model.evaluate( dataset=data_csv, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, collect_predictions=not skip_collect_predictions, collect_overall_stats=not skip_collect_overall_stats, output_directory=output_dir, ) model.experiment( dataset=data_csv, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, skip_collect_predictions=skip_collect_predictions, skip_collect_overall_stats=skip_collect_overall_stats, output_directory=output_dir, )
def test_experiment_image_dataset(train_format, train_in_memory, test_format, test_in_memory, tmpdir): # Image Inputs image_dest_folder = os.path.join(tmpdir, "generated_images") input_features = [ image_feature( folder=image_dest_folder, encoder="stacked_cnn", preprocessing={ "in_memory": True, "height": 12, "width": 12, "num_channels": 3, "num_processes": 5 }, output_size=16, num_filters=8, ), ] output_features = [ category_feature(vocab_size=2, reduce_input="sum"), ] config = { "input_features": input_features, "output_features": output_features, "combiner": { "type": "concat", "output_size": 14 }, "preprocessing": {}, TRAINER: { "epochs": 2 }, } # create temporary name for train and test data sets train_csv_filename = os.path.join( tmpdir, "train_" + uuid.uuid4().hex[:10].upper() + ".csv") test_csv_filename = os.path.join( tmpdir, "test_" + uuid.uuid4().hex[:10].upper() + ".csv") # setup training data format to test train_data = generate_data(input_features, output_features, train_csv_filename) config["input_features"][0]["preprocessing"]["in_memory"] = train_in_memory training_set_metadata = None backend = LocalTestBackend() if train_format == "hdf5": # hdf5 format train_set, _, _, training_set_metadata = preprocess_for_training( config, dataset=train_data, backend=backend, ) train_dataset_to_use = train_set.data_hdf5_fp else: train_dataset_to_use = create_data_set_to_use(train_format, train_data) # define Ludwig model model = LudwigModel( config=config, backend=backend, ) model.train(dataset=train_dataset_to_use, training_set_metadata=training_set_metadata) model.config["input_features"][0]["preprocessing"][ "in_memory"] = test_in_memory # setup test data format to test test_data = generate_data(input_features, output_features, test_csv_filename) if test_format == "hdf5": # hdf5 format # create hdf5 data set _, test_set, _, training_set_metadata_for_test = preprocess_for_training( model.config, dataset=test_data, backend=backend, ) test_dataset_to_use = test_set.data_hdf5_fp else: test_dataset_to_use = create_data_set_to_use(test_format, test_data) # run functions with the specified data format model.evaluate(dataset=test_dataset_to_use) model.predict(dataset=test_dataset_to_use)
config = { 'input_features': input_features, 'output_features': output_features, 'combiner': {'type': 'concat', 'fc_size': 14}, 'training': {'epochs': 2} } model = LudwigModel(config) train_stats, _, _ = model.train(dataset=df) st.header('Eval Stats') eval_stats, _, _ = model.evaluate(dataset=df) #st.write(eval_stats) #st.write(type(eval_stats)) #WORKS! st.subheader('In JSON format') json_object = json.dumps(str(eval_stats), indent = 4) st.write(json_object) #DOESN'T WORK YET st.subheader('In dataframe format') st.write('separate dictionnaries from main dictionnary') df = pd.DataFrame([eval_stats], columns=eval_stats.keys()) st.table(df) ######################
def page_settings(state): st.title("Train your text classifier!") display_state_values(state) #st.write("---") import os #ModelFiles = os.path.isfile("test.csv") ModelFiles = os.path.isfile("training_set_metadata.json") if not ModelFiles: st.warning('Please train a model first!') #st.stop() #st.success('ModelFiles_are_saved') import pandas as pd import io import base64 uploaded_file = st.file_uploader("Choose a file", key="2") if uploaded_file is not None: # Can be used wherever a "file-like" object is accepted: df = pd.read_csv(uploaded_file) uploaded_file.seek(0) #df.seek(0) df.columns = ["doc_text", "class"] st.write(df) else: #st.warning('Upload the CSV to be trained') st.stop() input_features = [{'name': 'doc_text', 'type': 'text'}] output_features = [{'name': 'class', 'type': 'category'}] config = { 'input_features': input_features, 'output_features': output_features, 'combiner': { 'type': 'concat', 'fc_size': 14 }, 'training': { 'epochs': 2 } } model = LudwigModel(config) import pandas as pd train_stats, _, _ = model.train(dataset=df) st.header('Eval Stats') eval_stats, _, _ = model.evaluate(dataset=df) #st.write(eval_stats) #st.write(type(eval_stats)) #WORKS! st.subheader('In JSON format') json_object = json.dumps(str(eval_stats), indent=4) #DOESN'T WORK YET st.subheader('In dataframe format') st.write(json_object) st.write('separate dictionnaries from main dictionnary') df = pd.DataFrame([eval_stats], columns=eval_stats.keys()) #st.table("df") #st.table(df) #Save model model.save(cwd) else: st.success( 'РюЁ The model has now been trained, you can start making predictions!' ) st.image('arrow2.png', width=325)
def train_with_backend( backend, config, dataset=None, training_set=None, validation_set=None, test_set=None, predict=True, evaluate=True, callbacks=None, skip_save_processed_input=True, ): model = LudwigModel(config, backend=backend, callbacks=callbacks) output_dir = None try: _, _, output_dir = model.train( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, skip_save_processed_input=skip_save_processed_input, skip_save_progress=True, skip_save_unprocessed_output=True, skip_save_log=True, ) if dataset is None: dataset = training_set if predict: preds, _ = model.predict(dataset=dataset) assert preds is not None if evaluate: eval_stats, eval_preds, _ = model.evaluate( dataset=dataset, collect_overall_stats=False, collect_predictions=True) assert eval_preds is not None # Test that eval_stats are approx equal when using local backend with tempfile.TemporaryDirectory() as tmpdir: model.save(tmpdir) local_model = LudwigModel.load(tmpdir, backend=LocalTestBackend()) local_eval_stats, _, _ = local_model.evaluate( dataset=dataset, collect_overall_stats=False, collect_predictions=False) # Filter out metrics that are not being aggregated correctly for now # TODO(travis): https://github.com/ludwig-ai/ludwig/issues/1956 def filter(stats): return { k: { metric_name: value for metric_name, value in v.items() if metric_name not in {"loss", "root_mean_squared_percentage_error"} } for k, v in stats.items() } for (k1, v1), (k2, v2) in zip( filter(eval_stats).items(), filter(local_eval_stats).items()): assert k1 == k2 for (name1, metric1), (name2, metric2) in zip(v1.items(), v2.items()): assert name1 == name2 assert np.isclose( metric1, metric2, rtol=1e-04, atol=1e-5 ), f"metric {name1}: {metric1} != {metric2}" return model finally: # Remove results/intermediate data saved to disk shutil.rmtree(output_dir, ignore_errors=True)
# Define Ludwig model object that drive model training model = LudwigModel(config=model_id + "_config.yaml", logging_level=logging.WARN) # initiate model training train_stats, _, _ = model.train( training_set=training_set, validation_set=val_set, test_set=test_set, experiment_name="balance_example", model_name=model_id, skip_save_model=True, ) # evaluate model on test_set eval_stats, _, _ = model.evaluate(test_set) # save eval stats for later use list_of_eval_stats.append(eval_stats) print(">>>>>>> completed: ", model_id, "\n") compare_performance( list_of_eval_stats, "Response", model_names=list_of_model_ids, output_directory="./visualizations", file_format="png", )
if __name__ == "__main__": if sys.argv[1] == "train": if len(sys.argv) != 5: print("Incorrect number of arguments. Please use format:\npython main.py train <path-to-input-csv-file> <path-to-output-csv-file> <ludwig-model-definition>") preprocess_dataset(sys.argv[2], sys.argv[3], 1) config = sys.argv[4] model = LudwigModel(config) train_stats = model.experiment(dataset=sys.argv[3], training_set=sys.argv[3], validation_set=sys.argv[3], test_set=sys.argv[3], experiment_name='covid_inference', model_name='train') print(train_stats) elif sys.argv[1] == "evaluate": if len(sys.argv) != 6: print("Incorrect number of arguments. Please use format:\npython main.py evaluate <path-to-trained-model> <path-to-input-csv-file> <path-to-output-csv-file> <ludwig-model-definition>") preprocess_dataset(sys.argv[3], sys.argv[4], 0) config = sys.argv[5] model = LudwigModel.load(sys.argv[2]) train_stats = model.evaluate(dataset=sys.argv[4], skip_save_predictions=False, skip_save_eval_stats=False, collect_predictions=True, collect_overall_stats=True) print(train_stats) else: print( "Incorrect arguments. Please use format:\n python main.py [train/evaluate]") # ludwig train --dataset sys.argv[2] --config sys.argv[3]
def train_and_eval_on_split( model_definition, eval_split=VALIDATION, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=False, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, use_horovod=None, random_seed=default_random_seed, debug=False, **kwargs): # Collect training and validation losses and metrics # & append it to `results` model = LudwigModel( model_definition=model_definition, use_horovod=use_horovod, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, ) train_stats, preprocessed_data, _ = model.train( dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, output_directory=output_directory, random_seed=random_seed, debug=debug, ) if model_definition[TRAINING]["eval_batch_size"] > 0: batch_size = model_definition[TRAINING]["eval_batch_size"] else: batch_size = model_definition[TRAINING]["batch_size"] training_set, validation_set, test_set, train_set_metadata = preprocessed_data eval_set = validation_set if eval_split == TRAINING: eval_set = training_set elif eval_split == VALIDATION: eval_set = validation_set elif eval_split == TEST: eval_set = test_set test_results, postproc_predictions, _ = model.evaluate( dataset=eval_set, data_format=data_format, batch_size=batch_size, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, return_type=dict, debug=debug, ) return train_stats, test_results
dataset = twitter_bots.TwitterBots(cache_dir=".") training_set, val_set, test_set = dataset.load(split=True) # Moves profile images into local directory, so relative paths in the dataset will be resolved. rename(os.path.join(dataset.processed_dataset_path, "profile_images"), "./profile_images") with open("./config.yaml") as f: config = yaml.safe_load(f.read()) model = LudwigModel(config, logging_level=logging.INFO) train_stats, preprocessed_data, output_directory = model.train( dataset=training_set) # Generates predictions and performance statistics for the test set. test_stats, predictions, output_directory = model.evaluate( test_set, collect_predictions=True, collect_overall_stats=True) confusion_matrix( [test_stats], model.training_set_metadata, "account_type", top_n_classes=[2], model_names=[""], normalize=True, output_directory="./visualizations", file_format="png", ) # Visualizes learning curves, which show how performance metrics changed over time during training. learning_curves(train_stats, output_feature_name="account_type",