def train_and_test(model_definition, dataset_file, target_folder): model = LudwigModel(model_definition, logging_level=DEBUG) train_stats = model.train(data_csv=dataset_file) model.save(target_folder) # optionally a separate test file can be supplied OR # Ludwig built-in "split" column mechanism can be used predictions, test_stats = model.test(data_csv=dataset_file) print(test_stats['combined']['accuracy']) model.close()
def main(): data = pd.read_csv('./winequality-red.csv') for dummy_i in range(len(data["quality"])): if data["quality"][dummy_i] < 6.5: data.at[dummy_i, "quality"] = 0 else: data.at[dummy_i, "quality"] = 1 model_definition = { "input_features": [{ 'name': 'fixed_acidity', 'type': 'numerical' }, { 'name': 'volatile_acidity', 'type': 'numerical' }, { 'name': 'citric_acid', 'type': 'numerical' }, { 'name': 'residual_sugar', 'type': 'numerical' }, { 'name': 'chlorides', 'type': 'numerical' }, { 'name': 'free_sulfur_dioxide', 'type': 'numerical' }, { 'name': 'total_sulfur_dioxide', 'type': 'numerical' }, { 'name': 'density', 'type': 'numerical' }, { 'name': 'pH', 'type': 'numerical' }, { 'name': 'sulphates', 'type': 'numerical' }, { 'name': 'alcohol', 'type': 'numerical' }], "output_features": [{ 'name': 'quality', 'type': 'binary' }] } model = LudwigModel(model_definition) trained_model = model.train(data) predictions, test_stats = model.test(data_df=data) model.save("wine") model.close()
def train(self): training_dataframe, model_definition = self._create_ludwig_dataframe('train') if self.transaction.lmd['model_order_by'] is None: timeseries_cols = [] else: timeseries_cols = list(map(lambda x: x[0], self.transaction.lmd['model_order_by'])) if len(timeseries_cols) > 0: training_dataframe, model_definition = self._translate_df_to_timeseries_format(training_dataframe, model_definition, timeseries_cols, 'train') with disable_ludwig_output(True): model = LudwigModel(model_definition) # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295 #model.initialize_model(train_set_metadata={}) #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name'] if self.transaction.lmd['rebuild_model'] is True: train_stats = model.train(data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=True) else: model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) train_stats = model.train(data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=True) #,model_load_path=self.transaction.lmd['ludwig_data']['ludwig_save_path']) for k in train_stats['train']: if k not in self.transaction.lmd['model_accuracy']['train']: self.transaction.lmd['model_accuracy']['train'][k] = [] self.transaction.lmd['model_accuracy']['test'][k] = [] elif k is not 'combined': # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway pass else: self.transaction.lmd['model_accuracy']['train'][k].extend(train_stats['train'][k]['accuracy']) self.transaction.lmd['model_accuracy']['test'][k].extend(train_stats['test'][k]['accuracy']) ''' @ TRAIN ONLINE BIT That's not working model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) for i in range(0,100): train_stats = model.train_online(data_df=training_dataframe) # The resulting train_stats are "None"... wonderful -_- ''' ludwig_model_savepath = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_ludwig_data') model.save(ludwig_model_savepath) model.close() self.transaction.lmd['ludwig_data'] = {'ludwig_save_path': ludwig_model_savepath} self.transaction.hmd['ludwig_data'] = {'model_definition': model_definition}
def main(): folderName = "testeste" cvsFile = pd.read_csv(folderName + '/train.csv') print(cvsFile) cvsFilePredict = pd.read_csv(folderName + '/predict.csv') model_definition = { 'input_features':[ {'name':'image_path', 'type':'image', 'encoder':'stacked_cnn'} ], 'output_features': [ {'name': 'class', 'type': 'binary'} ] } model = LudwigModel(model_definition) trainData = model.train(data_df=cvsFile) #model = LudwigModel.load("trainedModel") predictionData1 = model.predict(data_df=cvsFilePredict) ''' numpyPrediction = predictionData1.to_numpy() results = [] for i in range(len(numpyPrediction)): results.append(numpyPrediction[i][0]) #results now has the bool values ''' print("=========================PREDICTION 1=========================") print(predictionData1.to_string()) model.close()
def train(self): training_dataframe, model_definition, timeseries_cols = self._create_ludwig_dataframe( 'train') if len(timeseries_cols) > 0: training_dataframe, model_definition = self._translate_df_to_timeseries_format( training_dataframe, model_definition, timeseries_cols, 'train') with disable_console_output(True): # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295 #model.initialize_model(train_set_metadata={}) #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name'] ludwig_save_is_working = False if not ludwig_save_is_working: shutil.rmtree('results', ignore_errors=True) if self.transaction.lmd['rebuild_model'] is True: model = LudwigModel(model_definition) merged_model_definition = model.model_definition train_set_metadata = build_metadata( training_dataframe, (merged_model_definition['input_features'] + merged_model_definition['output_features']), merged_model_definition['preprocessing']) model.initialize_model(train_set_metadata=train_set_metadata, gpus=self.get_useable_gpus()) train_stats = model.train( data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=ludwig_save_is_working, skip_save_progress=True, gpus=self.get_useable_gpus()) else: model = LudwigModel.load(model_dir=self.get_model_dir()) train_stats = model.train( data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=ludwig_save_is_working, skip_save_progress=True, gpus=self.get_useable_gpus()) for k in train_stats['train']: if k not in self.transaction.lmd['model_accuracy']['train']: self.transaction.lmd['model_accuracy']['train'][k] = [] self.transaction.lmd['model_accuracy']['test'][k] = [] elif k is not 'combined': # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway pass else: self.transaction.lmd['model_accuracy']['train'][k].extend( train_stats['train'][k]['accuracy']) self.transaction.lmd['model_accuracy']['test'][k].extend( train_stats['test'][k]['accuracy']) ''' @ TRAIN ONLINE BIT That's not working model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) for i in range(0,100): train_stats = model.train_online(data_df=training_dataframe) # The resulting train_stats are "None"... wonderful -_- ''' ludwig_model_savepath = os.path.join( CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'] + '_ludwig_data') if ludwig_save_is_working: model.save(ludwig_model_savepath) model.close() else: shutil.rmtree(ludwig_model_savepath, ignore_errors=True) shutil.move(os.path.join('results', os.listdir('results')[0]), ludwig_model_savepath) self.transaction.lmd['ludwig_data'] = { 'ludwig_save_path': ludwig_model_savepath } self.transaction.hmd['ludwig_data'] = { 'model_definition': model_definition }
#!/usr/bin/env python # coding: utf-8 # # Simple Model Training Example # # This example is the API example for this Ludwig command line example # (https://uber.github.io/ludwig/examples/#kaggles-titanic-predicting-survivors). # Import required libraries from ludwig.api import LudwigModel import logging import shutil # clean out prior results try: shutil.rmtree('./results') except: pass # Define Ludwig model object that drive model training model = LudwigModel(model_definition_file='./model1_definition.yaml', logging_level=logging.INFO) # initiate model training train_stats = model.train(data_csv='./data/train.csv', experiment_name='simple_experiment', model_name='simple_model') model.close()
def test_savedmodel(csv_filename): ####### # Setup ####### dir_path = os.path.dirname(csv_filename) # Single sequence input, single category output sf = sequence_feature() sf['encoder'] = 'parallel_cnn' input_features = [sf] output_features = [category_feature(vocab_size=2)] predictions_column_name = '{}_predictions'.format( output_features[0]['name']) # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(model_definition) ludwig_model.train( data_csv=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################# # save savedmodel ################# savedmodel_path = os.path.join(dir_path, 'savedmodel') shutil.rmtree(savedmodel_path, ignore_errors=True) ludwig_model.model.save_savedmodel(savedmodel_path) ############################## # collect weight tensors names ############################## original_predictions_df = ludwig_model.predict(data_csv=data_csv_path) original_weights = deepcopy(ludwig_model.model.model.trainable_variables) ludwig_model.close() ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path) loaded_prediction_df = ludwig_model.predict(data_csv=data_csv_path) loaded_weights = deepcopy(ludwig_model.model.model.trainable_variables) ################################################# # restore savedmodel, obtain predictions and weights ################################################# train_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, train_set_metadata = preprocess_for_prediction( ludwigmodel_path, split=FULL, data_csv=data_csv_path, train_set_metadata=train_set_metadata_json_fp, evaluate_performance=False) restored_model = tf.saved_model.load(savedmodel_path) if_name = list(ludwig_model.model.model.input_features.keys())[0] of_name = list(ludwig_model.model.model.output_features.keys())[0] data_to_predict = { if_name: tf.convert_to_tensor(dataset.dataset[if_name], dtype=tf.int32) } logits = restored_model(data_to_predict, False, None) restored_predictions = tf.argmax(logits[of_name]['logits'], -1, name='predictions_{}'.format(of_name)) restored_predictions = tf.map_fn( lambda idx: train_set_metadata[of_name]['idx2str'][idx], restored_predictions, dtype=tf.string) restored_weights = deepcopy(restored_model.trainable_variables) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(savedmodel_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### # check for same number of weights as original model assert len(original_weights) == len(loaded_weights) assert len(original_weights) == len(restored_weights) # check to ensure weight valuess match the original model loaded_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), loaded_weights[i].numpy())) for i in range(len(original_weights)) ]) restored_weights_match = np.all([ np.all( np.isclose(original_weights[i].numpy(), restored_weights[i].numpy())) for i in range(len(original_weights)) ]) assert loaded_weights_match and restored_weights_match # Are predictions identical to original ones? loaded_predictions_match = np.all( original_predictions_df[predictions_column_name] == loaded_prediction_df[predictions_column_name]) restored_predictions_match = np.all( original_predictions_df[predictions_column_name] == restored_predictions.numpy().astype('str')) assert loaded_predictions_match and restored_predictions_match
def test_savedmodel(csv_filename): ####### # Setup ####### dir_path = os.path.dirname(csv_filename) # Single sequence input, single category output sf = sequence_feature() sf['encoder'] = 'parallel_cnn' input_features = [sf] input_feature_name = input_features[0]['name'] input_feature_tensor_name = '{}/{}_placeholder:0'.format( input_feature_name, input_feature_name) output_features = [category_feature(vocab_size=2)] output_feature_name = output_features[0]['name'] output_feature_tensor_name = '{}/predictions_{}/predictions_{}:0'.format( output_feature_name, output_feature_name, output_feature_name) predictions_column_name = '{}_predictions'.format(output_feature_name) weight_tensor_name = '{}/fc_0/weights:0'.format(input_feature_name) # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } ludwig_model = LudwigModel(model_definition) ludwig_model.train( data_csv=data_csv_path, skip_save_training_description=True, skip_save_training_statistics=True, skip_save_model=True, skip_save_progress=True, skip_save_log=True, skip_save_processed_input=True, ) original_predictions_df = ludwig_model.predict(data_csv=data_csv_path) ################### # save Ludwig model ################### ludwigmodel_path = os.path.join(dir_path, 'ludwigmodel') shutil.rmtree(ludwigmodel_path, ignore_errors=True) ludwig_model.save(ludwigmodel_path) ################# # save savedmodel ################# savedmodel_path = os.path.join(dir_path, 'savedmodel') shutil.rmtree(savedmodel_path, ignore_errors=True) ludwig_model.model.save_savedmodel(savedmodel_path) ############################## # collect weight tensors names ############################## with ludwig_model.model.session as sess: all_variables = tf.compat.v1.trainable_variables() all_variables_names = [v.name for v in all_variables] ludwig_model.close() ################################################### # load Ludwig model, obtain predictions and weights ################################################### ludwig_model = LudwigModel.load(ludwigmodel_path) ludwig_prediction_df = ludwig_model.predict(data_csv=data_csv_path) ludwig_weights = ludwig_model.model.collect_weights(all_variables_names) ludwig_model.close() ################################################# # load savedmodel, obtain predictions and weights ################################################# train_set_metadata_json_fp = os.path.join(ludwigmodel_path, TRAIN_SET_METADATA_FILE_NAME) dataset, train_set_metadata = preprocess_for_prediction( ludwigmodel_path, split=FULL, data_csv=data_csv_path, train_set_metadata=train_set_metadata_json_fp, evaluate_performance=False) with tf.compat.v1.Session() as sess: tf.saved_model.loader.load(sess, [tf.saved_model.SERVING], savedmodel_path) predictions = sess.run(output_feature_tensor_name, feed_dict={ input_feature_tensor_name: dataset.get(input_feature_name), }) savedmodel_prediction_df = pd.DataFrame( data=[ train_set_metadata[output_feature_name]["idx2str"][p] for p in predictions ], columns=[predictions_column_name]) savedmodel_weights = sess.run({n: n for n in all_variables_names}) ######### # Cleanup ######### shutil.rmtree(ludwigmodel_path, ignore_errors=True) shutil.rmtree(savedmodel_path, ignore_errors=True) ############################################### # Check if weights and predictions are the same ############################################### for var in all_variables_names: print("Are the weights in {} identical?".format(var), np.all(ludwig_weights[var] == savedmodel_weights[var])) print( "Are loaded model predictions identical to original ones?", np.all( original_predictions_df[predictions_column_name] == \ ludwig_prediction_df[predictions_column_name] ) ) print( "Are savedmodel predictions identical to loaded model?", np.all( ludwig_prediction_df[predictions_column_name] == \ savedmodel_prediction_df[predictions_column_name] ) ) for var in all_variables_names: assert np.all(ludwig_weights[var] == savedmodel_weights[var]) assert np.all( original_predictions_df[predictions_column_name] == \ ludwig_prediction_df[predictions_column_name] ) assert np.all( ludwig_prediction_df[predictions_column_name] == \ savedmodel_prediction_df[predictions_column_name] )
def train(self): training_dataframe, model_definition, timeseries_cols, has_heavy_data, self.transaction.lmd[ 'ludwig_tf_self_col_map'] = self._create_ludwig_dataframe('train') if len(timeseries_cols) > 0: training_dataframe, model_definition = self._translate_df_to_timeseries_format( training_dataframe, model_definition, timeseries_cols, 'train') with disable_console_output(True): # <---- Ludwig currently broken, since mode can't be initialized without train_set_metadata and train_set_metadata can't be obtained without running train... see this issue for any updates on the matter: https://github.com/uber/ludwig/issues/295 #model.initialize_model(train_set_metadata={}) #train_stats = model.train_online(data_df=training_dataframe) # ??Where to add model_name?? ----> model_name=self.transaction.lmd['name'] ludwig_save_is_working = False if not ludwig_save_is_working: shutil.rmtree('results', ignore_errors=True) if self.transaction.lmd['rebuild_model'] is True: model = LudwigModel(model_definition) merged_model_definition = model.model_definition train_set_metadata = build_metadata( training_dataframe, (merged_model_definition['input_features'] + merged_model_definition['output_features']), merged_model_definition['preprocessing']) model.initialize_model(train_set_metadata=train_set_metadata, gpus=self._get_useable_gpus()) else: model = LudwigModel.load(model_dir=self._get_model_dir()) split_by = int(20 * pow(10, 6)) if has_heavy_data: split_by = 40 df_len = len(training_dataframe[training_dataframe.columns[0]]) if df_len > split_by: i = 0 while i < df_len: end = i + split_by self.transaction.log.info( f'Training with batch from index {i} to index {end}') training_sample = training_dataframe.iloc[i:end] training_sample = training_sample.reset_index() if len(training_sample) < 1: continue train_stats = model.train( data_df=training_sample, model_name=self.transaction.lmd['name'], skip_save_model=ludwig_save_is_working, skip_save_progress=True, gpus=self._get_useable_gpus()) i = end else: train_stats = model.train( data_df=training_dataframe, model_name=self.transaction.lmd['name'], skip_save_model=ludwig_save_is_working, skip_save_progress=True, gpus=self._get_useable_gpus()) for k in train_stats['train']: if k not in self.transaction.lmd['model_accuracy']['train']: self.transaction.lmd['model_accuracy']['train'][k] = [] self.transaction.lmd['model_accuracy']['test'][k] = [] elif k != 'combined': # We should be adding the accuracy here but we only have it for combined, so, for now use that, will only affect multi-output scenarios anyway pass else: self.transaction.lmd['model_accuracy']['train'][k].extend( train_stats['train'][k]['accuracy']) self.transaction.lmd['model_accuracy']['test'][k].extend( train_stats['test'][k]['accuracy']) ''' @ TRAIN ONLINE BIT That's not working model = LudwigModel.load(self.transaction.lmd['ludwig_data']['ludwig_save_path']) for i in range(0,100): train_stats = model.train_online(data_df=training_dataframe) # The resulting train_stats are "None"... wonderful -_- ''' ludwig_model_savepath = os.path.join(CONFIG.MINDSDB_STORAGE_PATH, self.transaction.lmd['name'], 'ludwig_data') Path(CONFIG.MINDSDB_STORAGE_PATH).joinpath( self.transaction.lmd['name']).mkdir(mode=0o777, exist_ok=True, parents=True) if ludwig_save_is_working: model.save(ludwig_model_savepath) model.close() else: shutil.rmtree(ludwig_model_savepath, ignore_errors=True) shutil.move(os.path.join('results', os.listdir('results')[0]), ludwig_model_savepath) self.transaction.lmd['ludwig_data'] = { 'ludwig_save_path': ludwig_model_savepath } self.transaction.hmd['ludwig_data'] = { 'model_definition': model_definition }