def _preprocess_df_for_training( features, data_df=None, data_train_df=None, data_validation_df=None, data_test_df=None, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed): """ Method to pre-process dataframes. This doesn't have the optoin to save the processed data as hdf5 as we don't expect users to do this as the data can be processed in memory """ if data_df is not None: # needs preprocessing logging.info('Using full dataframe') logging.info('Building dataset (it may take a while)') elif data_train_df is not None: # needs preprocessing logging.info('Using training dataframe') logging.info('Building dataset (it may take a while)') data_df = concatenate_df(data_train_df, data_validation_df, data_test_df) data, train_set_metadata = build_dataset_df(data_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) return training_set, test_set, validation_set, train_set_metadata
def load_data( hdf5_file_path, input_features, output_features, split_data=True, shuffle_training=False ): logger.info('Loading data from: {0}'.format(hdf5_file_path)) # Load data from file hdf5_data = h5py.File(hdf5_file_path, 'r') dataset = {} for input_feature in input_features: if input_feature['type'] == TEXT: text_data_field = text_feature_data_field(input_feature) dataset[text_data_field] = hdf5_data[text_data_field].value else: dataset[input_feature['name']] = hdf5_data[ input_feature['name'] ].value for output_feature in output_features: if output_feature['type'] == TEXT: dataset[text_feature_data_field(output_feature)] = hdf5_data[ text_feature_data_field(output_feature) ].value else: dataset[output_feature['name']] = hdf5_data[ output_feature['name']].value if 'limit' in output_feature: dataset[output_feature['name']] = collapse_rare_labels( dataset[output_feature['name']], output_feature['limit'] ) if not split_data: hdf5_data.close() return dataset split = hdf5_data['split'].value hdf5_data.close() training_set, test_set, validation_set = split_dataset_tvt(dataset, split) # shuffle up if shuffle_training: training_set = data_utils.shuffle_dict_unison_inplace(training_set) return training_set, test_set, validation_set
def obtain_df_splits(data_csv): """Split input data csv file in to train, validation and test dataframes. :param data_csv: Input data CSV file. :return test_df, train_df, val_df: Train, validation and test dataframe splits """ data_df = read_csv(data_csv) # Obtain data split array mapping data rows to split type # 0-train, 1-validation, 2-test data_split = get_split(data_df) train_split, test_split, val_split = split_dataset_tvt(data_df, data_split) # Splits are python dictionaries not dataframes- they need to be converted. test_df = pd.DataFrame(test_split) train_df = pd.DataFrame(train_split) val_df = pd.DataFrame(val_split) return test_df, train_df, val_df
def _preprocess_csv_for_training( features, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed ): """ Method to pre-process csv data :param features: list of all features (input + output) :param data_csv: path to the csv data :param data_train_csv: training csv data :param data_validation_csv: validation csv data :param data_test_csv: test csv data :param train_set_metadata_json: train set metadata json :param skip_save_processed_input: if False, the pre-processed data is saved as .hdf5 files in the same location as the csvs with the same names. :param preprocessing_params: preprocessing parameters :param random_seed: random seed :return: training, test, validation datasets, training metadata """ train_set_metadata = None if train_set_metadata_json is not None: train_set_metadata = load_metadata(train_set_metadata_json) if data_csv is not None: # Use data and ignore _train, _validation and _test. # Also ignore data and train set metadata needs preprocessing logger.info( 'Using full raw csv, no hdf5 and json file ' 'with the same name have been found' ) logger.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata, random_seed=random_seed ) if not skip_save_processed_input: logger.info('Writing dataset') data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logger.info('Writing train set metadata with vocabulary') train_set_metadata_json_fp = replace_file_extension( data_csv, 'json' ) data_utils.save_json( train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split'] ) elif data_train_csv is not None: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata # needs preprocessing logger.info( 'Using training raw csv, no hdf5 and json ' 'file with the same name have been found' ) logger.info('Building dataset (it may take a while)') concatenated_df = concatenate_csv( data_train_csv, data_validation_csv, data_test_csv ) concatenated_df.csv = data_train_csv data, train_set_metadata = build_dataset_df( concatenated_df, features, preprocessing_params, train_set_metadata=train_set_metadata, random_seed=random_seed ) training_set, test_set, validation_set = split_dataset_tvt( data, data['split'] ) if not skip_save_processed_input: logger.info('Writing dataset') data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5') data_utils.save_hdf5( data_train_hdf5_fp, training_set, train_set_metadata ) if validation_set is not None: data_validation_hdf5_fp = replace_file_extension( data_validation_csv, 'hdf5' ) data_utils.save_hdf5( data_validation_hdf5_fp, validation_set, train_set_metadata ) if test_set is not None: data_test_hdf5_fp = replace_file_extension(data_test_csv, 'hdf5') data_utils.save_hdf5( data_test_hdf5_fp, test_set, train_set_metadata ) logger.info('Writing train set metadata with vocabulary') train_set_metadata_json_fp = replace_file_extension(data_train_csv, 'json') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) return training_set, test_set, validation_set, train_set_metadata
def preprocess_for_prediction( model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True ): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME) ) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name']) ) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict( default_preprocessing_parameters, model_definition['preprocessing'] ) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == FULL: if data_hdf5 is not None: dataset = load_data( data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False ) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) else: if data_hdf5 is not None: training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False ) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) # build_dataset adds a split column if there is none in the csv # so if we want to check if the csv contained a split column # we have to check in the csv not in the built dataset. # The logic is that if there is no split in the original csv # we treat the split parameter as if it was == full if csv_contains_column(data_csv, SPLIT): training_set, test_set, validation_set = split_dataset_tvt( dataset, dataset[SPLIT] ) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: logger.warning( 'You requested the {} split, but the data CSV ' 'does not contain a "split" column, so the ' 'full data will be used instead' ) replace_text_feature_level( features, [dataset] ) dataset = Dataset( dataset, model_definition['input_features'], output_features, train_set_metadata.get(DATA_TRAIN_HDF5_FP) ) return dataset, train_set_metadata
def preprocess_for_training( model_definition, dataset_type='generic', data_df=None, data_train_df=None, data_validation_df=None, data_test_df=None, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, data_hdf5=None, data_train_hdf5=None, data_validation_hdf5=None, data_test_hdf5=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed): # Check if hdf5 and json already exist data_hdf5_fp = None data_train_hdf5_fp = None data_validation_hdf5_fp = None data_test_hdf5_fp = None train_set_metadata_json_fp = 'metadata.json' if data_csv is not None: data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5' train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json' if (os.path.isfile(data_hdf5_fp) and os.path.isfile(train_set_metadata_json_fp)): logging.info('Found hdf5 and json with the same filename ' 'of the csv, using them instead') data_csv = None data_hdf5 = data_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_train_csv is not None: data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5' train_set_metadata_json_fp = os.path.splitext( data_train_csv)[0] + '.json' if (os.path.isfile(data_train_hdf5_fp) and os.path.isfile(train_set_metadata_json_fp)): logging.info('Found hdf5 and json with the same filename of ' 'the train csv, using them instead') data_train_csv = None data_train_hdf5 = data_train_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_validation_csv is not None: data_validation_hdf5_fp = os.path.splitext( data_validation_csv)[0] + '.hdf5' if os.path.isfile(data_validation_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_validation_csv = None data_validation_hdf5 = data_validation_hdf5_fp if data_test_csv is not None: data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5' if os.path.isfile(data_test_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_test_csv = None data_test_hdf5 = data_test_hdf5_fp model_definition['data_hdf5_fp'] = data_hdf5_fp # Decide if to preprocess or just load features = (model_definition['input_features'] + model_definition['output_features']) (concatenate_csv, concatenate_df, build_dataset, build_dataset_df) = get_dataset_fun(dataset_type) if data_df is not None: # needs preprocessing logging.info('Using full dataframe') logging.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset_df(data_df, features, preprocessing_params, random_seed=random_seed) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) elif data_train_df is not None: # needs preprocessing logging.info('Using training dataframe') logging.info('Building dataset (it may take a while)') concatenated_df = concatenate_df(data_train_df, data_validation_df, data_test_df) data, train_set_metadata = build_dataset_df(concatenated_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_train_hdf5_fp, training_set, train_set_metadata) if validation_set is not None: data_utils.save_hdf5(data_validation_hdf5_fp, validation_set, train_set_metadata) if test_set is not None: data_utils.save_hdf5(data_test_hdf5_fp, test_set, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) elif data_csv is not None: # Use data and ignore _train, _validation and _test. # Also ignore data and train set metadata needs preprocessing logging.info('Using full raw csv, no hdf5 and json file ' 'with the same name have been found') logging.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset(data_csv, features, preprocessing_params, random_seed=random_seed) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) elif data_train_csv is not None: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata # needs preprocessing logging.info('Using training raw csv, no hdf5 and json ' 'file with the same name have been found') logging.info('Building dataset (it may take a while)') concatenated_df = concatenate_csv(data_train_csv, data_validation_csv, data_test_csv) concatenated_df.csv = data_train_csv data, train_set_metadata = build_dataset_df(concatenated_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_train_hdf5_fp, training_set, train_set_metadata) if validation_set is not None: data_utils.save_hdf5(data_validation_hdf5_fp, validation_set, train_set_metadata) if test_set is not None: data_utils.save_hdf5(data_test_hdf5_fp, test_set, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) elif data_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # doesn't need preprocessing, just load logging.info('Using full hdf5 and json') training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], model_definition['output_features'], shuffle_training=True) train_set_metadata = load_metadata(train_set_metadata_json) elif data_train_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # doesn't need preprocessing, just load logging.info('Using hdf5 and json') training_set = load_data(data_train_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) train_set_metadata = load_metadata(train_set_metadata_json) if data_validation_hdf5 is not None: validation_set = load_data(data_validation_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: validation_set = None if data_test_hdf5 is not None: test_set = load_data(data_test_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: test_set = None else: raise RuntimeError('Insufficient input parameters') replace_text_feature_level(model_definition, [training_set, validation_set, test_set]) training_dataset = Dataset(training_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) validation_dataset = None if validation_set is not None: validation_dataset = Dataset(validation_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) test_dataset = None if test_set is not None: test_dataset = Dataset(test_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) return (training_dataset, validation_dataset, test_dataset, train_set_metadata)
def test_model_save_reload_API(csv_filename, tmp_path): tf.random.set_seed(1234) dir_path = os.path.dirname(csv_filename) image_dest_folder = os.path.join(os.getcwd(), 'generated_images') audio_dest_folder = os.path.join(os.getcwd(), 'generated_audio') input_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3, encoder='rnn', cell_type='lstm', num_layers=2, bidirections=True), vector_feature(), image_feature(image_dest_folder), audio_feature(audio_dest_folder, encoder='stacked_cnn'), timeseries_feature(encoder='parallel_cnn'), sequence_feature(vocab_size=3, encoder='stacked_parallel_cnn'), date_feature(), h3_feature(), set_feature(vocab_size=3), bag_feature(vocab_size=3), ] output_features = [ binary_feature(), numerical_feature(), category_feature(vocab_size=3), sequence_feature(vocab_size=3), text_feature(vocab_size=3), set_feature(vocab_size=3), vector_feature(), ] # Generate test data data_csv_path = generate_data(input_features, output_features, csv_filename) ############# # Train model ############# model_definition = { 'input_features': input_features, 'output_features': output_features, 'training': { 'epochs': 2 } } data_df = read_csv(data_csv_path) training_set, test_set, validation_set = split_dataset_tvt( data_df, get_split(data_df)) training_set = pd.DataFrame(training_set) validation_set = pd.DataFrame(validation_set) test_set = pd.DataFrame(test_set) # create sub-directory to store results results_dir = tmp_path / 'results' results_dir.mkdir() # perform initial model training ludwig_model1 = LudwigModel(model_definition) train_stats = ludwig_model1.train( data_train_df=training_set, data_validation_df=validation_set, data_test_df=test_set, output_directory='results' # results_dir ) preds_1 = ludwig_model1.predict(data_df=validation_set) def check_model_equal(ludwig_model2): # Compare model predictions preds_2 = ludwig_model2.predict(data_df=validation_set) assert set(preds_1.keys()) == set(preds_2.keys()) for key in preds_1: assert preds_1[key].dtype == preds_2[key].dtype, key assert list(preds_1[key]) == list(preds_2[key]), key # assert preds_2[key].dtype == preds_3[key].dtype, key # assert list(preds_2[key]) == list(preds_3[key]), key # Compare model weights # this has to be done after predicts because of TF2 lazy restoration for if_name in ludwig_model1.model.model.input_features: if1 = ludwig_model1.model.model.input_features[if_name] if2 = ludwig_model2.model.model.input_features[if_name] for if1_w, if2_w in zip(if1.encoder_obj.weights, if2.encoder_obj.weights): assert np.allclose(if1_w.numpy(), if2_w.numpy()) c1 = ludwig_model1.model.model.combiner c2 = ludwig_model2.model.model.combiner for c1_w, c2_w in zip(c1.weights, c2.weights): assert np.allclose(c1_w.numpy(), c2_w.numpy()) for of_name in ludwig_model1.model.model.output_features: of1 = ludwig_model1.model.model.output_features[of_name] of2 = ludwig_model2.model.model.output_features[of_name] for of1_w, of2_w in zip(of1.decoder_obj.weights, of2.decoder_obj.weights): assert np.allclose(of1_w.numpy(), of2_w.numpy()) # Test saving and loading the model explicitly with tempfile.TemporaryDirectory() as tmpdir: ludwig_model1.save(tmpdir) ludwig_model_loaded = LudwigModel.load(tmpdir) check_model_equal(ludwig_model_loaded) # Test loading the model from the experiment directory ludwig_model_exp = LudwigModel.load( os.path.join(ludwig_model1.exp_dir_name, 'model')) check_model_equal(ludwig_model_exp)