def _preprocess_df_for_training( features, data_df=None, data_train_df=None, data_validation_df=None, data_test_df=None, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed): """ Method to pre-process dataframes. This doesn't have the optoin to save the processed data as hdf5 as we don't expect users to do this as the data can be processed in memory """ if data_df is not None: # needs preprocessing logging.info('Using full dataframe') logging.info('Building dataset (it may take a while)') elif data_train_df is not None: # needs preprocessing logging.info('Using training dataframe') logging.info('Building dataset (it may take a while)') data_df = concatenate_df(data_train_df, data_validation_df, data_test_df) data, train_set_metadata = build_dataset_df(data_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) return training_set, test_set, validation_set, train_set_metadata
def test_number_feature_wrong_dtype(csv_filename, tmpdir): """Tests that a number feature with all string values is treated as having missing values by default.""" data_csv_path = os.path.join(tmpdir, csv_filename) num_feat = number_feature() input_features = [num_feat] output_features = [binary_feature()] config = { "input_features": input_features, "output_features": output_features } training_data_csv_path = generate_data(input_features, output_features, data_csv_path) df = pd.read_csv(training_data_csv_path) # convert numbers to random strings def random_string(): letters = string.ascii_lowercase return "".join(random.choice(letters) for _ in range(10)) df[num_feat[COLUMN]] = df[num_feat[COLUMN]].apply( lambda _: random_string()) # run preprocessing backend = LocalTestBackend() ludwig_model = LudwigModel(config, backend=backend) train_ds, val_ds, test_ds, _ = ludwig_model.preprocess(dataset=df) concatenated_df = concatenate_df(train_ds.to_df(), val_ds.to_df(), test_ds.to_df(), backend) # check that train_ds had invalid values replaced with the missing value assert len(concatenated_df) == len(df) assert np.all(concatenated_df[num_feat[PROC_COLUMN]] == 0.0)
def test_image_resizing_num_channel_handling(csv_filename): """ This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Resnet encoder input_features = [ image_feature(folder=image_dest_folder, encoder='resnet', preprocessing={ 'in_memory': True, 'height': 8, 'width': 8, 'num_channels': 3, 'num_processes': 5 }, fc_size=8, num_filters=8), text_feature(encoder='embed', min_len=1), numerical_feature(normalization='minmax') ] output_features = [binary_feature(), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df1 = read_csv(rel_path) input_features[0]['preprocessing']['num_channels'] = 1 rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user sepcifiies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]['preprocessing']['num_channels'] # User now doesn't specify num channels. Should throw exception with pytest.raises(ValueError): run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def test_image_resizing_num_channel_handling(csv_filename): """This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the config. :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), "generated_images") # Resnet encoder input_features = [ image_feature( folder=image_dest_folder, encoder="resnet", preprocessing={ "in_memory": True, "height": 8, "width": 8, "num_channels": 3, "num_processes": 5 }, fc_size=8, num_filters=8, ), text_feature(encoder="embed", min_len=1), numerical_feature(normalization="minmax"), ] output_features = [binary_feature(), numerical_feature()] rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df1 = read_csv(rel_path) input_features[0]["preprocessing"]["num_channels"] = 1 rel_path = generate_data(input_features, output_features, csv_filename, num_examples=50) df2 = read_csv(rel_path) df = concatenate_df(df1, df2, None, LOCAL_BACKEND) df.to_csv(rel_path, index=False) # Here the user specifies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) del input_features[0]["preprocessing"]["num_channels"] # User doesn't specify num channels, but num channels is inferred. Exception shouldn't be thrown run_experiment(input_features, output_features, dataset=rel_path) # Delete the temporary data created shutil.rmtree(image_dest_folder)
def preprocess_for_training( model_definition, dataset_type='generic', data_df=None, data_train_df=None, data_validation_df=None, data_test_df=None, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, data_hdf5=None, data_train_hdf5=None, data_validation_hdf5=None, data_test_hdf5=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed): # Check if hdf5 and json already exist data_hdf5_fp = None data_train_hdf5_fp = None data_validation_hdf5_fp = None data_test_hdf5_fp = None train_set_metadata_json_fp = 'metadata.json' if data_csv is not None: data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5' train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json' if (os.path.isfile(data_hdf5_fp) and os.path.isfile(train_set_metadata_json_fp)): logging.info('Found hdf5 and json with the same filename ' 'of the csv, using them instead') data_csv = None data_hdf5 = data_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_train_csv is not None: data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5' train_set_metadata_json_fp = os.path.splitext( data_train_csv)[0] + '.json' if (os.path.isfile(data_train_hdf5_fp) and os.path.isfile(train_set_metadata_json_fp)): logging.info('Found hdf5 and json with the same filename of ' 'the train csv, using them instead') data_train_csv = None data_train_hdf5 = data_train_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_validation_csv is not None: data_validation_hdf5_fp = os.path.splitext( data_validation_csv)[0] + '.hdf5' if os.path.isfile(data_validation_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_validation_csv = None data_validation_hdf5 = data_validation_hdf5_fp if data_test_csv is not None: data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5' if os.path.isfile(data_test_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_test_csv = None data_test_hdf5 = data_test_hdf5_fp model_definition['data_hdf5_fp'] = data_hdf5_fp # Decide if to preprocess or just load features = (model_definition['input_features'] + model_definition['output_features']) (concatenate_csv, concatenate_df, build_dataset, build_dataset_df) = get_dataset_fun(dataset_type) if data_df is not None: # needs preprocessing logging.info('Using full dataframe') logging.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset_df(data_df, features, preprocessing_params, random_seed=random_seed) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) elif data_train_df is not None: # needs preprocessing logging.info('Using training dataframe') logging.info('Building dataset (it may take a while)') concatenated_df = concatenate_df(data_train_df, data_validation_df, data_test_df) data, train_set_metadata = build_dataset_df(concatenated_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_train_hdf5_fp, training_set, train_set_metadata) if validation_set is not None: data_utils.save_hdf5(data_validation_hdf5_fp, validation_set, train_set_metadata) if test_set is not None: data_utils.save_hdf5(data_test_hdf5_fp, test_set, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) elif data_csv is not None: # Use data and ignore _train, _validation and _test. # Also ignore data and train set metadata needs preprocessing logging.info('Using full raw csv, no hdf5 and json file ' 'with the same name have been found') logging.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset(data_csv, features, preprocessing_params, random_seed=random_seed) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) elif data_train_csv is not None: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata # needs preprocessing logging.info('Using training raw csv, no hdf5 and json ' 'file with the same name have been found') logging.info('Building dataset (it may take a while)') concatenated_df = concatenate_csv(data_train_csv, data_validation_csv, data_test_csv) concatenated_df.csv = data_train_csv data, train_set_metadata = build_dataset_df(concatenated_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_train_hdf5_fp, training_set, train_set_metadata) if validation_set is not None: data_utils.save_hdf5(data_validation_hdf5_fp, validation_set, train_set_metadata) if test_set is not None: data_utils.save_hdf5(data_test_hdf5_fp, test_set, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) elif data_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # doesn't need preprocessing, just load logging.info('Using full hdf5 and json') training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], model_definition['output_features'], shuffle_training=True) train_set_metadata = load_metadata(train_set_metadata_json) elif data_train_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # doesn't need preprocessing, just load logging.info('Using hdf5 and json') training_set = load_data(data_train_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) train_set_metadata = load_metadata(train_set_metadata_json) if data_validation_hdf5 is not None: validation_set = load_data(data_validation_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: validation_set = None if data_test_hdf5 is not None: test_set = load_data(data_test_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: test_set = None else: raise RuntimeError('Insufficient input parameters') replace_text_feature_level(model_definition, [training_set, validation_set, test_set]) training_dataset = Dataset(training_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) validation_dataset = None if validation_set is not None: validation_dataset = Dataset(validation_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) test_dataset = None if test_set is not None: test_dataset = Dataset(test_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) return (training_dataset, validation_dataset, test_dataset, train_set_metadata)
def test_image_resizing_num_channel_handling(csv_filename): """ This test creates two image datasets with 3 channels and 1 channel. The combination of this data is used to train a model. This checks the cases where the user may or may not specify a number of channels in the model definition :param csv_filename: :return: """ # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') input_features_template = Template( "[{type: text, name: random_text, vocab_size: 100," " max_len: 10, encoder: stacked_cnn}, {type: numerical," " name: random_number}, " "{type: image, name: random_image, encoder: ${encoder}, preprocessing:" " {width: 10, in_memory: ${in_memory}," " height: 10, num_channels: 3}," " resnet_size: 8, destination_folder: ${folder}}]") # Resnet encoder input_features = input_features_template.substitute( encoder='resnet', folder=image_dest_folder, in_memory='true', ) output_features = "[{type: binary, name: intent, reduce_input: sum}, " \ "{type: numerical, name: random_num_output}]" rel_path = generate_data(input_features, output_features, csv_filename) df1 = pd.read_csv(rel_path) input_features_template = Template( "[{type: text, name: random_text, vocab_size: 100," " max_len: 10, encoder: stacked_cnn}, {type: numerical," " name: random_number}, " "{type: image, name: random_image, preprocessing: {width: 10," " in_memory: ${in_memory}, height: 10, num_channels: 1}," " encoder: ${encoder}," " resnet_size: 8, destination_folder: ${folder}}]") input_features = input_features_template.substitute( encoder='resnet', folder=image_dest_folder, in_memory='true', ) rel_path = generate_data(input_features, output_features, csv_filename) df2 = pd.read_csv(rel_path) df = concatenate_df(df1, df2, None) df.to_csv(rel_path, index=False) # Here the user sepcifiies number of channels. Exception shouldn't be thrown run_experiment(input_features, output_features, rel_path) input_features_template = Template( "[{type: text, name: random_text, vocab_size: 100," " max_len: 10, encoder: stacked_cnn}, {type: numerical," " name: random_number}, " "{type: image, name: random_image, preprocessing: {width: 10, " "in_memory: ${in_memory}, height: 10} , encoder: ${encoder}," " resnet_size: 8, destination_folder: ${folder}}]") input_features = input_features_template.substitute( encoder='resnet', folder=image_dest_folder, in_memory='true', ) # User now doesn't specify num channels. Should throw exception with pytest.raises(ValueError): run_experiment(input_features, output_features, rel_path) # Delete the temporary data created all_images = glob.glob(os.path.join(image_dest_folder, '*.jpg')) for im in all_images: os.remove(im) os.rmdir(image_dest_folder)