def delete_temporary_data(csv_path): """ Helper method to delete temporary data created for running tests. Deletes the csv and hdf5/json data (if any) :param csv_path: path to the csv data file :return: None """ if os.path.isfile(csv_path): os.remove(csv_path) json_path = replace_file_extension(csv_path, 'meta.json') if os.path.isfile(json_path): os.remove(json_path) hdf5_path = replace_file_extension(csv_path, 'hdf5') if os.path.isfile(hdf5_path): os.remove(hdf5_path)
def preprocess_for_prediction( model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True ): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME) ) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name']) ) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict( default_preprocessing_parameters, model_definition['preprocessing'] ) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == 'full': if data_hdf5 is not None: dataset = load_data( data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False ) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) else: if data_hdf5 is not None: training, test, validation = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False ) if split == 'training': dataset = training elif split == 'validation': dataset = validation else: # if split == 'test': dataset = test else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) replace_text_feature_level( features, [dataset] ) dataset = Dataset( dataset, model_definition['input_features'], output_features, data_hdf5_fp, ) return dataset, train_set_metadata
def _preprocess_csv_for_training( features, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed ): """ Method to pre-process csv data :param features: list of all features (input + output) :param data_csv: path to the csv data :param data_train_csv: training csv data :param data_validation_csv: validation csv data :param data_test_csv: test csv data :param train_set_metadata_json: train set metadata json :param skip_save_processed_input: if False, the pre-processed data is saved as .hdf5 files in the same location as the csvs with the same names. :param preprocessing_params: preprocessing parameters :param random_seed: random seed :return: training, test, validation datasets, training metadata """ train_set_metadata = None if train_set_metadata_json is not None: train_set_metadata = load_metadata(train_set_metadata_json) if data_csv is not None: # Use data and ignore _train, _validation and _test. # Also ignore data and train set metadata needs preprocessing logger.info( 'Using full raw csv, no hdf5 and json file ' 'with the same name have been found' ) logger.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata, random_seed=random_seed ) if not skip_save_processed_input: logger.info('Writing dataset') data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logger.info('Writing train set metadata with vocabulary') train_set_metadata_json_fp = replace_file_extension( data_csv, 'json' ) data_utils.save_json( train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split'] ) elif data_train_csv is not None: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata # needs preprocessing logger.info( 'Using training raw csv, no hdf5 and json ' 'file with the same name have been found' ) logger.info('Building dataset (it may take a while)') concatenated_df = concatenate_csv( data_train_csv, data_validation_csv, data_test_csv ) concatenated_df.csv = data_train_csv data, train_set_metadata = build_dataset_df( concatenated_df, features, preprocessing_params, train_set_metadata=train_set_metadata, random_seed=random_seed ) training_set, test_set, validation_set = split_dataset_tvt( data, data['split'] ) if not skip_save_processed_input: logger.info('Writing dataset') data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5') data_utils.save_hdf5( data_train_hdf5_fp, training_set, train_set_metadata ) if validation_set is not None: data_validation_hdf5_fp = replace_file_extension( data_validation_csv, 'hdf5' ) data_utils.save_hdf5( data_validation_hdf5_fp, validation_set, train_set_metadata ) if test_set is not None: data_test_hdf5_fp = replace_file_extension(data_test_csv, 'hdf5') data_utils.save_hdf5( data_test_hdf5_fp, test_set, train_set_metadata ) logger.info('Writing train set metadata with vocabulary') train_set_metadata_json_fp = replace_file_extension(data_train_csv, 'json') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) return training_set, test_set, validation_set, train_set_metadata
def preprocess_for_training_by_type( model_definition, data_type, all_data_fp=None, train_fp=None, validation_fp=None, test_fp=None, all_data_df=None, train_df=None, validation_df=None, test_df=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed ): if all_data_fp is not None and train_fp is not None: raise ValueError('Use either one file for all data or 3 files for ' 'train, test and validation') if data_type not in ['hdf5', 'csv', 'pandas']: raise ValueError('Invalid type of data provided') features = (model_definition['input_features'] + model_definition['output_features']) data_hdf5_fp = None if data_type == 'pandas': # Preprocess data frames ( training_set, test_set, validation_set, train_set_metadata ) = _preprocess_df_for_training( features, all_data_df, train_df, validation_df, test_df, train_set_metadata_json=train_set_metadata_json, preprocessing_params=preprocessing_params, random_seed=random_seed ) elif data_type == 'hdf5' and train_set_metadata_json is None: raise ValueError('train set metadata file is not found along with hdf5 ' 'data') elif data_type == 'hdf5': if all_data_fp is not None: data_hdf5_fp = replace_file_extension(all_data_fp, 'hdf5') logger.info('Using full hdf5 and json') training_set, test_set, validation_set = load_data( all_data_fp, model_definition['input_features'], model_definition['output_features'], shuffle_training=True ) train_set_metadata = load_metadata(train_set_metadata_json) elif train_fp is not None: logger.info('Using hdf5 and json') training_set = load_data( train_fp, model_definition['input_features'], model_definition['output_features'], split_data=False ) train_set_metadata = load_metadata(train_set_metadata_json) validation_set = None if validation_fp is not None: validation_set = load_data( validation_fp, model_definition['input_features'], model_definition['output_features'], split_data=False ) test_set = None if test_fp is not None: test_set = load_data( test_fp, model_definition['input_features'], model_definition['output_features'], split_data=False ) elif data_type == 'csv': data_hdf5_fp = replace_file_extension( all_data_fp, 'hdf5' ) model_definition['data_hdf5_fp'] = data_hdf5_fp if all_data_fp is not None: if (file_exists_with_diff_extension(all_data_fp, 'hdf5') and file_exists_with_diff_extension(all_data_fp, 'json')): # use hdf5 data instead logger.info( 'Found hdf5 and json with the same filename ' 'of the csv, using them instead' ) return preprocess_for_training_by_type( model_definition, 'hdf5', all_data_fp=replace_file_extension(all_data_fp, 'hdf5'), train_set_metadata_json=replace_file_extension(all_data_fp, 'json'), skip_save_processed_input=skip_save_processed_input, preprocessing_params=preprocessing_params, random_seed=random_seed ) else: ( training_set, test_set, validation_set, train_set_metadata ) = _preprocess_csv_for_training( features=features, data_csv=all_data_fp, data_train_csv=None, data_validation_csv=None, data_test_csv=None, train_set_metadata_json=train_set_metadata_json, skip_save_processed_input=skip_save_processed_input, preprocessing_params=preprocessing_params, random_seed=random_seed ) else: if (file_exists_with_diff_extension(train_fp, 'hdf5') and file_exists_with_diff_extension(train_fp, 'json') and file_exists_with_diff_extension(validation_fp, 'hdf5') and file_exists_with_diff_extension(test_fp, 'hdf5')): logger.info( 'Found hdf5 and json with the same filename ' 'of the csvs, using them instead.' ) return preprocess_for_training_by_type( model_definition, 'hdf5', train_fp=replace_file_extension(train_fp, 'hdf5'), validation_fp=replace_file_extension(validation_fp, 'hdf5'), test_fp=replace_file_extension(test_fp, 'hdf5'), train_set_metadata_json=replace_file_extension(all_data_fp, 'json'), skip_save_processed_input=skip_save_processed_input, preprocessing_params=preprocessing_params, random_seed=random_seed ) else: ( training_set, test_set, validation_set, train_set_metadata ) = _preprocess_csv_for_training( features=features, data_csv=None, data_train_csv=train_fp, data_validation_csv=validation_fp, data_test_csv=test_fp, train_set_metadata_json=train_set_metadata_json, skip_save_processed_input=skip_save_processed_input, preprocessing_params=preprocessing_params, random_seed=random_seed ) else: raise RuntimeError('Insufficient input parameters') replace_text_feature_level( model_definition['input_features'] + model_definition['output_features'], [training_set, validation_set, test_set] ) training_dataset = Dataset( training_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp ) validation_dataset = None if validation_set is not None: validation_dataset = Dataset( validation_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp ) test_dataset = None if test_set is not None: test_dataset = Dataset( test_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp ) return ( training_dataset, validation_dataset, test_dataset, train_set_metadata )
def create_data_set_to_use(data_format, raw_data): # helper function for generating training and test data with specified format # handles all data formats except for hdf5 # assumes raw_data is a csv dataset generated by # tests.integration_tests.utils.generate_data() function # support for writing to a fwf dataset based on this stackoverflow posting: # https://stackoverflow.com/questions/16490261/python-pandas-write-dataframe-to-fixed-width-file-to-fwf from tabulate import tabulate def to_fwf(df, fname): content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain") open(fname, "w").write(content) pd.DataFrame.to_fwf = to_fwf dataset_to_use = None if data_format == "csv": dataset_to_use = raw_data elif data_format in {"df", "dict"}: dataset_to_use = pd.read_csv(raw_data) if data_format == "dict": dataset_to_use = dataset_to_use.to_dict(orient="list") elif data_format == "excel": dataset_to_use = replace_file_extension(raw_data, "xlsx") pd.read_csv(raw_data).to_excel(dataset_to_use, index=False) elif data_format == "excel_xls": dataset_to_use = replace_file_extension(raw_data, "xls") pd.read_csv(raw_data).to_excel(dataset_to_use, index=False) elif data_format == "feather": dataset_to_use = replace_file_extension(raw_data, "feather") pd.read_csv(raw_data).to_feather(dataset_to_use) elif data_format == "fwf": dataset_to_use = replace_file_extension(raw_data, "fwf") pd.read_csv(raw_data).to_fwf(dataset_to_use) elif data_format == "html": dataset_to_use = replace_file_extension(raw_data, "html") pd.read_csv(raw_data).to_html(dataset_to_use, index=False) elif data_format == "json": dataset_to_use = replace_file_extension(raw_data, "json") pd.read_csv(raw_data).to_json(dataset_to_use, orient="records") elif data_format == "jsonl": dataset_to_use = replace_file_extension(raw_data, "jsonl") pd.read_csv(raw_data).to_json(dataset_to_use, orient="records", lines=True) elif data_format == "parquet": dataset_to_use = replace_file_extension(raw_data, "parquet") pd.read_csv(raw_data).to_parquet(dataset_to_use, index=False) elif data_format == "pickle": dataset_to_use = replace_file_extension(raw_data, "pickle") pd.read_csv(raw_data).to_pickle(dataset_to_use) elif data_format == "stata": dataset_to_use = replace_file_extension(raw_data, "stata") pd.read_csv(raw_data).to_stata(dataset_to_use) elif data_format == "tsv": dataset_to_use = replace_file_extension(raw_data, "tsv") pd.read_csv(raw_data).to_csv(dataset_to_use, sep="\t", index=False) else: ValueError(f"'{data_format}' is an unrecognized data format") return dataset_to_use
def preprocess_for_training( model_definition, dataset_type='generic', data_df=None, data_train_df=None, data_validation_df=None, data_test_df=None, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, data_hdf5=None, data_train_hdf5=None, data_validation_hdf5=None, data_test_hdf5=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed): # Sanity Check to make sure some data source is provided data_sources_provided = [ data_df, data_train_df, data_csv, data_train_csv, data_hdf5, data_train_hdf5 ] data_sources_not_none = [x is not None for x in data_sources_provided] if not any(data_sources_not_none): raise ValueError('No training data is provided!') # Check if hdf5 and json already exist. If they do, use the hdf5 data, # instead of the csvs data_hdf5_fp = None if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') train_set_metadata_json_fp = replace_file_extension(data_csv, 'json') if os.path.isfile(data_hdf5_fp) and os.path.isfile( train_set_metadata_json_fp): logging.info('Found hdf5 and json with the same filename ' 'of the csv, using them instead') data_csv = None data_hdf5 = data_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_train_csv is not None: data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5') train_set_metadata_json_fp = replace_file_extension( data_train_csv, 'json', ) if os.path.isfile(data_train_hdf5_fp) and os.path.isfile( train_set_metadata_json_fp): logging.info('Found hdf5 and json with the same filename of ' 'the train csv, using them instead') data_train_csv = None data_train_hdf5 = data_train_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_validation_csv is not None: data_validation_hdf5_fp = replace_file_extension( data_validation_csv, 'hdf5') if os.path.isfile(data_validation_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_validation_csv = None data_validation_hdf5 = data_validation_hdf5_fp if data_test_csv is not None: data_test_hdf5_fp = replace_file_extension(data_test_csv, 'hdf5') if os.path.isfile(data_test_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the test csv, using it instead') data_test_csv = None data_test_hdf5 = data_test_hdf5_fp model_definition['data_hdf5_fp'] = data_hdf5_fp # Decide if to preprocess or just load features = (model_definition['input_features'] + model_definition['output_features']) (concatenate_csv, concatenate_df, build_dataset, build_dataset_df) = get_dataset_fun(dataset_type) if data_df is not None or data_train_df is not None: # Preprocess data frames (training_set, test_set, validation_set, train_set_metadata) = _preprocess_df_for_training( features, data_df, data_train_df, data_validation_df, data_test_df, preprocessing_params, random_seed) elif data_csv is not None or data_train_csv is not None: # Preprocess csv data (training_set, test_set, validation_set, train_set_metadata) = _preprocess_csv_for_training( features, data_csv, data_train_csv, data_validation_csv, data_test_csv, skip_save_processed_input, preprocessing_params, random_seed) elif data_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # does not need preprocessing, just load logging.info('Using full hdf5 and json') training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], model_definition['output_features'], shuffle_training=True) train_set_metadata = load_metadata(train_set_metadata_json) elif data_train_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # does not need preprocessing, just load logging.info('Using hdf5 and json') training_set = load_data(data_train_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) train_set_metadata = load_metadata(train_set_metadata_json) if data_validation_hdf5 is not None: validation_set = load_data(data_validation_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: validation_set = None if data_test_hdf5 is not None: test_set = load_data(data_test_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: test_set = None else: raise RuntimeError('Insufficient input parameters') replace_text_feature_level( model_definition['input_features'] + model_definition['output_features'], [training_set, validation_set, test_set]) training_dataset = Dataset(training_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) validation_dataset = None if validation_set is not None: validation_dataset = Dataset(validation_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) test_dataset = None if test_set is not None: test_dataset = Dataset(test_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) return (training_dataset, validation_dataset, test_dataset, train_set_metadata)
def preprocess_for_prediction( model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True ): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME) ) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name']) ) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict( default_preprocessing_parameters, model_definition['preprocessing'] ) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == FULL: if data_hdf5 is not None: dataset = load_data( data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False ) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) else: if data_hdf5 is not None: training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False ) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) # build_dataset adds a split column if there is none in the csv # so if we want to check if the csv contained a split column # we have to check in the csv not in the built dataset. # The logic is that if there is no split in the original csv # we treat the split parameter as if it was == full if csv_contains_column(data_csv, SPLIT): training_set, test_set, validation_set = split_dataset_tvt( dataset, dataset[SPLIT] ) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: logger.warning( 'You requested the {} split, but the data CSV ' 'does not contain a "split" column, so the ' 'full data will be used instead' ) replace_text_feature_level( features, [dataset] ) dataset = Dataset( dataset, model_definition['input_features'], output_features, train_set_metadata.get(DATA_TRAIN_HDF5_FP) ) return dataset, train_set_metadata
def create_data_set_to_use(data_format, raw_data): # helper function for generating training and test data with specified format # handles all data formats except for hdf5 # assumes raw_data is a csv dataset generated by # tests.integration_tests.utils.generate_data() function # support for writing to a fwf dataset based on this stackoverflow posting: # https://stackoverflow.com/questions/16490261/python-pandas-write-dataframe-to-fixed-width-file-to-fwf from tabulate import tabulate def to_fwf(df, fname): content = tabulate(df.values.tolist(), list(df.columns), tablefmt="plain") open(fname, "w").write(content) pd.DataFrame.to_fwf = to_fwf dataset_to_use = None if data_format == 'csv': dataset_to_use = raw_data elif data_format in {'df', 'dict'}: dataset_to_use = pd.read_csv(raw_data) if data_format == 'dict': dataset_to_use = dataset_to_use.to_dict(orient='list') elif data_format == 'excel': dataset_to_use = replace_file_extension(raw_data, 'xlsx') pd.read_csv(raw_data).to_excel( dataset_to_use, index=False ) elif data_format == 'excel_xls': dataset_to_use = replace_file_extension(raw_data, 'xls') pd.read_csv(raw_data).to_excel( dataset_to_use, index=False ) elif data_format == 'feather': dataset_to_use = replace_file_extension(raw_data, 'feather') pd.read_csv(raw_data).to_feather( dataset_to_use ) elif data_format == 'fwf': dataset_to_use = replace_file_extension(raw_data, 'fwf') pd.read_csv(raw_data).to_fwf( dataset_to_use ) elif data_format == 'html': dataset_to_use = replace_file_extension(raw_data, 'html') pd.read_csv(raw_data).to_html( dataset_to_use, index=False ) elif data_format == 'json': dataset_to_use = replace_file_extension(raw_data, 'json') pd.read_csv(raw_data).to_json( dataset_to_use, orient='records' ) elif data_format == 'jsonl': dataset_to_use = replace_file_extension(raw_data, 'jsonl') pd.read_csv(raw_data).to_json( dataset_to_use, orient='records', lines=True ) elif data_format == 'parquet': dataset_to_use = replace_file_extension(raw_data, 'parquet') pd.read_csv(raw_data).to_parquet( dataset_to_use, index=False ) elif data_format == 'pickle': dataset_to_use = replace_file_extension(raw_data, 'pickle') pd.read_csv(raw_data).to_pickle( dataset_to_use ) elif data_format == 'stata': dataset_to_use = replace_file_extension(raw_data, 'stata') pd.read_csv(raw_data).to_stata( dataset_to_use ) elif data_format == 'tsv': dataset_to_use = replace_file_extension(raw_data, 'tsv') pd.read_csv(raw_data).to_csv( dataset_to_use, sep='\t', index=False ) else: ValueError( "'{}' is an unrecognized data format".format(data_format) ) return dataset_to_use
def test_cache_checksum(csv_filename, tmp_path): # setup for training input_features = [category_feature(vocab_size=5)] output_features = [category_feature(vocab_size=2, top_k=2)] source_dataset = os.path.join(tmp_path, csv_filename) source_dataset = generate_data(input_features, output_features, source_dataset) config = { "input_features": input_features, "output_features": output_features, "preprocessing": {"text": {"most_common_word": 1000}}, TRAINER: {"epochs": 2}, } backend = LocalTestBackend() cache_fname = replace_file_extension(source_dataset, TRAINING_PREPROC_FILE_NAME) # conduct initial training output_directory = os.path.join(tmp_path, "results") model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) first_training_timestamp = os.path.getmtime(cache_fname) # conduct second training, should not force recreating hdf5 model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # time stamps should be the same assert first_training_timestamp == current_training_timestamp # force recreating cache file by changing checksum prior_training_timestamp = current_training_timestamp config["preprocessing"]["text"]["most_common_word"] = 2000 model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamp should differ assert prior_training_timestamp < current_training_timestamp # force recreating cache by updating modification time of source dataset prior_training_timestamp = current_training_timestamp os.utime(source_dataset) model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamps should be different assert prior_training_timestamp < current_training_timestamp # force change in feature preprocessing prior_training_timestamp = current_training_timestamp input_features = config["input_features"].copy() input_features[0]["preprocessing"] = {"lowercase": True} config["input_features"] = input_features model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamps should be different assert prior_training_timestamp < current_training_timestamp # force change in features names (and properties) prior_training_timestamp = current_training_timestamp input_features = [category_feature(vocab_size=5), category_feature()] source_dataset = generate_data(input_features, output_features, source_dataset) config["input_features"] = input_features model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamps should be different assert prior_training_timestamp < current_training_timestamp # force change in Ludwig version prior_training_timestamp = current_training_timestamp global_vars.LUDWIG_VERSION = "new_version" model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamps should be different assert prior_training_timestamp < current_training_timestamp
def test_cache_checksum(csv_filename, tmp_path): # setup for training input_features = [category_feature(vocab_size=5)] output_features = [category_feature(vocab_size=2)] source_dataset = os.path.join(tmp_path, csv_filename) source_dataset = generate_data(input_features, output_features, source_dataset) config = { 'input_features': input_features, 'output_features': output_features, 'preprocessing': {'text': {'most_common_word': 1000}}, 'training': {'epochs': 2} } # conduct initial training output_directory = os.path.join(tmp_path, 'results') model = LudwigModel(config) _, _, train_output_directory1 = \ model.train(dataset=source_dataset, output_directory=output_directory) first_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # conduct second training, should not force recreating hdf5 model = LudwigModel(config) _, _, train_output_directory2 = \ model.train(dataset=source_dataset, output_directory=output_directory) second_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # time stamps should be the same assert first_training_timestamp == second_training_timestamp # force recreating cache file by changing checksum config['preprocessing']['text']['most_common_word'] = 2000 model = LudwigModel(config) _, _, train_output_directory3 = \ model.train(dataset=source_dataset, output_directory=output_directory) third_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamp should differ assert first_training_timestamp < third_training_timestamp # force recreating cache by updating modification time of source dataset os.utime(source_dataset) model = LudwigModel(config) _, _, train_output_directory4 = \ model.train(dataset=source_dataset, output_directory=output_directory) fourth_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamps should be different assert third_training_timestamp < fourth_training_timestamp # force change in feature preprocessing input_features = config['input_features'].copy() input_features[0]['preprocessing'] = {'lowercase': True} config['input_features'] = input_features model = LudwigModel(config) _, _, train_output_directory5 = \ model.train(dataset=source_dataset, output_directory=output_directory) fifth_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamps should be different assert fourth_training_timestamp < fifth_training_timestamp # force change in features names (and properties) input_features = [category_feature(vocab_size=5), category_feature()] source_dataset = generate_data(input_features, output_features, source_dataset) config['input_features'] = input_features model = LudwigModel(config) _, _, train_output_directory5 = \ model.train(dataset=source_dataset, output_directory=output_directory) sixth_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamps should be different assert fifth_training_timestamp < sixth_training_timestamp # force change in Ludwig version global_vars.LUDWIG_VERSION = 'new_version' model = LudwigModel(config) _, _, train_output_directory5 = \ model.train(dataset=source_dataset, output_directory=output_directory) seventh_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamps should be different assert sixth_training_timestamp < seventh_training_timestamp
def test_cache_checksum(csv_filename, tmp_path): # setup for training input_features = [category_feature(vocab_size=5)] output_features = [category_feature(vocab_size=2, top_k=2)] source_dataset = os.path.join(tmp_path, csv_filename) source_dataset = generate_data(input_features, output_features, source_dataset) config = { INPUT_FEATURES: input_features, OUTPUT_FEATURES: output_features, DEFAULTS: { CATEGORY: { PREPROCESSING: { "fill_value": "<UNKNOWN>" } } }, TRAINER: { EPOCHS: 2 }, } backend = LocalTestBackend() cache_fname = replace_file_extension(source_dataset, TRAINING_PREPROC_FILE_NAME) # conduct initial training output_directory = os.path.join(tmp_path, "results") model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) first_training_timestamp = os.path.getmtime(cache_fname) # conduct second training, should not force recreating hdf5 model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # time stamps should be the same assert first_training_timestamp == current_training_timestamp # force recreating cache file by changing checksum by updating defaults prior_training_timestamp = current_training_timestamp config[DEFAULTS][CATEGORY][PREPROCESSING]["fill_value"] = "<EMPTY>" model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamp should differ assert prior_training_timestamp < current_training_timestamp # force recreating cache by updating modification time of source dataset prior_training_timestamp = current_training_timestamp os.utime(source_dataset) model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamps should be different assert prior_training_timestamp < current_training_timestamp # force change in feature preprocessing prior_training_timestamp = current_training_timestamp input_features = config[INPUT_FEATURES].copy() input_features[0][PREPROCESSING] = {"lowercase": True} config[INPUT_FEATURES] = input_features model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamps should be different assert prior_training_timestamp < current_training_timestamp # force change in features names (and properties) prior_training_timestamp = current_training_timestamp input_features = [category_feature(vocab_size=5), category_feature()] source_dataset = generate_data(input_features, output_features, source_dataset) config[INPUT_FEATURES] = input_features model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamps should be different assert prior_training_timestamp < current_training_timestamp # force change in Ludwig version prior_training_timestamp = current_training_timestamp global_vars.LUDWIG_VERSION = "new_version" model = LudwigModel(config, backend=backend) model.train(dataset=source_dataset, output_directory=output_directory) current_training_timestamp = os.path.getmtime(cache_fname) # timestamps should be different assert prior_training_timestamp < current_training_timestamp
import logging import pandas as pd import dask.dataframe as dd from ludwig.api import LudwigModel from ludwig.utils.data_utils import replace_file_extension csv_path = "./data/train_combined.csv" dataset_to_use = replace_file_extension(csv_path, 'parquet') pd.read_csv(csv_path).to_parquet( dataset_to_use, index=False ) # dataset_to_use = dd.read_csv(csv_path) model = LudwigModel( # config='./config/large.yaml', config='./config/small.yaml', logging_level=logging.INFO, backend='ray') # model = LudwigModel( # # config='./config/large.yaml', # config='./config/small.yaml', # logging_level=logging.INFO) ( train_stats,