def preprocess_for_prediction( model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True ): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME) ) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name']) ) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict( default_preprocessing_parameters, model_definition['preprocessing'] ) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == 'full': if data_hdf5 is not None: dataset = load_data( data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False ) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) else: if data_hdf5 is not None: training, test, validation = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False ) if split == 'training': dataset = training elif split == 'validation': dataset = validation else: # if split == 'test': dataset = test else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) replace_text_feature_level( features, [dataset] ) dataset = Dataset( dataset, model_definition['input_features'], output_features, data_hdf5_fp, ) return dataset, train_set_metadata
def create_dataset(self, dataset, tag, config, training_set_metadata): return Dataset(dataset, get_proc_features(config), training_set_metadata.get(DATA_TRAIN_HDF5_FP))
def preprocess_for_training_by_type( model_definition, data_type, all_data_fp=None, train_fp=None, validation_fp=None, test_fp=None, all_data_df=None, train_df=None, validation_df=None, test_df=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed ): if all_data_fp is not None and train_fp is not None: raise ValueError('Use either one file for all data or 3 files for ' 'train, test and validation') if data_type not in ['hdf5', 'csv', 'pandas']: raise ValueError('Invalid type of data provided') features = (model_definition['input_features'] + model_definition['output_features']) data_hdf5_fp = None if data_type == 'pandas': # Preprocess data frames ( training_set, test_set, validation_set, train_set_metadata ) = _preprocess_df_for_training( features, all_data_df, train_df, validation_df, test_df, train_set_metadata_json=train_set_metadata_json, preprocessing_params=preprocessing_params, random_seed=random_seed ) elif data_type == 'hdf5' and train_set_metadata_json is None: raise ValueError('train set metadata file is not found along with hdf5 ' 'data') elif data_type == 'hdf5': if all_data_fp is not None: data_hdf5_fp = replace_file_extension(all_data_fp, 'hdf5') logger.info('Using full hdf5 and json') training_set, test_set, validation_set = load_data( all_data_fp, model_definition['input_features'], model_definition['output_features'], shuffle_training=True ) train_set_metadata = load_metadata(train_set_metadata_json) elif train_fp is not None: logger.info('Using hdf5 and json') training_set = load_data( train_fp, model_definition['input_features'], model_definition['output_features'], split_data=False ) train_set_metadata = load_metadata(train_set_metadata_json) validation_set = None if validation_fp is not None: validation_set = load_data( validation_fp, model_definition['input_features'], model_definition['output_features'], split_data=False ) test_set = None if test_fp is not None: test_set = load_data( test_fp, model_definition['input_features'], model_definition['output_features'], split_data=False ) elif data_type == 'csv': data_hdf5_fp = replace_file_extension( all_data_fp, 'hdf5' ) model_definition['data_hdf5_fp'] = data_hdf5_fp if all_data_fp is not None: if (file_exists_with_diff_extension(all_data_fp, 'hdf5') and file_exists_with_diff_extension(all_data_fp, 'json')): # use hdf5 data instead logger.info( 'Found hdf5 and json with the same filename ' 'of the csv, using them instead' ) return preprocess_for_training_by_type( model_definition, 'hdf5', all_data_fp=replace_file_extension(all_data_fp, 'hdf5'), train_set_metadata_json=replace_file_extension(all_data_fp, 'json'), skip_save_processed_input=skip_save_processed_input, preprocessing_params=preprocessing_params, random_seed=random_seed ) else: ( training_set, test_set, validation_set, train_set_metadata ) = _preprocess_csv_for_training( features=features, data_csv=all_data_fp, data_train_csv=None, data_validation_csv=None, data_test_csv=None, train_set_metadata_json=train_set_metadata_json, skip_save_processed_input=skip_save_processed_input, preprocessing_params=preprocessing_params, random_seed=random_seed ) else: if (file_exists_with_diff_extension(train_fp, 'hdf5') and file_exists_with_diff_extension(train_fp, 'json') and file_exists_with_diff_extension(validation_fp, 'hdf5') and file_exists_with_diff_extension(test_fp, 'hdf5')): logger.info( 'Found hdf5 and json with the same filename ' 'of the csvs, using them instead.' ) return preprocess_for_training_by_type( model_definition, 'hdf5', train_fp=replace_file_extension(train_fp, 'hdf5'), validation_fp=replace_file_extension(validation_fp, 'hdf5'), test_fp=replace_file_extension(test_fp, 'hdf5'), train_set_metadata_json=replace_file_extension(all_data_fp, 'json'), skip_save_processed_input=skip_save_processed_input, preprocessing_params=preprocessing_params, random_seed=random_seed ) else: ( training_set, test_set, validation_set, train_set_metadata ) = _preprocess_csv_for_training( features=features, data_csv=None, data_train_csv=train_fp, data_validation_csv=validation_fp, data_test_csv=test_fp, train_set_metadata_json=train_set_metadata_json, skip_save_processed_input=skip_save_processed_input, preprocessing_params=preprocessing_params, random_seed=random_seed ) else: raise RuntimeError('Insufficient input parameters') replace_text_feature_level( model_definition['input_features'] + model_definition['output_features'], [training_set, validation_set, test_set] ) training_dataset = Dataset( training_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp ) validation_dataset = None if validation_set is not None: validation_dataset = Dataset( validation_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp ) test_dataset = None if test_set is not None: test_dataset = Dataset( test_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp ) return ( training_dataset, validation_dataset, test_dataset, train_set_metadata )
def train_online( self, data_df=None, data_csv=None, data_dict=None, batch_size=None, learning_rate=None, regularization_lambda=None, dropout_rate=None, bucketing_field=None, gpus=None, gpu_fraction=1, logging_level=logging.ERROR, ): """This function is used to perform one epoch of training of the model on the specified dataset. # Inputs :param data_df: (DataFrame) dataframe containing data. :param data_csv: (string) input data CSV file. :param data_dict: (dict) input data dictionary. It is expected to contain one key for each field and the values have to be lists of the same length. Each index in the lists corresponds to one datapoint. For example a data set consisting of two datapoints with a text and a class may be provided as the following dict ``{'text_field_name': ['text of the first datapoint', text of the second datapoint'], 'class_filed_name': ['class_datapoints_1', 'class_datapoints_2']}`. :param batch_size: (int) the batch size to use for training. By default it's the one specified in the model definition. :param learning_rate: (float) the learning rate to use for training. By default the values is the one specified in the model definition. :param regularization_lambda: (float) the regularization lambda parameter to use for training. By default the values is the one specified in the model definition. :param dropout_rate: (float) the dropout rate to use for training. By default the values is the one specified in the model definition. :param bucketing_field: (string) the bucketing field to use for bucketing the data. By default the values is one specified in the model definition. :param gpus: (string, default: `None`) list of GPUs to use (it uses the same syntax of CUDA_VISIBLE_DEVICES) :param gpu_fraction: (float, default `1.0`) fraction of GPU memory to initialize the process with :param logging_level: (int, default: `logging.ERROR`) logging level to use for logging. Use logging constants like `logging.DEBUG`, `logging.INFO` and `logging.ERROR`. By default only errors will be printed. There are three ways to provide data: by dataframes using the `data_df` parameter, by CSV using the `data_csv` parameter and by dictionary, using the `data_dict` parameter. The DataFrame approach uses data previously obtained and put in a dataframe, the CSV approach loads data from a CSV file, while dict approach uses data organized by keys representing columns and values that are lists of the datapoints for each. For example a data set consisting of two datapoints with a text and a class may be provided as the following dict ``{'text_field_name}: ['text of the first datapoint', text of the second datapoint'], 'class_filed_name': ['class_datapoints_1', 'class_datapoints_2']}`. """ logging.getLogger().setLevel(logging_level) if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}: set_disable_progressbar(True) if (self.model is None or self.model_definition is None or self.train_set_metadata is None): raise ValueError('Model has not been initialized or loaded') if data_df is None: data_df = self._read_data(data_csv, data_dict) data_df.csv = data_csv if batch_size is None: batch_size = self.model_definition['training']['batch_size'] if learning_rate is None: learning_rate = self.model_definition['training']['learning_rate'] if regularization_lambda is None: regularization_lambda = self.model_definition['training'][ 'regularization_lambda'] if dropout_rate is None: dropout_rate = self.model_definition['training']['dropout_rate'], if bucketing_field is None: bucketing_field = self.model_definition['training'][ 'bucketing_field'] logging.debug('Preprocessing {} datapoints'.format(len(data_df))) features_to_load = (self.model_definition['input_features'] + self.model_definition['output_features']) preprocessed_data = build_data(data_df, features_to_load, self.train_set_metadata, self.model_definition['preprocessing']) replace_text_feature_level( self.model_definition['input_features'] + self.model_definition['output_features'], [preprocessed_data]) dataset = Dataset(preprocessed_data, self.model_definition['input_features'], self.model_definition['output_features'], None) logging.debug('Training batch') self.model.train_online(dataset, batch_size=batch_size, learning_rate=learning_rate, regularization_lambda=regularization_lambda, dropout_rate=dropout_rate, bucketing_field=bucketing_field, gpus=gpus, gpu_fraction=gpu_fraction)
def _predict( self, data_df=None, data_csv=None, data_dict=None, return_type=pd.DataFrame, batch_size=128, gpus=None, gpu_fraction=1, only_predictions=True, logging_level=logging.ERROR, ): logging.getLogger().setLevel(logging_level) if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}: set_disable_progressbar(True) if (self.model is None or self.model_definition is None or self.train_set_metadata is None): raise ValueError('Model has not been trained or loaded') if data_df is None: data_df = self._read_data(data_csv, data_dict) logging.debug('Preprocessing {} datapoints'.format(len(data_df))) features_to_load = self.model_definition['input_features'] if not only_predictions: features_to_load += self.model_definition['output_features'] preprocessed_data = build_data(data_df, features_to_load, self.train_set_metadata, self.model_definition['preprocessing']) replace_text_feature_level( self.model_definition['input_features'] + ([] if only_predictions else self.model_definition['output_features']), [preprocessed_data]) dataset = Dataset(preprocessed_data, self.model_definition['input_features'], [] if only_predictions else self.model_definition['output_features'], None) logging.debug('Predicting') predict_results = self.model.predict(dataset, batch_size, only_predictions=only_predictions, gpus=gpus, gpu_fraction=gpu_fraction, session=getattr( self.model, 'session', None)) if not only_predictions: calculate_overall_stats(predict_results, self.model_definition['output_features'], dataset, self.train_set_metadata) logging.debug('Postprocessing') if (return_type == 'dict' or return_type == 'dictionary' or return_type == dict): postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata) elif (return_type == 'dataframe' or return_type == 'df' or return_type == pd.DataFrame): postprocessed_predictions = postprocess_df( predict_results, self.model_definition['output_features'], self.train_set_metadata) else: logging.warning('Unrecognized return_type: {}. ' 'Returning DataFrame.'.format(return_type)) postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata) return postprocessed_predictions, predict_results
def preprocess_for_training( model_definition, dataset_type='generic', data_df=None, data_train_df=None, data_validation_df=None, data_test_df=None, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, data_hdf5=None, data_train_hdf5=None, data_validation_hdf5=None, data_test_hdf5=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed): # Sanity Check to make sure some data source is provided data_sources_provided = [ data_df, data_train_df, data_csv, data_train_csv, data_hdf5, data_train_hdf5 ] data_sources_not_none = [x is not None for x in data_sources_provided] if not any(data_sources_not_none): raise ValueError('No training data is provided!') # Check if hdf5 and json already exist. If they do, use the hdf5 data, # instead of the csvs data_hdf5_fp = None if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') train_set_metadata_json_fp = replace_file_extension(data_csv, 'json') if os.path.isfile(data_hdf5_fp) and os.path.isfile( train_set_metadata_json_fp): logging.info('Found hdf5 and json with the same filename ' 'of the csv, using them instead') data_csv = None data_hdf5 = data_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_train_csv is not None: data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5') train_set_metadata_json_fp = replace_file_extension( data_train_csv, 'json', ) if os.path.isfile(data_train_hdf5_fp) and os.path.isfile( train_set_metadata_json_fp): logging.info('Found hdf5 and json with the same filename of ' 'the train csv, using them instead') data_train_csv = None data_train_hdf5 = data_train_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_validation_csv is not None: data_validation_hdf5_fp = replace_file_extension( data_validation_csv, 'hdf5') if os.path.isfile(data_validation_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_validation_csv = None data_validation_hdf5 = data_validation_hdf5_fp if data_test_csv is not None: data_test_hdf5_fp = replace_file_extension(data_test_csv, 'hdf5') if os.path.isfile(data_test_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the test csv, using it instead') data_test_csv = None data_test_hdf5 = data_test_hdf5_fp model_definition['data_hdf5_fp'] = data_hdf5_fp # Decide if to preprocess or just load features = (model_definition['input_features'] + model_definition['output_features']) (concatenate_csv, concatenate_df, build_dataset, build_dataset_df) = get_dataset_fun(dataset_type) if data_df is not None or data_train_df is not None: # Preprocess data frames (training_set, test_set, validation_set, train_set_metadata) = _preprocess_df_for_training( features, data_df, data_train_df, data_validation_df, data_test_df, preprocessing_params, random_seed) elif data_csv is not None or data_train_csv is not None: # Preprocess csv data (training_set, test_set, validation_set, train_set_metadata) = _preprocess_csv_for_training( features, data_csv, data_train_csv, data_validation_csv, data_test_csv, skip_save_processed_input, preprocessing_params, random_seed) elif data_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # does not need preprocessing, just load logging.info('Using full hdf5 and json') training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], model_definition['output_features'], shuffle_training=True) train_set_metadata = load_metadata(train_set_metadata_json) elif data_train_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # does not need preprocessing, just load logging.info('Using hdf5 and json') training_set = load_data(data_train_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) train_set_metadata = load_metadata(train_set_metadata_json) if data_validation_hdf5 is not None: validation_set = load_data(data_validation_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: validation_set = None if data_test_hdf5 is not None: test_set = load_data(data_test_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: test_set = None else: raise RuntimeError('Insufficient input parameters') replace_text_feature_level( model_definition['input_features'] + model_definition['output_features'], [training_set, validation_set, test_set]) training_dataset = Dataset(training_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) validation_dataset = None if validation_set is not None: validation_dataset = Dataset(validation_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) test_dataset = None if test_set is not None: test_dataset = Dataset(test_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) return (training_dataset, validation_dataset, test_dataset, train_set_metadata)
def _predict( self, data_df=None, data_csv=None, data_dict=None, return_type=pd.DataFrame, batch_size=128, evaluate_performance=False, skip_save_unprocessed_output=False, gpus=None, gpu_fraction=1, ): if (self.model is None or self.model_definition is None or self.train_set_metadata is None): raise ValueError('Model has not been trained or loaded') if data_df is None: data_df = self._read_data(data_csv, data_dict) logger.debug('Preprocessing {} datapoints'.format(len(data_df))) # Added [:] to next line, before I was just assigning, # this way I'm copying the list. If you don't do it, you are actually # modifying the input feature list when you add output features, # which you definitely don't want to do features_to_load = self.model_definition['input_features'][:] if evaluate_performance: output_features = self.model_definition['output_features'] else: output_features = [] features_to_load += output_features num_overrides = override_in_memory_flag( self.model_definition['input_features'], True) if num_overrides > 0: logger.warning( 'Using in_memory = False is not supported for Ludwig API.') preprocessed_data = build_data(data_df, features_to_load, self.train_set_metadata, self.model_definition['preprocessing']) replace_text_feature_level(features_to_load, [preprocessed_data]) dataset = Dataset(preprocessed_data, self.model_definition['input_features'], output_features, None) logger.debug('Predicting') predict_results = self.model.predict( dataset, batch_size, evaluate_performance=evaluate_performance, gpus=gpus, gpu_fraction=gpu_fraction, session=getattr(self.model, 'session', None)) if evaluate_performance: calculate_overall_stats(predict_results, self.model_definition['output_features'], dataset, self.train_set_metadata) logger.debug('Postprocessing') if (return_type == 'dict' or return_type == 'dictionary' or return_type == dict): postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata, experiment_dir_name=self.exp_dir_name, skip_save_unprocessed_output=skip_save_unprocessed_output, ) elif (return_type == 'dataframe' or return_type == 'df' or return_type == pd.DataFrame): postprocessed_predictions = postprocess_df( predict_results, self.model_definition['output_features'], self.train_set_metadata, experiment_dir_name=self.exp_dir_name, skip_save_unprocessed_output=skip_save_unprocessed_output, ) else: logger.warning('Unrecognized return_type: {}. ' 'Returning DataFrame.'.format(return_type)) postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata, experiment_dir_name=self.exp_dir_name, skip_save_unprocessed_output=skip_save_unprocessed_output, ) return postprocessed_predictions, predict_results
def preprocess_for_prediction( model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True ): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME) ) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name']) ) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict( default_preprocessing_parameters, model_definition['preprocessing'] ) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == FULL: if data_hdf5 is not None: dataset = load_data( data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False ) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) else: if data_hdf5 is not None: training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False ) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) # build_dataset adds a split column if there is none in the csv # so if we want to check if the csv contained a split column # we have to check in the csv not in the built dataset. # The logic is that if there is no split in the original csv # we treat the split parameter as if it was == full if csv_contains_column(data_csv, SPLIT): training_set, test_set, validation_set = split_dataset_tvt( dataset, dataset[SPLIT] ) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: logger.warning( 'You requested the {} split, but the data CSV ' 'does not contain a "split" column, so the ' 'full data will be used instead' ) replace_text_feature_level( features, [dataset] ) dataset = Dataset( dataset, model_definition['input_features'], output_features, train_set_metadata.get(DATA_TRAIN_HDF5_FP) ) return dataset, train_set_metadata
def preprocess_for_prediction(model_path, split, dataset_type='generic', data_csv=None, data_hdf5=None, train_set_metadata=None, only_predictions=False): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param dataset_type: Generic :type: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param only_predictions: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)) preprocessing_params = merge_dict(default_preprocessing_parameters, model_definition['preprocessing']) # Check if hdf5 and json already exist if data_csv is not None: data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5' if os.path.isfile(data_hdf5_fp): logging.info( 'Found hdf5 with the same filename of the csv, using it instead' ) data_csv = None data_hdf5 = data_hdf5_fp # Load data _, _, build_dataset, _ = get_dataset_fun(dataset_type) train_set_metadata = load_metadata(train_set_metadata) features = ( model_definition['input_features'] + ([] if only_predictions else model_definition['output_features'])) if split == 'full': if data_hdf5 is not None: dataset = load_data(data_hdf5, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], split_data=False, shuffle_training=False) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) else: if data_hdf5 is not None: training, test, validation = load_data( data_hdf5, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], shuffle_training=False) if split == 'training': dataset = training elif split == 'validation': dataset = validation else: # if split == 'test': dataset = test else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) replace_text_feature_level(model_definition, [dataset]) dataset = Dataset( dataset, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], data_hdf5, ) return dataset, train_set_metadata
def preprocess_for_training( model_definition, dataset_type='generic', data_df=None, data_train_df=None, data_validation_df=None, data_test_df=None, data_csv=None, data_train_csv=None, data_validation_csv=None, data_test_csv=None, data_hdf5=None, data_train_hdf5=None, data_validation_hdf5=None, data_test_hdf5=None, train_set_metadata_json=None, skip_save_processed_input=False, preprocessing_params=default_preprocessing_parameters, random_seed=default_random_seed): # Check if hdf5 and json already exist data_hdf5_fp = None data_train_hdf5_fp = None data_validation_hdf5_fp = None data_test_hdf5_fp = None train_set_metadata_json_fp = 'metadata.json' if data_csv is not None: data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5' train_set_metadata_json_fp = os.path.splitext(data_csv)[0] + '.json' if (os.path.isfile(data_hdf5_fp) and os.path.isfile(train_set_metadata_json_fp)): logging.info('Found hdf5 and json with the same filename ' 'of the csv, using them instead') data_csv = None data_hdf5 = data_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_train_csv is not None: data_train_hdf5_fp = os.path.splitext(data_train_csv)[0] + '.hdf5' train_set_metadata_json_fp = os.path.splitext( data_train_csv)[0] + '.json' if (os.path.isfile(data_train_hdf5_fp) and os.path.isfile(train_set_metadata_json_fp)): logging.info('Found hdf5 and json with the same filename of ' 'the train csv, using them instead') data_train_csv = None data_train_hdf5 = data_train_hdf5_fp train_set_metadata_json = train_set_metadata_json_fp if data_validation_csv is not None: data_validation_hdf5_fp = os.path.splitext( data_validation_csv)[0] + '.hdf5' if os.path.isfile(data_validation_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_validation_csv = None data_validation_hdf5 = data_validation_hdf5_fp if data_test_csv is not None: data_test_hdf5_fp = os.path.splitext(data_test_csv)[0] + '.hdf5' if os.path.isfile(data_test_hdf5_fp): logging.info('Found hdf5 with the same filename of ' 'the validation csv, using it instead') data_test_csv = None data_test_hdf5 = data_test_hdf5_fp model_definition['data_hdf5_fp'] = data_hdf5_fp # Decide if to preprocess or just load features = (model_definition['input_features'] + model_definition['output_features']) (concatenate_csv, concatenate_df, build_dataset, build_dataset_df) = get_dataset_fun(dataset_type) if data_df is not None: # needs preprocessing logging.info('Using full dataframe') logging.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset_df(data_df, features, preprocessing_params, random_seed=random_seed) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) elif data_train_df is not None: # needs preprocessing logging.info('Using training dataframe') logging.info('Building dataset (it may take a while)') concatenated_df = concatenate_df(data_train_df, data_validation_df, data_test_df) data, train_set_metadata = build_dataset_df(concatenated_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_train_hdf5_fp, training_set, train_set_metadata) if validation_set is not None: data_utils.save_hdf5(data_validation_hdf5_fp, validation_set, train_set_metadata) if test_set is not None: data_utils.save_hdf5(data_test_hdf5_fp, test_set, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) elif data_csv is not None: # Use data and ignore _train, _validation and _test. # Also ignore data and train set metadata needs preprocessing logging.info('Using full raw csv, no hdf5 and json file ' 'with the same name have been found') logging.info('Building dataset (it may take a while)') data, train_set_metadata = build_dataset(data_csv, features, preprocessing_params, random_seed=random_seed) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) elif data_train_csv is not None: # use data_train (including _validation and _test if they are present) # and ignore data and train set metadata # needs preprocessing logging.info('Using training raw csv, no hdf5 and json ' 'file with the same name have been found') logging.info('Building dataset (it may take a while)') concatenated_df = concatenate_csv(data_train_csv, data_validation_csv, data_test_csv) concatenated_df.csv = data_train_csv data, train_set_metadata = build_dataset_df(concatenated_df, features, preprocessing_params, random_seed=random_seed) training_set, test_set, validation_set = split_dataset_tvt( data, data['split']) if not skip_save_processed_input: logging.info('Writing dataset') data_utils.save_hdf5(data_train_hdf5_fp, training_set, train_set_metadata) if validation_set is not None: data_utils.save_hdf5(data_validation_hdf5_fp, validation_set, train_set_metadata) if test_set is not None: data_utils.save_hdf5(data_test_hdf5_fp, test_set, train_set_metadata) logging.info('Writing train set metadata with vocabulary') data_utils.save_json(train_set_metadata_json_fp, train_set_metadata) elif data_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # doesn't need preprocessing, just load logging.info('Using full hdf5 and json') training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], model_definition['output_features'], shuffle_training=True) train_set_metadata = load_metadata(train_set_metadata_json) elif data_train_hdf5 is not None and train_set_metadata_json is not None: # use data and train set metadata # doesn't need preprocessing, just load logging.info('Using hdf5 and json') training_set = load_data(data_train_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) train_set_metadata = load_metadata(train_set_metadata_json) if data_validation_hdf5 is not None: validation_set = load_data(data_validation_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: validation_set = None if data_test_hdf5 is not None: test_set = load_data(data_test_hdf5, model_definition['input_features'], model_definition['output_features'], split_data=False) else: test_set = None else: raise RuntimeError('Insufficient input parameters') replace_text_feature_level(model_definition, [training_set, validation_set, test_set]) training_dataset = Dataset(training_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) validation_dataset = None if validation_set is not None: validation_dataset = Dataset(validation_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) test_dataset = None if test_set is not None: test_dataset = Dataset(test_set, model_definition['input_features'], model_definition['output_features'], data_hdf5_fp) return (training_dataset, validation_dataset, test_dataset, train_set_metadata)
def train_online(self, data_df=None, data_csv=None, data_dict=None, batch_size=None, learning_rate=None, regularization_lambda=None, bucketing_field=None): """This function is used to perform one epoch of training of the model on the specified dataset. # Inputs :param data_df: (DataFrame) dataframe containing data. :param data_csv: (string) input data CSV file. :param data_dict: (dict) input data dictionary. It is expected to contain one key for each field and the values have to be lists of the same length. Each index in the lists corresponds to one datapoint. For example a data set consisting of two datapoints with a text and a class may be provided as the following dict ``{'text_field_name': ['text of the first datapoint', text of the second datapoint'], 'class_filed_name': ['class_datapoints_1', 'class_datapoints_2']}`. :param batch_size: (int) the batch size to use for training. By default it's the one specified in the model definition. :param learning_rate: (float) the learning rate to use for training. By default the values is the one specified in the model definition. :param regularization_lambda: (float) the regularization lambda parameter to use for training. By default the values is the one specified in the model definition. :param dropout: (float) the dropout rate to use for training. By default the values is the one specified in the model definition. :param bucketing_field: (string) the bucketing field to use for bucketing the data. By default the values is one specified in the model definition. There are three ways to provide data: by dataframes using the `data_df` parameter, by CSV using the `data_csv` parameter and by dictionary, using the `data_dict` parameter. The DataFrame approach uses data previously obtained and put in a dataframe, the CSV approach loads data from a CSV file, while dict approach uses data organized by keys representing columns and values that are lists of the datapoints for each. For example a data set consisting of two datapoints with a text and a class may be provided as the following dict ``{'text_field_name}: ['text of the first datapoint', text of the second datapoint'], 'class_filed_name': ['class_datapoints_1', 'class_datapoints_2']}`. """ if (self.model is None or self.model_definition is None or self.train_set_metadata is None): raise ValueError('Model has not been initialized or loaded') if data_df is None: data_df = self._read_data(data_csv, data_dict) data_df.csv = data_csv if batch_size is None: batch_size = self.model_definition[TRAINING]['batch_size'] if learning_rate is None: learning_rate = self.model_definition[TRAINING]['learning_rate'] if regularization_lambda is None: regularization_lambda = self.model_definition[TRAINING][ 'regularization_lambda'] if bucketing_field is None: bucketing_field = self.model_definition[TRAINING][ 'bucketing_field'] logger.debug('Preprocessing {} datapoints'.format(len(data_df))) features_to_load = (self.model_definition['input_features'] + self.model_definition['output_features']) preprocessed_data = build_data(data_df, features_to_load, self.train_set_metadata, self.model_definition['preprocessing']) replace_text_feature_level( self.model_definition['input_features'] + self.model_definition['output_features'], [preprocessed_data]) dataset = Dataset(preprocessed_data, self.model_definition['input_features'], self.model_definition['output_features'], None) logger.debug('Training batch') self.model.train_online(dataset, batch_size=batch_size, learning_rate=learning_rate, regularization_lambda=regularization_lambda, bucketing_field=bucketing_field)
def _predict( self, data_df=None, data_csv=None, data_dict=None, return_type=pd.DataFrame, batch_size=128, gpus=None, gpu_fraction=1, evaluate_performance=False, logging_level=logging.ERROR, ): logging.getLogger('ludwig').setLevel(logging_level) if logging_level in {logging.WARNING, logging.ERROR, logging.CRITICAL}: set_disable_progressbar(True) if (self.model is None or self.model_definition is None or self.train_set_metadata is None): raise ValueError('Model has not been trained or loaded') if data_df is None: data_df = self._read_data(data_csv, data_dict) logger.debug('Preprocessing {} datapoints'.format(len(data_df))) # Added [:] to next line, before I was just assigning, # this way I'm copying the list. If you don't do it, you are actually # modifying the input feature list when you add output features, # which you definitely don't want to do features_to_load = self.model_definition['input_features'][:] if evaluate_performance: output_features = self.model_definition['output_features'] else: output_features = [] features_to_load += output_features preprocessed_data = build_data(data_df, features_to_load, self.train_set_metadata, self.model_definition['preprocessing']) replace_text_feature_level(features_to_load, [preprocessed_data]) dataset = Dataset(preprocessed_data, self.model_definition['input_features'], output_features, None) logger.debug('Predicting') predict_results = self.model.predict( dataset, batch_size, evaluate_performance=evaluate_performance, gpus=gpus, gpu_fraction=gpu_fraction, session=getattr(self.model, 'session', None)) if evaluate_performance: calculate_overall_stats(predict_results, self.model_definition['output_features'], dataset, self.train_set_metadata) logger.debug('Postprocessing') if (return_type == 'dict' or return_type == 'dictionary' or return_type == dict): postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata) elif (return_type == 'dataframe' or return_type == 'df' or return_type == pd.DataFrame): postprocessed_predictions = postprocess_df( predict_results, self.model_definition['output_features'], self.train_set_metadata) else: logger.warning('Unrecognized return_type: {}. ' 'Returning DataFrame.'.format(return_type)) postprocessed_predictions = postprocess( predict_results, self.model_definition['output_features'], self.train_set_metadata) return postprocessed_predictions, predict_results