def get_preprocessing_params(model_definition): model_definition = merge_with_defaults(model_definition) global_preprocessing_parameters = model_definition['preprocessing'] features = ( model_definition['input_features'] + model_definition['output_features'] ) global_preprocessing_parameters = merge_dict( default_preprocessing_parameters, global_preprocessing_parameters ) merged_preprocessing_params = [] for feature in features: if 'preprocessing' in feature: local_preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature['type']], feature['preprocessing'] ) else: local_preprocessing_parameters = global_preprocessing_parameters[ feature['type'] ] merged_preprocessing_params.append( (feature['name'], feature['type'], local_preprocessing_parameters) ) return merged_preprocessing_params
def build_dataset_df(dataset_df, features, global_preprocessing_parameters, train_set_metadata=None, random_seed=default_random_seed, **kwargs): global_preprocessing_parameters = merge_dict( default_preprocessing_parameters, global_preprocessing_parameters) if train_set_metadata is None: train_set_metadata = build_metadata(dataset_df, features, global_preprocessing_parameters) data_val = build_data(dataset_df, features, train_set_metadata, global_preprocessing_parameters) data_val['split'] = get_split( dataset_df, force_split=global_preprocessing_parameters['force_split'], split_probabilities=global_preprocessing_parameters[ 'split_probabilities'], stratify=global_preprocessing_parameters['stratify'], random_seed=random_seed) return data_val, train_set_metadata
def merge_with_defaults(model_definition): _perform_sanity_checks(model_definition) # ===== Preprocessing ===== model_definition['preprocessing'] = merge_dict( default_preprocessing_parameters, model_definition.get('preprocessing', {}) ) stratify = model_definition['preprocessing']['stratify'] if stratify is not None: if stratify not in [x['name'] for x in model_definition['output_features']]: raise ValueError('Stratify must be in output features') if ([x for x in model_definition['output_features'] if x['name'] == stratify][0]['type'] not in [BINARY, CATEGORY]): raise ValueError('Stratify feature must be binary or category') # ===== Model ===== set_default_value(model_definition, 'combiner', {'type': default_combiner_type}) # ===== Training ===== set_default_value(model_definition, TRAINING, default_training_params) for param, value in default_training_params.items(): set_default_value(model_definition[TRAINING], param, value) set_default_value( model_definition[TRAINING], 'validation_measure', output_type_registry[model_definition['output_features'][0][ 'type']].default_validation_measure ) # ===== Training Optimizer ===== optimizer = model_definition[TRAINING]['optimizer'] default_optimizer_params = get_default_optimizer_params(optimizer['type']) for param in default_optimizer_params: set_default_value(optimizer, param, default_optimizer_params[param]) # ===== Input Features ===== for input_feature in model_definition['input_features']: get_from_registry(input_feature['type'], input_type_registry).populate_defaults(input_feature) # ===== Output features ===== for output_feature in model_definition['output_features']: get_from_registry(output_feature['type'], output_type_registry).populate_defaults( output_feature) return model_definition
def build_metadata(dataset_df, features, global_preprocessing_parameters): train_set_metadata = {} for feature in features: get_feature_meta = get_from_registry( feature['type'], base_type_registry).get_feature_meta if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature['type']], feature['preprocessing']) else: preprocessing_parameters = global_preprocessing_parameters[ feature['type']] train_set_metadata[feature['name']] = get_feature_meta( dataset_df[feature['name']].astype(str), preprocessing_parameters) return train_set_metadata
def overwrite_defaults(self, feature): attributes = self.__dict__.keys() remaining_dict = dict(feature) for k in feature.keys(): if k in attributes: if (isinstance(feature[k], dict) and hasattr(self, k) and isinstance(getattr(self, k), dict)): setattr(self, k, merge_dict(getattr(self, k), feature[k])) else: setattr(self, k, feature[k]) del remaining_dict[k] return remaining_dict
def build_data(dataset_df, features, train_set_metadata, global_preprocessing_parameters): data = {} for feature in features: add_feature_data = get_from_registry( feature['type'], base_type_registry).add_feature_data if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature['type']], feature['preprocessing']) else: preprocessing_parameters = global_preprocessing_parameters[ feature['type']] handle_missing_values(dataset_df, feature, preprocessing_parameters) if feature['name'] not in train_set_metadata: train_set_metadata[feature['name']] = {} train_set_metadata[ feature['name']]['preprocessing'] = preprocessing_parameters add_feature_data(feature, dataset_df, data, train_set_metadata, preprocessing_parameters) return data
def preprocess_for_prediction( model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True ): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME) ) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name']) ) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict( default_preprocessing_parameters, model_definition['preprocessing'] ) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == 'full': if data_hdf5 is not None: dataset = load_data( data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False ) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) else: if data_hdf5 is not None: training, test, validation = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False ) if split == 'training': dataset = training elif split == 'validation': dataset = validation else: # if split == 'test': dataset = test else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata ) replace_text_feature_level( features, [dataset] ) dataset = Dataset( dataset, model_definition['input_features'], output_features, data_hdf5_fp, ) return dataset, train_set_metadata
def preprocess_for_prediction(model_path, split, dataset_type='generic', data_csv=None, data_hdf5=None, train_set_metadata=None, only_predictions=False): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param dataset_type: Generic :type: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param only_predictions: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)) preprocessing_params = merge_dict(default_preprocessing_parameters, model_definition['preprocessing']) # Check if hdf5 and json already exist if data_csv is not None: data_hdf5_fp = os.path.splitext(data_csv)[0] + '.hdf5' if os.path.isfile(data_hdf5_fp): logging.info( 'Found hdf5 with the same filename of the csv, using it instead' ) data_csv = None data_hdf5 = data_hdf5_fp # Load data _, _, build_dataset, _ = get_dataset_fun(dataset_type) train_set_metadata = load_metadata(train_set_metadata) features = ( model_definition['input_features'] + ([] if only_predictions else model_definition['output_features'])) if split == 'full': if data_hdf5 is not None: dataset = load_data(data_hdf5, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], split_data=False, shuffle_training=False) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) else: if data_hdf5 is not None: training, test, validation = load_data( data_hdf5, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], shuffle_training=False) if split == 'training': dataset = training elif split == 'validation': dataset = validation else: # if split == 'test': dataset = test else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) replace_text_feature_level(model_definition, [dataset]) dataset = Dataset( dataset, model_definition['input_features'], [] if only_predictions else model_definition['output_features'], data_hdf5, ) return dataset, train_set_metadata
def preprocess_for_prediction(model_path, split, data_csv=None, data_hdf5=None, train_set_metadata=None, evaluate_performance=True): """Preprocesses the dataset to parse it into a format that is usable by the Ludwig core :param model_path: The input data that is joined with the model hyperparameter file to create the model definition file :type model_path: Str :param split: Splits the data into the train and test sets :param data_csv: The CSV input data file :param data_hdf5: The hdf5 data file if there is no csv data file :param train_set_metadata: Train set metadata for the input features :param evaluate_performance: If False does not load output features :returns: Dataset, Train set metadata """ model_definition = load_json( os.path.join(model_path, MODEL_HYPERPARAMETERS_FILE_NAME)) for input_feature in model_definition['input_features']: if 'preprocessing' in input_feature: if 'in_memory' in input_feature['preprocessing']: if not input_feature['preprocessing']['in_memory']: logger.warning( 'WARNING: When running predict in_memory flag should ' 'be true. Overriding and setting it to true for ' 'feature <{}>'.format(input_feature['name'])) input_feature['preprocessing']['in_memory'] = True preprocessing_params = merge_dict(default_preprocessing_parameters, model_definition['preprocessing']) output_features = model_definition[ 'output_features'] if evaluate_performance else [] features = model_definition['input_features'] + output_features # Check if hdf5 file already exists if data_csv is not None: data_hdf5_fp = replace_file_extension(data_csv, 'hdf5') if os.path.isfile(data_hdf5_fp): logger.info('Found hdf5 with the same filename of the csv, ' 'using it instead') data_csv = None data_hdf5 = data_hdf5_fp else: data_hdf5_fp = None # Load data train_set_metadata = load_metadata(train_set_metadata) if split == FULL: if data_hdf5 is not None: dataset = load_data(data_hdf5, model_definition['input_features'], output_features, split_data=False, shuffle_training=False) else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) else: if data_hdf5 is not None: training_set, test_set, validation_set = load_data( data_hdf5, model_definition['input_features'], output_features, shuffle_training=False) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: dataset, train_set_metadata = build_dataset( data_csv, features, preprocessing_params, train_set_metadata=train_set_metadata) # build_dataset adds a split column if there is none in the csv # so if we want to check if the csv contained a split column # we have to check in the csv not in the built dataset. # The logic is that if there is no split in the original csv # we treat the split parameter as if it was == full if csv_contains_column(data_csv, SPLIT): training_set, test_set, validation_set = split_dataset_tvt( dataset, dataset[SPLIT]) if split == TRAINING: dataset = training_set elif split == VALIDATION: dataset = validation_set else: # if split == TEST: dataset = test_set else: logger.warning('You requested the {} split, but the data CSV ' 'does not contain a "split" column, so the ' 'full data will be used instead') replace_text_feature_level(features, [dataset]) dataset = Dataset(dataset, model_definition['input_features'], output_features, train_set_metadata.get(DATA_TRAIN_HDF5_FP)) return dataset, train_set_metadata