def test_one_hot_encoding(): """Process str and int one hot columns and verify outputs.""" proc = PreProcess(A) out = proc.process_one_hot(convert_int=False) assert (out.iloc[:, -4:].sum(axis=1) == np.ones(7)).all() assert out['f1_0'].values[0] == 1.0 assert out['f1_0'].values[1] == 0.0 assert out['f1_0'].values[-3] == 1.0 assert out['f1_0'].values[-1] == 1.0 assert out['f1_1'].values[1] == 1.0 assert out['f1_1'].values[0] == 0.0 assert out['f1_2'].values[2] == 1.0 assert out['f1_2'].values[-2] == 1.0 assert out['f1_2'].values[-1] == 0.0 assert out['f1_3'].values[3] == 1.0 assert out['f1_3'].values[0] == 0.0 proc = PreProcess(A.values) np_out = proc.process_one_hot(convert_int=False) assert np.allclose(out, np_out) proc = PreProcess(A) out = proc.process_one_hot(convert_int=True) assert 'f3' not in out assert (out.iloc[:, 1:5].sum(axis=1) == np.ones(7)).all() assert (out.iloc[:, 5:].sum(axis=1) == np.ones(7)).all()
def test_categories(): """ Verify predefined categories handle missing data """ proc = PreProcess(A) out = proc.process_one_hot(convert_int=False) assert (out.columns == ['f2', 'f3', 'f1_0', 'f1_1', 'f1_2', 'f1_3']).all() # Verify columns are created for missing categories # and that the new one-hot columns have names corresponding to their values proc = PreProcess(A) out0 = proc.process_one_hot( convert_int=False, categories={'f1': ['a', 'b', 'c', 'd', 'missing']}) assert (out0.columns == ['f2', 'f3', 'a', 'b', 'c', 'd', 'missing']).all() assert (out0['missing'] == np.zeros(7)).all() # verify ordering works. out1 = proc.process_one_hot( convert_int=False, categories={'f1': ['missing', 'd', 'c', 'a', 'b']}) assert (out1.columns == ['f2', 'f3', 'missing', 'd', 'c', 'a', 'b']).all() assert all(out0.a == out1.a) assert all(out0.b == out1.b) assert all(out0.c == out1.c) assert all(out0.d == out1.d) assert (out1['missing'] == np.zeros(7)).all() assert out1.a.values[0] == 1 assert out1.a.values[1] == 0 assert out1.a.values[2] == 0 assert out1.a.values[3] == 0 assert out1.a.values[4] == 1 # Verify good error with bad categories input. try: proc.process_one_hot(categories={'f1': ['a', 'b', 'c']}) except ValueError as e: assert 'Found unknown categories' in str(e)
def __init__(self, model, feature_names=None, label_names=None, norm_params=None, normalize=(True, False), one_hot_categories=None): """ Parameters ---------- model : OBJ Initialized model object feature_names : list Ordered list of feature names. label_names : list Ordered list of label (output) names. norm_params : dict, optional Dictionary mapping feature and label names (keys) to normalization parameters (mean, stdev), by default None normalize : bool | tuple, optional Boolean flag(s) as to whether features and labels should be normalized. Possible values: - True means normalize both - False means don't normalize either - Tuple of flags (normalize_feature, normalize_label) by default True one_hot_categories : dict, optional Features to one-hot encode using given categories, if None do not run one-hot encoding, by default None """ self._model = model if isinstance(feature_names, str): feature_names = [feature_names] elif isinstance(feature_names, (np.ndarray, pd.Index)): feature_names = feature_names.tolist() self._feature_names = feature_names if isinstance(label_names, str): label_names = [label_names] elif isinstance(label_names, (np.ndarray, pd.Index)): label_names = label_names.tolist() self._label_names = label_names if norm_params is None: norm_params = {} self._norm_params = norm_params self._normalize = self._parse_normalize(normalize) if one_hot_categories is not None: PreProcess.check_one_hot_categories(one_hot_categories) self._one_hot_categories = one_hot_categories
def _normalize_arr(self, arr, names): """ Normalize array and save normalization parameters to given names Parameters ---------- arr : ndarray Array of features/label to normalize names : list List of feature/label names Returns ------- norm_arr : ndarray Normalized features/label """ n_names = self._get_item_number(arr) if len(names) != n_names: msg = ("Number of item names ({}) does not match number of items " "({})".format(len(names), arr.shape[1])) logger.error(msg) raise RuntimeError(msg) means, stdevs = self.get_norm_params(names) update = means is None or stdevs is None norm_arr, means, stdevs = PreProcess.normalize(arr, mean=means, stdev=stdevs) if update: for i, n in enumerate(names): norm_params = {n: {'mean': means[i], 'stdev': stdevs[i]}} self._norm_params.update(norm_params) return norm_arr
def _normalize_df(self, df): """ Normalize DataFrame Parameters ---------- df : pandas.DataFrame DataFrame of features/label to normalize Returns ------- norm_df : pandas.DataFrame Normalized features/label """ means, stdevs = self.get_norm_params(df.columns) update = means is None or stdevs is None norm_df, means, stdevs = PreProcess.normalize(df, mean=means, stdev=stdevs) if update: for i, c in enumerate(df.columns): norm_params = {c: {'mean': means[i], 'stdev': stdevs[i]}} self._norm_params.update(norm_params) return norm_df
def _normalize_dict(self, items): """ Normalize given dictionary of items (features | labels) Parameters ---------- items : dict mapping of names to vectors Returns ------- norm_items : dict mapping of names to normalized-feature vectors """ norm_items = {} for key, value in items.items(): mean = self.get_mean(key) stdev = self.get_stdev(key) update = mean is None or stdev is None try: value, mean, stdev = PreProcess.normalize(value, mean=mean, stdev=stdev) if update: norm_params = {key: {'mean': mean, 'stdev': stdev}} self._norm_params.update(norm_params) except Exception as ex: msg = "Could not normalize {}:\n{}".format(key, ex) logger.warning(msg) warn(msg) norm_items[key] = value return norm_items
def test_OHE(): """ Test one-hot encoding """ ohe_features = FEATURES.copy() categories = list('def') ohe_features['categorical'] = np.random.choice(categories, len(FEATURES)) one_hot_categories = {'categorical': categories} model = ModelBase(None, feature_names=ohe_features.columns, label_names=LABELS.columns, normalize=True, one_hot_categories=one_hot_categories) baseline, means, stdevs = \ PreProcess.normalize(FEATURES.values.astype('float32')) test = model.parse_features(ohe_features) assert np.allclose(baseline, test[:, :2]) assert np.allclose(means, np.array(model.feature_means, dtype='float32')[:2]) assert np.allclose(stdevs, np.array(model.feature_stdevs, dtype='float32')[:2]) for c in categories: assert model.get_mean(c) is None assert model.get_stdev(c) is None assert all(np.isin(categories, model.feature_names)) assert not any(np.isin(categories, model.input_feature_names)) assert 'categorical' not in model.feature_names assert 'categorical' in model.input_feature_names
def _unnormalize_df(self, df): """ Un-normalize DataFrame Parameters ---------- df : pandas.DataFrame DataFrame of features/label to un-normalize Returns ------- df : pandas.DataFrame Native features/label df if norm params are not None """ means, stdevs = self.get_norm_params(df.columns) if means is not None and stdevs is not None: df = PreProcess.unnormalize(df.copy(), means, stdevs) else: msg = ("Normalization parameters are unavailable, df will not be " "un-normalized!") logger.warning(msg) warn(msg) return df
def _unnormalize_arr(self, arr, names): """ Un-normalize array using given names Parameters ---------- arr : ndarray Array of features/label to un-normalize names : list List of feature/label names Returns ------- arr : ndarray Native features/label array if norm params are not None """ n_names = self._get_item_number(arr) if len(names) != n_names: msg = ("Number of item names ({}) does not match number of items " "({})".format(len(names), arr.shape[1])) logger.error(msg) raise RuntimeError(msg) means, stdevs = self.get_norm_params(names) if means is not None and stdevs is not None: arr = PreProcess.unnormalize(arr.copy(), means, stdevs) else: msg = ("Normalization parameters are unavailable, arr will not be " "un-normalized!") logger.warning(msg) warn(msg) return arr
def _unnormalize_dict(self, items): """ Un-normalize given dictionary of items (features | labels) Parameters ---------- items : dict mapping of names to vectors Returns ------- native_items : dict mapping of names to native vectors """ native_items = {} for key, value in items.items(): norm_params = self.normalization_parameters[key] if norm_params is not None: value = PreProcess.unnormalize(value, norm_params['mean'], norm_params['stdev']) else: msg = ("Normalization Parameters unavailable, {} will not be " "un-normalized!".format(key)) logger.warning(msg) warn(msg) native_items[key] = value return native_items
def _parse_features(self, features, names=None, process_one_hot=True, **kwargs): """ Parse features Parameters ---------- features : pandas.DataFrame | dict | ndarray Features to train on or predict from names : list, optional List of feature names, by default None process_one_hot : bool, optional Check for and process one-hot variables, by default True kwargs : dict, optional kwargs for PreProcess.one_hot Returns ------- features : ndarray Parsed features array normalized and with str columns converted to one hot vectors if desired """ features, feature_names = self._parse_data(features, names=names) if len(features.shape) != 2: msg = ('{} can only use 2D data as input!'.format( self.__class__.__name__)) logger.error(msg) raise RuntimeError(msg) if self.feature_names is not None: if features.shape[1] != len(self.feature_names): msg = ('data has {} features but expected {}'.format( features.shape[1], self.feature_dims)) logger.error(msg) raise RuntimeError(msg) if self._feature_names is None: self._feature_names = feature_names elif self.feature_names != feature_names: msg = ('Expecting features with names: {}, but was provided with: ' '{}!'.format(feature_names, self.feature_names)) logger.error(msg) raise RuntimeError(msg) if process_one_hot: kwargs.update({'return_ind': True}) features, one_hot_ind = PreProcess.one_hot(features, **kwargs) if one_hot_ind: one_hot_features = [self.feature_names[i] for i in one_hot_ind] self._check_one_hot_norm_params(one_hot_features) if self.normalize_features: features = self.normalize(features, names=feature_names) return features
def test_norm_df(): """Test ModelBase Normalization on a dataframe""" model = ModelBase(None, feature_names=FEATURES.columns, label_names=LABELS.columns, normalize=True) baseline, means, stdevs = PreProcess.normalize(FEATURES) test = model.parse_features(FEATURES) assert np.allclose(baseline.values, test) assert np.allclose(means, model.feature_means) assert np.allclose(stdevs, model.feature_stdevs) baseline, means, stdevs = PreProcess.normalize(LABELS) test = model._parse_labels(LABELS) np.allclose(baseline.values, test) assert np.allclose(means, model.label_means) assert np.allclose(stdevs, model.label_stdevs)
def parse_features(self, features, names=None, **kwargs): """Parse features - preprocessing of feature data before training or prediction. This will do one-hot encoding based on self.one_hot_categories, and feature normalization based on self.normalize_features Parameters ---------- features : pandas.DataFrame | dict | ndarray Features to train on or predict from names : list, optional List of feature names, by default None kwargs : dict, optional kwargs for PreProcess.one_hot Returns ------- features : ndarray Parsed features array normalized and with str columns converted to one hot vectors if desired """ features, feature_names = self._parse_data(features, names=names) if len(features.shape) != 2: msg = ('{} can only use 2D data as input!'.format( self.__class__.__name__)) logger.error(msg) raise RuntimeError(msg) if self.feature_names is None: self._feature_names = feature_names check = (self.one_hot_categories is not None and all(np.isin(feature_names, self.input_feature_names))) if check: self._check_one_hot_feature_names(feature_names) kwargs.update({ 'feature_names': feature_names, 'categories': self.one_hot_categories }) features = PreProcess.one_hot(features, **kwargs) elif self.feature_names != feature_names: msg = ('Expecting features with names: {}, but was provided with: ' '{}!'.format(self.feature_names, feature_names)) logger.error(msg) raise RuntimeError(msg) if self.normalize_features: features = self.normalize(features, names=self.feature_names) if features.shape[1] != self.feature_dims: msg = ('data has {} features but expected {}'.format( features.shape[1], self.feature_dims)) logger.error(msg) raise RuntimeError(msg) return features
def _check_one_hot_feature_names(self, feature_names): """ Check one_hot_feature_names, update feature_names to remove features that were one-hot encoded and add in new one-hot features if needed Parameters ---------- feature_names : list Input feature names """ one_hot_feature_names = self.make_one_hot_feature_names( feature_names, self.one_hot_categories) if one_hot_feature_names != self.feature_names: check_names = feature_names.copy() if self.label_names is not None: check_names += self.label_names PreProcess.check_one_hot_categories(self.one_hot_categories, feature_names=check_names) self._feature_names = one_hot_feature_names
def test_norm_arr(): """Test ModelBase Normalization on a dataframe""" features = FEATURES.values feature_names = FEATURES.columns.tolist() labels = LABELS.values label_names = LABELS.columns.tolist() model = ModelBase(None, feature_names=feature_names, label_names=label_names, normalize=True) baseline, means, stdevs = PreProcess.normalize(features) test = model.parse_features(features, names=feature_names) assert np.allclose(baseline, test) assert np.allclose(means, model.feature_means) assert np.allclose(stdevs, model.feature_stdevs) baseline, means, stdevs = PreProcess.normalize(labels) test = model._parse_labels(labels, names=label_names) assert np.allclose(baseline, test) assert np.allclose(means, model.label_means) assert np.allclose(stdevs, model.label_stdevs)
def unnormalize_prediction(self, prediction): """ Unnormalize prediction if needed Parameters ---------- prediction : ndarray Model prediction Returns ------- prediction : ndarray Native prediction """ means = self.label_means[0] if means: stdevs = self.label_stdevs[0] prediction = PreProcess.unnormalize(prediction, means, stdevs) return prediction
def build(cls, p_fun, feature_names, label_names, normalize=(True, False), one_hot_categories=None, loss_weights=(0.5, 0.5), hidden_layers=None, input_layer=None, output_layer=None, layers_obj=None, metric='mae', initializer=None, optimizer=None, learning_rate=0.01, history=None, kernel_reg_rate=0.0, kernel_reg_power=1, bias_reg_rate=0.0, bias_reg_power=1, name=None): """ Build phygnn model from given features, layers and kwargs Parameters ---------- p_fun : function Physics function to guide the neural network loss function. This fun must take (phygnn, y_true, y_predicted, p, **p_kwargs) as arguments with datatypes (PhysicsGuidedNeuralNetwork, tf.Tensor, np.ndarray, np.ndarray). The function must return a tf.Tensor object with a single numeric loss value (output.ndim == 0). feature_names : list Ordered list of feature names. label_names : list Ordered list of label (output) names. normalize : bool | tuple, optional Boolean flag(s) as to whether features and labels should be normalized. Possible values: - True means normalize both - False means don't normalize either - Tuple of flags (normalize_feature, normalize_label) by default True one_hot_categories : dict, optional Features to one-hot encode using given categories, if None do not run one-hot encoding, by default None loss_weights : tuple, optional Loss weights for the neural network y_true vs y_predicted and for the p_fun loss, respectively. For example, loss_weights=(0.0, 1.0) would simplify the phygnn loss function to just the p_fun output. hidden_layers : list, optional List of dictionaries of key word arguments for each hidden layer in the NN. Dense linear layers can be input with their activations or separately for more explicit control over the layer ordering. For example, this is a valid input for hidden_layers that will yield 8 hidden layers (10 layers including input+output): [{'units': 64, 'activation': 'relu', 'dropout': 0.01}, {'units': 64}, {'batch_normalization': {'axis': -1}}, {'activation': 'relu'}, {'dropout': 0.01}, {'class': 'Flatten'}, ] input_layer : None | bool | dict Input layer. specification. Can be a dictionary similar to hidden_layers specifying a dense / conv / lstm layer. Will default to a keras InputLayer with input shape = n_features. Can be False if the input layer will be included in the hidden_layers input. output_layer : None | bool | list | dict Output layer specification. Can be a list/dict similar to hidden_layers input specifying a dense layer with activation. For example, for a classfication problem with a single output, output_layer should be [{'units': 1}, {'activation': 'sigmoid'}]. This defaults to a single dense layer with no activation (best for regression problems). Can be False if the output layer will be included in the hidden_layers input. layers_obj : None | phygnn.utilities.tf_layers.Layers Optional initialized Layers object to set as the model layers including pre-set weights. This option will override the hidden_layers, input_layer, and output_layer arguments. metric : str, optional Loss metric option for the NN loss function (not the physical loss function). Must be a valid key in phygnn.loss_metrics.METRICS initializer : tensorflow.keras.initializers, optional Instantiated initializer object. None defaults to GlorotUniform optimizer : tensorflow.keras.optimizers | dict | None Instantiated tf.keras.optimizers object or a dict optimizer config from tf.keras.optimizers.get_config(). None defaults to Adam. learning_rate : float, optional Optimizer learning rate. Not used if optimizer input arg is a pre-initialized object or if optimizer input arg is a config dict. history : None | pd.DataFrame, optional Learning history if continuing a training session. kernel_reg_rate : float, optional Kernel regularization rate. Increasing this value above zero will add a structural loss term to the loss function that disincentivizes large hidden layer weights and should reduce model complexity. Setting this to 0.0 will disable kernel regularization. kernel_reg_power : int, optional Kernel regularization power. kernel_reg_power=1 is L1 regularization (lasso regression), and kernel_reg_power=2 is L2 regularization (ridge regression). bias_reg_rate : float, optional Bias regularization rate. Increasing this value above zero will add a structural loss term to the loss function that disincentivizes large hidden layer biases and should reduce model complexity. Setting this to 0.0 will disable bias regularization. bias_reg_power : int, optional Bias regularization power. bias_reg_power=1 is L1 regularization (lasso regression), and bias_reg_power=2 is L2 regularization (ridge regression). name : None | str Optional model name for debugging. Returns ------- model : PhygnnModel Initialized PhygnnModel instance """ if isinstance(label_names, str): label_names = [label_names] if one_hot_categories is not None: check_names = feature_names + label_names PreProcess.check_one_hot_categories(one_hot_categories, feature_names=check_names) feature_names = cls.make_one_hot_feature_names( feature_names, one_hot_categories) model = PhysicsGuidedNeuralNetwork(p_fun, loss_weights=loss_weights, n_features=len(feature_names), n_labels=len(label_names), hidden_layers=hidden_layers, input_layer=input_layer, output_layer=output_layer, layers_obj=layers_obj, metric=metric, initializer=initializer, optimizer=optimizer, learning_rate=learning_rate, history=history, kernel_reg_rate=kernel_reg_rate, kernel_reg_power=kernel_reg_power, bias_reg_rate=bias_reg_rate, bias_reg_power=bias_reg_power, feature_names=feature_names, output_names=label_names, name=name) model = cls(model, feature_names=feature_names, label_names=label_names, normalize=normalize, one_hot_categories=one_hot_categories) return model
def build(cls, feature_names, label_names, normalize=(True, False), one_hot_categories=None, hidden_layers=None, learning_rate=0.001, loss="mean_squared_error", metrics=('mae', 'mse'), optimizer_class=Adam, **kwargs): """ Build tensorflow sequential model from given features, layers and kwargs Parameters ---------- feature_names : list Ordered list of feature names. label_names : list Ordered list of label (output) names. normalize : bool | tuple, optional Boolean flag(s) as to whether features and labels should be normalized. Possible values: - True means normalize both - False means don't normalize either - Tuple of flags (normalize_feature, normalize_label) by default True one_hot_categories : dict, optional Features to one-hot encode using given categories, if None do not run one-hot encoding, by default None hidden_layers : list, optional List of tensorflow layers.Dense kwargs (dictionaries) if None use a single linear layer, by default None learning_rate : float, optional tensorflow optimizer learning rate, by default 0.001 loss : str, optional name of objective function, by default "mean_squared_error" metrics : list, optional List of metrics to be evaluated by the model during training and testing, by default ('mae', 'mse') optimizer_class : tf.keras.optimizers, optional Optional explicit request of optimizer. This should be a class that will be instantated in the TfModel.compile_model() method The default is the Adam optimizer kwargs : dict kwargs for tensorflow.keras.models.compile Returns ------- model : TfModel Initialized TfModel obj """ if isinstance(label_names, str): label_names = [label_names] if one_hot_categories is not None: check_names = feature_names + label_names PreProcess.check_one_hot_categories(one_hot_categories, feature_names=check_names) feature_names = cls.make_one_hot_feature_names( feature_names, one_hot_categories) model = cls.compile_model(len(feature_names), n_labels=len(label_names), hidden_layers=hidden_layers, learning_rate=learning_rate, loss=loss, metrics=metrics, optimizer_class=optimizer_class, **kwargs) model = cls(model, feature_names=feature_names, label_names=label_names, normalize=normalize, one_hot_categories=one_hot_categories) return model
def build_trained(cls, features, label, normalize=True, one_hot_categories=None, shuffle=True, save_path=None, compile_kwargs=None, parse_kwargs=None, fit_kwargs=None): """ Build Random Forest Model with given kwargs and then train with given features, labels, and kwargs Parameters ---------- features : pandas.DataFrame Model features label : pandas.DataFrame label to train on normalize : bool | tuple, optional Boolean flag(s) as to whether features and labels should be normalized. Possible values: - True means normalize both - False means don't normalize either - Tuple of flags (normalize_feature, normalize_label) by default True one_hot_categories : dict, optional Features to one-hot encode using given categories, if None do not run one-hot encoding, by default None shuffle : bool Flag to randomly subset the validation data and batch selection from features and labels. save_path : str Directory path to save model to. The RandomForest Model will be saved to the directory while the framework parameters will be saved in json. compile_kwargs : dict kwargs for sklearn.ensemble.RandomForestRegressor parse_kwargs : dict kwargs for cls.parse_features fit_kwargs : dict kwargs for sklearn.ensemble.RandomForestRegressor.fit Returns ------- model : RandomForestModel Initialized and trained RandomForestModel obj """ if compile_kwargs is None: compile_kwargs = {} _, feature_names = cls._parse_data(features) _, label_name = cls._parse_data(label) model = cls.compile_model(**compile_kwargs) if one_hot_categories is not None: check_names = feature_names + label_name PreProcess.check_one_hot_categories(one_hot_categories, feature_names=check_names) feature_names = cls.make_one_hot_feature_names( feature_names, one_hot_categories) model = cls(model, feature_names=feature_names, label_name=label_name, normalize=normalize, one_hot_categories=one_hot_categories) model.train_model(features, label, shuffle=shuffle, parse_kwargs=parse_kwargs, fit_kwargs=fit_kwargs) if save_path is not None: model.save_model(save_path) return model