def evaluate(self, test_data: MODData) -> pd.DataFrame: """Evaluates the target values for the passed MODData by returning the corresponding loss. Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. Returns: Loss score """ # prevents Nan predictions if some features are inf x = (test_data.get_featurized_df().replace( [np.inf, -np.inf, np.nan], 0)[self.optimal_descriptors[:self.n_feat]].values) # Scale the input features: x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) x = np.nan_to_num(x, nan=-1) y = [] for targ in self.targets_flatten: if self.num_classes[targ] >= 2: # Classification y_inner = tf.keras.utils.to_categorical( test_data.df_targets[targ].values, num_classes=self.num_classes[targ], ) else: y_inner = test_data.df_targets[targ].values.astype(np.float, copy=False) y.append(y_inner) return self.model.evaluate(x, y)[0]
def predict(self, test_data: MODData) -> pd.DataFrame: """Predict the target values for the passed MODData. Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. Returns: A `pandas.DataFrame` containing the predicted values of the targets. """ x = test_data.get_featurized_df()[ self.optimal_descriptors[:self.n_feat] ].values # Scale the input features: if self._scaler is not None: x = self._scaler.transform(x) x = np.nan_to_num(x) if self._multi_target: p = np.array(self.model.predict(x))[:, :, 0].transpose() else: p = np.array(self.model.predict(x))[:, 0].transpose() predictions = pd.DataFrame(p) predictions.columns = self.targets_flatten predictions.index = test_data.structure_ids return predictions
def fit(self,data:MODData, val_fraction = 0.0, val_key = None, lr=0.001, epochs = 200, batch_size = 128, xscale='minmax',yscale=None): print('new') self.xscale = xscale self.target_names = data.names self.optimal_descriptors = data.get_optimal_descriptors() x = data.get_featurized_df()[self.optimal_descriptors[:self.n_feat]].values print(x.shape) y = data.get_target_df()[self.targets_flatten].values.transpose() print(y.shape) #Scale the input features: if self.xscale == 'minmax': self.xmin = x.min(axis=0) self.xmax = x.max(axis=0) x=(x-self.xmin)/(self.xmax-self.xmin) - 0.5 elif self.xscale == 'standard': self.scaler = StandardScaler() x = self.scaler.fit_transform(x) x = np.nan_to_num(x) if val_fraction > 0: if self.PP: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_{}_mae'.format(val_key)]))) else: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_mae']))) else: print_callback = LambdaCallback( on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}".format(epoch,logs['loss']))) fit_params = { 'x': x, 'y': list(y), 'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'validation_split' : val_fraction, 'callbacks':[print_callback] } print('compile',flush=True) self.model.compile(loss = 'mse',optimizer=keras.optimizers.Adam(lr=lr),metrics=['mae'],loss_weights=self.weights) print('fit',flush=True) self.model.fit(**fit_params)
def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame: """Predict the target values for the passed MODData. Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. return_prob: For a classification tasks only: whether to return the probability of each class OR only return the most probable class. Returns: A `pandas.DataFrame` containing the predicted values of the targets. """ # prevents Nan predictions if some features are inf x = (test_data.get_featurized_df().replace( [np.inf, -np.inf, np.nan], 0)[self.optimal_descriptors[:self.n_feat]].values) # Scale the input features: x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) x = np.nan_to_num(x, nan=-1) p = np.array(self.model.predict(x)) if len(p.shape) == 2: p = np.array([p]) p_dic = {} for i, name in enumerate(self.targets_flatten): if self.num_classes[name] >= 2: if return_prob: temp = p[i, :, :] / (p[i, :, :].sum(axis=1)).reshape( (-1, 1)) for j in range(temp.shape[-1]): p_dic["{}_prob_{}".format(name, j)] = temp[:, j] else: p_dic[name] = np.argmax(p[i, :, :], axis=1) else: p_dic[name] = p[i, :, 0] predictions = pd.DataFrame(p_dic) predictions.index = test_data.structure_ids return predictions
def fit( self, training_data: MODData, val_fraction: float = 0.0, val_key: Optional[str] = None, val_data: Optional[MODData] = None, lr: float = 0.001, epochs: int = 200, batch_size: int = 128, xscale: Optional[str] = "minmax", metrics: List[str] = ["mae"], callbacks: List[Callable] = None, verbose: int = 0, loss: str = "mse", **fit_params, ) -> None: """Train the model on the passed training `MODData` object. Parameters: training_data: A `MODData` that has been featurized and feature selected. The first `self.n_feat` entries in `training_data.get_optimal_descriptors()` will be used for training. val_fraction: The fraction of the training data to use as a validation set for tracking model performance during training. val_key: The target name to track on the validation set during training, if performing multi-target learning. lr: The learning rate. epochs: The maximum number of epochs to train for. batch_size: The batch size to use for training. xscale: The feature scaler to use, either `None`, `'minmax'` or `'standard'`. metrics: A list of tf.keras metrics to pass to `compile(...)`. loss: The built-in tf.keras loss to pass to `compile(...)`. fit_params: Any additional parameters to pass to `fit(...)`, these will be overwritten by the explicit keyword arguments above. """ if self.n_feat > len(training_data.get_optimal_descriptors()): raise RuntimeError( "The model requires more features than computed in data. " f"Please reduce n_feat below or equal to {len(training_data.get_optimal_descriptors())}" ) self.xscale = xscale self.target_names = list(self.weights.keys()) self.optimal_descriptors = training_data.get_optimal_descriptors() x = training_data.get_featurized_df()[ self.optimal_descriptors[:self.n_feat]].values # For compatibility with MODNet 0.1.7; if there is only one target in the training data, # use that for the name of the target too. if (len(self.targets_flatten) == 1 and len(training_data.df_targets.columns) == 1): self.targets_flatten = list(training_data.df_targets.columns) y = [] for targ in self.targets_flatten: if self.num_classes[targ] >= 2: # Classification y_inner = tf.keras.utils.to_categorical( training_data.df_targets[targ].values, num_classes=self.num_classes[targ], ) loss = "categorical_crossentropy" else: y_inner = training_data.df_targets[targ].values.astype( np.float, copy=False) y.append(y_inner) # Scale the input features: if self.xscale == "minmax": self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5)) elif self.xscale == "standard": self._scaler = StandardScaler() x = self._scaler.fit_transform(x) x = np.nan_to_num(x, nan=-1) if val_data is not None: val_x = val_data.get_featurized_df()[ self.optimal_descriptors[:self.n_feat]].values val_x = self._scaler.transform(val_x) val_x = np.nan_to_num(val_x, nan=-1) try: val_y = list(val_data.get_target_df()[ self.targets_flatten].values.astype( np.float, copy=False).transpose()) except Exception: val_y = list(val_data.get_target_df().values.astype( np.float, copy=False).transpose()) validation_data = (val_x, val_y) else: validation_data = None # Optionally set up print callback if verbose: if val_fraction > 0 or validation_data: if self._multi_target and val_key is not None: val_metric_key = f"val_{val_key}_mae" else: val_metric_key = "val_mae" print_callback = tf.keras.callbacks.LambdaCallback( on_epoch_end=lambda epoch, logs: print( f"epoch {epoch}: loss: {logs['loss']:.3f}, " f"val_loss:{logs['val_loss']:.3f} {val_metric_key}:{logs[val_metric_key]:.3f}" )) else: print_callback = tf.keras.callbacks.LambdaCallback( on_epoch_end=lambda epoch, logs: print( f"epoch {epoch}: loss: {logs['loss']:.3f}")) if callbacks is None: callbacks = [print_callback] else: callbacks.append(print_callback) fit_params = { "x": x, "y": y, "epochs": epochs, "batch_size": batch_size, "verbose": verbose, "validation_split": val_fraction, "validation_data": validation_data, "callbacks": callbacks, } self.model.compile( loss=loss, optimizer=tf.keras.optimizers.Adam(lr=lr), metrics=metrics, loss_weights=self.weights, ) history = self.model.fit(**fit_params) self.history = history.history
def predict(self, test_data: MODData, return_prob=False, return_unc=False) -> pd.DataFrame: """Predict the target values for the passed MODData. Parameters: test_data: A featurized and feature-selected `MODData` object containing the descriptors used in training. return_prob: For a classification tasks only: whether to return the probability of each class OR only return the most probable class. return_unc: whether to return the standard deviation as a second dataframe Returns: A `pandas.DataFrame` containing the predicted values of the targets. If return_unc=True, two `pandas.DataFrame` : (predictions,std) containing the predicted values of the targets and the standard deviations of the epistemic uncertainty. """ # prevents Nan predictions if some features are inf x = (test_data.get_featurized_df().replace( [np.inf, -np.inf, np.nan], 0)[self.optimal_descriptors[:self.n_feat]].values) # Scale the input features: x = np.nan_to_num(x) if self._scaler is not None: x = self._scaler.transform(x) x = np.nan_to_num(x) all_predictions = [] for i in range(1000): p = self.model.predict(x) if len(self.targets_flatten) == 1: p = np.array([p]) all_predictions.append(p) p_dic = {} unc_dic = {} for i, name in enumerate(self.targets_flatten): if self.num_classes[name] >= 2: if return_prob: preds = np.array([pred[i] for pred in all_predictions]) probs = preds / (preds.sum(axis=-1)).reshape((-1, 1)) mean_prob = probs.mean() std_prob = probs.std() for j in range(mean_prob.shape[-1]): p_dic["{}_prob_{}".format(name, j)] = mean_prob[:, j] unc_dic["{}_prob_{}".format(name, j)] = std_prob[:, j] else: p_dic[name] = np.argmax( np.array([pred[i] for pred in all_predictions]).mean(axis=0), axis=1, ) unc_dic[name] = np.max( np.array([pred[i] for pred in all_predictions]).mean(axis=0), axis=1, ) else: mean_p = np.array([pred[i] for pred in all_predictions]).mean(axis=0) std_p = np.array([pred[i] for pred in all_predictions]).std(axis=0) p_dic[name] = mean_p[:, 0] unc_dic[name] = std_p[:, 0] predictions = pd.DataFrame(p_dic) unc = pd.DataFrame(unc_dic) predictions.index = test_data.structure_ids unc.index = test_data.structure_ids if return_unc: return predictions, unc else: return predictions