Esempio n. 1
0
    def evaluate(self, test_data: MODData) -> pd.DataFrame:
        """Evaluates the target values for the passed MODData by returning the corresponding loss.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.


        Returns:
            Loss score
        """
        # prevents Nan predictions if some features are inf
        x = (test_data.get_featurized_df().replace(
            [np.inf, -np.inf, np.nan],
            0)[self.optimal_descriptors[:self.n_feat]].values)

        # Scale the input features:
        x = np.nan_to_num(x)
        if self._scaler is not None:
            x = self._scaler.transform(x)
            x = np.nan_to_num(x, nan=-1)

        y = []
        for targ in self.targets_flatten:
            if self.num_classes[targ] >= 2:  # Classification
                y_inner = tf.keras.utils.to_categorical(
                    test_data.df_targets[targ].values,
                    num_classes=self.num_classes[targ],
                )
            else:
                y_inner = test_data.df_targets[targ].values.astype(np.float,
                                                                   copy=False)
            y.append(y_inner)

        return self.model.evaluate(x, y)[0]
Esempio n. 2
0
    def predict(self, test_data: MODData) -> pd.DataFrame:
        """Predict the target values for the passed MODData.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.

        Returns:
            A `pandas.DataFrame` containing the predicted values of the targets.


        """

        x = test_data.get_featurized_df()[
            self.optimal_descriptors[:self.n_feat]
        ].values

        # Scale the input features:
        if self._scaler is not None:
            x = self._scaler.transform(x)

        x = np.nan_to_num(x)

        if self._multi_target:
            p = np.array(self.model.predict(x))[:, :, 0].transpose()
        else:
            p = np.array(self.model.predict(x))[:, 0].transpose()

        predictions = pd.DataFrame(p)
        predictions.columns = self.targets_flatten
        predictions.index = test_data.structure_ids

        return predictions
Esempio n. 3
0
    def fit(self,data:MODData, val_fraction = 0.0, val_key = None, lr=0.001, epochs = 200, batch_size = 128, xscale='minmax',yscale=None):
        
        print('new')
        self.xscale = xscale
        self.target_names = data.names
        self.optimal_descriptors = data.get_optimal_descriptors()
        x = data.get_featurized_df()[self.optimal_descriptors[:self.n_feat]].values
        print(x.shape)
        y = data.get_target_df()[self.targets_flatten].values.transpose()
        print(y.shape)
        
        #Scale the input features:
        if self.xscale == 'minmax':
            self.xmin = x.min(axis=0)
            self.xmax = x.max(axis=0)
            x=(x-self.xmin)/(self.xmax-self.xmin) - 0.5
                
        elif self.xscale == 'standard':
            self.scaler = StandardScaler()
            x = self.scaler.fit_transform(x)

        x = np.nan_to_num(x)
        
        if val_fraction > 0:
            if self.PP:
                print_callback = LambdaCallback(
                  on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_{}_mae'.format(val_key)])))
            else:
                print_callback = LambdaCallback(
                  on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}, val_loss:{:.3f} val_{}:{:.3f}".format(epoch,logs['loss'],logs['val_loss'],val_key,logs['val_mae'])))
        else:
            print_callback = LambdaCallback(
              on_epoch_end=lambda epoch,logs: print("epoch {}: loss: {:.3f}".format(epoch,logs['loss']))) 

        
        fit_params = {
                      'x': x,
                      'y': list(y),
                      'epochs': epochs,
                      'batch_size': batch_size,
                      'verbose': 0,
                      'validation_split' : val_fraction,
                      'callbacks':[print_callback]
                  }
        print('compile',flush=True)
        self.model.compile(loss = 'mse',optimizer=keras.optimizers.Adam(lr=lr),metrics=['mae'],loss_weights=self.weights)
        print('fit',flush=True)     
        
        self.model.fit(**fit_params)
Esempio n. 4
0
    def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame:
        """Predict the target values for the passed MODData.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.
            return_prob: For a classification tasks only: whether to return the probability of each
                class OR only return the most probable class.

        Returns:
            A `pandas.DataFrame` containing the predicted values of the targets.


        """
        # prevents Nan predictions if some features are inf
        x = (test_data.get_featurized_df().replace(
            [np.inf, -np.inf, np.nan],
            0)[self.optimal_descriptors[:self.n_feat]].values)

        # Scale the input features:
        x = np.nan_to_num(x)
        if self._scaler is not None:
            x = self._scaler.transform(x)
            x = np.nan_to_num(x, nan=-1)

        p = np.array(self.model.predict(x))
        if len(p.shape) == 2:
            p = np.array([p])
        p_dic = {}
        for i, name in enumerate(self.targets_flatten):
            if self.num_classes[name] >= 2:
                if return_prob:
                    temp = p[i, :, :] / (p[i, :, :].sum(axis=1)).reshape(
                        (-1, 1))
                    for j in range(temp.shape[-1]):
                        p_dic["{}_prob_{}".format(name, j)] = temp[:, j]
                else:
                    p_dic[name] = np.argmax(p[i, :, :], axis=1)
            else:
                p_dic[name] = p[i, :, 0]
        predictions = pd.DataFrame(p_dic)
        predictions.index = test_data.structure_ids

        return predictions
Esempio n. 5
0
    def fit(
        self,
        training_data: MODData,
        val_fraction: float = 0.0,
        val_key: Optional[str] = None,
        val_data: Optional[MODData] = None,
        lr: float = 0.001,
        epochs: int = 200,
        batch_size: int = 128,
        xscale: Optional[str] = "minmax",
        metrics: List[str] = ["mae"],
        callbacks: List[Callable] = None,
        verbose: int = 0,
        loss: str = "mse",
        **fit_params,
    ) -> None:
        """Train the model on the passed training `MODData` object.

        Parameters:
            training_data: A `MODData` that has been featurized and
                feature selected. The first `self.n_feat` entries in
                `training_data.get_optimal_descriptors()` will be used
                for training.
            val_fraction: The fraction of the training data to use as a
                validation set for tracking model performance during
                training.
            val_key: The target name to track on the validation set
                during training, if performing multi-target learning.
            lr: The learning rate.
            epochs: The maximum number of epochs to train for.
            batch_size: The batch size to use for training.
            xscale: The feature scaler to use, either `None`,
                `'minmax'` or `'standard'`.
            metrics: A list of tf.keras metrics to pass to `compile(...)`.
            loss: The built-in tf.keras loss to pass to `compile(...)`.
            fit_params: Any additional parameters to pass to `fit(...)`,
                these will be overwritten by the explicit keyword
                arguments above.

        """

        if self.n_feat > len(training_data.get_optimal_descriptors()):
            raise RuntimeError(
                "The model requires more features than computed in data. "
                f"Please reduce n_feat below or equal to {len(training_data.get_optimal_descriptors())}"
            )

        self.xscale = xscale
        self.target_names = list(self.weights.keys())
        self.optimal_descriptors = training_data.get_optimal_descriptors()

        x = training_data.get_featurized_df()[
            self.optimal_descriptors[:self.n_feat]].values

        # For compatibility with MODNet 0.1.7; if there is only one target in the training data,
        # use that for the name of the target too.
        if (len(self.targets_flatten) == 1
                and len(training_data.df_targets.columns) == 1):
            self.targets_flatten = list(training_data.df_targets.columns)

        y = []
        for targ in self.targets_flatten:
            if self.num_classes[targ] >= 2:  # Classification
                y_inner = tf.keras.utils.to_categorical(
                    training_data.df_targets[targ].values,
                    num_classes=self.num_classes[targ],
                )
                loss = "categorical_crossentropy"
            else:
                y_inner = training_data.df_targets[targ].values.astype(
                    np.float, copy=False)
            y.append(y_inner)

        # Scale the input features:
        if self.xscale == "minmax":
            self._scaler = MinMaxScaler(feature_range=(-0.5, 0.5))

        elif self.xscale == "standard":
            self._scaler = StandardScaler()

        x = self._scaler.fit_transform(x)
        x = np.nan_to_num(x, nan=-1)

        if val_data is not None:
            val_x = val_data.get_featurized_df()[
                self.optimal_descriptors[:self.n_feat]].values
            val_x = self._scaler.transform(val_x)
            val_x = np.nan_to_num(val_x, nan=-1)
            try:
                val_y = list(val_data.get_target_df()[
                    self.targets_flatten].values.astype(
                        np.float, copy=False).transpose())
            except Exception:
                val_y = list(val_data.get_target_df().values.astype(
                    np.float, copy=False).transpose())
            validation_data = (val_x, val_y)
        else:
            validation_data = None

        # Optionally set up print callback
        if verbose:
            if val_fraction > 0 or validation_data:
                if self._multi_target and val_key is not None:
                    val_metric_key = f"val_{val_key}_mae"
                else:
                    val_metric_key = "val_mae"
                print_callback = tf.keras.callbacks.LambdaCallback(
                    on_epoch_end=lambda epoch, logs: print(
                        f"epoch {epoch}: loss: {logs['loss']:.3f}, "
                        f"val_loss:{logs['val_loss']:.3f} {val_metric_key}:{logs[val_metric_key]:.3f}"
                    ))

            else:
                print_callback = tf.keras.callbacks.LambdaCallback(
                    on_epoch_end=lambda epoch, logs: print(
                        f"epoch {epoch}: loss: {logs['loss']:.3f}"))

                if callbacks is None:
                    callbacks = [print_callback]
                else:
                    callbacks.append(print_callback)

        fit_params = {
            "x": x,
            "y": y,
            "epochs": epochs,
            "batch_size": batch_size,
            "verbose": verbose,
            "validation_split": val_fraction,
            "validation_data": validation_data,
            "callbacks": callbacks,
        }

        self.model.compile(
            loss=loss,
            optimizer=tf.keras.optimizers.Adam(lr=lr),
            metrics=metrics,
            loss_weights=self.weights,
        )
        history = self.model.fit(**fit_params)
        self.history = history.history
Esempio n. 6
0
    def predict(self,
                test_data: MODData,
                return_prob=False,
                return_unc=False) -> pd.DataFrame:
        """Predict the target values for the passed MODData.

        Parameters:
            test_data: A featurized and feature-selected `MODData`
                object containing the descriptors used in training.
            return_prob: For a classification tasks only: whether to return the probability of each
                class OR only return the most probable class.
            return_unc: whether to return the standard deviation as a second dataframe

        Returns:
            A `pandas.DataFrame` containing the predicted values of the targets.
            If return_unc=True, two `pandas.DataFrame` : (predictions,std) containing the predicted values of the targets and
             the standard deviations of the epistemic uncertainty.


        """
        # prevents Nan predictions if some features are inf
        x = (test_data.get_featurized_df().replace(
            [np.inf, -np.inf, np.nan],
            0)[self.optimal_descriptors[:self.n_feat]].values)

        # Scale the input features:
        x = np.nan_to_num(x)
        if self._scaler is not None:
            x = self._scaler.transform(x)
            x = np.nan_to_num(x)

        all_predictions = []

        for i in range(1000):
            p = self.model.predict(x)
            if len(self.targets_flatten) == 1:
                p = np.array([p])
            all_predictions.append(p)

        p_dic = {}
        unc_dic = {}
        for i, name in enumerate(self.targets_flatten):
            if self.num_classes[name] >= 2:
                if return_prob:
                    preds = np.array([pred[i] for pred in all_predictions])
                    probs = preds / (preds.sum(axis=-1)).reshape((-1, 1))
                    mean_prob = probs.mean()
                    std_prob = probs.std()
                    for j in range(mean_prob.shape[-1]):
                        p_dic["{}_prob_{}".format(name, j)] = mean_prob[:, j]
                        unc_dic["{}_prob_{}".format(name, j)] = std_prob[:, j]
                else:
                    p_dic[name] = np.argmax(
                        np.array([pred[i]
                                  for pred in all_predictions]).mean(axis=0),
                        axis=1,
                    )
                    unc_dic[name] = np.max(
                        np.array([pred[i]
                                  for pred in all_predictions]).mean(axis=0),
                        axis=1,
                    )
            else:
                mean_p = np.array([pred[i]
                                   for pred in all_predictions]).mean(axis=0)
                std_p = np.array([pred[i]
                                  for pred in all_predictions]).std(axis=0)
                p_dic[name] = mean_p[:, 0]
                unc_dic[name] = std_p[:, 0]

        predictions = pd.DataFrame(p_dic)
        unc = pd.DataFrame(unc_dic)

        predictions.index = test_data.structure_ids
        unc.index = test_data.structure_ids

        if return_unc:
            return predictions, unc
        else:
            return predictions