Example #1
0
    def train(self, train_adata, condition_key, le=None, n_epochs=1000, batch_size=256):
        train_adata = remove_sparsity(train_adata)

        x_train = train_adata.X
        y_train, _ = label_encoder(train_adata, le, condition_key)
        y_train = np.reshape(y_train, (-1,))

        train_loader = Loader(x_train, labels=y_train, shuffle=True)

        self.model_backend.train(train_loader, n_epochs, batch_size)
Example #2
0
 def train(self, train_data, use_validation=False, valid_data=None, n_epochs=25, batch_size=32, early_stop_limit=20,
           threshold=0.0025, initial_run=True, shuffle=True):
     """
         Trains the network `n_epochs` times with given `train_data`
         and validates the model using validation_data if it was given
         in the constructor function. This function is using `early stopping`
         technique to prevent overfitting.
         # Parameters
             n_epochs: int
                 number of epochs to iterate and optimize network weights
             early_stop_limit: int
                 number of consecutive epochs in which network loss is not going lower.
                 After this limit, the network will stop training.
             threshold: float
                 Threshold for difference between consecutive validation loss values
                 if the difference is upper than this `threshold`, this epoch will not
                 considered as an epoch in early stopping.
             full_training: bool
                 if `True`: Network will be trained with all batches of data in each epoch.
                 if `False`: Network will be trained with a random batch of data in each epoch.
             initial_run: bool
                 if `True`: The network will initiate training and log some useful initial messages.
                 if `False`: Network will resume the training using `restore_model` function in order
                     to restore last model which has been trained with some training dataset.
         # Returns
             Nothing will be returned
         # Example
         ```python
         import scanpy as sc
         import scgen
         train_data = sc.read(train_katrain_kang.h5ad           >>> validation_data = sc.read(valid_kang.h5ad)
         network = scgen.CVAE(train_data=train_data, use_validation=True, validation_data=validation_data, model_path="./saved_models/", conditions={"ctrl": "control", "stim": "stimulated"})
         network.scripts(n_epochs=20)
         ```
     """
     if initial_run:
         log.info("----Training----")
         assign_step_zero = tensorflow.assign(self.global_step, 0)
         _init_step = self.sess.run(assign_step_zero)
     if not initial_run:
         self.saver.restore(self.sess, self.model_to_use)
     train_labels, le = label_encoder(train_data)
     if use_validation and valid_data is None:
         raise Exception("valid_data is None but use_validation is True.")
     if use_validation:
         valid_labels, _ = label_encoder(valid_data)
     loss_hist = []
     patience = early_stop_limit
     min_delta = threshold
     patience_cnt = 0
     for it in range(n_epochs):
         increment_global_step_op = tensorflow.assign(self.global_step, self.global_step + 1)
         _step = self.sess.run(increment_global_step_op)
         current_step = self.sess.run(self.global_step)
         train_loss = 0
         for lower in range(0, train_data.shape[0], batch_size):
             upper = min(lower + batch_size, train_data.shape[0])
             if sparse.issparse(train_data.X):
                 x_mb = train_data[lower:upper, :].X.A
             else:
                 x_mb = train_data[lower:upper, :].X
             y_mb = train_labels[lower:upper]
             _, current_loss_train = self.sess.run([self.solver, self.vae_loss],
                                                   feed_dict={self.x: x_mb, self.encoder_labels: y_mb,
                                                              self.decoder_labels: y_mb,
                                                              self.time_step: current_step,
                                                              self.size: len(x_mb), self.is_training: True})
             train_loss += current_loss_train
         print(f"iteration {it}: {train_loss // (train_data.shape[0] // batch_size)}")
         if use_validation:
             valid_loss = 0
             for lower in range(0, valid_data.shape[0], batch_size):
                 upper = min(lower + batch_size, valid_data.shape[0])
                 if sparse.issparse(valid_data.X):
                     x_mb = valid_data[lower:upper, :].X.A
                 else:
                     x_mb = valid_data[lower:upper, :].X
                 y_mb = valid_labels[lower:upper]
                 current_loss_valid = self.sess.run(self.vae_loss,
                                                    feed_dict={self.x: x_mb, self.encoder_labels: y_mb,
                                                               self.decoder_labels: y_mb,
                                                               self.time_step: current_step,
                                                               self.size: len(x_mb),
                                                               self.is_training: False})
                 valid_loss += current_loss_valid
             loss_hist.append(valid_loss / valid_data.shape[0])
             if it > 0 and loss_hist[it - 1] - loss_hist[it] > min_delta:
                 patience_cnt = 0
             else:
                 patience_cnt += 1
             if patience_cnt > patience:
                 os.makedirs(self.model_to_use, exist_ok=True)
                 save_path = self.saver.save(self.sess, self.model_to_use)
                 break
     else:
         os.makedirs(self.model_to_use, exist_ok=True)
         save_path = self.saver.save(self.sess, self.model_to_use)
     log.info(f"Model saved in file: {save_path}. Training finished")
    def train(self,
              train_adata,
              valid_adata=None,
              condition_encoder=None,
              condition_key='condition',
              n_epochs=25,
              batch_size=32,
              early_stop_limit=20,
              lr_reducer=10,
              threshold=0.0025,
              monitor='val_loss',
              shuffle=True,
              verbose=2,
              save=True):
        """
            Trains the network `n_epochs` times with given `train_data`
            and validates the model using validation_data if it was given
            in the constructor function. This function is using `early stopping`
            technique to prevent overfitting.
            # Parameters
                n_epochs: int
                    number of epochs to iterate and optimize network weights
                early_stop_limit: int
                    number of consecutive epochs in which network loss is not going lower.
                    After this limit, the network will stop training.
                threshold: float
                    Threshold for difference between consecutive validation loss values
                    if the difference is upper than this `threshold`, this epoch will not
                    considered as an epoch in early stopping.
                full_training: bool
                    if `True`: Network will be trained with all batches of data in each epoch.
                    if `False`: Network will be trained with a random batch of data in each epoch.
                initial_run: bool
                    if `True`: The network will initiate training and log some useful initial messages.
                    if `False`: Network will resume the training using `restore_model` function in order
                        to restore last model which has been trained with some training dataset.
            # Returns
                Nothing will be returned
            # Example
            ```python
            import scanpy as sc
            import scgen
            train_data = sc.read(train_katrain_kang.h5ad           >>> validation_data = sc.read(valid_kang.h5ad)
            network = scgen.CVAE(train_data=train_data, use_validation=True, validation_data=validation_data, model_path="./saved_models/", conditions={"ctrl": "control", "stim": "stimulated"})
            network.scripts(n_epochs=20)
            ```
        """
        train_labels_encoded, _ = label_encoder(train_adata, condition_encoder,
                                                condition_key)
        train_labels_onehot = to_categorical(train_labels_encoded,
                                             num_classes=self.n_conditions)

        callbacks = [
            History(),
            CSVLogger(filename="./csv_logger.log"),
        ]

        if early_stop_limit > 0:
            callbacks.append(
                EarlyStopping(patience=early_stop_limit,
                              monitor=monitor,
                              min_delta=threshold))

        if lr_reducer > 0:
            callbacks.append(
                ReduceLROnPlateau(monitor=monitor,
                                  patience=lr_reducer,
                                  verbose=verbose))

        if verbose > 2:
            callbacks.append(
                LambdaCallback(on_epoch_end=lambda epoch, logs: print_message(
                    epoch, logs, n_epochs, verbose)))
            fit_verbose = 0
        else:
            fit_verbose = verbose

        if sparse.issparse(train_adata.X):
            train_adata.X = train_adata.X.A

        x = [train_adata.X, train_labels_onehot, train_labels_onehot]
        y = [train_adata.X, train_labels_encoded]

        if valid_adata is not None:
            if sparse.issparse(valid_adata.X):
                valid_adata.X = valid_adata.X.A

            valid_labels_encoded, _ = label_encoder(valid_adata,
                                                    condition_encoder,
                                                    condition_key)
            valid_labels_onehot = to_categorical(valid_labels_encoded,
                                                 num_classes=self.n_conditions)

            x_valid = [valid_adata.X, valid_labels_onehot, valid_labels_onehot]
            y_valid = [valid_adata.X, valid_labels_encoded]

            self.cvae_model.fit(x=x,
                                y=y,
                                epochs=n_epochs,
                                batch_size=batch_size,
                                validation_data=(x_valid, y_valid),
                                shuffle=shuffle,
                                callbacks=callbacks,
                                verbose=fit_verbose)
        else:
            self.cvae_model.fit(x=x,
                                y=y,
                                epochs=n_epochs,
                                batch_size=batch_size,
                                shuffle=shuffle,
                                callbacks=callbacks,
                                verbose=fit_verbose)
        if save:
            self.save_model()
            (valid_adata.obs[cell_type_key] == specific_cell_type) &
            (valid_adata.obs[condition_key].isin(target_conditions)))]

        z_dim = 100
        network = CVAE(
            x_dimension=net_train_adata.X.shape[1],
            z_dimension=z_dim,
            alpha=0.1,
            model_path=f"./models/CVAE/{data_name}/{specific_cell_type}/cvae")

        network.train(net_train_adata,
                      use_validation=True,
                      valid_data=net_valid_adata,
                      n_epochs=120)

        train_labels, _ = label_encoder(train_adata, le, 'condition')
        cell_type_adata = train_adata[train_adata.obs[cell_type_key] ==
                                      specific_cell_type]

        unperturbed_data = cell_type_adata[cell_type_adata.obs[condition_key]
                                           == control_condition]
        target_labels = np.zeros(
            (len(unperturbed_data), 1)) + le[target_condition]
        source_labels = np.zeros(
            (len(unperturbed_data), 1)) + le[control_condition]
        pred_adata = network.predict(unperturbed_data, source_labels,
                                     target_labels)
        pred_adata.obs[condition_key] = [
            f"{specific_cell_type}_pred_{target_condition}"
        ] * len(target_labels)
        pred_adata.obs['method'] = 'CVAE'
    (valid_adata.obs[cell_type_key] == specific_cell_type) &
    (valid_adata.obs[condition_key].isin(target_conditions)))]

z_dim = 100
network = CVAE(
    x_dimension=net_train_adata.X.shape[1],
    z_dimension=z_dim,
    alpha=0.1,
    model_path=f"../models/CVAE/{data_name}/{specific_cell_type}/cvae")

network.train(net_train_adata,
              use_validation=True,
              valid_data=net_valid_adata,
              n_epochs=120)

train_labels, _ = label_encoder(train_adata, le, 'condition')
cell_type_adata = train_adata[train_adata.obs[cell_type_key] ==
                              specific_cell_type]

unperturbed_data = cell_type_adata[cell_type_adata.obs[condition_key] ==
                                   control_condition]
target_labels = np.zeros((len(unperturbed_data), 1)) + le[target_condition]
source_labels = np.zeros((len(unperturbed_data), 1)) + le[control_condition]
pred_adata = network.predict(unperturbed_data, source_labels, target_labels)
pred_adata.obs[condition_key] = [
    f"{specific_cell_type}_pred_{target_condition}"
] * len(target_labels)
pred_adata.obs['method'] = 'CVAE'
pred_adata.write(
    f"./data/reconstructed/{data_name}/CVAE-{specific_cell_type}.h5ad")