Example #1
0
    def train_final_vae(self, model_config):
        model_config["name"] = model_config["name"] + "_FULL"
        model_dir = self.get_model_dir(model_config["name"])
        create_dir(model_dir)
        model_config["model_dir"] = model_dir

        n_epochs = 2 if self.debug else 200
        full_dataset = Dataset.concatenate(*self.datasets)
        final_vae = VAE(model_config)
        final_vae.train(full_dataset,
                        epochs=n_epochs,
                        batch_size=50,
                        validation_dataset=full_dataset)
        latent_reps = final_vae.encode(full_dataset.features)
        results = np.hstack((np.expand_dims(full_dataset.sample_data[0],
                                            axis=1), latent_reps,
                             np.expand_dims(full_dataset.sample_data[1],
                                            axis=1),
                             np.expand_dims(full_dataset.sample_data[2],
                                            axis=1)))

        header = ["cell_ids"]
        for l in range(1, model_config["latent_size"] + 1):
            header.append("dim{}".format(l))
        header.append("cell_type")
        header.append("cell_subtype")
        header = np.array(header)

        results = np.vstack((header, results))

        save_data_table(
            results, model_config["model_dir"] + "/latent_representations.txt")
Example #2
0
    def train_vae(self, case_config):
        model_config = self.get_model_config(case_config)
        create_dir(model_config["model_dir"])

        avg_valid_loss = 0.0
        for k in range(0, 10):
            train_dataset = Dataset.concatenate(*(self.datasets[:k] +
                                                  self.datasets[(k + 1):]))
            valid_dataset = self.datasets[k]
            # Start training!
            vae = VAE(model_config)

            if self.debug:
                epochs = 2
            else:
                epochs = 100

            vae.train(train_dataset,
                      epochs=epochs,
                      batch_size=50,
                      validation_dataset=valid_dataset)

            fold_valid_loss = vae.evaluate(valid_dataset)
            self.logger.info("{}|Fold #{} Loss = {:f}".format(
                model_config["name"], k + 1, fold_valid_loss))

            avg_valid_loss += fold_valid_loss

            if self.debug:
                break

        avg_valid_loss /= 10
        self.logger.info("{}|Avg Validation Loss = {:f}".format(
            model_config["name"], avg_valid_loss))

        self.case_counter += 1

        return {
            "status": STATUS_OK,
            "loss": avg_valid_loss,
            "name": model_config["name"],
            "model_config": model_config
        }
Example #3
0
    def train_final_ae(self, model_config):
        model_config["name"] = model_config["name"] + "_FULL"
        model_dir = self.get_model_dir(model_config["name"])
        create_dir(model_dir)
        model_config["model_dir"] = model_dir

        n_epochs = 2 if self.debug else 100
        full_dataset = Dataset.concatenate(*self.datasets)

        self.logger.info("Training Final AE: " + model_config["name"])
        final_ae = AE(model_config)
        final_ae.train(full_dataset,
                       epochs=n_epochs,
                       batch_size=50,
                       validation_dataset=full_dataset)
        loss = final_ae.evaluate(full_dataset)
        self.logger.info("{}|Loss = {:f}".format(model_config["name"], loss))

        self.logger.info("Creating latent represenations...")
        latent_reps = final_ae.encode(full_dataset.features)

        results = np.hstack((np.expand_dims(full_dataset.sample_data[0],
                                            axis=1), latent_reps,
                             np.expand_dims(full_dataset.sample_data[1],
                                            axis=1),
                             np.expand_dims(full_dataset.sample_data[2],
                                            axis=1)))

        header = ["cell_ids"]
        for l in range(
                1,
                int(model_config["encoder_layers"][-1].split(":")[1]) + 1):
            header.append("dim{}".format(l))
        header.append("cell_type")
        header.append("cell_subtype")
        header = np.array(header)

        results = np.vstack((header, results))

        self.logger.info("Saving results")
        save_data_table(
            results, model_config["model_dir"] + "/latent_representations.txt")
    def train_final_model(self, model_config, batch_size=None):
        model_dir = self.get_model_dir(model_config["name"])
        create_dir(model_dir)
        model_config["model_dir"] = model_dir

        if batch_size is None:
            if "batch_size" in model_config:
                batch_size = model_config["batch_size"]
            else:
                raise Exception("No batch size specified \
                                for model training")

        full_dataset = Dataset.concatenate(*self.datasets)

        if self.logger is not None:
            self.logger.info("Training Final Model: {}".format(
                model_config["name"]))

        model = self.model_class(model_config)
        if self.debug:
            self.epochs = 2
        train_history = model.train(full_dataset,
                                    epochs=self.epochs,
                                    batch_size=batch_size)

        metrics = model.evaluate(full_dataset)
        if self.logger is not None:
            for k, v in metrics.items():
                self.logger.info("{}|{} = {:f}".format(model_config["name"], k,
                                                       v))

        return {
            "model": model,
            "train_history": train_history,
            "dataset": full_dataset,
            "metrics": metrics
        }
    def train_case_model(self,
                         case_config,
                         batch_size=None,
                         loss_metric="loss"):
        model_config = self.get_model_config(case_config)
        create_dir(model_config["model_dir"])

        if self.logger is not None:
            self.logger.info("Training %s..." % model_config["name"])

        status = STATUS_OK
        avg_valid_metrics = {}
        for k in range(0, self.n_folds):
            train_dataset = Dataset.concatenate(*(self.datasets[:k] +
                                                  self.datasets[(k + 1):]))
            valid_dataset = self.datasets[k]

            model = self.model_class(model_config)

            if batch_size is None:
                if "batch_size" in model_config:
                    batch_size = model_config["batch_size"]
                elif "batch_size" in case_config:
                    batch_size = case_config["batch_size"]
                else:
                    raise Exception("No batch size specified \
                                    for model training")

            if self.debug: self.epochs = 2
            model.train(train_dataset,
                        epochs=self.epochs,
                        batch_size=batch_size,
                        validation_dataset=valid_dataset)

            fold_valid_metrics = model.evaluate(valid_dataset)
            if not isinstance(fold_valid_metrics, dict):
                raise TypeError("Evaluate method of model must return a "
                                "dictionary of metric names and values")

            if np.any(np.isnan(list(fold_valid_metrics.values()))) or \
                    np.any(np.isinf(list(fold_valid_metrics.values()))):
                for key in fold_valid_metrics.keys():
                    avg_valid_metrics[key] = None
                status = STATUS_FAIL
                break
            else:
                for name, value in fold_valid_metrics.items():
                    if name in avg_valid_metrics:
                        avg_valid_metrics[name] += value
                    else:
                        avg_valid_metrics[name] = value

                    if self.logger is not None:
                        self.logger.info("{}|Fold #{}|{} = {:f}".format(
                            model_config["name"], k + 1, name, value))

            if self.debug:
                break

        if status != STATUS_FAIL:
            for name, metric in avg_valid_metrics.items():
                metric /= self.n_folds
                avg_valid_metrics[name] = metric
                if self.logger is not None:
                    self.logger.info("{}|Avg {} = {:f}".format(
                        model_config["name"], name, metric))

        self.case_counter += 1

        return {
            "status": status,
            "model_config": model_config,
            "loss": avg_valid_metrics[loss_metric],
            "avg_valid_metrics": avg_valid_metrics
        }
    delimiter = str(delimiter) if six.PY2 else delimiter

    with open(filepath, "w") as f:
        writer = csv.writer(
            f, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL)
        for r in data:
            writer.writerow(r)


cell_ids, features, cell_types, cell_subtypes = load_data()

datasets = stratified_kfold(
    features, cell_subtypes,
    [cell_ids, cell_types, cell_subtypes],
    n_folds=5, convert_labels_to_int=True)
full_dataset = Dataset.concatenate(*datasets)
n_epochs = 200

final_vae = VAE(model_config)
final_vae.train(full_dataset,
                epochs=n_epochs, batch_size=model_config["batch_size"])
loss = final_vae.evaluate(full_dataset)
print(loss)

latent_reps = final_vae.encode(full_dataset.features)
results = np.hstack((
    np.expand_dims(full_dataset.sample_data[0], axis=1),
    latent_reps,
    np.expand_dims(full_dataset.sample_data[1], axis=1),
    np.expand_dims(full_dataset.sample_data[2], axis=1)
))