Exemple #1
0
    def train_final_vae(self, model_config):
        model_config["name"] = model_config["name"] + "_FULL"
        model_dir = self.get_model_dir(model_config["name"])
        create_dir(model_dir)
        model_config["model_dir"] = model_dir

        n_epochs = 2 if self.debug else 200
        full_dataset = Dataset.concatenate(*self.datasets)
        final_vae = VAE(model_config)
        final_vae.train(full_dataset,
                        epochs=n_epochs,
                        batch_size=50,
                        validation_dataset=full_dataset)
        latent_reps = final_vae.encode(full_dataset.features)
        results = np.hstack((np.expand_dims(full_dataset.sample_data[0],
                                            axis=1), latent_reps,
                             np.expand_dims(full_dataset.sample_data[1],
                                            axis=1),
                             np.expand_dims(full_dataset.sample_data[2],
                                            axis=1)))

        header = ["cell_ids"]
        for l in range(1, model_config["latent_size"] + 1):
            header.append("dim{}".format(l))
        header.append("cell_type")
        header.append("cell_subtype")
        header = np.array(header)

        results = np.vstack((header, results))

        save_data_table(
            results, model_config["model_dir"] + "/latent_representations.txt")
Exemple #2
0
    def train_vae(self, case_config):
        model_config = self.get_model_config(case_config)
        create_dir(model_config["model_dir"])

        avg_valid_loss = 0.0
        for k in range(0, 10):
            train_dataset = Dataset.concatenate(*(self.datasets[:k] +
                                                  self.datasets[(k + 1):]))
            valid_dataset = self.datasets[k]
            # Start training!
            vae = VAE(model_config)

            if self.debug:
                epochs = 2
            else:
                epochs = 100

            vae.train(train_dataset,
                      epochs=epochs,
                      batch_size=50,
                      validation_dataset=valid_dataset)

            fold_valid_loss = vae.evaluate(valid_dataset)
            self.logger.info("{}|Fold #{} Loss = {:f}".format(
                model_config["name"], k + 1, fold_valid_loss))

            avg_valid_loss += fold_valid_loss

            if self.debug:
                break

        avg_valid_loss /= 10
        self.logger.info("{}|Avg Validation Loss = {:f}".format(
            model_config["name"], avg_valid_loss))

        self.case_counter += 1

        return {
            "status": STATUS_OK,
            "loss": avg_valid_loss,
            "name": model_config["name"],
            "model_config": model_config
        }
Exemple #3
0
    def train_final_ae(self, model_config):
        model_config["name"] = model_config["name"] + "_FULL"
        model_dir = self.get_model_dir(model_config["name"])
        create_dir(model_dir)
        model_config["model_dir"] = model_dir

        n_epochs = 2 if self.debug else 100
        full_dataset = Dataset.concatenate(*self.datasets)

        self.logger.info("Training Final AE: " + model_config["name"])
        final_ae = AE(model_config)
        final_ae.train(full_dataset,
                       epochs=n_epochs,
                       batch_size=50,
                       validation_dataset=full_dataset)
        loss = final_ae.evaluate(full_dataset)
        self.logger.info("{}|Loss = {:f}".format(model_config["name"], loss))

        self.logger.info("Creating latent represenations...")
        latent_reps = final_ae.encode(full_dataset.features)

        results = np.hstack((np.expand_dims(full_dataset.sample_data[0],
                                            axis=1), latent_reps,
                             np.expand_dims(full_dataset.sample_data[1],
                                            axis=1),
                             np.expand_dims(full_dataset.sample_data[2],
                                            axis=1)))

        header = ["cell_ids"]
        for l in range(
                1,
                int(model_config["encoder_layers"][-1].split(":")[1]) + 1):
            header.append("dim{}".format(l))
        header.append("cell_type")
        header.append("cell_subtype")
        header = np.array(header)

        results = np.vstack((header, results))

        self.logger.info("Saving results")
        save_data_table(
            results, model_config["model_dir"] + "/latent_representations.txt")
    def train_final_model(self, model_config, batch_size=None):
        model_dir = self.get_model_dir(model_config["name"])
        create_dir(model_dir)
        model_config["model_dir"] = model_dir

        if batch_size is None:
            if "batch_size" in model_config:
                batch_size = model_config["batch_size"]
            else:
                raise Exception("No batch size specified \
                                for model training")

        full_dataset = Dataset.concatenate(*self.datasets)

        if self.logger is not None:
            self.logger.info("Training Final Model: {}".format(
                model_config["name"]))

        model = self.model_class(model_config)
        if self.debug:
            self.epochs = 2
        train_history = model.train(full_dataset,
                                    epochs=self.epochs,
                                    batch_size=batch_size)

        metrics = model.evaluate(full_dataset)
        if self.logger is not None:
            for k, v in metrics.items():
                self.logger.info("{}|{} = {:f}".format(model_config["name"], k,
                                                       v))

        return {
            "model": model,
            "train_history": train_history,
            "dataset": full_dataset,
            "metrics": metrics
        }
    def run(self):
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train = (x_train.astype("float32") - 127.5) / 127.5
        x_train = np.expand_dims(x_train, axis=3)

        x_test = (x_test.astype("float32") - 127.5) / 127.5
        x_test = np.expand_dims(x_test, axis=3)

        train_dataset = Dataset(x_train,
                                y_train,
                                flatten=False,
                                to_one_hot=False)
        # test_dataset = Dataset(x_test, y_test,
        #                        flatten=False, to_one_hot=False)

        model_name = "MNIST_GAN"
        model_dir = self.get_model_dir(model_name)

        create_dir(model_dir)

        model_config = {
            "name":
            model_name,
            "model_dir":
            model_dir,
            "input_shape": (28, 28, 1),
            "generator_layers": [
                "Dense:1024:activation='tanh'", "Dense:128*7*7",
                "BatchNormalization", "Activation:'tanh'",
                "Reshape:(7, 7, -1)", "UpSampling2D:size=(2,2)",
                "Conv2D:64:5:padding='same':activation='tanh'",
                "UpSampling2D:size=(2,2)", "Conv2D:1:5:padding='same'",
                "Activation:'tanh'"
            ],
            "discriminator_layers": [
                "Conv2D:64:5:padding='same':activation='tanh'",
                "MaxPooling2D:pool_size=(2,2)",
                "Conv2D:128:5:padding='same':activation='tanh'",
                "MaxPooling2D:pool_size=(2,2)", "Flatten",
                "Dense:1024:activation='tanh'", "Dense:1:activation='sigmoid'"
            ],
            "prior_size":
            64,
            "discriminator_loss":
            "binary_crossentropy",
            "gan_optimizer":
            "adam:lr=1e-4",
            "discriminator_optimizer":
            "adam:lr=1e-3"
        }

        if self.debug:
            iterations = 3
        else:
            iterations = 5000

        gan = GAN(model_config)
        g_loss, d_loss_real, d_loss_gen = gan.train(train_dataset,
                                                    iterations,
                                                    batch_size=64)
        print("Generator Loss: ", g_loss)
        print("Discriminator Loss (Real): ", d_loss_real)
        print("Discriminator Loss (Generated): ", d_loss_gen)
        print("Finished training GAN")
    def train_case_model(self,
                         case_config,
                         batch_size=None,
                         loss_metric="loss"):
        model_config = self.get_model_config(case_config)
        create_dir(model_config["model_dir"])

        if self.logger is not None:
            self.logger.info("Training %s..." % model_config["name"])

        status = STATUS_OK
        avg_valid_metrics = {}
        for k in range(0, self.n_folds):
            train_dataset = Dataset.concatenate(*(self.datasets[:k] +
                                                  self.datasets[(k + 1):]))
            valid_dataset = self.datasets[k]

            model = self.model_class(model_config)

            if batch_size is None:
                if "batch_size" in model_config:
                    batch_size = model_config["batch_size"]
                elif "batch_size" in case_config:
                    batch_size = case_config["batch_size"]
                else:
                    raise Exception("No batch size specified \
                                    for model training")

            if self.debug: self.epochs = 2
            model.train(train_dataset,
                        epochs=self.epochs,
                        batch_size=batch_size,
                        validation_dataset=valid_dataset)

            fold_valid_metrics = model.evaluate(valid_dataset)
            if not isinstance(fold_valid_metrics, dict):
                raise TypeError("Evaluate method of model must return a "
                                "dictionary of metric names and values")

            if np.any(np.isnan(list(fold_valid_metrics.values()))) or \
                    np.any(np.isinf(list(fold_valid_metrics.values()))):
                for key in fold_valid_metrics.keys():
                    avg_valid_metrics[key] = None
                status = STATUS_FAIL
                break
            else:
                for name, value in fold_valid_metrics.items():
                    if name in avg_valid_metrics:
                        avg_valid_metrics[name] += value
                    else:
                        avg_valid_metrics[name] = value

                    if self.logger is not None:
                        self.logger.info("{}|Fold #{}|{} = {:f}".format(
                            model_config["name"], k + 1, name, value))

            if self.debug:
                break

        if status != STATUS_FAIL:
            for name, metric in avg_valid_metrics.items():
                metric /= self.n_folds
                avg_valid_metrics[name] = metric
                if self.logger is not None:
                    self.logger.info("{}|Avg {} = {:f}".format(
                        model_config["name"], name, metric))

        self.case_counter += 1

        return {
            "status": status,
            "model_config": model_config,
            "loss": avg_valid_metrics[loss_metric],
            "avg_valid_metrics": avg_valid_metrics
        }
    def run(self):
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train = (x_train.astype("float32") - 127.5) / 255
        x_test = (x_test.astype("float32") - 127.5) / 255

        train_dataset = Dataset(x_train, y_train,
                                flatten=True, to_one_hot=False)
        test_dataset = Dataset(x_test, y_test,
                               flatten=True, to_one_hot=False)

        model_name = "MNIST_AAE"
        model_type = UnsupervisedClusteringAdversarialAutoencoder
        model_dir = self.get_model_dir(model_name)
        create_dir(model_dir)

        aae_model_config = {
            "name": model_name,
            "model_dir": model_dir,

            "input_shape": (784,),
            "encoder_layers": [
                "Dense:256:activation='relu'",
            ],
            "z_latent_distribution": "gaussian:2",
            "z_prior_distribution": "gaussian:2:mean=0.0:stddev=5.0",
            "output_distribution": "mean_gaussian:784",
            "z_discriminator_layers": [
                "Dense:50:activation='relu'",
                "Dense:25:activation='relu'"
            ],

            "autoencoder_optimizer": "adam:lr=0.001",
            "z_discriminator_optimizer": "adam:lr=0.001",

            "autoencoder_callbacks": {
                "file_logger": {"file": "autoencoder_model.training.log"}
            },
            "z_discriminator_callbacks": {
                "file_logger": {"file": "discriminator_model.training.log"}
            }
        }

        kadurin_aae_model_config = {
            "name": model_name,
            "model_dir": model_dir,

            "input_shape": (784,),
            "encoder_layers": [
                "Dense:256:activation='relu'",
            ],
            "z_latent_distribution": "gaussian:2",
            "z_prior_distribution": "gaussian:2:mean=0.0:stddev=5.0",
            "output_distribution": "mean_gaussian:784",
            "z_discriminator_layers": [
                "Dense:50:activation='relu'",
                "Dense:25:activation='relu'"
            ],
            "discriminative_power": 0.6,

            "autoencoder_optimizer": "adam:lr=0.0001",
            "z_discriminator_optimizer": "adam:lr=0.0001",

            "z_combined_callbacks": {
                "file_logger": {"file": "combined_model.training.log"}
            },
            "z_discriminator_callbacks": {
                "file_logger": {"file": "discriminator_model.training.log"}
            }
        }

        unsupervised_clustering_aae_config = {
            "name": model_name,
            "model_dir": model_dir,

            "input_shape": (784,),
            "encoder_layers": [
                "Dense:256:activation='relu'"
            ],

            "z_latent_distribution": "gaussian:2",
            "z_prior_distribution": "gaussian:2:mean=0.0:stddev=5.0",
            "n_clusters": 10,
            "output_distribution": "mean_gaussian:784",

            "z_discriminator_layers": [
                "Dense:50:activation='relu'",
                "Dense:25:activation='relu'"
            ],

            "y_discriminator_layers": [
                "Dense:50:activation='relu'",
                "Dense:25:activation='relu'"
            ],

            "autoencoder_optimizer": "adam:lr=0.0001",
            "z_discriminator_optimizer": "adam:lr=0.0001",
            "y_discriminator_optimizer": "adam:lr=0.0001",

            "autoencoder_callbacks": {
                "file_logger": {"file": "autoencoder_model.training.log"}
            }
            # "z_discriminator_callbacks": {},
            # "y_discriminator_callbacks": {},
            # "z_adversarial_callbacks": {},
            # "y_adversarial_callbacks": {}
        }

        if self.debug:
            epochs = 5
        else:
            epochs = 50

        aae = model_type(unsupervised_clustering_aae_config)
        aae.train(train_dataset, epochs=epochs, batch_size=100,
                  validation_dataset=test_dataset, verbose=2)

        latent_space = aae.encode(test_dataset.features)
        style, clusters = latent_space[0], latent_space[1]
        clusters = aae.cluster(test_dataset.features)

        # results = np.hstack((
        #     latent_space,
        #     np.expand_dims(test_dataset.labels, axis=1)
        # ))

        # header = []
        # for l in range(1, 3):
        #     header.append("dim{}".format(l))
        # header.append("digit")
        # header = np.array(header)
        #
        # results = np.vstack((header, results))
        #
        # self.logger.info("Saving results")
        # save_data_table(
        #     results,
        #     model_config["model_dir"] + "/latent_representations.txt")

        print("ARI: ", adjusted_rand_score(test_dataset.labels, clusters))
        print("Clusters:", np.unique(clusters, return_counts=True))

        plt.figure(figsize=(6, 6))
        plt.scatter(style[:, 0], style[:, 1],
                    c=y_test, cmap="rainbow")
        plt.colorbar()
        plt.show()
    delimiter = str(delimiter) if six.PY2 else delimiter

    with open(filepath, "w") as f:
        writer = csv.writer(
            f, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL)
        for r in data:
            writer.writerow(r)


cell_ids, features, cell_types, cell_subtypes = load_data()

datasets = stratified_kfold(
    features, cell_subtypes,
    [cell_ids, cell_types, cell_subtypes],
    n_folds=5, convert_labels_to_int=True)
full_dataset = Dataset.concatenate(*datasets)
n_epochs = 200

final_vae = VAE(model_config)
final_vae.train(full_dataset,
                epochs=n_epochs, batch_size=model_config["batch_size"])
loss = final_vae.evaluate(full_dataset)
print(loss)

latent_reps = final_vae.encode(full_dataset.features)
results = np.hstack((
    np.expand_dims(full_dataset.sample_data[0], axis=1),
    latent_reps,
    np.expand_dims(full_dataset.sample_data[1], axis=1),
    np.expand_dims(full_dataset.sample_data[2], axis=1)
))
    def run(self):
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train = (x_train.astype("float32") - 127.5) / 255
        x_test = (x_test.astype("float32") - 127.5) / 255

        train_dataset = Dataset(x_train,
                                y_train,
                                flatten=True,
                                to_one_hot=False)
        test_dataset = Dataset(x_test, y_test, flatten=True, to_one_hot=False)

        model_name = "MNIST_VAE"
        model_dir = self.get_model_dir(model_name)

        create_dir(model_dir)

        model_config = {
            "name": model_name,
            "model_dir": model_dir,
            "input_shape": (784, ),
            "continuous": True,
            "encoder_layers":
            ["Dense:256:activation='elu'", "BatchNormalization"],
            "latent_size": 2,
            "optimizer": "adam"
        }

        if self.debug:
            epochs = 3
        else:
            epochs = 50

        vae = VAE(model_config)
        vae.train(train_dataset,
                  epochs=epochs,
                  batch_size=100,
                  validation_dataset=test_dataset)

        latent_reps = vae.encode(test_dataset.features)

        results = np.hstack(
            (latent_reps, np.expand_dims(test_dataset.labels, axis=1)))

        header = []
        for l in range(1, model_config["latent_size"] + 1):
            header.append("dim{}".format(l))
        header.append("digit")
        header = np.array(header)

        results = np.vstack((header, results))

        self.logger.info("Saving results")
        save_data_table(
            results, model_config["model_dir"] + "/latent_representations.txt")

        plt.figure(figsize=(6, 6))
        plt.scatter(latent_reps[:, 0],
                    latent_reps[:, 1],
                    c=y_test,
                    cmap="rainbow")
        plt.colorbar()
        plt.show()