def train_final_vae(self, model_config): model_config["name"] = model_config["name"] + "_FULL" model_dir = self.get_model_dir(model_config["name"]) create_dir(model_dir) model_config["model_dir"] = model_dir n_epochs = 2 if self.debug else 200 full_dataset = Dataset.concatenate(*self.datasets) final_vae = VAE(model_config) final_vae.train(full_dataset, epochs=n_epochs, batch_size=50, validation_dataset=full_dataset) latent_reps = final_vae.encode(full_dataset.features) results = np.hstack((np.expand_dims(full_dataset.sample_data[0], axis=1), latent_reps, np.expand_dims(full_dataset.sample_data[1], axis=1), np.expand_dims(full_dataset.sample_data[2], axis=1))) header = ["cell_ids"] for l in range(1, model_config["latent_size"] + 1): header.append("dim{}".format(l)) header.append("cell_type") header.append("cell_subtype") header = np.array(header) results = np.vstack((header, results)) save_data_table( results, model_config["model_dir"] + "/latent_representations.txt")
def train_vae(self, case_config): model_config = self.get_model_config(case_config) create_dir(model_config["model_dir"]) avg_valid_loss = 0.0 for k in range(0, 10): train_dataset = Dataset.concatenate(*(self.datasets[:k] + self.datasets[(k + 1):])) valid_dataset = self.datasets[k] # Start training! vae = VAE(model_config) if self.debug: epochs = 2 else: epochs = 100 vae.train(train_dataset, epochs=epochs, batch_size=50, validation_dataset=valid_dataset) fold_valid_loss = vae.evaluate(valid_dataset) self.logger.info("{}|Fold #{} Loss = {:f}".format( model_config["name"], k + 1, fold_valid_loss)) avg_valid_loss += fold_valid_loss if self.debug: break avg_valid_loss /= 10 self.logger.info("{}|Avg Validation Loss = {:f}".format( model_config["name"], avg_valid_loss)) self.case_counter += 1 return { "status": STATUS_OK, "loss": avg_valid_loss, "name": model_config["name"], "model_config": model_config }
def train_final_ae(self, model_config): model_config["name"] = model_config["name"] + "_FULL" model_dir = self.get_model_dir(model_config["name"]) create_dir(model_dir) model_config["model_dir"] = model_dir n_epochs = 2 if self.debug else 100 full_dataset = Dataset.concatenate(*self.datasets) self.logger.info("Training Final AE: " + model_config["name"]) final_ae = AE(model_config) final_ae.train(full_dataset, epochs=n_epochs, batch_size=50, validation_dataset=full_dataset) loss = final_ae.evaluate(full_dataset) self.logger.info("{}|Loss = {:f}".format(model_config["name"], loss)) self.logger.info("Creating latent represenations...") latent_reps = final_ae.encode(full_dataset.features) results = np.hstack((np.expand_dims(full_dataset.sample_data[0], axis=1), latent_reps, np.expand_dims(full_dataset.sample_data[1], axis=1), np.expand_dims(full_dataset.sample_data[2], axis=1))) header = ["cell_ids"] for l in range( 1, int(model_config["encoder_layers"][-1].split(":")[1]) + 1): header.append("dim{}".format(l)) header.append("cell_type") header.append("cell_subtype") header = np.array(header) results = np.vstack((header, results)) self.logger.info("Saving results") save_data_table( results, model_config["model_dir"] + "/latent_representations.txt")
def train_final_model(self, model_config, batch_size=None): model_dir = self.get_model_dir(model_config["name"]) create_dir(model_dir) model_config["model_dir"] = model_dir if batch_size is None: if "batch_size" in model_config: batch_size = model_config["batch_size"] else: raise Exception("No batch size specified \ for model training") full_dataset = Dataset.concatenate(*self.datasets) if self.logger is not None: self.logger.info("Training Final Model: {}".format( model_config["name"])) model = self.model_class(model_config) if self.debug: self.epochs = 2 train_history = model.train(full_dataset, epochs=self.epochs, batch_size=batch_size) metrics = model.evaluate(full_dataset) if self.logger is not None: for k, v in metrics.items(): self.logger.info("{}|{} = {:f}".format(model_config["name"], k, v)) return { "model": model, "train_history": train_history, "dataset": full_dataset, "metrics": metrics }
def train_case_model(self, case_config, batch_size=None, loss_metric="loss"): model_config = self.get_model_config(case_config) create_dir(model_config["model_dir"]) if self.logger is not None: self.logger.info("Training %s..." % model_config["name"]) status = STATUS_OK avg_valid_metrics = {} for k in range(0, self.n_folds): train_dataset = Dataset.concatenate(*(self.datasets[:k] + self.datasets[(k + 1):])) valid_dataset = self.datasets[k] model = self.model_class(model_config) if batch_size is None: if "batch_size" in model_config: batch_size = model_config["batch_size"] elif "batch_size" in case_config: batch_size = case_config["batch_size"] else: raise Exception("No batch size specified \ for model training") if self.debug: self.epochs = 2 model.train(train_dataset, epochs=self.epochs, batch_size=batch_size, validation_dataset=valid_dataset) fold_valid_metrics = model.evaluate(valid_dataset) if not isinstance(fold_valid_metrics, dict): raise TypeError("Evaluate method of model must return a " "dictionary of metric names and values") if np.any(np.isnan(list(fold_valid_metrics.values()))) or \ np.any(np.isinf(list(fold_valid_metrics.values()))): for key in fold_valid_metrics.keys(): avg_valid_metrics[key] = None status = STATUS_FAIL break else: for name, value in fold_valid_metrics.items(): if name in avg_valid_metrics: avg_valid_metrics[name] += value else: avg_valid_metrics[name] = value if self.logger is not None: self.logger.info("{}|Fold #{}|{} = {:f}".format( model_config["name"], k + 1, name, value)) if self.debug: break if status != STATUS_FAIL: for name, metric in avg_valid_metrics.items(): metric /= self.n_folds avg_valid_metrics[name] = metric if self.logger is not None: self.logger.info("{}|Avg {} = {:f}".format( model_config["name"], name, metric)) self.case_counter += 1 return { "status": status, "model_config": model_config, "loss": avg_valid_metrics[loss_metric], "avg_valid_metrics": avg_valid_metrics }
delimiter = str(delimiter) if six.PY2 else delimiter with open(filepath, "w") as f: writer = csv.writer( f, delimiter=delimiter, quoting=csv.QUOTE_MINIMAL) for r in data: writer.writerow(r) cell_ids, features, cell_types, cell_subtypes = load_data() datasets = stratified_kfold( features, cell_subtypes, [cell_ids, cell_types, cell_subtypes], n_folds=5, convert_labels_to_int=True) full_dataset = Dataset.concatenate(*datasets) n_epochs = 200 final_vae = VAE(model_config) final_vae.train(full_dataset, epochs=n_epochs, batch_size=model_config["batch_size"]) loss = final_vae.evaluate(full_dataset) print(loss) latent_reps = final_vae.encode(full_dataset.features) results = np.hstack(( np.expand_dims(full_dataset.sample_data[0], axis=1), latent_reps, np.expand_dims(full_dataset.sample_data[1], axis=1), np.expand_dims(full_dataset.sample_data[2], axis=1) ))