def main(): output_path = Path('../output/try2_exactly_7_times') output_path.mkdir(exist_ok=True) save_path = output_path / 'vader.ckpt' # w_train, x_train, names = read_premade(DAYS_ORDERED) w_train, x_train, names = read_data() x_train = (x_train - np.mean(x_train)) / np.std(x_train) vader = VADER(x_train=x_train, w_train=w_train, save_path=save_path, n_hidden=[128, 32], k=5, learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=8, alpha=0.1) # pre-train without latent loss vader.pre_fit(n_epoch=20, verbose=True) # train with latent loss vader.fit(n_epoch=100, verbose=True) # get the clusters c = vader.cluster(x_train, w_train) # get the re-constructions p = vader.predict(x_train) print(vader.get_clusters_on_x())
def test_vader_recur(self): X_train, W_train, y_train = generate_x_w_y(7, 400) # Note: y_train is used purely for monitoring performance when a ground truth clustering is available. # It can be omitted if no ground truth is available. # noinspection PyTypeChecker vader = VADER(X_train=X_train, W_train=W_train, y_train=y_train, save_path=None, n_hidden=[12, 2], k=4, learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=16) # pre-train without latent loss vader.pre_fit(n_epoch=10, verbose=True) # train with latent loss vader.fit(n_epoch=10, verbose=True) # get the clusters clustering = vader.cluster(X_train) assert any(clustering) assert len(clustering) == len(X_train) # get the re-constructions prediction = vader.predict(X_train) assert prediction.shape == X_train.shape # compute the loss given the network loss = vader.get_loss(X_train) assert loss assert "reconstruction_loss" in loss assert "latent_loss" in loss assert loss["reconstruction_loss"] >= 0 assert loss["latent_loss"] >= 0
def test1(): save_path = os.path.join('test_vader', 'vader.ckpt') x_train, y_train, w_train = prepare_data() # json.dump(x_train, open("x_train.json", "wb")) # json.dump(y_train, open("y_train.json", "wb")) # json.dump(w_train, open("w_train.json", "wb")) pickle.dump(x_train, open("x_train.pickle", "wb")) pickle.dump(y_train, open("y_train.pickle", "wb")) pickle.dump(w_train, open("w_train.pickle", "wb")) # Note: y_train is used purely for monitoring performance when a ground truth clustering is available. # It can be omitted if no ground truth is available. vader = VADER(x_train=x_train, w_train=w_train, y_train=y_train, save_path=save_path, n_hidden=[12, 2], k=4, learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=16) # pre-train without latent loss vader.pre_fit(n_epoch=50, verbose=True) # train with latent loss vader.fit(n_epoch=50, verbose=True) # get the clusters c = vader.cluster(x_train) # get the re-constructions p = vader.predict(x_train)
def test_vader_save_load(self): save_path = "test_vader_save_load" if os.path.exists(save_path): shutil.rmtree(save_path) X_train, W_train, y_train = generate_x_w_y(7, 400) # noinspection PyTypeChecker vader = VADER(X_train=X_train, W_train=W_train, y_train=y_train, save_path=save_path, n_hidden=[12, 2], k=4, learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=16) vader.pre_fit(n_epoch=10, verbose=True) vader.fit(n_epoch=10, verbose=True) clustering_before_loading = vader.cluster(X_train) loaded_vader = VADER.load_model(save_path, X_train, W_train, y_train) clustering_after_loading = loaded_vader.cluster(X_train) if os.path.exists(save_path): shutil.rmtree(save_path) assert list(clustering_before_loading) == list( clustering_after_loading)
def _fit_vader(self, X_train: ndarray, W_train: Optional[ndarray]) -> VADER: k = self.params_dict["k"] n_hidden = self.params_dict["n_hidden"] learning_rate = self.params_dict["learning_rate"] batch_size = self.params_dict["batch_size"] alpha = self.params_dict["alpha"] # noinspection PyTypeChecker vader = VADER(X_train=X_train, W_train=W_train, save_path=None, n_hidden=n_hidden, k=k, seed=self.seed, learning_rate=learning_rate, recurrent=True, batch_size=batch_size, alpha=alpha) vader.pre_fit(n_epoch=10, verbose=False) vader.fit(n_epoch=self.n_epoch, verbose=False, early_stopping_ratio=self.early_stopping_ratio, early_stopping_batch_size=self.early_stopping_batch_size) return vader
def test_vader_nonrecur(self): NUM_OF_TIME_POINTS = 7 X_train, y_train = generate_x_y_for_nonrecur(NUM_OF_TIME_POINTS, 400) # Run VaDER non-recurrently (ordinary VAE with GM prior) # noinspection PyTypeChecker vader = VADER(X_train=X_train, y_train=y_train, n_hidden=[12, 2], k=2, learning_rate=1e-3, output_activation=None, recurrent=False, batch_size=16) # pre-train without latent loss vader.pre_fit(n_epoch=10, verbose=True) # train with latent loss vader.fit(n_epoch=10, verbose=True) # get the clusters clustering = vader.cluster(X_train) assert any(clustering) assert len(clustering) == len(X_train) # get the re-constructions prediction = vader.predict(X_train) assert prediction.shape == X_train.shape # compute the loss given the network loss = vader.get_loss(X_train) assert loss assert "reconstruction_loss" in loss assert "latent_loss" in loss assert loss["reconstruction_loss"] >= 0 assert loss["latent_loss"] >= 0 # generate some samples NUM_OF_GENERATED_SAMPLES = 10 generated_samples = vader.generate(NUM_OF_GENERATED_SAMPLES) assert generated_samples assert "clusters" in generated_samples assert "samples" in generated_samples assert len(generated_samples["clusters"]) == NUM_OF_GENERATED_SAMPLES assert generated_samples["samples"].shape == (NUM_OF_GENERATED_SAMPLES, NUM_OF_TIME_POINTS)
if args.n_consensus and args.n_consensus > 1: loss_history_pdf = matplotlib.backends.backend_pdf.PdfPages( loss_history_file_path) y_pred_repeats = [] effective_k_repeats = [] train_reconstruction_loss_repeats = [] train_latent_loss_repeats = [] for j in range(args.n_consensus): seed = f"{args.seed}{i}{j}" if args.seed else None # noinspection PyTypeChecker vader = VADER(X_train=input_data, W_train=input_weights, k=args.k, n_hidden=n_hidden, learning_rate=args.learning_rate, batch_size=args.batch_size, alpha=args.alpha, seed=args.seed, save_path=args.save_path, output_activation=None, recurrent=True) vader.pre_fit(n_epoch=10, verbose=False) vader.fit(n_epoch=args.n_epoch, verbose=False, early_stopping_ratio=args.early_stopping_ratio, early_stopping_batch_size=args.early_stopping_batch_size) fig = plot_loss_history(vader, model_name=f"Model #{j}") loss_history_pdf.savefig(fig) # noinspection PyTypeChecker clustering = vader.cluster(input_data, input_weights) effective_k = len(Counter(clustering))
data_reader_spec = importlib.util.spec_from_file_location( "data_reader", args.data_reader_script) data_reader_module = importlib.util.module_from_spec(data_reader_spec) data_reader_spec.loader.exec_module(data_reader_module) data_reader = data_reader_module.DataReader() x_tensor = data_reader.read_data(args.input_data_file) w_tensor = generate_wtensor_from_xtensor(x_tensor) input_data = np.nan_to_num(x_tensor) input_weights = w_tensor features = data_reader.features time_points = data_reader.time_points x_label = data_reader.time_point_meaning ids_list = data_reader.ids_list vader = VADER.load_model(args.load_path, input_data, input_weights) n_hidden = [str(layer_size) for layer_size in vader.n_hidden] report_suffix = f"k{str(vader.K)}" \ f"_n_hidden{'_'.join(n_hidden)}" \ f"_learning_rate{str(vader.learning_rate)}" \ f"_batch_size{str(vader.batch_size)}" \ f"_n_epoch{str(vader.n_epoch)}" \ f"_seed{str(args.seed)}" plot_file_path = os.path.join( args.output_path, f"z_scores_trajectories_{report_suffix}.pdf") clustering_file_path = os.path.join(args.output_path, f"clustering_{report_suffix}.csv") clustering = vader.cluster(input_data, input_weights) pd.Series(list(clustering), index=ids_list, dtype=np.int64, name='Cluster').to_csv(clustering_file_path)
# Randomly set 50% of values to missing (0: missing, 1: present) # Note: All X_train[i,j] for which W_train[i,j] == 0 are treated as missing (i.e. their specific value is ignored) W_train = np.random.choice(2, X_train.shape) import pickle pickle.dump(X_train, open("X_train.pickle", "wb")) pickle.dump(y_train, open("y_train.pickle", "wb")) pickle.dump(W_train, open("W_train.pickle", "wb")) # Note: y_train is used purely for monitoring performance when a ground truth clustering is available. # It can be omitted if no ground truth is available. vader = VADER(X_train=X_train, W_train=W_train, y_train=y_train, save_path=save_path, n_hidden=[12, 2], k=4, learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=16) # pre-train without latent loss vader.pre_fit(n_epoch=50, verbose=True) # train with latent loss vader.fit(n_epoch=50, verbose=True) # get the clusters c = vader.cluster(X_train) # get the re-constructions p = vader.predict(X_train) # compute the loss given the network l = vader.get_loss(X_train)
def test2(): x_train, y_train = get_dete_for_seconed_test() vader = VADER(x_train=x_train, y_train=y_train, n_hidden=[12, 2], k=2, learning_rate=1e-3, output_activation=None, recurrent=False, batch_size=16) # pre-train without latent loss vader.pre_fit(n_epoch=50, verbose=True) # train with latent loss vader.fit(n_epoch=50, verbose=True) # get the clusters c = vader.cluster(x_train) # get the re-constructions p = vader.predict(x_train) # compute the loss given the network l = vader.get_loss(x_train) # generate some samples g = vader.generate(10) # compute the loss given the network l = vader.get_loss(x_train)
def test_vader_transfer_learning(self): X_train, W_train, y_train = generate_x_w_y(7, 400) # noinspection PyTypeChecker vader = VADER(X_train=X_train, W_train=W_train, y_train=y_train, save_path=None, n_hidden=[12, 2], k=4, learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=16) # pre-train without latent loss vader.pre_fit(n_epoch=10, verbose=True) # train with latent loss vader.fit(n_epoch=10, verbose=True) X_train_ft, W_train_ft, y_train_ft = generate_x_w_y(7, 400) vader.set_inputs(X_train_ft, W_train_ft, y_train_ft) # pre-train without latent loss vader.pre_fit(n_epoch=10, verbose=True) # train with latent loss vader.fit(n_epoch=10, verbose=True) # get the clusters clustering = vader.cluster(X_train_ft) assert any(clustering) assert len(clustering) == len(X_train_ft) # get the re-constructions prediction = vader.predict(X_train_ft) assert prediction.shape == X_train_ft.shape # compute the loss given the network loss = vader.get_loss(X_train_ft) assert loss assert "reconstruction_loss" in loss assert "latent_loss" in loss assert loss["reconstruction_loss"] >= 0 assert loss["latent_loss"] >= 0
def test_vader_save_load_transfer_learning(self): save_folder = "test_vader_save_load_transfer_learning" save_path = f"{save_folder}//weights" if os.path.exists(save_folder): shutil.rmtree(save_folder) X_train, W_train, y_train = generate_x_w_y(7, 400) # noinspection PyTypeChecker vader = VADER(X_train=X_train, W_train=W_train, y_train=y_train, save_path=save_path, n_hidden=[12, 2], k=4, learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=16) vader.pre_fit(n_epoch=10, verbose=True) vader.fit(n_epoch=10, verbose=True) clustering_before_loading = vader.cluster(X_train) X_train_ft, W_train_ft, y_train_ft = generate_x_w_y(7, 400) vader = VADER(X_train=X_train_ft, W_train=W_train_ft, y_train=y_train_ft, save_path=None, n_hidden=[12, 2], k=4, learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=16) vader.load_weights(save_path) vader.pre_fit(n_epoch=10, verbose=True) vader.fit(n_epoch=10, verbose=True) # get the clusters clustering = vader.cluster(X_train_ft) if os.path.exists(save_folder): shutil.rmtree(save_folder) assert any(clustering) assert len(clustering) == len(X_train_ft) # get the re-constructions prediction = vader.predict(X_train_ft) assert prediction.shape == X_train_ft.shape # compute the loss given the network loss = vader.get_loss(X_train_ft) assert loss assert "reconstruction_loss" in loss assert "latent_loss" in loss assert loss["reconstruction_loss"] >= 0 assert loss["latent_loss"] >= 0