def run_experiment(_run): assert _run.info.get("tensorflow", None) is None with tf.Session() as s: with LogFileWriter(ex): swr = tf.summary.FileWriter(logdir=TEST_LOG_DIR, graph=s.graph) assert swr is not None assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR] tf.summary.FileWriter(TEST_LOG_DIR2, s.graph) assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR, TEST_LOG_DIR2]
def run_experiment(_run): assert _run.info.get("tensorflow", None) is None with tf.Session() as s: # Capturing the log directory should be done only in scope of the context manager try: with LogFileWriter(ex): swr = tf.summary.FileWriter(logdir=TEST_LOG_DIR, graph=s.graph) assert swr is not None assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR] raise ValueError("I want to be raised!") except ValueError: pass # This should not be captured: tf.summary.FileWriter("/tmp/whatever", s.graph) assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR]
def train(ds, arch, ms, cuda, log_dir, seed, save_model, save_best_only, early_stop, unsupervized, _run): # fix seed print("seed: ", seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if ds['fold'] is None: ds['fold'] = 0 # pick first and only column in csv files log_dir = os.path.join(log_dir, str(_run._id), _run.experiment_info['name']) else: import pathlib log_dir = os.path.join(log_dir, 'fold' + str(ds['fold'])) pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) cfg_path = os.path.abspath( os.path.join(log_dir, os.pardir, 'config.json')) with open(cfg_path, 'w') as outfile: json.dump(_run.config, outfile) print("log_dir JSON: ", log_dir) if 'expert_models' in ms.keys(): ms['expert_models'] = [ os.path.join(x, f"fold{ds['fold']}", "checkpoint.pth.tar") for x in ms['expert_models'] ] if _run._id is not None: # Capture TensorBoard logs with sacred with LogFileWriter(ex): logger = Logger(log_dir, _run) else: logger = Logger(log_dir, None) # Fit the model clf = Base(logger=logger, cuda=cuda, verbose=True) clf.fit(arch, ms, **ds, early_stop=early_stop, unsupervized=unsupervized) _run.info['modelstr'] = str(clf.model) if save_model: clf.save_checkpoint_(save_best_only=save_best_only) return clf.best_acc_
def run_experiment(_run): assert _run.info.get("tensorflow", None) is None with tf.Session() as s: # Without using the LogFileWriter context manager, nothing should change swr = tf.summary.FileWriter(logdir=TEST_LOG_DIR, graph=s.graph) assert swr is not None assert _run.info.get("tensorflow", None) is None # Capturing the log directory should be done only in scope of the context manager with LogFileWriter(ex): swr = tf.summary.FileWriter(logdir=TEST_LOG_DIR, graph=s.graph) assert swr is not None assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR] tf.summary.FileWriter(TEST_LOG_DIR2, s.graph) assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR, TEST_LOG_DIR2] # This should not be captured: tf.summary.FileWriter("/tmp/whatever", s.graph) assert _run.info["tensorflow"]["logdirs"] == [TEST_LOG_DIR, TEST_LOG_DIR2]
def train_model(model, data_train, data_val, generator, lr_val, num_epochs, batch_size, logdir, ex_name, val_epochs, modelpath, learning_rate, epochs_pretrain, latent_dim, num_clusters): """Trains the VarPSOM model. Args: model (VarPSOM): VarPSOM model to train. data_train (np.array): Training set. data_val (np.array): Validation/test set. generator (generator): Data generator for the batches. lr_val (tf.Tensor): Placeholder for the learning rate value. num_epochs (int): Number of training epochs. batch_size (int): Batch size for the training. logdir (path): Directory for the experiment logs. ex_name (string): Unique name of this particular run. val_epochs (bool): If "True" clustering results are saved every 10 epochs on default output files. modelpath (path): Path for the model checkpoints. learning_rate (float): Learning rate for the optimization. epochs_pretrain (int): Number of VAE pretraining epochs. latent_dim (int): Dimensionality of the VarPSOM's latent space. num_clusters (int): Number of clusters. """ epochs = 0 iterations = 0 train_gen = generator("train", batch_size) val_gen = generator("val", batch_size) len_data_train = len(data_train) len_data_val = len(data_val) num_batches = len_data_train // batch_size saver = tf.train.Saver(max_to_keep=1) summaries = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) test_losses = [] test_losses_mean = [] with LogFileWriter(ex): train_writer = tf.summary.FileWriter(logdir + "/train", sess.graph) test_writer = tf.summary.FileWriter(logdir + "/test", sess.graph) train_step_SOMVAE, train_step_ae = model.optimize x = model.inputs p = model.p is_training = model.is_training graph = tf.get_default_graph() z = graph.get_tensor_by_name("reconstruction_e/decoder/z_e:0") print("\n**********Starting job {}********* \n".format(ex_name)) pbar = tqdm(total=(num_epochs + epochs_pretrain) * (num_batches)) print("\n\nAutoencoder Pretraining...\n") a = np.zeros((batch_size, num_clusters)) dp = {p: a, is_training: True, z: np.zeros((batch_size, latent_dim))} for epoch in range(epochs_pretrain): for i in range(num_batches): batch_data, _, _ = next(train_gen) f_dic = {x: batch_data, lr_val: learning_rate} f_dic.update(dp) train_step_ae.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, _ = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_reconstruction_ze, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_reconstruction_ze, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) print("\nClusters initialization...\n") z_e = [] for t in range(9): z_e.extend( sess.run(model.sample_z_e, feed_dict={ x: data_train[int(len(data_train) / 10) * t:int(len(data_train) / 10) * (t + 1)], is_training: True, z: np.zeros((int(len(data_train) / 10), latent_dim)) })) z_e.extend( sess.run(model.sample_z_e, feed_dict={ x: data_train[int(len(data_train) / 10) * 9:], is_training: True, z: np.zeros((int(len(data_val) / 10), latent_dim)) })) z_e = np.array(z_e) assign_mu_op = model.get_assign_cluster_centers_op(features=z_e) _ = sess.run(assign_mu_op) print("\nTraining...\n") for epoch in range(num_epochs): epochs += 1 #Compute initial soft probabilities between data points and centroids q = [] for t in range(9): q.extend( sess.run( model.q, feed_dict={ x: data_train[int(len(data_train) / 10) * t:int(len(data_train) / 10) * (t + 1)], is_training: True, z: np.zeros((int(len(data_train) / 10), latent_dim)) })) q.extend( sess.run(model.q, feed_dict={ x: data_train[int(len(data_train) / 10) * 9:], is_training: True, z: np.zeros( (int(len(data_train) / 10), latent_dim)) })) q = np.array(q) ppt = model.target_distribution(q) q = sess.run(model.q, feed_dict={ x: data_val, is_training: True, z: np.zeros((len(data_val), latent_dim)) }) ppv = model.target_distribution(q) #Train for i in range(num_batches): iterations += 1 batch_data, _, ii = next(train_gen) ftrain = { p: ppt[ii * batch_size:(ii + 1) * batch_size], is_training: True, z: np.zeros((batch_size, latent_dim)) } f_dic = {x: batch_data, lr_val: learning_rate} f_dic.update(ftrain) train_step_SOMVAE.run(feed_dict=f_dic) batch_val, _, ii = next(val_gen) fval = { p: ppv[ii * batch_size:(ii + 1) * batch_size], is_training: True, z: np.zeros((batch_size, latent_dim)) } f_dic = {x: batch_val} f_dic.update(fval) test_loss, summary = sess.run([model.loss, summaries], feed_dict=f_dic) test_losses.append(test_loss) if i % 100 == 0: test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(ftrain) train_loss, summary = sess.run([model.loss, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) if i % 1000 == 0: test_loss_mean = np.mean(test_losses) test_losses_mean.append(test_loss_mean) test_losses = [] if len(test_losses_mean) > 0: test_s = test_losses_mean[-1] else: test_s = test_losses_mean pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_s, refresh=False) pbar.update(1) saver.save(sess, modelpath) if val_epochs == True and epochs % 10 == 0: saver.save(sess, modelpath) results = evaluate_model(model, generator, len_data_val, x, modelpath, epochs) if results is None: return None saver.save(sess, modelpath) results = evaluate_model(model, generator, len_data_val, x, modelpath, epochs) return results
def train_model(model, data_train, data_val, endpoints_total_val, lr_val, num_epochs, patience, batch_size, logdir, modelpath, learning_rate, interactive, only_evaluate, benchmark, train_ratio): """Trains the SOM-VAE model. Args: model (SOM-VAE): SOM-VAE model to train. x (tf.Tensor): Input tensor or placeholder. lr_val (tf.Tensor): Placeholder for the learning rate value. num_epochs (int): Number of epochs to train. patience (int): Patience parameter for the early stopping. batch_size (int): Batch size for the training generator. logdir (path): Directory for saving the logs. modelpath (path): Path for saving the model checkpoints. learning_rate (float): Learning rate for the optimization. interactive (bool): Indicator if we want to have an interactive progress bar for training. generator (generator): Generator for the data batches. only_evaluate (bool): Do not actually perform train but just load the model """ len_data_val = len(data_val) val_gen = batch_generator(data_train, data_val, endpoints_total_val, mode="val") x = model.inputs if benchmark: times_per_epoch = [] # Train the model if not only_evaluate: len_data_train = len(data_train) num_batches = len_data_train // batch_size train_gen = batch_generator(data_train, data_val, endpoints_total_val, mode="train") saver = tf.train.Saver(keep_checkpoint_every_n_hours=0.5) summaries = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) patience_count = 0 test_losses = [] with LogFileWriter(ex): train_writer = tf.summary.FileWriter(logdir + "/train", sess.graph) test_writer = tf.summary.FileWriter(logdir + "/test", sess.graph) print("Training...") train_step_SOMVAE, train_step_prob = model.optimize if interactive: pbar = tqdm(total=num_epochs * (num_batches)) if benchmark: t_begin_all = timeit.default_timer() for epoch in range(num_epochs): if benchmark: t_begin = timeit.default_timer() batch_val, _ = next(val_gen) test_loss, summary = sess.run( [model.loss, summaries], feed_dict={ x: batch_val, model.is_training: True, model.prediction_input: np.zeros(batch_size) }) test_losses.append(test_loss) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) if test_losses[-1] == min(test_losses): saver.save(sess, modelpath, global_step=epoch) patience_count = 0 else: patience_count += 1 if patience_count >= patience: break for i in range(num_batches): batch_data = next(train_gen) if i % 100 == 0: train_loss, summary = sess.run( [model.loss, summaries], feed_dict={ x: batch_data, model.is_training: True, model.prediction_input: np.zeros(batch_size) }) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) train_step_SOMVAE.run( feed_dict={ x: batch_data, lr_val: learning_rate, model.is_training: True, model.prediction_input: np.zeros(batch_size) }) train_step_prob.run( feed_dict={ x: batch_data, lr_val: learning_rate * 100, model.is_training: True, model.prediction_input: np.zeros(batch_size) }) if interactive: pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) if benchmark: t_end = timeit.default_timer() times_per_epoch.append(t_end - t_begin) if benchmark: t_end_all = timeit.default_timer() ttime_all = t_end_all - t_begin_all saver.save(sess, modelpath) pbar.close() if benchmark: print("Total time series: {}/{}".format(train_ratio, len(data_train))) print("Fitting time per epoch: {:.3f}".format( np.mean(times_per_epoch))) print("Total fitting time: {:.3f}".format(ttime_all)) sys.exit(0) # Evaluate the model in any case with tf.Session() as sess: results = evaluate_model(model, x, val_gen, len_data_val, modelpath) return results
def train_model(model, x, lr_val, num_epochs, patience, batch_size, logdir, modelpath, learning_rate, interactive, generator): """Trains the SOM-VAE model. Args: model (SOM-VAE): SOM-VAE model to train. x (tf.Tensor): Input tensor or placeholder. lr_val (tf.Tensor): Placeholder for the learning rate value. num_epochs (int): Number of epochs to train. patience (int): Patience parameter for the early stopping. batch_size (int): Batch size for the training generator. logdir (path): Directory for saving the logs. modelpath (path): Path for saving the model checkpoints. learning_rate (float): Learning rate for the optimization. interactive (bool): Indicator if we want to have an interactive progress bar for training. generator (generator): Generator for the data batches. """ train_gen = generator("train", batch_size) val_gen = generator("val", batch_size) num_batches = len(data_train)//batch_size saver = tf.train.Saver(keep_checkpoint_every_n_hours=2.) summaries = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) patience_count = 0 test_losses = [] with LogFileWriter(ex): train_writer = tf.summary.FileWriter(logdir+"/train", sess.graph) test_writer = tf.summary.FileWriter(logdir+"/test", sess.graph) print("Training...") train_step_SOMVAE, train_step_prob = model.optimize try: if interactive: pbar = tqdm(total=num_epochs*(num_batches)) for epoch in range(num_epochs): batch_val = next(val_gen) test_loss, summary = sess.run([model.loss, summaries], feed_dict={x: batch_val}) test_losses.append(test_loss) test_writer.add_summary(summary, tf.train.global_step(sess, model.global_step)) if test_losses[-1] == min(test_losses): saver.save(sess, modelpath, global_step=epoch) patience_count = 0 else: patience_count += 1 if patience_count >= patience: break for i in range(num_batches): batch_data = next(train_gen) if i%100 == 0: train_loss, summary = sess.run([model.loss, summaries], feed_dict={x: batch_data}) train_writer.add_summary(summary, tf.train.global_step(sess, model.global_step)) train_step_SOMVAE.run(feed_dict={x: batch_data, lr_val:learning_rate}) train_step_prob.run(feed_dict={x: batch_data, lr_val:learning_rate*100}) if interactive: pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) except KeyboardInterrupt: pass finally: saver.save(sess, modelpath) if interactive: pbar.close()
def train_model(model, data_train, data_val, endpoints_total_val, lr_val, num_epochs, batch_size, latent_dim, som_dim, learning_rate, epochs_pretrain, ex_name, logdir, modelpath, val_epochs, save_pretrain, use_saved_pretrain, benchmark, train_ratio): """Trains the T-DPSOM model. Params: model (T-DPSOM): T-DPSOM model to train. data_train (np.array): Training set. data_val (np.array): Validation/test set. endpoints_total_val (np.array): Validation/test labels. lr_val (tf.Tensor): Placeholder for the learning rate value. num_epochs (int): Number of training epochs. batch_size (int): Batch size for the training. latent_dim (int): Dimensionality of the T-DPSOM's latent space. som_dim (list): Dimensionality of the self-organizing map. learning_rate (float): Learning rate for the optimization. epochs_pretrain (int): Number of VAE pretraining epochs. ex_name (string): Unique name of this particular run. logdir (path): Directory for the experiment logs. modelpath (path): Path for the model checkpoints. val_epochs (bool): If "True" clustering results are saved every 10 epochs on default output files. """ max_n_step = 72 epochs = 0 iterations = 0 pretrainpath = "../models/pretrain/LSTM" len_data_train = len(data_train) len_data_val = len(data_val) num_batches = len_data_train // batch_size train_gen = batch_generator(data_train, data_val, endpoints_total_val, mode="train") val_gen = batch_generator(data_train, data_val, endpoints_total_val, mode="val") saver = tf.train.Saver(max_to_keep=50) summaries = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) test_losses = [] test_losses_mean = [] with LogFileWriter(ex): train_writer = tf.summary.FileWriter(logdir + "/train", sess.graph) test_writer = tf.summary.FileWriter(logdir + "/test", sess.graph) train_step_SOMVAE, train_step_ae, train_step_som, train_step_prob = model.optimize x = model.inputs p = model.p is_training = model.is_training graph = tf.get_default_graph() init_1 = graph.get_tensor_by_name("prediction/next_state/init_state:0") z_e_p = graph.get_tensor_by_name("prediction/next_state/input_lstm:0") z_e_rec = graph.get_tensor_by_name('reconstruction_e/decoder/z_e:0') training_dic = { is_training: True, z_e_p: np.zeros((max_n_step * batch_size, latent_dim)), init_1: np.zeros((2, batch_size, 100)), z_e_rec: np.zeros((max_n_step * batch_size, latent_dim)) } pbar = tqdm(total=(num_epochs + epochs_pretrain * 3) * (num_batches)) print("\n**********Starting job {}********* \n".format(ex_name)) a = np.zeros((batch_size * 72, som_dim[0] * som_dim[1])) dp = {p: a} dp.update(training_dic) if benchmark: ttime_per_epoch = [] ttime_ae_per_epoch = [] ttime_som_per_epoch = [] ttime_pred_per_epoch = [] if use_saved_pretrain: print("\n\nUsing Saved Pretraining...\n") saver.restore(sess, pretrainpath) else: print("\n\nAutoencoder Pretraining...\n") if benchmark: t_begin_all = timeit.default_timer() for epoch in range(epochs_pretrain): if benchmark: t_begin = timeit.default_timer() for i in range(num_batches): batch_data, ii = next(train_gen) f_dic = {x: batch_data, lr_val: learning_rate} f_dic.update(dp) train_step_ae.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_reconstruction_ze, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_reconstruction_ze, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) if benchmark: t_end = timeit.default_timer() ttime_ae_per_epoch.append(t_end - t_begin) if benchmark: t_end_all = timeit.default_timer() ttime_ae_pretrain = t_end_all - t_begin_all print("\n\nSOM initialization...\n") if benchmark: t_begin_all = timeit.default_timer() for epoch in range(epochs_pretrain // 3): if benchmark: t_begin = timeit.default_timer() for i in range(num_batches): batch_data, ii = next(train_gen) f_dic = {x: batch_data, lr_val: 0.1} f_dic.update(dp) train_step_som.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) if benchmark: t_end = timeit.default_timer() ttime_som_per_epoch.append(t_end - t_begin) for epoch in range(epochs_pretrain // 3): if benchmark: t_begin = timeit.default_timer() for i in range(num_batches): batch_data, ii = next(train_gen) f_dic = {x: batch_data, lr_val: 0.01} f_dic.update(dp) train_step_som.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) if benchmark: t_end = timeit.default_timer() ttime_som_per_epoch.append(t_end - t_begin) for epoch in range(epochs_pretrain // 3): if benchmark: t_begin = timeit.default_timer() for i in range(num_batches): batch_data, ii = next(train_gen) f_dic = {x: batch_data, lr_val: 0.01} f_dic.update(dp) train_step_som.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) if benchmark: t_end = timeit.default_timer() ttime_som_per_epoch.append(t_end - t_begin) if benchmark: t_end_all = timeit.default_timer() ttime_som = t_end_all - t_begin_all if save_pretrain: saver.save(sess, pretrainpath) print("\n\nTraining...\n") if benchmark: t_begin_all = timeit.default_timer() for epoch in range(num_epochs): if benchmark: t_begin = timeit.default_timer() epochs += 1 print(epochs) f_dic = {x: data_train} f_dic.update(training_dic) q = [] for t in range(19): q.extend( sess.run( model.q, feed_dict={ x: data_train[int(len(data_train) / 20) * t:int(len(data_train) / 20) * (t + 1)] })) q.extend( sess.run( model.q, feed_dict={x: data_train[int(len(data_train) / 20) * 19:]})) q = np.array(q) ppt = model.target_distribution(q) q = [] f_dic = {x: data_val} f_dic.update(training_dic) for t in range(9): q.extend( sess.run(model.q, feed_dict={ x: data_val[int(len(data_val) / 10) * t:int(len(data_val) / 10) * (t + 1)] })) q.extend( sess.run(model.q, feed_dict={x: data_val[int(len(data_val) / 10) * 9:]})) q = np.array(q) ppv = model.target_distribution(q) for i in range(num_batches): iterations += 1 batch_data, ii = next(train_gen) ftrain = { p: ppt[ii * batch_size * 72:(ii + 1) * batch_size * 72] } f_dic = {x: batch_data, lr_val: learning_rate} f_dic.update(ftrain) f_dic.update(training_dic) train_step_SOMVAE.run(feed_dict=f_dic) train_step_prob.run(feed_dict=f_dic) batch_val, _, ii = next(val_gen) fval = { p: ppv[ii * batch_size * 72:(ii + 1) * batch_size * 72] } f_dic = {x: batch_val} f_dic.update(fval) f_dic.update(training_dic) test_loss, summary = sess.run([model.loss, summaries], feed_dict=f_dic) test_losses.append(test_loss) if i % 100 == 0: test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(ftrain) f_dic.update(training_dic) train_loss, summary = sess.run([model.loss, summaries], feed_dict=f_dic) if math.isnan(train_loss): return None train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) if i % 1000 == 0: test_loss_mean = np.mean(test_losses) test_losses_mean.append(test_loss_mean) test_losses = [] if len(test_losses_mean) > 0: test_s = test_losses_mean[-1] else: test_s = test_losses_mean pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_s, refresh=False) pbar.update(1) if val_epochs == True and epoch % 5 == 0: path = "../models/exp/exp" + str(epoch) + "/LSTM" saver.save(sess, path) #results = evaluate_model(model, x, val_gen, len_data_val, modelpath, epochs) if benchmark: t_end = timeit.default_timer() ttime_per_epoch.append(t_end - t_begin) if benchmark: t_end_all = timeit.default_timer() ttime_training = t_end_all - t_begin_all print("\n\nPrediction Finetuning...\n") if benchmark: t_begin_all = timeit.default_timer() for epoch in range(200): if benchmark: t_begin = timeit.default_timer() for i in range(num_batches): batch_data, ii = next(train_gen) f_dic = {x: batch_data, lr_val: learning_rate * 10} f_dic.update(dp) train_step_prob.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_prediction, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_prediction, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) if benchmark: t_end = timeit.default_timer() ttime_pred_per_epoch.append(t_end - t_begin) if benchmark: t_end_all = timeit.default_timer() ttime_pred = t_end_all - t_begin_all saver.save(sess, modelpath) results = evaluate_model(model, x, val_gen, len_data_val, modelpath, epochs) pbar.close() if benchmark: print("\nNumber of time series in train: {} %, {}".format( train_ratio, len(data_train))) print("SOM init time: {:.3f}".format(ttime_som)) print("SOM init time per epoch: {:.3f}".format( np.mean(ttime_som_per_epoch))) print("AE pretrain time: {:.3f}".format(ttime_ae_pretrain)) print("AE pretrain time per epoch: {:.3f}".format( np.mean(ttime_ae_per_epoch))) print("Training time: {:.3f}".format(ttime_training)) print("Training time per epoch: {:.3f}".format( np.mean(ttime_per_epoch))) print("Pred finetuning time: {:.3f}".format(ttime_pred)) print("Pred finetuning time per epoch: {:.3f}".format( np.mean(ttime_pred_per_epoch))) sys.exit(0) return results
def train_model(model, data_train, data_val, generator, lr_val, num_epochs, batch_size, logdir, ex_name, validation, val_epochs, modelpath, learning_rate, epochs_pretrain, som_dim, latent_dim, use_saved_pretrain, learning_rate_pretrain, save_pretrain): """Trains the DPSOM model. Args: model (DPSOM): DPSOM model to train. data_train (np.array): Training set. data_val (np.array): Validation/test set. generator (generator): Data generator for the batches. lr_val (tf.Tensor): Placeholder for the learning rate value. num_epochs (int): Number of training epochs. batch_size (int): Batch size for the training. logdir (path): Directory for the experiment logs. ex_name (string): Unique name of this particular run. val_epochs (bool): If "True" clustering results are saved every 10 epochs on default output files. modelpath (path): Path for the model checkpoints. learning_rate (float): Learning rate for the optimization. epochs_pretrain (int): Number of VAE pretraining epochs. som_dim (list): Dimensionality of the self-organizing map. latent_dim (int): Dimensionality of the DPSOM's latent space. """ epochs = 0 iterations = 0 train_gen = generator("train", batch_size) if validation: val_gen = generator("val", batch_size) else: val_gen = generator("test", batch_size) len_data_train = len(data_train) len_data_val = len(data_val) num_batches = len_data_train // batch_size saver = tf.train.Saver(max_to_keep=5) summaries = tf.summary.merge_all() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) test_losses = [] test_losses_mean = [] pretrainpath = "../models/pretrainVAE/VAE" with LogFileWriter(ex): train_writer = tf.summary.FileWriter(logdir + "/train", sess.graph) test_writer = tf.summary.FileWriter(logdir + "/test", sess.graph) train_step_VARPSOM, train_step_vae, train_step_som = model.optimize x = model.inputs p = model.p is_training = model.is_training graph = tf.get_default_graph() z = graph.get_tensor_by_name("reconstruction_e/decoder/z_e:0") print("\n**********Starting job {}********* \n".format(ex_name)) pbar = tqdm(total=(num_epochs + epochs_pretrain + 40) * num_batches) if use_saved_pretrain: print("\n\nUsing Saved Pretraining...\n") saver.restore(sess, pretrainpath) else: print("\n\nAutoencoder Pretraining...\n") a = np.zeros((batch_size, som_dim[0] * som_dim[1])) dp = { p: a, is_training: True, z: np.zeros((batch_size, latent_dim)) } for epoch in range(epochs_pretrain): for i in range(num_batches): batch_data, _, _ = next(train_gen) f_dic = {x: batch_data, lr_val: learning_rate_pretrain} f_dic.update(dp) train_step_vae.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, _ = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_reconstruction_ze, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_reconstruction_ze, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) print("\n\nSOM initialization...\n") for epoch in range(5): for i in range(num_batches): batch_data, _, ii = next(train_gen) f_dic = {x: batch_data, lr_val: 0.9} f_dic.update(dp) train_step_som.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) for epoch in range(5): for i in range(num_batches): batch_data, _, ii = next(train_gen) f_dic = {x: batch_data, lr_val: 0.3} f_dic.update(dp) train_step_som.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) for epoch in range(5): for i in range(num_batches): batch_data, _, ii = next(train_gen) f_dic = {x: batch_data, lr_val: 0.1} f_dic.update(dp) train_step_som.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) for epoch in range(5): for i in range(num_batches): batch_data, _, ii = next(train_gen) f_dic = {x: batch_data, lr_val: 0.01} f_dic.update(dp) train_step_som.run(feed_dict=f_dic) if i % 100 == 0: batch_val, _, ii = next(val_gen) f_dic = {x: batch_val} f_dic.update(dp) test_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(dp) train_loss, summary = sess.run( [model.loss_a, summaries], feed_dict=f_dic) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_loss, refresh=False) pbar.update(1) if save_pretrain: saver.save(sess, pretrainpath) print("\n\nTraining...\n") lratios = [] l2ratios = [] l3ratios = [] for epoch in range(num_epochs): epochs += 1 #Compute initial soft probabilities between data points and centroids q = [] for t in range(9): q.extend( sess.run( model.q, feed_dict={ x: data_train[int(len(data_train) / 10) * t:int(len(data_train) / 10) * (t + 1)], is_training: True, z: np.zeros((int(len(data_train) / 10), latent_dim)) })) q.extend( sess.run(model.q, feed_dict={ x: data_train[int(len(data_train) / 10) * 9:], is_training: True, z: np.zeros( (int(len(data_train) / 10), latent_dim)) })) q = np.array(q) ppt = model.target_distribution(q) q = sess.run(model.q, feed_dict={ x: data_val, is_training: True, z: np.zeros((len(data_val), latent_dim)) }) ppv = model.target_distribution(q) #Train for i in range(num_batches): iterations += 1 batch_data, _, ii = next(train_gen) ftrain = { p: ppt[ii * batch_size:(ii + 1) * batch_size], is_training: True, z: np.zeros((batch_size, latent_dim)) } f_dic = {x: batch_data, lr_val: learning_rate} f_dic.update(ftrain) train_step_VARPSOM.run(feed_dict=f_dic) batch_val, _, ii = next(val_gen) fval = { p: ppv[ii * batch_size:(ii + 1) * batch_size], is_training: True, z: np.zeros((batch_size, latent_dim)) } f_dic = {x: batch_val} f_dic.update(fval) test_loss, summary = sess.run([model.loss, summaries], feed_dict=f_dic) test_losses.append(test_loss) if i % 100 == 0: test_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) f_dic = {x: batch_data} f_dic.update(ftrain) train_loss, summary = sess.run([model.loss, summaries], feed_dict=f_dic) elbo_loss = sess.run( [model.theta * model.loss_reconstruction_ze], feed_dict=f_dic) cah_loss = sess.run([model.gamma * model.loss_commit], feed_dict=f_dic) ssom_loss = sess.run([model.beta * model.loss_som], feed_dict=f_dic) cah_ssom_ratio = cah_loss[0] / ssom_loss[0] vae_cah_ratio = elbo_loss[0] / cah_loss[0] clust_vae_ratio = elbo_loss[0] / (ssom_loss[0] + cah_loss[0]) lratios.append(cah_ssom_ratio) l2ratios.append(vae_cah_ratio) l3ratios.append(clust_vae_ratio) train_writer.add_summary( summary, tf.train.global_step(sess, model.global_step)) if i % 1000 == 0: test_loss_mean = np.mean(test_losses) test_losses_mean.append(test_loss_mean) test_losses = [] if len(test_losses_mean) > 0: test_s = test_losses_mean[-1] else: test_s = test_losses_mean pbar.set_postfix(epoch=epoch, train_loss=train_loss, test_loss=test_s, ssom=ssom_loss, cah=cah_loss, vae=elbo_loss, cs_ratio=np.mean(lratios), vc_ratio=np.mean(l2ratios), cr_ratio=np.mean(l3ratios), refresh=False) pbar.update(1) saver.save(sess, modelpath) if val_epochs == True and epochs % 10 == 0: saver.save(sess, modelpath) results = evaluate_model(model, generator, len_data_val, x, modelpath, epochs) if results is None: return None saver.save(sess, modelpath) results = evaluate_model(model, generator, len_data_val, x, modelpath, epochs) return results