def _mlp_configured(input_, num_actions, hiddens, dueling=False, summaries=False): """ Constructs dueling architecture """ # build standard MLP if not dueling: output = _mlp(input_, num_actions, hiddens) else: # advantage value function A = _mlp(input_, num_actions, hiddens) # state-value function V = _mlp(input_, num_actions, hiddens) # plot mean(V(s, a)) if summaries: scalar_summary('state_value', tf.reduce_mean(V)) # mean-center advantage values A_mean = tf.reduce_mean(A, axis=1) A_centered = A - tf.expand_dims(A_mean, axis=1) output = V + A_centered return output
def __init__(self, m, ovo): self.mse_ph = tf.placeholder(tf.float32, (), name='mse-train') self.mae_ph = tf.placeholder(tf.float32, (), name='mae-train') self.val_mse_ph = tf.placeholder(tf.float32, (), name='mse-test') self.val_mae_ph = tf.placeholder(tf.float32, (), name='mae-test') self.im_ph = tf.placeholder(tf.uint8, (1, 210 * 3, 160 * 5, 3), name='pred-ball-pos-ph') tr_sum = [] tr_sum.append( scalar_summary('mse-train', self.mse_ph, scope='train')) tr_sum.append( scalar_summary('mae-train', self.mae_ph, scope='train')) te_sum = [] te_sum.append( scalar_summary('mse-test', self.val_mse_ph, scope='test')) te_sum.append( scalar_summary('mae-test', self.val_mae_ph, scope='test')) self.im_sum = tf.summary.image('pred-ball-pos', self.im_ph) self.mtr_sum = tf.summary.merge(tr_sum) self.mte_sum = tf.summary.merge(te_sum) self.fw = tf.summary.FileWriter(f'{home}/ball/{model_name}', graph=sess.graph) self.ovo = ovo self.step = 0 self.m = m
def _setup_tensorboard(self): """ Adds all variables that might help debugging to Tensorboard. At the end, the FileWriter is constructed pointing to the specified directory. """ # more placeholders for summarised variables; along with summaries self.eps_ph = tf.placeholder(tf.float32, (), name='epsilon') self.rew_ph = tf.placeholder(tf.float32, (), name='rolling-reward') scalar_summary('epsilon', self.eps_ph) scalar_summary('reward', self.rew_ph) # display q_values while training for a_i in range(self.num_actions): scalar_summary('QTa_{}'.format(a_i + 1), tf.reduce_mean(self.target_tp1[:, a_i]), scope='Q-Values') scalar_summary('Qa_{}'.format(a_i + 1), tf.reduce_mean(self.q_t[:, a_i]), scope='Q-Values') # plot network weights with tf.variable_scope('weights'): for qv in self.q_net_vars: tf.summary.histogram('{}'.format(qv.name), qv) for tv in self.target_net_vars: tf.summary.histogram('{}'.format(tv.name), tv) # gradient histograms with tf.variable_scope('gradients'): for g in self.gradients: tf.summary.histogram('{}-grad'.format(g[1].name), g[0])
def _loss(self): """ Defines loss as layed out in the original Nature paper """ with tf.variable_scope('loss'): # either use maximum target q or use value from target network while the action is chosen by the q net if self.double_q: act_tp1_idxs = tf.stop_gradient(tf.argmax(self.q_tp1, axis=1)) q_tp1 = tf.reduce_sum( self.target_tp1 * tf.one_hot(act_tp1_idxs, self.num_actions), axis=1) else: q_tp1 = tf.reduce_max(self.target_tp1, axis=1) # bellman target y = self._L_r + (self.gamma * (1.0 - self._L_d) * q_tp1) # select q value of taken action qj = tf.reduce_sum(self.q_t * tf.one_hot(self._L_a, self.num_actions), axis=1) # TD errors self._td_errors = qj - y # apply huber loss loss = tf.losses.huber_loss(y, qj) if self.use_tensorboard: scalar_summary('target', tf.reduce_mean(y)) scalar_summary('huber-loss', tf.reduce_mean(loss)) tf.summary.histogram('selected_Q', qj) # importance sampling weights if self.prioritized_replay: updates = tf.reduce_mean(self._is_weights * loss) else: updates = tf.reduce_mean(loss) return updates
def _setup_tensorboard(self): """ Adds all variables that might help debugging to Tensorboard. At the end, the FileWriter is constructed pointing to the specified directory. """ self.logger.info('Saving Tensorboard summaries to {}'.format( self.tensorboard_dir)) self.ret_ph = tf.placeholder(tf.float32, (), name='mean-return') self.kl_ph = tf.placeholder(tf.float32, (), name='kl') self.pl_diff_ph = tf.placeholder(tf.float32, (), name='pl-diff') scalar_summary('mean-return', self.ret_ph) scalar_summary('kl', self.kl_ph) scalar_summary('pl-diff', self.pl_diff_ph) with tf.variable_scope('loss'): scalar_summary('value-loss', self.vf_loss) scalar_summary('policy-loss', self.policy_loss) # scalar_summary('policy-entropy', self.pi_entropy) with tf.variable_scope('value'): scalar_summary('value_target', tf.reduce_mean(self.d_rew_ph)) scalar_summary('value', tf.reduce_mean(self.values)) # plot network weights with tf.variable_scope('weights'): for pv in self.theta_vars: tf.summary.histogram('{}'.format(pv.name), pv) # gradient histograms with tf.variable_scope('gradients'): vector_summary('policy-gradient', self.pg)
def _tensorboard_setup(self): """ Tensorboard (TB) setup """ with tf.variable_scope('{}-ph'.format(self.name)): self.bps_ph = tf.placeholder(tf.int32, (), name='batches-per-second') self.ep_ph = tf.placeholder(tf.int32, (), name='episode') scalar_summary('batches-per-second', self.bps_ph) scalar_summary('episode', self.ep_ph) self.v_loss = tf.placeholder(tf.float32, (), name='vae-loss') self.rel_ph = tf.placeholder(tf.float32, (), name='rec-loss') self.kll_ph = tf.placeholder(tf.float32, (), name='kl-loss') self.klls_ph = [ tf.placeholder(tf.float32, (), name=f'z{i}-kl') for i in range(self.latent_dim) ] with tf.variable_scope('loss'): scalar_summary('reconstruction-loss', self.rel_ph) scalar_summary('total-loss', self.vae_loss) scalar_summary('kl-loss', self.kll_ph) for i in range(self.latent_dim): scalar_summary(f'z{i}-kl', self.klls_ph[i], scope='z-kl') self.merge_op = tf.summary.merge_all() import os home = os.environ['HOME'] self.writer = tf.summary.FileWriter(f'{home}/vae/{self.savename}', graph=tf.get_default_graph())
def train(self, dataset, batch_size=155, num_episodes=50, print_freq=5): import numpy as np import time import datetime from tabulate import tabulate from forkan.common import CSVLogger from forkan.common.tf_utils import scalar_summary num_samples = len(dataset) assert np.max(dataset) <= 1, 'provide normalized dataset!' self.log.info('Training on {} samples for {} episodes.'.format( num_samples, num_episodes)) tstart = time.time() nb = 1 train_op = tf.train.AdamOptimizer().minimize(self.vae_loss) csv_header = ['date', '#episode', '#batch', 'rec-loss', 'kl-loss'] + \ ['z{}-kl'.format(i) for i in range(self.latent_dim)] csv = CSVLogger('{}/progress.csv'.format(self.savepath), *csv_header) rel_ph = tf.placeholder(tf.float32, (), name='rec-loss') kll_ph = tf.placeholder(tf.float32, (), name='kl-loss') klls_ph = [ tf.placeholder(tf.float32, (), name=f'z{i}-kl') for i in range(self.latent_dim) ] scalar_summary('reconstruction-loss', rel_ph, scope='vae-loss') scalar_summary('kl-loss', kll_ph, scope='vae-loss') for i in range(self.latent_dim): scalar_summary(f'z{i}-kl', klls_ph[i], scope='z-kl') merged_ = tf.summary.merge_all() writer = tf.summary.FileWriter(f'{self.savepath}/board', self.s.graph) self.s.run(tf.global_variables_initializer()) du = [] for _ in range(5): a = np.linspace(0, 1, 64) ar = np.repeat(a, 64, 0).reshape([64, 64]) du.append(ar) print(np.asarray(du).shape) du = np.reshape(du, [1, 5, 64, 64, 1]) file_writer = tf.summary.FileWriter('/Users/llach/board_test') im_ph = tf.placeholder(tf.float32, shape=(1, 64, 128, 1)) im_sum = tf.summary.image('img', im_ph) # rollout N episodes for ep in range(num_episodes): # shuffle dataset np.random.shuffle(dataset) for n, idx in enumerate(np.arange(0, num_samples, batch_size)): bps = max(int(nb / (time.time() - tstart)), 1) x = dataset[idx:min(idx + batch_size, num_samples), ...] _, loss, re_loss, kl_losses = self.s.run( [train_op, self.vae_loss, self.re_loss, self.kl_loss], feed_dict={self.X: x}) # mean losses re_loss = np.mean(re_loss) kl_loss = self.beta * np.sum(kl_losses) fd = { rel_ph: re_loss, kll_ph: kl_loss, } for i, kph in enumerate(klls_ph): fd.update({kph: kl_losses[i]}) suma = self.s.run(merged_, feed_dict=fd) writer.add_summary(suma, nb) # increase batch counter nb += 1 csv.writeline(datetime.datetime.now().isoformat(), ep, nb, re_loss, kl_loss, *kl_losses) if n % print_freq == 0 and print_freq is not -1: total_batches = (num_samples // batch_size) * num_episodes perc = ((nb) / total_batches) * 100 steps2go = total_batches - nb secs2go = steps2go / bps min2go = secs2go / 60 hrs = int(min2go // 60) mins = int(min2go) % 60 tab = tabulate([ ['name', f'retrainvae-clean-b{self.beta}'], ['episode', ep], ['batch', n], ['bps', bps], ['rec-loss', re_loss], ['kl-loss', kl_loss], ['ETA', '{}h {}min'.format(hrs, mins)], ['done', '{}%'.format(int(perc))], ]) print('\n{}'.format(tab)) reca = self.reconstruct_stacked(du) print(reca[0].shape, ar.shape) fin = np.concatenate((reca[0], np.expand_dims(ar, axis=-1)), axis=1) isu = self.s.run( im_sum, feed_dict={im_ph: np.expand_dims(fin, axis=0)}) file_writer.add_summary(isu, nb) file_writer.flush() self.save() file_writer.close() self.save() print('training done!')