def setup_model(self): """ Create all the functions and tensorflow graphs necessary to train the model """ assert issubclass(self.policy, MetaLstmActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of MetaLstmActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_utils.make_session(graph=self.graph) # azért nincs step model mert ugyanaz a lépés (n_batch) így felesleges. policy_model = self.policy(sess=self.sess, input_length=self.input_length, output_length=self.output_length, n_steps=self.n_steps, window_size=self.window_size, layers=self.layers, lstm_units=self.lstm_units) with tf.variable_scope("loss", reuse=False): self.actions_ph = policy_model.pdtype.sample_placeholder([self.n_steps], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [self.n_steps], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [self.n_steps], name="rewards_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") neglogpac = policy_model.proba_distribution.neglogp(self.actions_ph) self.entropy = tf.reduce_mean(policy_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(policy_model.value_fn), self.rewards_ph) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef self.trainable_variables = tf_utils.find_trainable_variables("model") grads = tf.gradients(loss, self.trainable_variables) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.trainable_variables)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.step = policy_model.step self.policy_model = policy_model self.value = self.policy_model.value tf.global_variables_initializer().run(session=self.sess)
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) with tf.variable_scope("loss", reuse=False): self.actions_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp(self.actions_ph) self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('loss', loss) self.params = find_trainable_variables("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) tf.summary.histogram('learning_rate', self.learning_rate) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.histogram('advantage', self.advs_ph) if len(self.observation_space.shape) == 3: tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.train_model = train_model self.step_model = step_model self.step = step_model.step # self.step_with_attention = step_model.step_with_attention self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.actions_ph = train_model.pdtype.sample_placeholder( [None], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp( self.actions_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(train_model.value_flat), self.rewards_ph) # https://arxiv.org/pdf/1708.04782.pdf#page=9, https://arxiv.org/pdf/1602.01783.pdf#page=4 # and https://github.com/dennybritz/reinforcement-learning/issues/34 # suggest to add an entropy component in order to improve exploration. loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('loss', loss) self.params = tf_util.get_trainable_vars("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, self.params)) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state self.attention = step_model.attention tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ "an instance of common.policies.ActorCriticPolicy." # Enable continuous actions tricks (normalized advantage) self.continuous_actions = isinstance(self.action_space, Box) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False, **self.policy_kwargs) self.params = params = tf_util.get_trainable_vars("model") with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_util.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True, **self.policy_kwargs) with tf.variable_scope( "loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder( tf.float32, [None]) self.learning_rate_ph = learning_rate_ph = tf.placeholder( tf.float32, []) self.actions_ph = train_model.pdtype.sample_placeholder( [None]) neg_log_prob = train_model.proba_distribution.neglogp( self.actions_ph) # training loss pg_loss = tf.reduce_mean(advs_ph * neg_log_prob) self.entropy = entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse( tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean( neg_log_prob) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( tf.pow( train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', pg_loss) tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss) tf.summary.scalar('loss', train_loss) self.grads_check = tf.gradients(train_loss, params) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) with tf.variable_scope( "kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=learning_rate_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=self.kfac_update, epsilon=0.01, stats_decay=0.99, async_eigen_decomp=self.async_eigen_decomp, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder( tf.float32, [None]) self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, []) n_batch_step = None n_batch_train = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) self.model2 = train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) self.action_ph = action_ph = train_model.pdtype.sample_placeholder( [None]) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) self.entropy = entropy = tf.reduce_mean( calc_entropy(train_model.policy)) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -self.vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess)
def setup_train_model(self, transfer=False): with SetVerbosity(self.verbose): assert issubclass(self.policy, MultiTaskActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of MultiTaskActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_utils.make_session(graph=self.graph) self.n_batch = self.n_envs_per_task * self.n_steps step_model = self.policy(self.sess, self.tasks, self.observation_space_dict, self.action_space_dict, self.n_envs_per_task, n_steps=1, reuse=False) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_utils.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.tasks, self.observation_space_dict, self.action_space_dict, self.n_envs_per_task, self.n_steps, reuse=True) with tf.variable_scope("loss", reuse=False): self.actions_ph = tf.placeholder(dtype=tf.int32, shape=[None], name="actions_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") # advantages self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") neglogpac = {} losses = {} for task in self.tasks: neglogpac[task] = train_model.proba_distribution_dict[ task].neglogp(self.actions_ph) self.entropy[task] = tf.reduce_mean( train_model.proba_distribution_dict[task].entropy( )) self.pg_loss[task] = tf.reduce_mean( self.advs_ph * neglogpac[task]) # policy gradient loss self.vf_loss[task] = mse( tf.squeeze(train_model.value_fn_dict[task]), self.rewards_ph) losses[task] = self.pg_loss[task] - self.entropy[ task] * self.ent_coef + self.vf_loss[ task] * self.vf_coef tf.summary.scalar(task + '_policy_gradient_loss', self.pg_loss[task]) tf.summary.scalar(task + '_value_function_loss', self.vf_loss[task]) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) optimizers = {} grads_and_vars = {} self.apply_backprop = {} for task in self.tasks: optimizers[task] = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) grads_and_vars[task] = optimizers[task].compute_gradients( losses[task]) if self.max_grad_norm is not None: grads = [grad for grad, var in grads_and_vars[task]] vars = [var for grad, var in grads_and_vars[task]] clipped_grads, _ = tf.clip_by_global_norm( grads, self.max_grad_norm) grads_and_vars[task] = list(zip(clipped_grads, vars)) self.apply_backprop[task] = optimizers[ task].apply_gradients(grads_and_vars[task]) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.trainable_variables = tf_utils.find_trainable_variables( "model") tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all() if not transfer: self.sess.graph.finalize()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the ACKTR model must be " \ "an instance of common.policies.ActorCriticPolicy." if isinstance(self.action_space, Box): raise NotImplementedError("WIP: ACKTR does not support Continuous actions yet.") self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) n_batch_step = None n_batch_train = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps self.model = step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) self.params = params = find_trainable_variables("model") with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): self.model2 = train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) with tf.variable_scope("loss", reuse=False, custom_getter=tf_util.outer_scope_getter("loss")): self.advs_ph = advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = rewards_ph = tf.placeholder(tf.float32, [None]) self.pg_lr_ph = pg_lr_ph = tf.placeholder(tf.float32, []) self.action_ph = action_ph = train_model.pdtype.sample_placeholder([None]) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) self.entropy = entropy = tf.reduce_mean(calc_entropy(train_model.policy)) self.pg_loss = pg_loss = pg_loss - self.ent_coef * entropy self.vf_loss = vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + self.vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal(tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = - self.vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', pg_loss) tf.summary.scalar('policy_gradient_fisher_loss', pg_fisher_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('value_function_fisher_loss', vf_fisher_loss) tf.summary.scalar('loss', train_loss) self.grads_check = tf.gradients(train_loss, params) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.scalar('learning_rate', tf.reduce_mean(self.pg_lr_ph)) tf.summary.histogram('learning_rate', self.pg_lr_ph) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.histogram('advantage', self.advs_ph) if len(self.observation_space.shape) == 3: tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) with tf.variable_scope("kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")): with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=pg_lr_ph, clip_kl=self.kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=self.max_grad_norm, verbose=self.verbose) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.ActorCriticPolicy." assert issubclass(self.policy, FeedForwardPolicy), "Error: the input policy for the A2C model must be an " \ "instance of common.policies.FeedFowardPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None n_batch_sil = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps # TODO: Add n_batch_sil = 512 step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) # TODO: Add with tf.variable_scope("train_model", reuse=True, custom_getter=tf_util.outer_scope_getter("train_model")): train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) with tf.variable_scope("sil_model", reuse=True, custom_getter=tf_util.outer_scope_getter("sil_model")): sil_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_sil, reuse=True) with tf.variable_scope("loss", reuse=False): # self.actions_ph = train_model.pdtype.sample_placeholder([None], name="action_ph") self.actions_ph = train_model.action_ph self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.successor_feature_ph = tf.placeholder(tf.float32, [None, FEATURE_SIZE], name="successor_feature_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") neglogpac = train_model.proba_distribution.neglogp(self.actions_ph) last_frame = tf.reshape(train_model.obs_ph[..., 3], shape=[-1, 84 * 84]) recons_losses = tf.squared_difference(x=last_frame, y=train_model.recons_mod) self.recons_loss = tf.losses.mean_squared_error(labels=last_frame, predictions=train_model.recons_mod) self.entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) if self.use_recons: self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph + self.recons_intri * tf.stop_gradient(self.recons_loss)) else: self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph) # TODO: loss of SF self.sf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.successor_feature), self.successor_feature_ph)) loss = self.pg_loss - \ self.entropy * self.ent_coef + \ self.vf_loss * self.vf_coef if self.use_recons: loss += self.recons_loss * self.recons_coef elif self.use_sf: loss += self.sf_loss * self.sf_coef + \ self.recons_loss * self.recons_coef tf.summary.scalar('recons_loss/max', tf.reduce_max(recons_losses)) tf.summary.scalar('recons_loss/min', tf.reduce_min(recons_losses)) tf.summary.scalar('recons_loss', self.recons_loss) tf.summary.scalar('entropy_loss', self.entropy) tf.summary.scalar('policy_gradient_loss', self.pg_loss) tf.summary.scalar('value_function_loss', self.vf_loss) tf.summary.scalar('successor_feature_loss', self.sf_loss) tf.summary.scalar('loss', loss) self.params = find_trainable_variables("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) _last_frame = tf.reshape(last_frame, [-1, 84, 84, 1]) _recons_mod = tf.reshape(train_model.recons_mod, [-1, 84, 84, 1]) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph)) tf.summary.histogram('discounted_rewards', self.rewards_ph) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) tf.summary.histogram('learning_rate', self.learning_rate) tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph)) tf.summary.histogram('advantage', self.advs_ph) tf.summary.image('last_frame', _last_frame) tf.summary.image('reconstruction', _recons_mod) if len(self.observation_space.shape) == 3: tf.summary.image('observation', train_model.obs_ph) else: tf.summary.histogram('observation', train_model.obs_ph) trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) # TODO: Add self.sil = SelfImitation( model_ob=sil_model.obs_ph, model_vf=sil_model.value_fn, model_entropy=sil_model.proba_distribution.entropy(), fn_value=sil_model.value, fn_neg_log_prob=sil_model.proba_distribution.neglogp, ac_space=self.action_space, fn_reward=np.sign, n_env=self.n_envs, n_update=self.sil_update, beta=self.sil_beta) self.sil.build_train_op( params=self.params, optim=trainer, lr=self.learning_rate_ph, max_grad_norm=self.max_grad_norm) self.train_model = train_model self.step_model = step_model # self.step = step_model.step self.step = step_model.step_with_sf self.estimate_recons = step_model.estimate_recons self.proba_step = step_model.proba_step self.value = step_model.value # TODO: Add self.successor_feature = step_model.estimate_sf self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all()
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.make_session(graph=self.graph) self.n_batch = self.n_envs * self.n_steps n_batch_step = None n_batch_train = None if issubclass(self.policy, LstmPolicy): n_batch_step = self.n_envs n_batch_train = self.n_envs * self.n_steps step_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, n_batch_step, reuse=False) train_model = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, self.n_steps, n_batch_train, reuse=True) self.actions_ph = train_model.pdtype.sample_placeholder([None]) self.advs_ph = tf.placeholder(tf.float32, [None]) self.rewards_ph = tf.placeholder(tf.float32, [None]) self.learning_rate_ph = tf.placeholder(tf.float32, []) neglogpac = train_model.proba_distribution.neglogp( self.actions_ph) self.entropy = tf.reduce_mean( train_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef self.params = find_trainable_variables("model") grads = tf.gradients(loss, self.params) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.params)) trainer = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.proba_step = step_model.proba_step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=self.sess)