def define_simulation_graph(batch_env, algo_cls, config): """Define the algortihm and environment interaction. Args: batch_env: In-graph environments object. algo_cls: Constructor of a batch algorithm. config: Configuration object for the algorithm. Returns: Object providing graph elements via attributes. """ # pylint: disable=unused-variable step = tf.Variable(0, False, dtype=tf.int32, name='global_step') is_training = tf.placeholder(tf.bool, name='is_training') should_log = tf.placeholder(tf.bool, name='should_log') do_report = tf.placeholder(tf.bool, name='do_report') force_reset = tf.placeholder(tf.bool, name='force_reset') algo = algo_cls(batch_env, step, is_training, should_log, config) #algo_cls=ppo.PPOAlgorithm, and it returns a vectorized implementation of PPO. #"""A vectorized implementation of the PPO algorithm by John Schulman.""" done, score, summary = tools.simulate(batch_env, algo, should_log, force_reset) """" tools.simulate Returns: Tuple of tensors containing done flags for the current episodes, possibly intermediate scores for the episodes, and a summary tensor. """ message = 'Graph contains {} trainable variables.' tf.logging.info(message.format(tools.count_weights())) # pylint: enable=unused-variable return tools.AttrDict(locals())
def define_simulation_graph(batch_env, algo_cls, config): """Define the algortihm and environment interaction. Args: batch_env: In-graph environments object. algo_cls: Constructor of a batch algorithm. config: Configuration object for the algorithm. Returns: Object providing graph elements via attributes. """ # pylint: disable=unused-variable step = tf.Variable(0, False, dtype=tf.int32, name='global_step') is_training = tf.placeholder(tf.bool, name='is_training') should_log = tf.placeholder(tf.bool, name='should_log') do_report = tf.placeholder(tf.bool, name='do_report') force_reset = tf.placeholder(tf.bool, name='force_reset') algo = algo_cls(batch_env, step, is_training, should_log, config) done, score, summary = tools.simulate(batch_env, algo, should_log, force_reset) message = 'Graph contains {} trainable variables.' tf.logging.info(message.format(tools.count_weights())) # pylint: enable=unused-variable return tools.AttrDict(locals())
def main(_): tf_utils.set_up_logging() os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.GPU if not FLAGS.config: raise KeyError('You must specify a configuration.') if FLAGS.load_from: logdir = FLAGS.logdir = FLAGS.load_from else: if FLAGS.logdir and os.path.exists(FLAGS.logdir): run_number = [ int(f.split("-")[0]) for f in os.listdir(FLAGS.logdir) if os.path.isdir(os.path.join(FLAGS.logdir, f)) and FLAGS.config in f ] run_number = max(run_number) + 1 if len(run_number) > 0 else 0 else: run_number = 0 logdir = FLAGS.logdir and os.path.expanduser( os.path.join(FLAGS.logdir, '{}-{}'.format(run_number, FLAGS.config))) try: config = tf_utils.load_config(logdir) except IOError: config = tools.AttrDict(getattr(configs, FLAGS.config)()) config = tf_utils.save_config(config, logdir) run_wild(config, logdir)
def main(_): utility.set_up_logging() if not FLAGS.config: raise KeyError('You must specify a configuration.') if FLAGS.load_from: logdir = FLAGS.logdir = FLAGS.load_from else: if FLAGS.logdir and os.path.exists(FLAGS.logdir): run_number = [ int(f.split("-")[0]) for f in os.listdir(FLAGS.logdir) if os.path.isdir(os.path.join(FLAGS.logdir, f)) and FLAGS.config in f ] run_number = max(run_number) + 1 if len(run_number) > 0 else 0 else: run_number = 0 logdir = FLAGS.logdir and os.path.expanduser( os.path.join(FLAGS.logdir, '{}-{}'.format(run_number, FLAGS.config))) # recreate_directory_structure(logdir) try: config = utility.load_config(logdir) except IOError: config = tools.AttrDict(getattr(configs, FLAGS.config)()) config = utility.save_config(config, logdir) train(config, FLAGS.env_processes, logdir)
def main(_): """Configure logging.""" utility.set_up_logging() """Assert configuration and set-up directory log structure of the configuration.""" if not FLAGS.config: raise KeyError('You must specify a configuration.') if FLAGS.load_from: logdir = FLAGS.logdir = FLAGS.load_from else: """If config log directory already exists, increase the counter number and setup log dir.""" if FLAGS.logdir and os.path.exists(FLAGS.logdir): run_number = [ int(f.split("-")[0]) for f in os.listdir(FLAGS.logdir) if os.path.isdir(os.path.join(FLAGS.logdir, f)) and FLAGS.config in f ] run_number = max(run_number) + 1 if len(run_number) > 0 else 0 else: run_number = 0 logdir = FLAGS.logdir and os.path.expanduser( os.path.join(FLAGS.logdir, '{}-{}'.format(run_number, FLAGS.config))) """If config log directory already exists, try to load config file from it. Otherwise create a new config file coresponding to the user specified config from the config.py""" try: config = utility.load_config(logdir) except IOError: config = tools.AttrDict(getattr(configs, FLAGS.config)()) config = utility.save_config(config, logdir) """Run the task specified.""" run(config, logdir)
def _train_model(self, data): with tf.GradientTape() as model_tape: embed = self._encode(data) # print(embed,data['action']) post, prior = self._dynamics.observe(embed, data['action'], data['desc']) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() if self._c.cpc: # print("using cpc") pred = self._cpc_pred(embed) # print(pred,feat) cpc_loss = -1. * tf.math.reduce_mean( tools.compute_cpc_loss(pred, feat, self._c)) # caution! model_loss = cpc_loss else: model_loss = cpc_loss = 0 likes.image = tf.reduce_mean(image_pred.log_prob( data['image'])) likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward'])) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) # model_loss = self._c.kl_scale * div - sum(likes.values()) model_loss += self._c.kl_scale * div - sum(likes.values()) model_loss /= float(self._strategy.num_replicas_in_sync) model_norm = self._model_opt(model_tape, model_loss)
def define_config(): config = tools.AttrDict() # General. config.device = 0 config.logdir = pathlib.Path('.') config.seed = 0 config.steps = 5e6 config.eval_every = 1e4 config.log_every = 1e3 config.log_scalars = True config.log_images = True config.gpu_growth = True config.precision = 16 # Environment. config.task = 'dmc_walker_walk' config.envs = 1 config.parallel = 'none' config.action_repeat = 2 config.time_limit = 1000 config.prefill = 5000 config.eval_noise = 0.0 config.clip_rewards = 'none' # Model. config.deter_size = 200 config.stoch_size = 30 config.num_units = 400 config.dense_act = 'elu' config.cnn_act = 'relu' config.cnn_depth = 32 config.pcont = False config.free_nats = 3.0 config.kl_scale = 1.0 config.pcont_scale = 10.0 config.weight_decay = 0.0 config.weight_decay_pattern = r'.*' # Training. config.batch_size = 50 config.batch_length = 50 config.train_every = 1000 config.train_steps = 100 config.pretrain = 100 config.model_lr = 6e-4 config.value_lr = 8e-5 config.actor_lr = 8e-5 config.grad_clip = 100.0 config.dataset_balance = False # Behavior. config.discount = 0.99 config.disclam = 0.95 config.horizon = 15 config.action_dist = 'tanh_normal' config.action_init_std = 5.0 config.expl = 'additive_gaussian' config.expl_amount = 0.3 config.expl_decay = 0.0 config.expl_min = 0.0 config.ent_warm_up = 0 config.ent_alpha = 0.2 return config
def _train(self, data, test_data, log_images, step=1, should_print=False): with tf.GradientTape() as model_tape: embed = self._encode(data) post, prior = self._dynamics.observe(embed, data['action']) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.image = tf.reduce_mean(image_pred.log_prob(data['image'])) likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward'])) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) model_loss = self._c.kl_scale * div - sum(likes.values()) model_loss /= float(self._strategy.num_replicas_in_sync) model_norm = self._model_opt(model_tape, model_loss) with tf.GradientTape() as actor_tape: if step % 100000 == 0 and step > 0: if should_print: self.already_printed_dict[step] = True test_embed = self._encode(test_data) test_post, test_prior = self._dynamics.observe( test_embed, test_data['action']) imag_feat = self._imagine_ahead(test_post) imag_feat_sliced = imag_feat[:] decoded_images = self._decode(imag_feat_sliced) for j in range(100): for i in [5]: current_normal = decoded_images[j][i].distribution mean = current_normal.loc normalized_mean = tf.math.divide( tf.math.subtract(mean, tf.reduce_min(mean)), tf.math.subtract(tf.reduce_max(mean), tf.reduce_min(mean))) normalized_mean_int = tf.image.convert_image_dtype( normalized_mean, tf.uint8) image_file = tf.io.encode_jpeg(normalized_mean_int) file_name = "./img/steps{}traj{}img{}.jpg".format( step, i, j) tf.io.write_file(tf.constant(file_name), image_file)
def _train(self, data, log_images): with tf.GradientTape() as model_tape: embed = self._encode(data) post, prior = self._dynamics.observe(embed, data['action']) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.image = tf.reduce_mean(image_pred.log_prob(data[self._c.obs_type])) likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward'])) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) model_loss = self._c.kl_scale * div - sum(likes.values()) model_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as actor_tape: imag_feat = self._imagine_ahead(post) reward = tf.cast(self._reward(imag_feat).mode(), 'float') # cast: to address the output of bernoulli if self._c.pcont: pcont = self._pcont(imag_feat).mean() else: pcont = self._c.discount * tf.ones_like(reward) value = self._value(imag_feat).mode() returns = tools.lambda_return( reward[:-1], value[:-1], pcont[:-1], bootstrap=value[-1], lambda_=self._c.disclam, axis=0) discount = tf.stop_gradient(tf.math.cumprod(tf.concat( [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) actor_loss = -tf.reduce_mean(discount * returns) actor_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as value_tape: value_pred = self._value(imag_feat)[:-1] target = tf.stop_gradient(returns) value_loss = -tf.reduce_mean(discount * value_pred.log_prob(target)) value_loss /= float(self._strategy.num_replicas_in_sync) model_norm = self._model_opt(model_tape, model_loss) actor_norm = self._actor_opt(actor_tape, actor_loss) value_norm = self._value_opt(value_tape, value_loss) if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: if self._c.log_scalars: self._scalar_summaries( data, feat, prior_dist, post_dist, likes, div, model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm) if tf.equal(log_images, True): self._image_summaries(data, embed, image_pred) self._reward_summaries(data, reward_pred)
def define_config(): config = tools.AttrDict() # General. config.logdir = pathlib.Path('./logdir/atari_Krull_dreamer/') config.seed = 0 config.steps = 2e7 config.eval_every = 1e5 config.log_every = 1e3 config.log_scalars = True config.log_images = True config.gpu_growth = True # True config.precision = 16 # Environment. config.task = 'atari_Krull' config.envs = 1 # server-40 2*8 config.parallel = 'thread' # none thread process config.action_repeat = 4 # atari 4, mujoco 4 config.time_limit = 27000 # atari 27000,mujoco 1000 config.prefill = 5000 config.eval_noise = 0.001 # atari 0.001,mujoco 0.0 config.clip_rewards = 'tanh' # atari tanh, mujoco none # Model. config.deter_size = 200 # model_size/hidden_size config.stoch_size = 30 # 30 config.num_units = 400 # 400 config.dense_act = 'elu' config.cnn_act = 'relu' # config.cnn_depth = 32 # 32 config.pcont = True # atari True, mujoco False 这是啥啊 # continuous? config.free_nats = 3.0 # config.kl_scale = 0.1 # atari 0.1 mujoco 1.0 config.pcont_scale = 0.99 # 10.0 config.weight_decay = 0.0 config.weight_decay_pattern = r'.*' # Training. config.batch_size = 50 config.batch_length = 50 config.train_every = 1000 config.train_steps = 100 config.pretrain = 100 config.model_lr = 6e-4 config.value_lr = 8e-5 config.actor_lr = 8e-5 config.grad_clip = 100.0 config.dataset_balance = False # Behavior. config.discount = 1.0 # atari 1.0, mujoco 0.99 config.disclam = 0.95 # 非0即蒙特卡洛折扣回报 config.horizon = 10 # atari 10, mujoco 15 config.action_dist = 'onehot' # atari onehot, mujoco tanh_normal config.action_init_std = 5.0 # 5.0 config.expl = 'epsilon_greedy'# exploration : atari epsilon_greedy, mujoco epsilon_greedy config.expl_amount = 0.4 # atari 0.4, mujoco 0.3 config.expl_decay = 2e6 # atari 0.0, mujoco 0.0 config.expl_min = 0.25 # atari 0.1 in paper/0.01, mujoco 0.0 return config
def _train(self, data, log_images): with tf.GradientTape() as model_tape: embed = self._encode(data) post, prior = self._dynamics.observe(embed, data['action']) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.image = tf.reduce_mean(image_pred.log_prob(data['laser'])) likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward'])) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) model_loss = self._c.kl_scale * div - sum(likes.values()) with tf.GradientTape() as actor_tape: imag_feat = self._imagine_ahead(post) reward = self._reward(imag_feat).mode() if self._c.pcont: pcont = self._pcont(imag_feat).mean() else: pcont = self._c.discount * tf.ones_like(reward) value = self._value(imag_feat).mode() returns = tools.lambda_return(reward[:-1], value[:-1], pcont[:-1], bootstrap=value[-1], lambda_=self._c.disclam, axis=0) discount = tf.stop_gradient( tf.math.cumprod( tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) actor_loss = -tf.reduce_mean(discount * returns) with tf.GradientTape() as value_tape: value_pred = self._value(imag_feat)[:-1] target = tf.stop_gradient(returns) value_loss = -tf.reduce_mean( discount * value_pred.log_prob(target)) model_norm = self._model_opt(model_tape, model_loss) actor_norm = self._actor_opt(actor_tape, actor_loss) value_norm = self._value_opt(value_tape, value_loss) if self._c.log_scalars: self._scalar_summaries(data, feat, prior_dist, post_dist, likes, div, model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm)
def _model_train_step(self, data, prefix='train'): with tf.GradientTape() as model_tape: embed = self._encode(data) post, prior = self._dynamics.observe(embed, data['action']) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.image = tf.reduce_mean( tf.boolean_mask(image_pred.log_prob(data['image']), data['mask'])) likes.reward = tf.reduce_mean( tf.boolean_mask(reward_pred.log_prob(data['reward']), data['mask'])) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = data['terminal'] likes.pcont = tf.reduce_mean( tf.boolean_mask(pcont_pred.log_prob(pcont_target), data['mask'])) likes.pcont *= self._c.pcont_scale for key in prior.keys(): prior[key] = tf.boolean_mask(prior[key], data['mask']) post[key] = tf.boolean_mask(post[key], data['mask']) prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) model_loss = self._c.kl_scale * div - sum(likes.values()) if prefix == 'train': model_norm = self._model_opt(model_tape, model_loss) self._model_step += 1 if self._model_step % self._c.log_every == 0: self._image_summaries(data, embed, image_pred, self._model_step, prefix) model_summaries = dict() model_summaries['model_train/KL Divergence'] = tf.reduce_mean(div) model_summaries['model_train/image_recon'] = tf.reduce_mean( likes.image) model_summaries['model_train/reward_recon'] = tf.reduce_mean( likes.reward) model_summaries['model_train/model_loss'] = tf.reduce_mean( model_loss) if prefix == 'train': model_summaries['model_train/model_norm'] = tf.reduce_mean( model_norm) if self._c.pcont: model_summaries['model_train/terminal_recon'] = tf.reduce_mean( likes.pcont) self._write_summaries(model_summaries, self._model_step)
def main(_): """Create or load configuration and launch the trainer.""" utility.set_up_logging() if not FLAGS.config: raise KeyError('You must specify a configuration.') logdir = FLAGS.logdir and os.path.expanduser( os.path.join(FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config))) try: config = utility.load_config(logdir) except IOError: config = tools.AttrDict(getattr(configs, FLAGS.config)()) config = utility.save_config(config, logdir) for score in train(config, FLAGS.env_processes): tf.logging.info('Score {}.'.format(score))
def _train(self, data, log_images): ''' define the RL Algorithms the world model: do observation the actor net: do image the critic net: do image ''' with tf.GradientTape() as model_tape: ''' the world model, which is _dynamics(RSSM) ''' # data: {'action': shape=(25, 50, 4) float16, 'reward':shape=(25, 50) float16, #'discount': shape=(25, 50)float16, 'image': shape=(25, 50, 64, 64, 3) float16>} # 25: batch_size/num of GPU, 50:batch_length embed = self._encode(data) # (25, 50, 1024) post, prior = self._dynamics.observe( embed, data['action'] ) # world model try to dream from first step to last step. # post: post['meant'].shape: (25, 50, 30) feat = self._dynamics.get_feat( post) # feat: (25, batch_length, 230) image_pred = self._decode( feat) # image_pred.sample(): (25, batch_length, 64, 64, 3) reward_pred = self._reward( feat) # reward_pred.sample(): (25, batch_length) likes = tools.AttrDict( ) # collect the likelihood(prob of acturally happend events) likes.image = tf.reduce_mean(image_pred.log_prob(data['image'])) likes.reward = tf.reduce_mean( reward_pred.log_prob(data['reward']) ) # data['reward'].shape: (25, 50) => log_prob each step : (25, 50) scalar => likes.reward(mean of logprob) : () scalar if self._c.pcont: # for my aspect, this will make model to learn which step to focus by itself. pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) ''' the model loss is exactly the VAE loss of world model(which is VAE sample generator) ''' model_loss = self._c.kl_scale * div - sum(likes.values( )) # like.value contains log prob of image and reward model_loss /= float(self._strategy.num_replicas_in_sync) ''' dreamer ''' # with tf.GradientTape() as actor_tape: # imag_feat = self._imagine_ahead(post) # scaning to get prior for each prev state, step(policy&world model) for horizon(15) steps # print("imag_feat:",imag_feat.shape) # (15, 1225, 230) # reward = self._reward(imag_feat).mode() # get reward for every step # (15, 1225) # print("reward:",reward) # if self._c.pcont: # pcont = self._pcont(imag_feat).mean() # else: # pcont = self._c.discount * tf.ones_like(reward) # value = self._value(imag_feat).mode() # (15, 1250), 15 is horizon, 1250 is batch_length*batch_size/num of GPUs. # returns = tools.lambda_return( # reward[:-1], value[:-1], pcont[:-1], # bootstrap=value[-1], lambda_=self._c.disclam, axis=0) # an exponentially-weighted average of the estimates V for different k to balance bias and variance # # print("returns: ",returns) # (14, 1225) # discount = tf.stop_gradient(tf.math.cumprod(tf.concat( # [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) # not to effect the world model # print("discount:",discount.shape) # actor_loss = -tf.reduce_mean(discount * returns) # !!!!! not using policy gradient !!!! directy maximize return # actor_loss /= float(self._strategy.num_replicas_in_sync) # with tf.GradientTape() as value_tape: # value_pred = self._value(imag_feat)[:-1] # target = tf.stop_gradient(returns) # print("target:",target.shape) # (14, 1225) # print("value_pred.log_prob(target).shape:",value_pred.log_prob(target).shape) # (14, 1225) # value_loss = -tf.reduce_mean(discount * value_pred.log_prob(target)) # to directy predict return. gradient is not effecting world model # value_loss /= float(self._strategy.num_replicas_in_sync) ''' A2C ''' with tf.GradientTape() as actor_tape: # imaging ahead to get state, action, reward to imagine horizon # imag_feat: (15, 1250, 230) # also, revise the image ahead func and img_step func to get the action it take # imag_act: (15, 1250, number of action) => (15, 25, 50, number of action) imag_feat, imagine_action = self._imagine_ahead_and_get_action( post ) # scaning to get prior for each prev state, step(policy&world model) for horizon(15) steps # print("imagine_action:",imagine_action) # (15, 1225, number of action) if self._c.pcont: reduce_batch_length = self._c.batch_length - 1 # do not take the last one reduce_horizon = self._c.horizon - 1 else: reduce_batch_length = self._c.batch_length reduce_horizon = self._c.horizon - 1 imagine_action = tf.reshape( imagine_action, [self._c.horizon, -1, reduce_batch_length, self._actdim]) # print("imagine_action:",imagine_action) # (15, 25, 49 or 50, number of action) imagine_action = imagine_action[:, :, :reduce_batch_length - 10, :] # for td argmax_imagine_action = tf.argmax(imagine_action, -1) # one_hot_imagine_action = tf.one_hot(tf.argmax(imagine_action,-1),self._actdim) # print("imagine_action:",imagine_action.shape) # (15, 25, 39, 4) # print("one_hot_imagine_action:",one_hot_imagine_action.shape) # (15, 25, 39, 4) # Preprocess reward for actor and critic. sliding window size decide TD-N(10) # (15, 25, 50, 230) => (15, 25, 50, 1) => (15, 25, 50) # sliding for window 10: (15, 25, 50) =slidesum=> (15, 25, 40) # # first step: advantage, first and after: model-based(planning) advantage # # imagine reward first step (15, 25, 50) # discount (14, 1225) => (14,25,50,1) => (14,25,39,1) reward = self._reward( imag_feat).mode() # get reward for every step # (15, 1250) reward = tf.reshape(reward, [self._c.horizon, -1, reduce_batch_length, 1]) dim5_reward = tf.expand_dims(reward, -1) sum_reward = tf.extract_volume_patches( dim5_reward, [1, 1, 10, 1, 1], [1, 1, 1, 1, 1], "VALID") # need to be dimension 5 sum_reward = tf.reduce_sum(sum_reward, -1) # (15, 25, 40, 1) if self._c.pcont: pcont = self._pcont(imag_feat).mean() else: pcont = self._c.discount * tf.ones_like(reward) discount = tf.math.cumprod( tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0 ) # for A2C, if we like to learn pcont, do not stop the gradient print("discount:", discount.shape) # (14, 1225) discount = tf.reshape(discount, [reduce_horizon, -1, reduce_batch_length, 1]) discount = discount[:, :, :reduce_batch_length - 10, :] print("discount:", discount.shape) # discount: (14, 25, 39, 1) # value prediction # this value function is prediction current value to TD-N. (not sum to end of imagine horizon) # (15, 25, 50, 230) => (15, 25, 50) # (15, 25, [0:40]) => (15, 25, 40) st # (15, 25, [10:50]) => (15, 25, 40) st+1 # reward(15, 25, [0:40]) + (value prediction st(15,25, 40) - st+1(15, 25, 40)) => (15, 25, 40) # get advantage # stop gedient(15,25,40) value = self._value(imag_feat).mode( ) # (15, 1250), 15 is horizon, 1250 is batch_length*batch_size/num of GPUs. value = tf.reshape(value, [self._c.horizon, -1, reduce_batch_length, 1 ]) # (15, 1250 or 1245) => [15,-1,50 or 49,1] st_value = value[:, :, :reduce_batch_length - 10] stp1_value = value[:, :, 1:1 + reduce_batch_length - 10] print("st_value:", st_value.shape) # st_value: (15, 25, 40 or 39, 1) print("stp1_value:", stp1_value.shape) # stp1_value: (15, 25, 40 or 39, 1) # advantage actor-critic policy gradient # action(15, 25, [0:40]) * advantage(15, 25, 40) => (15, 25, 40) # reduce mean(15, 25, 40) if self._c.pcont: sum_reward = sum_reward[:, :, :reduce_batch_length - 10, :] # (15, 25, 39, 1) advantage = sum_reward + st_value - stp1_value # (15, 25, 39, 1) advantage = tf.stop_gradient(advantage) # update only actor print("imagine_action:", imagine_action.shape) # (15, 25, 39, 4) print("argmax_imagine_action:", argmax_imagine_action.shape) # (15, 25, 39, 4) policy_gradient = tf.keras.losses.sparse_categorical_crossentropy( argmax_imagine_action, imagine_action, from_logits=False) print("policy_gradient:", policy_gradient.shape) # (15, 25, 39) policy_gradient = tf.expand_dims(policy_gradient, -1) * advantage print("policy_gradient:", policy_gradient.shape) # (15, 25, 39, 1) policy_gradient = policy_gradient[:-1] * discount # (14, 25, 39, 1)*(14, 25, 39, 1) actor_loss = tf.reduce_mean(policy_gradient) actor_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as value_tape: # value loss # (15, 25, 40)st # slide reward: (15, 25, 40) # reduce_mean(l2((15, 25, 40),(15, 25, 40)) value = self._value(imag_feat).mode( ) # (15, 1250), 15 is horizon, 1250 is batch_length*batch_size/num of GPUs. value = tf.reshape(value, [self._c.horizon, -1, reduce_batch_length, 1 ]) # (15, 1250 or 1245) => [15,-1,50 or 49,1] st_value = value[:, :, :reduce_batch_length - 10] value_MSE = tf.keras.losses.MSE(tf.stop_gradient(sum_reward), st_value) print("value_MSE:", value_MSE.shape) value_MSE = tf.expand_dims(value_MSE[:-1], -1) * discount value_loss = tf.reduce_mean(value_MSE) value_loss /= float(self._strategy.num_replicas_in_sync) model_norm = self._model_opt(model_tape, model_loss) actor_norm = self._actor_opt(actor_tape, actor_loss) value_norm = self._value_opt(value_tape, value_loss) if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: if self._c.log_scalars: self._scalar_summaries(data, feat, prior_dist, post_dist, likes, div, model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm) if tf.equal(log_images, True): self._image_summaries(data, embed, image_pred)
def _train(self, data, log_images): with tf.GradientTape() as model_tape: embed = self._encode(data) post, prior = self._dynamics.observe(embed, data["action"]) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.image = tf.reduce_mean(image_pred.log_prob(data["image"])) ###################################################################### # RE3: + intrinsic rewards rand_embed_ = tf.stop_gradient(self._rand_encode(data)) rand_embed = tf.reshape(rand_embed_, [-1, 50]) dist = tf.norm(rand_embed[:, None, :] - rand_embed[None, :, :], axis=-1) int_reward = -1.0 * tf.math.top_k(-dist, k=self._c.k).values[:, -1] norm_int_reward = self._rms(int_reward) norm_int_reward = tf.reshape(norm_int_reward, tf.shape(rand_embed_)[:-1]) likes.reward = tf.reduce_mean( reward_pred.log_prob(data["reward"] + self._c.beta * norm_int_reward) ) ###################################################################### if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data["discount"] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) model_loss = self._c.kl_scale * div - sum(likes.values()) model_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as actor_tape: imag_feat = self._imagine_ahead(post) reward = self._reward(imag_feat).mode() if self._c.pcont: pcont = self._pcont(imag_feat).mean() else: pcont = self._c.discount * tf.ones_like(reward) value = self._value(imag_feat).mode() returns = tools.lambda_return( reward[:-1], value[:-1], pcont[:-1], bootstrap=value[-1], lambda_=self._c.disclam, axis=0, ) discount = tf.stop_gradient( tf.math.cumprod(tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0) ) actor_loss = -tf.reduce_mean(discount * returns) actor_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as value_tape: value_pred = self._value(imag_feat)[:-1] target = tf.stop_gradient(returns) value_loss = -tf.reduce_mean(discount * value_pred.log_prob(target)) value_loss /= float(self._strategy.num_replicas_in_sync) model_norm = self._model_opt(model_tape, model_loss) actor_norm = self._actor_opt(actor_tape, actor_loss) value_norm = self._value_opt(value_tape, value_loss) if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: if self._c.log_scalars: self._scalar_summaries( data, feat, prior_dist, post_dist, likes, div, model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm, int_reward, norm_int_reward, ) if tf.equal(log_images, True): self._image_summaries(data, embed, image_pred)
def _train(self, data, log_images): with tf.GradientTape() as model_tape: embed = self._encode(data) post, prior = self._dynamics.observe(embed, data['action']) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.image = tf.reduce_mean(image_pred.log_prob(data['image'])) likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward'])) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) model_loss = self._c.kl_scale * div - sum(likes.values()) model_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as actor_tape: alset = [] alset_re = [] imag_feat, prob_traj = self._imagine_ahead(post) prob_traj=tf.reduce_sum(prob_traj,0,keepdims=True) prob_traj = (prob_traj - tf.reduce_mean(prob_traj)) / (tf.math.reduce_std(prob_traj) + 1e-9) +1 prob_traj = tf.clip_by_value(prob_traj, 1 - self._c.reweight_clip, 1 + self._c.reweight_clip) prob_traj = prob_traj entropy = tf.reduce_mean(self._actor(tf.stop_gradient(feat)).entropy()) tlikes = tools.AttrDict() tlikes.reward = reward_pred.log_prob(data['reward']) reward = self._reward(imag_feat).mode() if self._c.pcont: pcont = self._pcont(imag_feat).mean() else: pcont = self._c.discount * tf.ones_like(reward) value = self._value(imag_feat).mode() returns = tools.lambda_return( reward[:-1], value[:-1], pcont[:-1], bootstrap=value[-1], lambda_=self._c.disclam, axis=0) discount = tf.stop_gradient(tf.math.cumprod(tf.concat( [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) alset.append(-tf.reduce_mean(discount * returns)) alset_re.append(-tf.reduce_mean(tf.stop_gradient(prob_traj)* discount * returns)) actor_loss = tf.reduce_mean(tf.stack(alset_re)) - self._c.ent_alpha * entropy actor_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as value_tape: value_pred = self._value(imag_feat)[:-1] target = tf.stop_gradient(returns) value_loss = -tf.reduce_mean(prob_traj * discount * value_pred.log_prob(target)) value_loss /= float(self._strategy.num_replicas_in_sync) model_norm = self._model_opt(model_tape, model_loss) actor_norm = self._actor_opt(actor_tape, actor_loss) value_norm = self._value_opt(value_tape, value_loss) if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: if self._c.log_scalars: self._scalar_summaries( data, feat, prior_dist, post_dist, likes, div, model_loss, value_loss, actor_loss, entropy, model_norm, value_norm, actor_norm) if tf.equal(log_images, True): self._image_summaries(data, embed, image_pred)
def _train(self, data, log_images, init_horizon, imagine_depth): with tf.GradientTape() as model_tape: embed = self._encode(data) post, prior = self._dynamics.observe(embed, data['action']) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.image = tf.reduce_mean(image_pred.log_prob(data['image'])) likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward'])) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) model_loss = self._c.kl_scale * div - sum(likes.values()) model_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as actor_tape: flatten = lambda x: tf.reshape(x, [-1] + list(x.shape[2:])) discount = None imag_feats = [] returns_lst = [] discounts = [] actor_loss = 0.0 horizon = init_horizon for depth in range(imagine_depth): if self._c.pcont: # Last step could be terminal. post = {k: v[:, :-1] for k, v in post.items()} post = {k: flatten(v) for k, v in post.items()} if depth != 0: post = { k: tf.stop_gradient( tf.gather(v, indices=max_indexes, axis=0)) for k, v in post.items() } imag_feat, post = self._imagine_ahead(post, horizon) tf.print("Imagination Features:", tf.shape(imag_feat)) reward = self._reward(imag_feat).mode() if self._c.pcont: pcont = self._pcont(imag_feat).mean() else: pcont = self._c.discount * tf.ones_like(reward) value = self._value(imag_feat).mode() returns = tools.lambda_return(reward[:-1], value[:-1], pcont[:-1], bootstrap=value[-1], lambda_=self._c.disclam, axis=0) discount = tf.stop_gradient( tf.math.cumprod( tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) if depth != imagine_depth - 1: if self._c.branch_type == "reward": flat_reward = flatten(reward) max_indexes = tf.math.top_k( flat_reward, k=int(2500 / self._strategy.num_replicas_in_sync), sorted=False)[1] elif self._c.branch_type == "uniform": flat_reward = flatten(reward) max_indexes = tf.random.uniform( [int(2500 / self._strategy.num_replicas_in_sync)], minval=0, maxval=flat_reward.shape[0], dtype=tf.int32) elif self._c.branch_type == "value": flat_value = flatten(value) max_indexes = tf.math.top_k( flat_value, k=int(2500 / self._strategy.num_replicas_in_sync), sorted=False)[1] horizon = int(horizon * self._c.imagine_decay) imag_feats.append(imag_feat) returns_lst.append(returns) discounts.append(discount) actor_loss += -tf.reduce_mean(discount * returns) actor_loss /= float(self._strategy.num_replicas_in_sync * imagine_depth) with tf.GradientTape() as value_tape: value_loss = 0.0 for imag_feat, returns, discount in zip(imag_feats, returns_lst, discounts): value_pred = self._value(imag_feat)[:-1] target = tf.stop_gradient(returns) value_loss += -tf.reduce_mean( discount * value_pred.log_prob(target)) value_loss /= float(self._strategy.num_replicas_in_sync * imagine_depth) model_norm = self._model_opt(model_tape, model_loss) actor_norm = self._actor_opt(actor_tape, actor_loss) value_norm = self._value_opt(value_tape, value_loss) if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: if self._c.log_scalars: self._scalar_summaries(data, feat, prior_dist, post_dist, likes, div, model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm) if tf.equal(log_images, True): self._image_summaries(data, embed, image_pred)
def define_config(): config = tools.AttrDict() # General. config.logdir = pathlib.Path('.') config.seed = 0 config.steps = 5e6 config.eval_every = 1e4 config.log_every = 1e3 config.log_scalars = True config.log_images = True config.gpu_growth = True config.precision = 16 # Environment. config.task = 'gridworld_water' config.envs = 1 config.parallel = 'none' config.action_repeat = 1 config.time_limit = 200 config.prefill = 5000 config.eval_noise = 0.05 config.clip_rewards = 'none' # Model. # config.deter_size = 200 # config.stoch_size = 30 # config.num_units = 400 config.deter_size = 16 config.stoch_size = 16 config.num_units = 256 config.dense_act = 'elu' config.cnn_act = 'relu' config.cnn_depth = 32 config.pcont = True config.free_nats = 3.0 config.kl_scale = 2.0 #NOTE:TUNE config.pcont_scale = 10.0 config.weight_decay = 0.0 config.weight_decay_pattern = r'.*' # Training. config.batch_size = 32 config.batch_length = 8 config.train_every = 1000 config.train_steps = 100 config.pretrain = 100 config.model_lr = 6e-4 config.value_lr = 8e-5 config.actor_lr = 8e-5 config.grad_clip = 100.0 # config.dataset_balance = False config.dataset_balance = True # Behavior. config.discount = 0.99 config.disclam = 0.95 config.horizon = 15 config.action_init_std = 5.0 # config.action_dist = 'tanh_normal' # config.expl = 'additive_gaussian' # config.expl_amount = 0.3 config.action_dist = 'onehot' # config.action_dist = 'gumbel' config.expl = 'epsilon_greedy' config.expl_amount = 0.3 config.expl_decay = 1e5 config.expl_min = 0.05 # add by haohu config.cpc = False config.cpc_num_units = 64 config.cpc_latent_size = config.stoch_size + config.deter_size config.cpc_num_layers = 3 config.cpc_contrast = 'window' config.cpc_batch_amount = 8 config.cpc_time_amount = 4 return config
def define_config(): config = tools.AttrDict() # General. config.logdir = pathlib.Path('.') config.seed = 0 config.steps = 2e6 config.eval_every = 1e4 config.log_every = 1e3 config.log_scalars = True config.log_images = True config.gpu_growth = True config.precision = 32 # Environment. config.task = 'dmc_cup_catch' config.envs = 1 config.parallel = 'none' config.action_repeat = 2 config.time_limit = 1000 config.prefill = 5000 config.eval_noise = 0.0 config.clip_rewards = 'none' # Model. config.deter_size = 200 config.stoch_size = 30 config.num_units = 400 config.dense_act = 'elu' config.cnn_act = 'relu' config.cnn_depth = 32 config.pcont = False config.free_nats = 3.0 config.kl_scale = 1.0 config.pcont_scale = 10.0 config.weight_decay = 0.0 config.weight_decay_pattern = r'.*' # Training. config.batch_size = 50 config.batch_length = 50 config.train_every = 1000 config.train_steps = 100 config.pretrain = 100 config.model_lr = 6e-4 config.value_lr = 8e-5 config.actor_lr = 8e-5 config.grad_clip = 100.0 config.dataset_balance = False # Behavior. config.discount = 0.99 config.disclam = 0.95 config.horizon = 15 config.action_dist = 'tanh_normal' config.action_init_std = 5.0 config.expl = 'additive_gaussian' config.expl_amount = 0.3 config.expl_decay = 0.0 config.expl_min = 0.0 config.id = 'debug' config.use_state = False # Sim2real transfer config.real_world_prob = -1 # fraction of samples trained on which are from the real world (probably involves oversampling real-world samples) config.sample_real_every = 2 # How often we should sample from the real world #these values are for testing dmc_cup_catch config.mass_mean = 0.2 config.mass_range = 0.01 return config
def define_config(): config = tools.AttrDict() # General. config.logdir = pathlib.Path('.') config.seed = 0 config.steps = 5e6 config.eval_every = 1e4 config.log_every = 1e3 config.log_scalars = True config.log_images = True config.gpu_growth = True config.precision = 16 # Environment. config.task = 'dmc_walker_walk' config.envs = 1 config.parallel = 'none' config.action_repeat = 2 config.time_limit = 1000 config.prefill = 5000 config.eval_noise = 0.0 config.clip_rewards = 'none' # Model. config.deter_size = 200 config.stoch_size = 30 config.num_units = 400 config.dense_act = 'elu' config.cnn_act = 'relu' config.cnn_depth = 32 config.pcont = False config.free_nats = 3.0 config.kl_scale = 1.0 config.pcont_scale = 10.0 config.weight_decay = 0.0 config.weight_decay_pattern = r'.*' # Training. config.batch_size = 50 config.batch_length = 50 config.train_every = 1000 config.train_steps = 100 config.pretrain = 100 config.model_lr = 6e-4 config.value_lr = 8e-5 config.actor_lr = 8e-5 config.grad_clip = 100.0 config.dataset_balance = False # Behavior. config.discount = 0.99 config.disclam = 0.95 config.horizon = 15 config.action_dist = 'tanh_normal' config.action_init_std = 5.0 config.expl = 'additive_gaussian' config.expl_amount = 0.3 config.expl_decay = 0.0 config.expl_min = 0.0 config.log_imgs = False # natural or not config.natural = False # obs model config.obs_model = 'contrastive' # SAC settings config.num_Qs = 2 # use dreamer and SAC for hybrid actor-critic training config.use_sac = True config.use_dreamer = True # use trajectory optimization config.trajectory_opt = True config.traj_opt_lr = 0.003 config.num_samples = 20 return config
def define_config(): """ Default definition of command-line arguments. """ config = tools.AttrDict() # General. config.datetime = datetime.now().strftime( "%m-%d-%Y %H:%M:%S") # just for logging config config.seed = random.randint(2, 10**6) config.logdir = pathlib.Path('logs/experiments') config.steps = 5e6 config.eval_every = 1e4 config.log_every = 1e3 config.log_scalars = True config.log_images = True config.log_videos = True config.gpu_growth = True config.precision = 32 config.obs_type = 'lidar' # Environment. config.track = 'austria' config.task = 'max_progress' config.action_repeat = 4 config.eval_episodes = 5 config.time_limit_train = 2000 config.time_limit_test = 4000 config.prefill_agent = 'gap_follower' config.prefill = 5000 config.eval_noise = 0.0 config.clip_rewards = 'none' config.clip_rewards_min = -1 config.clip_rewards_max = 1 # Model. config.encoded_obs_dim = 1080 config.deter_size = 200 config.stoch_size = 30 config.num_units = 400 config.reward_out_dist = 'normal' config.dense_act = 'elu' config.cnn_act = 'relu' config.pcont = True config.free_nats = 3.0 config.kl_scale = 1.0 config.pcont_scale = 10.0 config.weight_decay = 0.0 config.weight_decay_pattern = r'.*' # Training. config.batch_size = 50 config.batch_length = 50 config.train_every = 1000 config.train_steps = 100 config.pretrain = 100 config.model_lr = 6e-4 config.value_lr = 8e-5 config.actor_lr = 8e-5 config.grad_clip = 1.0 config.dataset_balance = False # Behavior. config.discount = 0.99 config.disclam = 0.95 config.horizon = 15 config.action_dist = 'tanh_normal' config.action_init_std = 5.0 config.expl = 'additive_gaussian' config.expl_amount = 0.3 config.expl_decay = 0.0 config.expl_min = 0.3 return config
def define_config(): config = tools.AttrDict() # General. config.logdir = pathlib.Path('.logdir') #config.datadir = pathlib.Path('./data/') config.datadir = pathlib.Path('.datadir/walker') config.seed = 0 config.log_every = 1000 config.save_every = 5000 config.log_scalars = True config.log_images = True config.gpu_growth = True # Environment. config.task = 'dmc_walker_walk' config.envs = 1 config.parallel = 'none' config.action_repeat = 2 config.time_limit = 1000 config.im_size = 64 config.eval_noise = 0.0 config.clip_rewards = 'none' config.precision = 32 # Model. config.deter_size = 256 config.stoch_size = 64 config.num_models = 7 config.num_units = 256 config.proprio = False config.penalty_type = 'log_prob' config.dense_act = 'elu' config.cnn_act = 'relu' config.cnn_depth = 32 config.pcont = False config.kl_scale = 1.0 config.pcont_scale = 10.0 config.weight_decay = 0.0 config.weight_decay_pattern = r'.*' # Training. config.load_model = False config.load_agent = False config.load_buffer = False config.train_steps = 100000 config.model_train_steps = 25000 config.model_batch_size = 64 config.model_batch_length = 50 config.agent_batch_size = 256 config.cql_samples = 16 config.start_training = 50000 config.agent_train_steps = 100000 config.agent_itters_per_step = 200 config.buffer_size = 2e6 config.model_lr = 6e-4 config.q_lr = 3e-4 config.actor_lr = 3e-4 config.grad_clip = 100.0 config.tau = 5e-3 config.target_update_interval = 1 config.dataset_balance = False # Behavior. config.lmbd = 5.0 config.alpha = 0.0 config.sample = True config.discount = 0.99 config.disclam = 0.95 config.horizon = 5 config.done_treshold = 0.5 config.action_dist = 'tanh_normal' config.action_init_std = 5.0 config.expl = 'additive_gaussian' config.expl_amount = 0.2 config.expl_decay = 0.0 config.expl_min = 0.0 return config
def _train(self, data, log_images): with tf.GradientTape() as model_tape: if 'success' in data: success_rate = tf.reduce_sum( data['success']) / data['success'].shape[1] else: success_rate = tf.convert_to_tensor(-1) embed = self._encode(data) if 'state' in data: embed = tf.concat([data['state'], embed], axis=-1) post, prior = self._dynamics.observe(embed, data['action']) feat = self._dynamics.get_feat(post) image_pred = self._decode(feat) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.image = tf.reduce_mean(image_pred.log_prob(data['image'])) reward_obj = reward_pred.log_prob(data['reward']) # Mask out the elements which came from the real world env reward_obj = reward_obj * (1 - data['real_world']) likes.reward = tf.reduce_mean(reward_obj) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) model_loss = self._c.kl_scale * div - sum(likes.values()) model_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as actor_tape: imag_feat = self._imagine_ahead(post) reward = self._reward(imag_feat).mode() if self._c.pcont: pcont = self._pcont(imag_feat).mean() else: pcont = self._c.discount * tf.ones_like(reward) value = self._value(imag_feat).mode() returns = tools.lambda_return(reward[:-1], value[:-1], pcont[:-1], bootstrap=value[-1], lambda_=self._c.disclam, axis=0) discount = tf.stop_gradient( tf.math.cumprod( tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) actor_loss = -tf.reduce_mean(discount * returns) actor_loss /= float(self._strategy.num_replicas_in_sync) with tf.GradientTape() as value_tape: value_pred = self._value(imag_feat)[:-1] target = tf.stop_gradient(returns) value_loss = -tf.reduce_mean( discount * value_pred.log_prob(target)) value_loss /= float(self._strategy.num_replicas_in_sync) model_norm = self._model_opt(model_tape, model_loss) actor_norm = self._actor_opt(actor_tape, actor_loss) value_norm = self._value_opt(value_tape, value_loss) if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: if self._c.log_scalars: self._scalar_summaries(data, feat, prior_dist, post_dist, likes, div, model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm, success_rate) if tf.equal(log_images, True): self._image_summaries(data, embed, image_pred)
def _train(self, data, log_images): with tf.GradientTape() as model_tape: embed = self._encode(data) post, prior = self._dynamics.observe(embed, data['action']) feat = self._dynamics.get_feat(post) reward_pred = self._reward(feat) likes = tools.AttrDict() likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward'])) # if we use the generative observation model, we need to perform observation reconstruction image_pred = self._decode(feat) # compute the contrative loss directly in CVRL cont_loss = self._contrastive(feat, embed) # the contrastive / generative implementation of the observation model p(o|s) if self._c.obs_model == 'generative': likes.image = tf.reduce_mean(image_pred.log_prob( data['image'])) elif self._c.obs_model == 'contrastive': likes.image = tf.reduce_mean(cont_loss) if self._c.pcont: pcont_pred = self._pcont(feat) pcont_target = self._c.discount * data['discount'] likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target)) likes.pcont *= self._c.pcont_scale prior_dist = self._dynamics.get_dist(prior) post_dist = self._dynamics.get_dist(post) div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist)) div = tf.maximum(div, self._c.free_nats) model_loss = self._c.kl_scale * div - sum(likes.values()) assert self._c.use_dreamer or self._c.use_sac if self._c.use_dreamer: with tf.GradientTape() as actor_tape: imag_feat = self._imagine_ahead(post) reward = self._reward(imag_feat).mode() if self._c.pcont: pcont = self._pcont(imag_feat).mean() else: pcont = self._c.discount * tf.ones_like(reward) value = self._value(imag_feat).mode() returns = tools.lambda_return(reward[:-1], value[:-1], pcont[:-1], bootstrap=value[-1], lambda_=self._c.disclam, axis=0) discount = tf.stop_gradient( tf.math.cumprod( tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) actor_loss = -tf.reduce_mean(discount * returns) with tf.GradientTape() as value_tape: value_pred = self._value(imag_feat)[:-1] target = tf.stop_gradient(returns) value_loss = - \ tf.reduce_mean(discount * value_pred.log_prob(target)) actor_norm = self._actor_opt(actor_tape, actor_loss) value_norm = self._value_opt(value_tape, value_loss) else: actor_norm = actor_loss = 0 value_norm = value_loss = 0 model_norm = self._model_opt(model_tape, model_loss) states = tf.concat([post['stoch'], post['deter']], axis=-1) rewards = data['reward'] dones = tf.zeros_like(rewards) actions = data['action'] # if we use SAC, add the SAC training if self._c.use_sac: self._sac._do_training(self._step, states, actions, rewards, dones) if tf.distribute.get_replica_context().replica_id_in_sync_group == 0: if self._c.log_scalars: self._scalar_summaries(data, feat, prior_dist, post_dist, likes, div, model_loss, value_loss, actor_loss, model_norm, value_norm, actor_norm) if tf.equal(log_images, True) and self._c.log_imgs: self._image_summaries(data, embed, image_pred)