Beispiel #1
0
def define_simulation_graph(batch_env, algo_cls, config):
    """Define the algortihm and environment interaction.

  Args:
    batch_env: In-graph environments object.
    algo_cls: Constructor of a batch algorithm.
    config: Configuration object for the algorithm.

  Returns:
    Object providing graph elements via attributes.
  """
    # pylint: disable=unused-variable
    step = tf.Variable(0, False, dtype=tf.int32, name='global_step')

    is_training = tf.placeholder(tf.bool, name='is_training')
    should_log = tf.placeholder(tf.bool, name='should_log')
    do_report = tf.placeholder(tf.bool, name='do_report')
    force_reset = tf.placeholder(tf.bool, name='force_reset')

    algo = algo_cls(batch_env, step, is_training, should_log, config)
    #algo_cls=ppo.PPOAlgorithm, and it returns a vectorized implementation of PPO.
    #"""A vectorized implementation of the PPO algorithm by John Schulman."""

    done, score, summary = tools.simulate(batch_env, algo, should_log,
                                          force_reset)
    """"
   tools.simulate Returns:
    Tuple of tensors containing done flags for the current episodes, possibly
    intermediate scores for the episodes, and a summary tensor.
  """
    message = 'Graph contains {} trainable variables.'
    tf.logging.info(message.format(tools.count_weights()))
    # pylint: enable=unused-variable
    return tools.AttrDict(locals())
Beispiel #2
0
def define_simulation_graph(batch_env, algo_cls, config):
    """Define the algortihm and environment interaction.

  Args:
    batch_env: In-graph environments object.
    algo_cls: Constructor of a batch algorithm.
    config: Configuration object for the algorithm.

  Returns:
    Object providing graph elements via attributes.
  """
    # pylint: disable=unused-variable

    step = tf.Variable(0, False, dtype=tf.int32, name='global_step')
    is_training = tf.placeholder(tf.bool, name='is_training')
    should_log = tf.placeholder(tf.bool, name='should_log')
    do_report = tf.placeholder(tf.bool, name='do_report')
    force_reset = tf.placeholder(tf.bool, name='force_reset')
    algo = algo_cls(batch_env, step, is_training, should_log, config)
    done, score, summary = tools.simulate(batch_env, algo, should_log,
                                          force_reset)
    message = 'Graph contains {} trainable variables.'
    tf.logging.info(message.format(tools.count_weights()))
    # pylint: enable=unused-variable
    return tools.AttrDict(locals())
Beispiel #3
0
def main(_):
    tf_utils.set_up_logging()
    os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.GPU
    if not FLAGS.config:
        raise KeyError('You must specify a configuration.')
    if FLAGS.load_from:
        logdir = FLAGS.logdir = FLAGS.load_from
    else:
        if FLAGS.logdir and os.path.exists(FLAGS.logdir):
            run_number = [
                int(f.split("-")[0]) for f in os.listdir(FLAGS.logdir)
                if os.path.isdir(os.path.join(FLAGS.logdir, f))
                and FLAGS.config in f
            ]
            run_number = max(run_number) + 1 if len(run_number) > 0 else 0
        else:
            run_number = 0
        logdir = FLAGS.logdir and os.path.expanduser(
            os.path.join(FLAGS.logdir, '{}-{}'.format(run_number,
                                                      FLAGS.config)))
    try:
        config = tf_utils.load_config(logdir)
    except IOError:
        config = tools.AttrDict(getattr(configs, FLAGS.config)())
        config = tf_utils.save_config(config, logdir)
    run_wild(config, logdir)
Beispiel #4
0
def main(_):
    utility.set_up_logging()
    if not FLAGS.config:
        raise KeyError('You must specify a configuration.')
    if FLAGS.load_from:
        logdir = FLAGS.logdir = FLAGS.load_from
    else:
        if FLAGS.logdir and os.path.exists(FLAGS.logdir):
            run_number = [
                int(f.split("-")[0]) for f in os.listdir(FLAGS.logdir)
                if os.path.isdir(os.path.join(FLAGS.logdir, f))
                and FLAGS.config in f
            ]
            run_number = max(run_number) + 1 if len(run_number) > 0 else 0
        else:
            run_number = 0
        logdir = FLAGS.logdir and os.path.expanduser(
            os.path.join(FLAGS.logdir, '{}-{}'.format(run_number,
                                                      FLAGS.config)))
        # recreate_directory_structure(logdir)
    try:
        config = utility.load_config(logdir)
    except IOError:
        config = tools.AttrDict(getattr(configs, FLAGS.config)())
        config = utility.save_config(config, logdir)
    train(config, FLAGS.env_processes, logdir)
Beispiel #5
0
def main(_):
    """Configure logging."""
    utility.set_up_logging()
    """Assert configuration and set-up directory log structure of the configuration."""
    if not FLAGS.config:
        raise KeyError('You must specify a configuration.')
    if FLAGS.load_from:
        logdir = FLAGS.logdir = FLAGS.load_from
    else:
        """If config log directory already exists, increase the counter number and setup log dir."""
        if FLAGS.logdir and os.path.exists(FLAGS.logdir):
            run_number = [
                int(f.split("-")[0]) for f in os.listdir(FLAGS.logdir)
                if os.path.isdir(os.path.join(FLAGS.logdir, f))
                and FLAGS.config in f
            ]
            run_number = max(run_number) + 1 if len(run_number) > 0 else 0
        else:
            run_number = 0
        logdir = FLAGS.logdir and os.path.expanduser(
            os.path.join(FLAGS.logdir, '{}-{}'.format(run_number,
                                                      FLAGS.config)))
    """If config log directory already exists, try to load config file from it. Otherwise create a new config file 
  coresponding to the user specified config from the config.py"""
    try:
        config = utility.load_config(logdir)
    except IOError:
        config = tools.AttrDict(getattr(configs, FLAGS.config)())
        config = utility.save_config(config, logdir)
    """Run the task specified."""
    run(config, logdir)
Beispiel #6
0
 def _train_model(self, data):
     with tf.GradientTape() as model_tape:
         embed = self._encode(data)
         # print(embed,data['action'])
         post, prior = self._dynamics.observe(embed, data['action'],
                                              data['desc'])
         feat = self._dynamics.get_feat(post)
         image_pred = self._decode(feat)
         reward_pred = self._reward(feat)
         likes = tools.AttrDict()
         if self._c.cpc:
             # print("using cpc")
             pred = self._cpc_pred(embed)
             # print(pred,feat)
             cpc_loss = -1. * tf.math.reduce_mean(
                 tools.compute_cpc_loss(pred, feat, self._c))  # caution!
             model_loss = cpc_loss
         else:
             model_loss = cpc_loss = 0
             likes.image = tf.reduce_mean(image_pred.log_prob(
                 data['image']))
         likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))
         if self._c.pcont:
             pcont_pred = self._pcont(feat)
             pcont_target = self._c.discount * data['discount']
             likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
             likes.pcont *= self._c.pcont_scale
         prior_dist = self._dynamics.get_dist(prior)
         post_dist = self._dynamics.get_dist(post)
         div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
         div = tf.maximum(div, self._c.free_nats)
         # model_loss = self._c.kl_scale * div - sum(likes.values())
         model_loss += self._c.kl_scale * div - sum(likes.values())
         model_loss /= float(self._strategy.num_replicas_in_sync)
     model_norm = self._model_opt(model_tape, model_loss)
Beispiel #7
0
def define_config():
    config = tools.AttrDict()
    # General.
    config.device = 0
    config.logdir = pathlib.Path('.')
    config.seed = 0
    config.steps = 5e6
    config.eval_every = 1e4
    config.log_every = 1e3
    config.log_scalars = True
    config.log_images = True
    config.gpu_growth = True
    config.precision = 16
    # Environment.
    config.task = 'dmc_walker_walk'
    config.envs = 1
    config.parallel = 'none'
    config.action_repeat = 2
    config.time_limit = 1000
    config.prefill = 5000
    config.eval_noise = 0.0
    config.clip_rewards = 'none'
    # Model.
    config.deter_size = 200
    config.stoch_size = 30
    config.num_units = 400
    config.dense_act = 'elu'
    config.cnn_act = 'relu'
    config.cnn_depth = 32
    config.pcont = False
    config.free_nats = 3.0
    config.kl_scale = 1.0
    config.pcont_scale = 10.0
    config.weight_decay = 0.0
    config.weight_decay_pattern = r'.*'
    # Training.
    config.batch_size = 50
    config.batch_length = 50
    config.train_every = 1000
    config.train_steps = 100
    config.pretrain = 100
    config.model_lr = 6e-4
    config.value_lr = 8e-5
    config.actor_lr = 8e-5
    config.grad_clip = 100.0
    config.dataset_balance = False
    # Behavior.
    config.discount = 0.99
    config.disclam = 0.95
    config.horizon = 15
    config.action_dist = 'tanh_normal'
    config.action_init_std = 5.0
    config.expl = 'additive_gaussian'
    config.expl_amount = 0.3
    config.expl_decay = 0.0
    config.expl_min = 0.0
    config.ent_warm_up = 0
    config.ent_alpha = 0.2
    return config
Beispiel #8
0
    def _train(self, data, test_data, log_images, step=1, should_print=False):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])

            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data['image']))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)

        with tf.GradientTape() as actor_tape:

            if step % 100000 == 0 and step > 0:

                if should_print:

                    self.already_printed_dict[step] = True

                    test_embed = self._encode(test_data)
                    test_post, test_prior = self._dynamics.observe(
                        test_embed, test_data['action'])
                    imag_feat = self._imagine_ahead(test_post)

                    imag_feat_sliced = imag_feat[:]
                    decoded_images = self._decode(imag_feat_sliced)

                    for j in range(100):
                        for i in [5]:
                            current_normal = decoded_images[j][i].distribution
                            mean = current_normal.loc

                            normalized_mean = tf.math.divide(
                                tf.math.subtract(mean, tf.reduce_min(mean)),
                                tf.math.subtract(tf.reduce_max(mean),
                                                 tf.reduce_min(mean)))

                            normalized_mean_int = tf.image.convert_image_dtype(
                                normalized_mean, tf.uint8)
                            image_file = tf.io.encode_jpeg(normalized_mean_int)
                            file_name = "./img/steps{}traj{}img{}.jpg".format(
                                step, i, j)
                            tf.io.write_file(tf.constant(file_name),
                                             image_file)
Beispiel #9
0
    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data[self._c.obs_type]))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = tf.cast(self._reward(imag_feat).mode(), 'float')  # cast: to address the output of bernoulli
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(
                reward[:-1], value[:-1], pcont[:-1],
                bootstrap=value[-1], lambda_=self._c.disclam, axis=0)
            discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
                [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = -tf.reduce_mean(discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data, feat, prior_dist, post_dist, likes, div,
                    model_loss, value_loss, actor_loss, model_norm, value_norm,
                    actor_norm)
            if tf.equal(log_images, True):
                self._image_summaries(data, embed, image_pred)
                self._reward_summaries(data, reward_pred)
Beispiel #10
0
def define_config():
  config = tools.AttrDict()
  # General.
  config.logdir = pathlib.Path('./logdir/atari_Krull_dreamer/')
  config.seed = 0
  config.steps = 2e7
  config.eval_every = 1e5
  config.log_every = 1e3
  config.log_scalars = True
  config.log_images = True
  config.gpu_growth = True   # True
  config.precision = 16
  # Environment.
  config.task = 'atari_Krull'
  config.envs = 1               # server-40 2*8
  config.parallel = 'thread'      # none thread process
  config.action_repeat = 4      # atari 4, mujoco 4
  config.time_limit = 27000     # atari 27000,mujoco 1000
  config.prefill = 5000
  config.eval_noise = 0.001     # atari 0.001,mujoco 0.0
  config.clip_rewards = 'tanh'  # atari tanh, mujoco none
  # Model.
  config.deter_size = 200   # model_size/hidden_size
  config.stoch_size = 30    # 30
  config.num_units = 400    # 400
  config.dense_act = 'elu'
  config.cnn_act = 'relu'   #
  config.cnn_depth = 32     # 32
  config.pcont = True       # atari True, mujoco False 这是啥啊 # continuous?
  config.free_nats = 3.0    #
  config.kl_scale = 0.1     # atari 0.1 mujoco 1.0
  config.pcont_scale = 0.99 # 10.0
  config.weight_decay = 0.0
  config.weight_decay_pattern = r'.*'
  # Training.
  config.batch_size = 50
  config.batch_length = 50
  config.train_every = 1000
  config.train_steps = 100
  config.pretrain = 100
  config.model_lr = 6e-4
  config.value_lr = 8e-5
  config.actor_lr = 8e-5
  config.grad_clip = 100.0
  config.dataset_balance = False
  # Behavior.
  config.discount = 1.0         # atari 1.0, mujoco 0.99
  config.disclam = 0.95         # 非0即蒙特卡洛折扣回报
  config.horizon = 10           # atari 10, mujoco 15
  config.action_dist = 'onehot' # atari onehot, mujoco tanh_normal
  config.action_init_std = 5.0  # 5.0
  config.expl = 'epsilon_greedy'# exploration : atari epsilon_greedy, mujoco epsilon_greedy
  config.expl_amount = 0.4      # atari 0.4, mujoco 0.3
  config.expl_decay = 2e6       # atari 0.0, mujoco 0.0
  config.expl_min = 0.25         # atari 0.1 in paper/0.01, mujoco 0.0 
  return config
    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data['laser']))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())

        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(reward[:-1],
                                          value[:-1],
                                          pcont[:-1],
                                          bootstrap=value[-1],
                                          lambda_=self._c.disclam,
                                          axis=0)
            discount = tf.stop_gradient(
                tf.math.cumprod(
                    tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)

        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = -tf.reduce_mean(
                discount * value_pred.log_prob(target))

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if self._c.log_scalars:
            self._scalar_summaries(data, feat, prior_dist, post_dist, likes,
                                   div, model_loss, value_loss, actor_loss,
                                   model_norm, value_norm, actor_norm)
Beispiel #12
0
    def _model_train_step(self, data, prefix='train'):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(
                tf.boolean_mask(image_pred.log_prob(data['image']),
                                data['mask']))
            likes.reward = tf.reduce_mean(
                tf.boolean_mask(reward_pred.log_prob(data['reward']),
                                data['mask']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = data['terminal']
                likes.pcont = tf.reduce_mean(
                    tf.boolean_mask(pcont_pred.log_prob(pcont_target),
                                    data['mask']))
                likes.pcont *= self._c.pcont_scale

            for key in prior.keys():
                prior[key] = tf.boolean_mask(prior[key], data['mask'])
                post[key] = tf.boolean_mask(post[key], data['mask'])

            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            model_loss = self._c.kl_scale * div - sum(likes.values())

        if prefix == 'train':
            model_norm = self._model_opt(model_tape, model_loss)
            self._model_step += 1

        if self._model_step % self._c.log_every == 0:
            self._image_summaries(data, embed, image_pred, self._model_step,
                                  prefix)
            model_summaries = dict()
            model_summaries['model_train/KL Divergence'] = tf.reduce_mean(div)
            model_summaries['model_train/image_recon'] = tf.reduce_mean(
                likes.image)
            model_summaries['model_train/reward_recon'] = tf.reduce_mean(
                likes.reward)
            model_summaries['model_train/model_loss'] = tf.reduce_mean(
                model_loss)
            if prefix == 'train':
                model_summaries['model_train/model_norm'] = tf.reduce_mean(
                    model_norm)
            if self._c.pcont:
                model_summaries['model_train/terminal_recon'] = tf.reduce_mean(
                    likes.pcont)
            self._write_summaries(model_summaries, self._model_step)
def main(_):
    """Create or load configuration and launch the trainer."""
    utility.set_up_logging()
    if not FLAGS.config:
        raise KeyError('You must specify a configuration.')
    logdir = FLAGS.logdir and os.path.expanduser(
        os.path.join(FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp,
                                                  FLAGS.config)))
    try:
        config = utility.load_config(logdir)
    except IOError:
        config = tools.AttrDict(getattr(configs, FLAGS.config)())
        config = utility.save_config(config, logdir)
    for score in train(config, FLAGS.env_processes):
        tf.logging.info('Score {}.'.format(score))
Beispiel #14
0
    def _train(self, data, log_images):
        '''
        define the RL Algorithms

        the world model: do observation
        the actor net: do image 
        the critic net: do image
        '''

        with tf.GradientTape() as model_tape:
            '''
            the world model, which is _dynamics(RSSM)
            '''
            # data: {'action': shape=(25, 50, 4) float16, 'reward':shape=(25, 50) float16,
            #'discount': shape=(25, 50)float16, 'image': shape=(25, 50, 64, 64, 3) float16>}
            # 25: batch_size/num of GPU, 50:batch_length
            embed = self._encode(data)  # (25, 50, 1024)
            post, prior = self._dynamics.observe(
                embed, data['action']
            )  # world model try to dream from first step to last step.
            # post: post['meant'].shape: (25, 50, 30)

            feat = self._dynamics.get_feat(
                post)  # feat: (25, batch_length, 230)

            image_pred = self._decode(
                feat)  # image_pred.sample(): (25, batch_length, 64, 64, 3)
            reward_pred = self._reward(
                feat)  # reward_pred.sample(): (25, batch_length)

            likes = tools.AttrDict(
            )  # collect the likelihood(prob of acturally happend events)

            likes.image = tf.reduce_mean(image_pred.log_prob(data['image']))

            likes.reward = tf.reduce_mean(
                reward_pred.log_prob(data['reward'])
            )  #  data['reward'].shape: (25, 50)  => log_prob each step : (25, 50) scalar =>  likes.reward(mean of logprob) : () scalar

            if self._c.pcont:  # for my aspect, this will make model to learn which step to focus by itself.
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            '''
            the model loss is exactly the VAE loss of world model(which is VAE sample generator)
            '''
            model_loss = self._c.kl_scale * div - sum(likes.values(
            ))  # like.value contains log prob of image and reward

            model_loss /= float(self._strategy.num_replicas_in_sync)
        '''
        dreamer
        '''

        # with tf.GradientTape() as actor_tape:

        #     imag_feat = self._imagine_ahead(post) # scaning to get prior for each prev state, step(policy&world model) for horizon(15) steps
        #     print("imag_feat:",imag_feat.shape) # (15, 1225, 230)
        #     reward = self._reward(imag_feat).mode() # get reward for every step # (15, 1225)
        #     print("reward:",reward)
        #     if self._c.pcont:
        #         pcont = self._pcont(imag_feat).mean()
        #     else:
        #         pcont = self._c.discount * tf.ones_like(reward)
        #     value = self._value(imag_feat).mode() # (15, 1250), 15 is horizon, 1250 is batch_length*batch_size/num of GPUs.

        #     returns = tools.lambda_return(
        #         reward[:-1], value[:-1], pcont[:-1],
        #         bootstrap=value[-1], lambda_=self._c.disclam, axis=0) # an exponentially-weighted average of the estimates V for different k to balance bias and variance
        #     # print("returns: ",returns) # (14, 1225)

        #     discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
        #         [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)) # not to effect the world model
        #     print("discount:",discount.shape)
        #     actor_loss = -tf.reduce_mean(discount * returns) # !!!!! not using policy gradient !!!! directy maximize return
        #     actor_loss /= float(self._strategy.num_replicas_in_sync)

        # with tf.GradientTape() as value_tape:
        #     value_pred = self._value(imag_feat)[:-1]
        #     target = tf.stop_gradient(returns)
        #     print("target:",target.shape) # (14, 1225)
        #     print("value_pred.log_prob(target).shape:",value_pred.log_prob(target).shape) # (14, 1225)
        #     value_loss = -tf.reduce_mean(discount * value_pred.log_prob(target)) # to directy predict return. gradient is not effecting world model
        #     value_loss /= float(self._strategy.num_replicas_in_sync)
        '''
        A2C
        '''

        with tf.GradientTape() as actor_tape:
            # imaging ahead to get state, action, reward to imagine horizon
            # imag_feat: (15, 1250, 230)
            # also, revise the image ahead func and img_step func to get the action it take
            # imag_act: (15, 1250, number of action) => (15, 25, 50, number of action)

            imag_feat, imagine_action = self._imagine_ahead_and_get_action(
                post
            )  # scaning to get prior for each prev state, step(policy&world model) for horizon(15) steps
            #  print("imagine_action:",imagine_action) # (15, 1225, number of action)
            if self._c.pcont:
                reduce_batch_length = self._c.batch_length - 1  # do not take the last one
                reduce_horizon = self._c.horizon - 1
            else:
                reduce_batch_length = self._c.batch_length
                reduce_horizon = self._c.horizon - 1

            imagine_action = tf.reshape(
                imagine_action,
                [self._c.horizon, -1, reduce_batch_length, self._actdim])
            # print("imagine_action:",imagine_action) # (15, 25, 49 or 50, number of action)
            imagine_action = imagine_action[:, :, :reduce_batch_length -
                                            10, :]  # for td
            argmax_imagine_action = tf.argmax(imagine_action, -1)
            # one_hot_imagine_action = tf.one_hot(tf.argmax(imagine_action,-1),self._actdim)
            # print("imagine_action:",imagine_action.shape) # (15, 25, 39, 4)
            # print("one_hot_imagine_action:",one_hot_imagine_action.shape) # (15, 25, 39, 4)

            # Preprocess reward for actor and critic. sliding window size decide TD-N(10)
            # (15, 25, 50, 230) => (15, 25, 50, 1) => (15, 25, 50)
            # sliding for window 10: (15, 25, 50) =slidesum=> (15, 25, 40)
            # # first step: advantage, first and after: model-based(planning) advantage
            # # imagine reward first step (15, 25, 50)
            # discount (14, 1225) => (14,25,50,1) => (14,25,39,1)

            reward = self._reward(
                imag_feat).mode()  # get reward for every step # (15, 1250)
            reward = tf.reshape(reward,
                                [self._c.horizon, -1, reduce_batch_length, 1])
            dim5_reward = tf.expand_dims(reward, -1)
            sum_reward = tf.extract_volume_patches(
                dim5_reward, [1, 1, 10, 1, 1], [1, 1, 1, 1, 1],
                "VALID")  # need to be dimension 5
            sum_reward = tf.reduce_sum(sum_reward, -1)  # (15, 25, 40, 1)

            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)

            discount = tf.math.cumprod(
                tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0
            )  # for A2C, if we like to learn pcont, do not stop the gradient
            print("discount:", discount.shape)  # (14, 1225)
            discount = tf.reshape(discount,
                                  [reduce_horizon, -1, reduce_batch_length, 1])
            discount = discount[:, :, :reduce_batch_length - 10, :]
            print("discount:", discount.shape)  # discount: (14, 25, 39, 1)

            # value prediction # this value function is prediction current value to TD-N. (not sum to end of imagine horizon)
            # (15, 25, 50, 230) => (15, 25, 50)
            # (15, 25, [0:40]) => (15, 25, 40) st
            # (15, 25, [10:50]) => (15, 25, 40) st+1
            # reward(15, 25, [0:40]) + (value prediction st(15,25, 40) - st+1(15, 25, 40)) => (15, 25, 40) # get advantage
            # stop gedient(15,25,40)

            value = self._value(imag_feat).mode(
            )  # (15, 1250), 15 is horizon, 1250 is batch_length*batch_size/num of GPUs.
            value = tf.reshape(value,
                               [self._c.horizon, -1, reduce_batch_length, 1
                                ])  # (15, 1250 or 1245) => [15,-1,50 or 49,1]
            st_value = value[:, :, :reduce_batch_length - 10]
            stp1_value = value[:, :, 1:1 + reduce_batch_length - 10]
            print("st_value:",
                  st_value.shape)  # st_value: (15, 25, 40 or 39, 1)
            print("stp1_value:",
                  stp1_value.shape)  # stp1_value: (15, 25, 40 or 39, 1)

            # advantage actor-critic policy gradient
            # action(15, 25, [0:40]) * advantage(15, 25, 40) => (15, 25, 40)
            # reduce mean(15, 25, 40)
            if self._c.pcont:
                sum_reward = sum_reward[:, :, :reduce_batch_length -
                                        10, :]  # (15, 25, 39, 1)

            advantage = sum_reward + st_value - stp1_value  # (15, 25, 39, 1)
            advantage = tf.stop_gradient(advantage)  # update only actor

            print("imagine_action:", imagine_action.shape)  # (15, 25, 39, 4)
            print("argmax_imagine_action:",
                  argmax_imagine_action.shape)  # (15, 25, 39, 4)

            policy_gradient = tf.keras.losses.sparse_categorical_crossentropy(
                argmax_imagine_action, imagine_action, from_logits=False)
            print("policy_gradient:", policy_gradient.shape)  # (15, 25, 39)
            policy_gradient = tf.expand_dims(policy_gradient, -1) * advantage
            print("policy_gradient:", policy_gradient.shape)  # (15, 25, 39, 1)
            policy_gradient = policy_gradient[:-1] * discount  # (14, 25, 39, 1)*(14, 25, 39, 1)

            actor_loss = tf.reduce_mean(policy_gradient)
            actor_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as value_tape:

            # value loss
            # (15, 25, 40)st
            # slide reward: (15, 25, 40)
            # reduce_mean(l2((15, 25, 40),(15, 25, 40))
            value = self._value(imag_feat).mode(
            )  # (15, 1250), 15 is horizon, 1250 is batch_length*batch_size/num of GPUs.
            value = tf.reshape(value,
                               [self._c.horizon, -1, reduce_batch_length, 1
                                ])  # (15, 1250 or 1245) => [15,-1,50 or 49,1]
            st_value = value[:, :, :reduce_batch_length - 10]

            value_MSE = tf.keras.losses.MSE(tf.stop_gradient(sum_reward),
                                            st_value)
            print("value_MSE:", value_MSE.shape)
            value_MSE = tf.expand_dims(value_MSE[:-1], -1) * discount

            value_loss = tf.reduce_mean(value_MSE)
            value_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(data, feat, prior_dist, post_dist,
                                       likes, div, model_loss, value_loss,
                                       actor_loss, model_norm, value_norm,
                                       actor_norm)
            if tf.equal(log_images, True):
                self._image_summaries(data, embed, image_pred)
Beispiel #15
0
    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data["action"])
            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)

            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data["image"]))

            ######################################################################
            # RE3: + intrinsic rewards
            rand_embed_ = tf.stop_gradient(self._rand_encode(data))
            rand_embed = tf.reshape(rand_embed_, [-1, 50])
            dist = tf.norm(rand_embed[:, None, :] - rand_embed[None, :, :], axis=-1)
            int_reward = -1.0 * tf.math.top_k(-dist, k=self._c.k).values[:, -1]
            norm_int_reward = self._rms(int_reward)
            norm_int_reward = tf.reshape(norm_int_reward, tf.shape(rand_embed_)[:-1])
            likes.reward = tf.reduce_mean(
                reward_pred.log_prob(data["reward"] + self._c.beta * norm_int_reward)
            )
            ######################################################################

            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data["discount"]
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(
                reward[:-1],
                value[:-1],
                pcont[:-1],
                bootstrap=value[-1],
                lambda_=self._c.disclam,
                axis=0,
            )
            discount = tf.stop_gradient(
                tf.math.cumprod(tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0)
            )
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = -tf.reduce_mean(discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data,
                    feat,
                    prior_dist,
                    post_dist,
                    likes,
                    div,
                    model_loss,
                    value_loss,
                    actor_loss,
                    model_norm,
                    value_norm,
                    actor_norm,
                    int_reward,
                    norm_int_reward,
                )
            if tf.equal(log_images, True):
                self._image_summaries(data, embed, image_pred)
Beispiel #16
0
    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data['image']))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)


        with tf.GradientTape() as actor_tape:
            alset = []
            alset_re = []
            imag_feat, prob_traj = self._imagine_ahead(post)
            prob_traj=tf.reduce_sum(prob_traj,0,keepdims=True)
            prob_traj = (prob_traj - tf.reduce_mean(prob_traj)) / (tf.math.reduce_std(prob_traj) + 1e-9) +1
            prob_traj = tf.clip_by_value(prob_traj, 1 - self._c.reweight_clip, 1 + self._c.reweight_clip)
            prob_traj = prob_traj

            entropy = tf.reduce_mean(self._actor(tf.stop_gradient(feat)).entropy())

            tlikes = tools.AttrDict()
            tlikes.reward = reward_pred.log_prob(data['reward'])
            reward = self._reward(imag_feat).mode()

            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)

            value = self._value(imag_feat).mode()

            returns = tools.lambda_return(
                reward[:-1], value[:-1], pcont[:-1],
                bootstrap=value[-1], lambda_=self._c.disclam, axis=0)
            discount = tf.stop_gradient(tf.math.cumprod(tf.concat(
                [tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            alset.append(-tf.reduce_mean(discount * returns))
            alset_re.append(-tf.reduce_mean(tf.stop_gradient(prob_traj)* discount * returns))
            actor_loss = tf.reduce_mean(tf.stack(alset_re)) - self._c.ent_alpha * entropy

            actor_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = -tf.reduce_mean(prob_traj * discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)
        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(
                    data, feat, prior_dist, post_dist, likes, div,
                    model_loss, value_loss, actor_loss, entropy, model_norm, value_norm,
                    actor_norm)
            if tf.equal(log_images, True):
                self._image_summaries(data, embed, image_pred)
Beispiel #17
0
    def _train(self, data, log_images, init_horizon, imagine_depth):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data['image']))
            likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as actor_tape:
            flatten = lambda x: tf.reshape(x, [-1] + list(x.shape[2:]))
            discount = None
            imag_feats = []
            returns_lst = []
            discounts = []
            actor_loss = 0.0
            horizon = init_horizon
            for depth in range(imagine_depth):
                if self._c.pcont:  # Last step could be terminal.
                    post = {k: v[:, :-1] for k, v in post.items()}
                post = {k: flatten(v) for k, v in post.items()}
                if depth != 0:
                    post = {
                        k: tf.stop_gradient(
                            tf.gather(v, indices=max_indexes, axis=0))
                        for k, v in post.items()
                    }
                imag_feat, post = self._imagine_ahead(post, horizon)
                tf.print("Imagination Features:", tf.shape(imag_feat))
                reward = self._reward(imag_feat).mode()
                if self._c.pcont:
                    pcont = self._pcont(imag_feat).mean()
                else:
                    pcont = self._c.discount * tf.ones_like(reward)
                value = self._value(imag_feat).mode()
                returns = tools.lambda_return(reward[:-1],
                                              value[:-1],
                                              pcont[:-1],
                                              bootstrap=value[-1],
                                              lambda_=self._c.disclam,
                                              axis=0)

                discount = tf.stop_gradient(
                    tf.math.cumprod(
                        tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0),
                        0))

                if depth != imagine_depth - 1:
                    if self._c.branch_type == "reward":
                        flat_reward = flatten(reward)
                        max_indexes = tf.math.top_k(
                            flat_reward,
                            k=int(2500 / self._strategy.num_replicas_in_sync),
                            sorted=False)[1]
                    elif self._c.branch_type == "uniform":
                        flat_reward = flatten(reward)
                        max_indexes = tf.random.uniform(
                            [int(2500 / self._strategy.num_replicas_in_sync)],
                            minval=0,
                            maxval=flat_reward.shape[0],
                            dtype=tf.int32)
                    elif self._c.branch_type == "value":
                        flat_value = flatten(value)
                        max_indexes = tf.math.top_k(
                            flat_value,
                            k=int(2500 / self._strategy.num_replicas_in_sync),
                            sorted=False)[1]

                horizon = int(horizon * self._c.imagine_decay)

                imag_feats.append(imag_feat)
                returns_lst.append(returns)
                discounts.append(discount)
                actor_loss += -tf.reduce_mean(discount * returns)

            actor_loss /= float(self._strategy.num_replicas_in_sync *
                                imagine_depth)

        with tf.GradientTape() as value_tape:
            value_loss = 0.0
            for imag_feat, returns, discount in zip(imag_feats, returns_lst,
                                                    discounts):
                value_pred = self._value(imag_feat)[:-1]
                target = tf.stop_gradient(returns)
                value_loss += -tf.reduce_mean(
                    discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync *
                                imagine_depth)

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(data, feat, prior_dist, post_dist,
                                       likes, div, model_loss, value_loss,
                                       actor_loss, model_norm, value_norm,
                                       actor_norm)
            if tf.equal(log_images, True):
                self._image_summaries(data, embed, image_pred)
Beispiel #18
0
def define_config():
    config = tools.AttrDict()
    # General.
    config.logdir = pathlib.Path('.')
    config.seed = 0
    config.steps = 5e6
    config.eval_every = 1e4
    config.log_every = 1e3
    config.log_scalars = True
    config.log_images = True
    config.gpu_growth = True
    config.precision = 16
    # Environment.
    config.task = 'gridworld_water'
    config.envs = 1
    config.parallel = 'none'
    config.action_repeat = 1
    config.time_limit = 200
    config.prefill = 5000
    config.eval_noise = 0.05
    config.clip_rewards = 'none'
    # Model.
    # config.deter_size = 200
    # config.stoch_size = 30
    # config.num_units = 400
    config.deter_size = 16
    config.stoch_size = 16
    config.num_units = 256
    config.dense_act = 'elu'
    config.cnn_act = 'relu'
    config.cnn_depth = 32
    config.pcont = True
    config.free_nats = 3.0
    config.kl_scale = 2.0  #NOTE:TUNE
    config.pcont_scale = 10.0
    config.weight_decay = 0.0
    config.weight_decay_pattern = r'.*'
    # Training.
    config.batch_size = 32
    config.batch_length = 8
    config.train_every = 1000
    config.train_steps = 100
    config.pretrain = 100
    config.model_lr = 6e-4
    config.value_lr = 8e-5
    config.actor_lr = 8e-5
    config.grad_clip = 100.0
    # config.dataset_balance = False
    config.dataset_balance = True
    # Behavior.
    config.discount = 0.99
    config.disclam = 0.95
    config.horizon = 15
    config.action_init_std = 5.0

    # config.action_dist = 'tanh_normal'
    # config.expl = 'additive_gaussian'
    # config.expl_amount = 0.3

    config.action_dist = 'onehot'
    # config.action_dist = 'gumbel'
    config.expl = 'epsilon_greedy'
    config.expl_amount = 0.3

    config.expl_decay = 1e5
    config.expl_min = 0.05

    # add by haohu
    config.cpc = False
    config.cpc_num_units = 64
    config.cpc_latent_size = config.stoch_size + config.deter_size
    config.cpc_num_layers = 3
    config.cpc_contrast = 'window'
    config.cpc_batch_amount = 8
    config.cpc_time_amount = 4

    return config
Beispiel #19
0
def define_config():
    config = tools.AttrDict()
    # General.
    config.logdir = pathlib.Path('.')
    config.seed = 0
    config.steps = 2e6
    config.eval_every = 1e4
    config.log_every = 1e3
    config.log_scalars = True
    config.log_images = True
    config.gpu_growth = True
    config.precision = 32
    # Environment.
    config.task = 'dmc_cup_catch'
    config.envs = 1
    config.parallel = 'none'
    config.action_repeat = 2
    config.time_limit = 1000
    config.prefill = 5000
    config.eval_noise = 0.0
    config.clip_rewards = 'none'
    # Model.
    config.deter_size = 200
    config.stoch_size = 30
    config.num_units = 400
    config.dense_act = 'elu'
    config.cnn_act = 'relu'
    config.cnn_depth = 32
    config.pcont = False
    config.free_nats = 3.0
    config.kl_scale = 1.0
    config.pcont_scale = 10.0
    config.weight_decay = 0.0
    config.weight_decay_pattern = r'.*'
    # Training.
    config.batch_size = 50
    config.batch_length = 50
    config.train_every = 1000
    config.train_steps = 100
    config.pretrain = 100
    config.model_lr = 6e-4
    config.value_lr = 8e-5
    config.actor_lr = 8e-5
    config.grad_clip = 100.0
    config.dataset_balance = False
    # Behavior.
    config.discount = 0.99
    config.disclam = 0.95
    config.horizon = 15
    config.action_dist = 'tanh_normal'
    config.action_init_std = 5.0
    config.expl = 'additive_gaussian'
    config.expl_amount = 0.3
    config.expl_decay = 0.0
    config.expl_min = 0.0
    config.id = 'debug'
    config.use_state = False

    # Sim2real transfer
    config.real_world_prob = -1  # fraction of samples trained on which are from the real world (probably involves oversampling real-world samples)
    config.sample_real_every = 2  # How often we should sample from the real world

    #these values are for testing dmc_cup_catch
    config.mass_mean = 0.2
    config.mass_range = 0.01

    return config
Beispiel #20
0
def define_config():
    config = tools.AttrDict()
    # General.
    config.logdir = pathlib.Path('.')
    config.seed = 0
    config.steps = 5e6
    config.eval_every = 1e4
    config.log_every = 1e3
    config.log_scalars = True
    config.log_images = True
    config.gpu_growth = True
    config.precision = 16
    # Environment.
    config.task = 'dmc_walker_walk'
    config.envs = 1
    config.parallel = 'none'
    config.action_repeat = 2
    config.time_limit = 1000
    config.prefill = 5000
    config.eval_noise = 0.0
    config.clip_rewards = 'none'
    # Model.
    config.deter_size = 200
    config.stoch_size = 30
    config.num_units = 400
    config.dense_act = 'elu'
    config.cnn_act = 'relu'
    config.cnn_depth = 32
    config.pcont = False
    config.free_nats = 3.0
    config.kl_scale = 1.0
    config.pcont_scale = 10.0
    config.weight_decay = 0.0
    config.weight_decay_pattern = r'.*'
    # Training.
    config.batch_size = 50
    config.batch_length = 50
    config.train_every = 1000
    config.train_steps = 100
    config.pretrain = 100
    config.model_lr = 6e-4
    config.value_lr = 8e-5
    config.actor_lr = 8e-5
    config.grad_clip = 100.0
    config.dataset_balance = False
    # Behavior.
    config.discount = 0.99
    config.disclam = 0.95
    config.horizon = 15
    config.action_dist = 'tanh_normal'
    config.action_init_std = 5.0
    config.expl = 'additive_gaussian'
    config.expl_amount = 0.3
    config.expl_decay = 0.0
    config.expl_min = 0.0
    config.log_imgs = False

    # natural or not
    config.natural = False

    # obs model
    config.obs_model = 'contrastive'

    # SAC settings
    config.num_Qs = 2

    # use dreamer and SAC for hybrid actor-critic training
    config.use_sac = True
    config.use_dreamer = True

    # use trajectory optimization
    config.trajectory_opt = True
    config.traj_opt_lr = 0.003
    config.num_samples = 20
    return config
Beispiel #21
0
def define_config():
    """
    Default definition of command-line arguments.
    """
    config = tools.AttrDict()
    # General.
    config.datetime = datetime.now().strftime(
        "%m-%d-%Y %H:%M:%S")  # just for logging config
    config.seed = random.randint(2, 10**6)
    config.logdir = pathlib.Path('logs/experiments')
    config.steps = 5e6
    config.eval_every = 1e4
    config.log_every = 1e3
    config.log_scalars = True
    config.log_images = True
    config.log_videos = True
    config.gpu_growth = True
    config.precision = 32
    config.obs_type = 'lidar'
    # Environment.
    config.track = 'austria'
    config.task = 'max_progress'
    config.action_repeat = 4
    config.eval_episodes = 5
    config.time_limit_train = 2000
    config.time_limit_test = 4000
    config.prefill_agent = 'gap_follower'
    config.prefill = 5000
    config.eval_noise = 0.0
    config.clip_rewards = 'none'
    config.clip_rewards_min = -1
    config.clip_rewards_max = 1
    # Model.
    config.encoded_obs_dim = 1080
    config.deter_size = 200
    config.stoch_size = 30
    config.num_units = 400
    config.reward_out_dist = 'normal'
    config.dense_act = 'elu'
    config.cnn_act = 'relu'
    config.pcont = True
    config.free_nats = 3.0
    config.kl_scale = 1.0
    config.pcont_scale = 10.0
    config.weight_decay = 0.0
    config.weight_decay_pattern = r'.*'
    # Training.
    config.batch_size = 50
    config.batch_length = 50
    config.train_every = 1000
    config.train_steps = 100
    config.pretrain = 100
    config.model_lr = 6e-4
    config.value_lr = 8e-5
    config.actor_lr = 8e-5
    config.grad_clip = 1.0
    config.dataset_balance = False
    # Behavior.
    config.discount = 0.99
    config.disclam = 0.95
    config.horizon = 15
    config.action_dist = 'tanh_normal'
    config.action_init_std = 5.0
    config.expl = 'additive_gaussian'
    config.expl_amount = 0.3
    config.expl_decay = 0.0
    config.expl_min = 0.3
    return config
Beispiel #22
0
def define_config():
    config = tools.AttrDict()
    # General.
    config.logdir = pathlib.Path('.logdir')
    #config.datadir = pathlib.Path('./data/')
    config.datadir = pathlib.Path('.datadir/walker')
    config.seed = 0
    config.log_every = 1000
    config.save_every = 5000
    config.log_scalars = True
    config.log_images = True
    config.gpu_growth = True

    # Environment.
    config.task = 'dmc_walker_walk'
    config.envs = 1
    config.parallel = 'none'
    config.action_repeat = 2
    config.time_limit = 1000
    config.im_size = 64
    config.eval_noise = 0.0
    config.clip_rewards = 'none'
    config.precision = 32

    # Model.
    config.deter_size = 256
    config.stoch_size = 64
    config.num_models = 7
    config.num_units = 256
    config.proprio = False
    config.penalty_type = 'log_prob'
    config.dense_act = 'elu'
    config.cnn_act = 'relu'
    config.cnn_depth = 32
    config.pcont = False
    config.kl_scale = 1.0
    config.pcont_scale = 10.0
    config.weight_decay = 0.0
    config.weight_decay_pattern = r'.*'

    # Training.
    config.load_model = False
    config.load_agent = False
    config.load_buffer = False
    config.train_steps = 100000
    config.model_train_steps = 25000
    config.model_batch_size = 64
    config.model_batch_length = 50
    config.agent_batch_size = 256
    config.cql_samples = 16
    config.start_training = 50000
    config.agent_train_steps = 100000
    config.agent_itters_per_step = 200
    config.buffer_size = 2e6
    config.model_lr = 6e-4
    config.q_lr = 3e-4
    config.actor_lr = 3e-4
    config.grad_clip = 100.0
    config.tau = 5e-3
    config.target_update_interval = 1
    config.dataset_balance = False

    # Behavior.
    config.lmbd = 5.0
    config.alpha = 0.0
    config.sample = True
    config.discount = 0.99
    config.disclam = 0.95
    config.horizon = 5
    config.done_treshold = 0.5
    config.action_dist = 'tanh_normal'
    config.action_init_std = 5.0
    config.expl = 'additive_gaussian'
    config.expl_amount = 0.2
    config.expl_decay = 0.0
    config.expl_min = 0.0
    return config
Beispiel #23
0
    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            if 'success' in data:
                success_rate = tf.reduce_sum(
                    data['success']) / data['success'].shape[1]
            else:
                success_rate = tf.convert_to_tensor(-1)
            embed = self._encode(data)
            if 'state' in data:
                embed = tf.concat([data['state'], embed], axis=-1)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            image_pred = self._decode(feat)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.image = tf.reduce_mean(image_pred.log_prob(data['image']))
            reward_obj = reward_pred.log_prob(data['reward'])

            # Mask out the elements which came from the real world env
            reward_obj = reward_obj * (1 - data['real_world'])

            likes.reward = tf.reduce_mean(reward_obj)
            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale
            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())
            model_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as actor_tape:
            imag_feat = self._imagine_ahead(post)
            reward = self._reward(imag_feat).mode()
            if self._c.pcont:
                pcont = self._pcont(imag_feat).mean()
            else:
                pcont = self._c.discount * tf.ones_like(reward)
            value = self._value(imag_feat).mode()
            returns = tools.lambda_return(reward[:-1],
                                          value[:-1],
                                          pcont[:-1],
                                          bootstrap=value[-1],
                                          lambda_=self._c.disclam,
                                          axis=0)
            discount = tf.stop_gradient(
                tf.math.cumprod(
                    tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0), 0))
            actor_loss = -tf.reduce_mean(discount * returns)
            actor_loss /= float(self._strategy.num_replicas_in_sync)

        with tf.GradientTape() as value_tape:
            value_pred = self._value(imag_feat)[:-1]
            target = tf.stop_gradient(returns)
            value_loss = -tf.reduce_mean(
                discount * value_pred.log_prob(target))
            value_loss /= float(self._strategy.num_replicas_in_sync)

        model_norm = self._model_opt(model_tape, model_loss)
        actor_norm = self._actor_opt(actor_tape, actor_loss)
        value_norm = self._value_opt(value_tape, value_loss)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(data, feat, prior_dist, post_dist,
                                       likes, div, model_loss, value_loss,
                                       actor_loss, model_norm, value_norm,
                                       actor_norm, success_rate)
            if tf.equal(log_images, True):
                self._image_summaries(data, embed, image_pred)
Beispiel #24
0
    def _train(self, data, log_images):
        with tf.GradientTape() as model_tape:
            embed = self._encode(data)
            post, prior = self._dynamics.observe(embed, data['action'])
            feat = self._dynamics.get_feat(post)
            reward_pred = self._reward(feat)
            likes = tools.AttrDict()
            likes.reward = tf.reduce_mean(reward_pred.log_prob(data['reward']))

            # if we use the generative observation model, we need to perform observation reconstruction
            image_pred = self._decode(feat)
            # compute the contrative loss directly in CVRL
            cont_loss = self._contrastive(feat, embed)

            # the contrastive / generative implementation of the observation model p(o|s)
            if self._c.obs_model == 'generative':
                likes.image = tf.reduce_mean(image_pred.log_prob(
                    data['image']))
            elif self._c.obs_model == 'contrastive':
                likes.image = tf.reduce_mean(cont_loss)

            if self._c.pcont:
                pcont_pred = self._pcont(feat)
                pcont_target = self._c.discount * data['discount']
                likes.pcont = tf.reduce_mean(pcont_pred.log_prob(pcont_target))
                likes.pcont *= self._c.pcont_scale

            prior_dist = self._dynamics.get_dist(prior)
            post_dist = self._dynamics.get_dist(post)
            div = tf.reduce_mean(tfd.kl_divergence(post_dist, prior_dist))
            div = tf.maximum(div, self._c.free_nats)
            model_loss = self._c.kl_scale * div - sum(likes.values())

        assert self._c.use_dreamer or self._c.use_sac

        if self._c.use_dreamer:
            with tf.GradientTape() as actor_tape:
                imag_feat = self._imagine_ahead(post)
                reward = self._reward(imag_feat).mode()
                if self._c.pcont:
                    pcont = self._pcont(imag_feat).mean()
                else:
                    pcont = self._c.discount * tf.ones_like(reward)
                value = self._value(imag_feat).mode()
                returns = tools.lambda_return(reward[:-1],
                                              value[:-1],
                                              pcont[:-1],
                                              bootstrap=value[-1],
                                              lambda_=self._c.disclam,
                                              axis=0)
                discount = tf.stop_gradient(
                    tf.math.cumprod(
                        tf.concat([tf.ones_like(pcont[:1]), pcont[:-2]], 0),
                        0))
                actor_loss = -tf.reduce_mean(discount * returns)

            with tf.GradientTape() as value_tape:
                value_pred = self._value(imag_feat)[:-1]
                target = tf.stop_gradient(returns)
                value_loss = - \
                    tf.reduce_mean(discount * value_pred.log_prob(target))

            actor_norm = self._actor_opt(actor_tape, actor_loss)
            value_norm = self._value_opt(value_tape, value_loss)
        else:
            actor_norm = actor_loss = 0
            value_norm = value_loss = 0

        model_norm = self._model_opt(model_tape, model_loss)
        states = tf.concat([post['stoch'], post['deter']], axis=-1)
        rewards = data['reward']
        dones = tf.zeros_like(rewards)
        actions = data['action']

        # if we use SAC, add the SAC training
        if self._c.use_sac:
            self._sac._do_training(self._step, states, actions, rewards, dones)

        if tf.distribute.get_replica_context().replica_id_in_sync_group == 0:
            if self._c.log_scalars:
                self._scalar_summaries(data, feat, prior_dist, post_dist,
                                       likes, div, model_loss, value_loss,
                                       actor_loss, model_norm, value_norm,
                                       actor_norm)
            if tf.equal(log_images, True) and self._c.log_imgs:
                self._image_summaries(data, embed, image_pred)