Ejemplo n.º 1
0
    def testTrainWithRnn(self):
        actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
            self._obs_spec,
            self._action_spec,
            input_fc_layer_params=None,
            output_fc_layer_params=None,
            conv_layer_params=None,
            lstm_size=(40, ),
        )

        critic_net = critic_rnn_network.CriticRnnNetwork(
            (self._obs_spec, self._action_spec),
            observation_fc_layer_params=(16, ),
            action_fc_layer_params=(16, ),
            joint_fc_layer_params=(16, ),
            lstm_size=(16, ),
            output_fc_layer_params=None,
        )

        counter = common.create_variable('test_train_counter')

        optimizer_fn = tf.compat.v1.train.AdamOptimizer

        agent = sac_agent.SacAgent(
            self._time_step_spec,
            self._action_spec,
            critic_network=critic_net,
            actor_network=actor_net,
            actor_optimizer=optimizer_fn(1e-3),
            critic_optimizer=optimizer_fn(1e-3),
            alpha_optimizer=optimizer_fn(1e-3),
            train_step_counter=counter,
        )

        batch_size = 5
        observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size,
                                   dtype=tf.float32)
        actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.float32)
        time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * batch_size,
                                                       dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * batch_size,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * batch_size,
                                                      dtype=tf.float32),
                                 observation=observations)

        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, (), time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)

        # Force variable creation.
        agent.policy.variables()
        if tf.executing_eagerly():
            loss = lambda: agent.train(experience)
        else:
            loss = agent.train(experience)

        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertEqual(self.evaluate(counter), 0)
        self.evaluate(loss)
        self.assertEqual(self.evaluate(counter), 1)
Ejemplo n.º 2
0
 def testInitialValue(self):
     counter = common.create_variable('counter', 1)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(counter), 1)
Ejemplo n.º 3
0
 def testMultipleCounters(self):
     counter1 = common.create_variable('counter', 1)
     counter2 = common.create_variable('counter', 2)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(counter1), 1)
     self.assertEqual(self.evaluate(counter2), 2)
Ejemplo n.º 4
0
 def create_acc(spec):
   return common.create_variable(
       initial_value=np.zeros((batch_size,) + spec.shape),
       shape=(batch_size,) + spec.shape,
       dtype=spec.dtype,
       name='Accumulator/' + spec.name)
Ejemplo n.º 5
0
def train_eval(
        root_dir,
        random_seed=None,
        # Dataset params
        domain_name='cartpole',
        task_name='swingup',
        frame_shape=(84, 84, 3),
        image_aug_type='random_shifting',  # None/'random_shifting'
        frame_stack=3,
        action_repeat=4,
        # Params for learning
        num_env_steps=1000000,
        learn_ceb=True,
        use_augmented_q=False,
        # Params for CEB
        e_ctor=encoders.FRNConv,
        e_head_ctor=encoders.MVNormalDiagParamHead,
        b_ctor=encoders.FRNConv,
        b_head_ctor=encoders.MVNormalDiagParamHead,
        conv_feature_dim=50,  # deterministic feature used by actor/critic/ceb
        ceb_feature_dim=50,
        ceb_action_condition=True,
        ceb_backward_encode_rewards=True,
        initial_feature_step=0,
        feature_lr=3e-4,
        feature_lr_schedule=None,
        ceb_beta=0.01,
        ceb_beta_schedule=None,
        ceb_generative_ratio=0.0,
        ceb_generative_items=None,
        feature_grad_clip=None,
        enc_ema_tau=0.05,  # if enc_ema_tau=None, ceb also learns backend encoder
        use_critic_grad=True,
        # Params for SAC
        actor_kernel_init='glorot_uniform',
        normal_proj_net=sac_agent.sac_normal_projection_net,
        critic_kernel_init='glorot_uniform',
        critic_last_kernel_init='glorot_uniform',
        actor_fc_layers=(256, 256),
        critic_obs_fc_layers=None,
        critic_action_fc_layers=None,
        critic_joint_fc_layers=(256, 256),
        # Params for collect
        collect_every=1,
        initial_collect_steps=1000,
        collect_steps_per_iteration=1,
        replay_buffer_capacity=100000,
        # Params for target update
        target_update_tau=0.005,
        target_update_period=1,
        # Params for train
        batch_size=256,
        actor_learning_rate=3e-4,
        actor_lr_schedule=None,
        critic_learning_rate=3e-4,
        critic_lr_schedule=None,
        alpha_learning_rate=3e-4,
        alpha_lr_schedule=None,
        td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
        gamma=0.99,
        reward_scale_factor=1.0,
        gradient_clipping=None,
        use_tf_functions=True,
        drivers_in_graph=True,
        # Params for eval
        num_eval_episodes=10,
        eval_env_interval=5000,  # number of env steps
        greedy_eval_policy=True,
        train_next_frame_decoder=False,
        # Params for summaries and logging
        baseline_log_fn=None,
        checkpoint_env_interval=100000,  # number of env steps
        log_env_interval=1000,  # number of env steps
        summary_interval=1000,
        image_summary_interval=0,
        summaries_flush_secs=10,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        eval_metrics_callback=None):
    """train and eval for PI-SAC."""
    if random_seed is not None:
        tf.compat.v1.set_random_seed(random_seed)
        np.random.seed(random_seed)

    # Load baseline logs and write to tensorboard
    if baseline_log_fn is not None:
        baseline_log_fn(root_dir, domain_name, task_name, action_repeat)

    if root_dir is None:
        raise AttributeError('train_eval requires a root_dir.')

    # Set iterations and intervals to be computed relative to the number of
    # environment steps rather than the number of gradient steps.
    num_iterations = (
        num_env_steps * collect_every // collect_steps_per_iteration +
        (initial_feature_step))
    checkpoint_interval = (checkpoint_env_interval * collect_every //
                           collect_steps_per_iteration)
    eval_interval = (eval_env_interval * collect_every //
                     collect_steps_per_iteration)
    log_interval = (log_env_interval * collect_every //
                    collect_steps_per_iteration)
    logging.info('num_env_steps = %d (env steps)', num_env_steps)
    logging.info('initial_feature_step = %d (gradient steps)',
                 initial_feature_step)
    logging.info('num_iterations = %d (gradient steps)', num_iterations)
    logging.info('checkpoint interval (env steps) = %d',
                 checkpoint_env_interval)
    logging.info('checkpoint interval (gradient steps) = %d',
                 checkpoint_interval)
    logging.info('eval interval (env steps) = %d', eval_env_interval)
    logging.info('eval interval (gradient steps) = %d', eval_interval)
    logging.info('log interval (env steps) = %d', log_env_interval)
    logging.info('log interval (gradient steps) = %d', log_interval)

    root_dir = os.path.expanduser(root_dir)

    summary_writer = tf.compat.v2.summary.create_file_writer(
        root_dir, flush_millis=summaries_flush_secs * 1000)
    summary_writer.set_as_default()

    eval_histograms = [
        pisac_metric_utils.ReturnHistogram(buffer_size=num_eval_episodes),
    ]

    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        pisac_metric_utils.ReturnStddevMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    # create training environment
    render_configs = {
        'height': frame_shape[0],
        'width': frame_shape[1],
        'camera_id': dict(quadruped=2).get(domain_name, 0),
    }

    tf_env = tf_py_environment.TFPyEnvironment(
        env_load_fn(domain_name, task_name, render_configs, frame_stack,
                    action_repeat))
    eval_tf_env = tf_py_environment.TFPyEnvironment(
        env_load_fn(domain_name, task_name, render_configs, frame_stack,
                    action_repeat))

    # Define global step
    g_step = common.create_variable('g_step')

    # Spec
    ims_shape = frame_shape[:2] + (frame_shape[2] * frame_stack, )
    ims_spec = tf.TensorSpec(shape=ims_shape, dtype=tf.uint8)
    conv_feature_spec = tf.TensorSpec(shape=(conv_feature_dim, ),
                                      dtype=tf.float32)
    action_spec = tf_env.action_spec()

    # Forward encoder
    e_enc = e_ctor(ims_spec, output_dim=conv_feature_dim, name='e')
    e_enc_t = e_ctor(ims_spec, output_dim=conv_feature_dim, name='e_t')
    e_enc.create_variables()
    e_enc_t.create_variables()
    common.soft_variables_update(e_enc.variables,
                                 e_enc_t.variables,
                                 tau=1.0,
                                 tau_non_trainable=1.0)

    # Forward encoder head
    if e_head_ctor is None:
        e_head = None
    else:
        stacked_action_spec = tensor_spec.BoundedTensorSpec(
            action_spec.shape[:-1] + (action_spec.shape[-1] * frame_stack),
            action_spec.dtype,
            action_spec.minimum.tolist() * frame_stack,
            action_spec.maximum.tolist() * frame_stack, action_spec.name)
        e_head_spec = [conv_feature_spec, stacked_action_spec
                       ] if ceb_action_condition else conv_feature_spec
        e_head = e_head_ctor(e_head_spec,
                             output_dim=ceb_feature_dim,
                             name='e_head')
        e_head.create_variables()

    # Backward encoder
    b_enc = b_ctor(ims_spec, output_dim=conv_feature_dim, name='b')
    b_enc.create_variables()

    # Backward encoder head
    if b_head_ctor is None:
        b_head = None
    else:
        stacked_reward_spec = tf.TensorSpec(shape=(frame_stack, ),
                                            dtype=tf.float32)
        b_head_spec = [conv_feature_spec, stacked_reward_spec
                       ] if ceb_backward_encode_rewards else conv_feature_spec
        b_head = b_head_ctor(b_head_spec,
                             output_dim=ceb_feature_dim,
                             name='b_head')
        b_head.create_variables()

    # future decoder for generative formulation
    future_deconv = None
    future_reward_mlp = None
    y_decoders = None
    if ceb_generative_ratio > 0.0:
        future_deconv = utils.SimpleDeconv(conv_feature_spec,
                                           output_tensor_spec=ims_spec)
        future_deconv.create_variables()

        future_reward_mlp = utils.MLP(conv_feature_spec,
                                      hidden_dims=(ceb_feature_dim,
                                                   ceb_feature_dim // 2,
                                                   frame_stack))
        future_reward_mlp.create_variables()

        y_decoders = [future_deconv, future_reward_mlp]

    m_vars = e_enc.trainable_variables
    if enc_ema_tau is None:
        m_vars += b_enc.trainable_variables
    else:  # do not train b_enc
        common.soft_variables_update(e_enc.variables,
                                     b_enc.variables,
                                     tau=1.0,
                                     tau_non_trainable=1.0)

    if e_head_ctor is not None:
        m_vars += e_head.trainable_variables
    if b_head_ctor is not None:
        m_vars += b_head.trainable_variables
    if ceb_generative_ratio > 0.0:
        m_vars += future_deconv.trainable_variables
        m_vars += future_reward_mlp.trainable_variables

    feature_lr_fn = schedule_utils.get_schedule_fn(base=feature_lr,
                                                   sched=feature_lr_schedule,
                                                   step=g_step)
    m_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=feature_lr_fn)

    # CEB beta schedule, e.q. 'berp@0:1.0:1000_10000:0.3:0'
    beta_fn = schedule_utils.get_schedule_fn(base=ceb_beta,
                                             sched=ceb_beta_schedule,
                                             step=g_step)

    def img_pred_summary_fn(obs, pred):
        utils.replay_summary('y0',
                             g_step,
                             reshape=True,
                             frame_stack=frame_stack,
                             image_summary_interval=image_summary_interval)(
                                 obs, None)
        utils.replay_summary('y0_pred',
                             g_step,
                             reshape=True,
                             frame_stack=frame_stack,
                             image_summary_interval=image_summary_interval)(
                                 pred, None)
        utils.replay_summary('y0_pred_diff',
                             g_step,
                             reshape=True,
                             frame_stack=frame_stack,
                             image_summary_interval=image_summary_interval)(
                                 ((obs - pred) / 2.0 + 0.5), None)

    ceb = ceb_task.CEB(beta_fn=beta_fn,
                       generative_ratio=ceb_generative_ratio,
                       generative_items=ceb_generative_items,
                       step_counter=g_step,
                       img_pred_summary_fn=img_pred_summary_fn)
    m_ceb = ceb_task.CEBTask(
        ceb,
        e_enc,
        b_enc,
        forward_head=e_head,
        backward_head=b_head,
        y_decoders=y_decoders,
        learn_backward_enc=(enc_ema_tau is None),
        action_condition=ceb_action_condition,
        backward_encode_rewards=ceb_backward_encode_rewards,
        optimizer=m_optimizer,
        grad_clip=feature_grad_clip,
        global_step=g_step)

    if train_next_frame_decoder:
        ns_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
        next_frame_deconv = utils.SimpleDeconv(conv_feature_spec,
                                               output_tensor_spec=ims_spec)
        next_frame_decoder = utils.PixelDecoder(
            next_frame_deconv,
            optimizer=ns_optimizer,
            step_counter=g_step,
            image_summary_interval=image_summary_interval,
            frame_stack=frame_stack)
        next_frame_deconv.create_variables()

    # Agent training
    actor_lr_fn = schedule_utils.get_schedule_fn(base=actor_learning_rate,
                                                 sched=actor_lr_schedule,
                                                 step=g_step)
    critic_lr_fn = schedule_utils.get_schedule_fn(base=critic_learning_rate,
                                                  sched=critic_lr_schedule,
                                                  step=g_step)
    alpha_lr_fn = schedule_utils.get_schedule_fn(base=alpha_learning_rate,
                                                 sched=alpha_lr_schedule,
                                                 step=g_step)

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        conv_feature_spec,
        action_spec,
        kernel_initializer=actor_kernel_init,
        fc_layer_params=actor_fc_layers,
        activation_fn=tf.keras.activations.relu,
        continuous_projection_net=normal_proj_net)

    critic_net = critic_network.CriticNetwork(
        (conv_feature_spec, action_spec),
        observation_fc_layer_params=critic_obs_fc_layers,
        action_fc_layer_params=critic_action_fc_layers,
        joint_fc_layer_params=critic_joint_fc_layers,
        activation_fn=tf.nn.relu,
        kernel_initializer=critic_kernel_init,
        last_kernel_initializer=critic_last_kernel_init)

    tf_agent = sac_agent.SacAgent(
        ts.time_step_spec(observation_spec=conv_feature_spec),
        action_spec,
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=actor_lr_fn),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=critic_lr_fn),
        alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=alpha_lr_fn),
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        td_errors_loss_fn=td_errors_loss_fn,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        gradient_clipping=gradient_clipping,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=g_step)
    tf_agent.initialize()

    env_steps = tf_metrics.EnvironmentSteps(prefix='Train')
    average_return = tf_metrics.AverageReturnMetric(
        prefix='Train',
        buffer_size=num_eval_episodes,
        batch_size=tf_env.batch_size)
    train_metrics = [
        tf_metrics.NumberOfEpisodes(prefix='Train'), env_steps, average_return,
        tf_metrics.AverageEpisodeLengthMetric(prefix='Train',
                                              buffer_size=num_eval_episodes,
                                              batch_size=tf_env.batch_size),
        tf_metrics.AverageReturnMetric(name='LatestReturn',
                                       prefix='Train',
                                       buffer_size=1,
                                       batch_size=tf_env.batch_size)
    ]

    # Collect and eval policies
    initial_collect_policy = random_tf_policy.RandomTFPolicy(
        tf_env.time_step_spec(), action_spec)

    eval_policy = tf_agent.policy
    if greedy_eval_policy:
        eval_policy = greedy_policy.GreedyPolicy(eval_policy)

    def obs_to_feature(observation):
        feature, _ = e_enc(observation['pixels'], training=False)
        return tf.stop_gradient(feature)

    eval_policy = FeaturePolicy(policy=eval_policy,
                                time_step_spec=tf_env.time_step_spec(),
                                obs_to_feature_fn=obs_to_feature)

    collect_policy = FeaturePolicy(policy=tf_agent.collect_policy,
                                   time_step_spec=tf_env.time_step_spec(),
                                   obs_to_feature_fn=obs_to_feature)

    # Make the replay buffer.
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=collect_policy.trajectory_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)
    replay_observer = [replay_buffer.add_batch]

    # Checkpoints
    train_checkpointer = common.Checkpointer(
        ckpt_dir=os.path.join(root_dir, 'train'),
        agent=tf_agent,
        actor_net=actor_net,
        critic_net=critic_net,
        global_step=g_step,
        metrics=tfa_metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
    train_checkpointer.initialize_or_restore()

    policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        root_dir, 'policy'),
                                              policy=eval_policy,
                                              global_step=g_step)
    policy_checkpointer.initialize_or_restore()

    rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        root_dir, 'replay_buffer'),
                                          max_to_keep=1,
                                          replay_buffer=replay_buffer,
                                          global_step=g_step)
    rb_checkpointer.initialize_or_restore()

    if learn_ceb:
        d = dict()
        if future_deconv is not None:
            d.update(future_deconv=future_deconv)
        if future_reward_mlp is not None:
            d.update(future_reward_mlp=future_reward_mlp)
        model_ckpt = common.Checkpointer(ckpt_dir=os.path.join(
            root_dir, 'model'),
                                         forward_encoder=e_enc,
                                         forward_encoder_target=e_enc_t,
                                         forward_head=e_head,
                                         backward_encoder=b_enc,
                                         backward_head=b_head,
                                         global_step=g_step,
                                         **d)
    else:
        model_ckpt = common.Checkpointer(ckpt_dir=os.path.join(
            root_dir, 'model'),
                                         forward_encoder=e_enc,
                                         forward_encoder_target=e_enc_t,
                                         global_step=g_step)
    model_ckpt.initialize_or_restore()

    if train_next_frame_decoder:
        next_frame_decoder_ckpt = common.Checkpointer(
            ckpt_dir=os.path.join(root_dir, 'next_frame_decoder'),
            next_frame_decoder=next_frame_decoder,
            next_frame_deconv=next_frame_deconv,
            global_step=g_step)
        next_frame_decoder_ckpt.initialize_or_restore()

    if use_tf_functions and not drivers_in_graph:
        collect_policy.action = common.function(collect_policy.action)

    initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        initial_collect_policy,
        observers=replay_observer + train_metrics,
        num_steps=initial_collect_steps)
    collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        collect_policy,
        observers=replay_observer + train_metrics,
        num_steps=collect_steps_per_iteration)

    if use_tf_functions and drivers_in_graph:
        initial_collect_driver.run = common.function(
            initial_collect_driver.run)
        collect_driver.run = common.function(collect_driver.run)

    # Collect initial replay data.
    if env_steps.result() == 0 or replay_buffer.num_frames() == 0:
        qj(initial_collect_steps,
           'Initializing replay buffer by collecting random experience',
           tic=1)
        initial_collect_driver.run()
        for train_metric in train_metrics:
            train_metric.tf_summaries(train_step=env_steps.result())
        qj(s='Done initializing replay buffer', toc=1)

    time_step = None
    policy_state = collect_policy.get_initial_state(tf_env.batch_size)

    time_acc = 0
    env_steps_before = env_steps.result().numpy()

    paddings = tf.constant([[4, 4], [4, 4], [0, 0]])

    def random_shifting(traj, meta):
        x0 = traj.observation['pixels'][0]
        x1 = traj.observation['pixels'][1]
        y0 = traj.observation['pixels'][frame_stack]
        y1 = traj.observation['pixels'][frame_stack + 1]
        x0 = tf.pad(x0, paddings, 'SYMMETRIC')
        x1 = tf.pad(x1, paddings, 'SYMMETRIC')
        y0 = tf.pad(y0, paddings, 'SYMMETRIC')
        y1 = tf.pad(y1, paddings, 'SYMMETRIC')
        x0a = tf.image.random_crop(x0, ims_shape)
        x1a = tf.image.random_crop(x1, ims_shape)
        x0 = tf.image.random_crop(x0, ims_shape)
        x1 = tf.image.random_crop(x1, ims_shape)
        y0 = tf.image.random_crop(y0, ims_shape)
        y1 = tf.image.random_crop(y1, ims_shape)
        return (traj, (x0, x1, x0a, x1a, y0, y1)), meta

    # Dataset generates trajectories with shape [B, T, ...]
    num_steps = frame_stack + 2
    with tf.device('/cpu:0'):
        if image_aug_type == 'random_shifting':
            dataset = replay_buffer.as_dataset(
                sample_batch_size=batch_size,
                num_steps=num_steps).unbatch().filter(
                    utils.filter_invalid_transition).map(
                        random_shifting,
                        num_parallel_calls=3).batch(batch_size).map(
                            utils.replay_summary(
                                'replay/filtered',
                                order_frame_stack=True,
                                frame_stack=frame_stack,
                                image_summary_interval=image_summary_interval,
                                has_augmentations=True))
        elif image_aug_type is None:
            dataset = replay_buffer.as_dataset(
                sample_batch_size=batch_size,
                num_steps=num_steps).unbatch().filter(
                    utils.filter_invalid_transition).batch(batch_size).map(
                        utils.replay_summary(
                            'replay/filtered',
                            order_frame_stack=True,
                            frame_stack=frame_stack,
                            image_summary_interval=image_summary_interval,
                            has_augmentations=False))
        else:
            raise NotImplementedError
    iterator_nstep = iter(dataset)

    def model_train_step(experience):
        if image_aug_type == 'random_shifting':
            experience, cropped_frames = experience
            x0, x1, _, _, y0, y1 = cropped_frames
            r0, r1, a0, a1 = utils.split_xy(experience,
                                            frame_stack,
                                            rewards_n_actions_only=True)
            x0 = x0[:, None, ...]
            x1 = x1[:, None, ...]
            y0 = y0[:, None, ...]
            y1 = y1[:, None, ...]
        elif image_aug_type is None:
            x0, x1, y0, y1, r0, r1, a0, a1 = utils.split_xy(
                experience, frame_stack, rewards_n_actions_only=False)
        else:
            raise NotImplementedError

        # Flatten stacked actions
        action_shape = a0.shape.as_list()
        a0 = tf.reshape(a0, [action_shape[0], action_shape[1], -1])
        a1 = tf.reshape(a1, [action_shape[0], action_shape[1], -1])

        if image_summary_interval > 0:
            utils.replay_summary(
                'ceb/x0',
                g_step,
                reshape=True,
                frame_stack=frame_stack,
                image_summary_interval=image_summary_interval)(x0, None)
            utils.replay_summary(
                'ceb/x1',
                g_step,
                reshape=True,
                frame_stack=frame_stack,
                image_summary_interval=image_summary_interval)(x1, None)
            utils.replay_summary(
                'ceb/y0',
                g_step,
                reshape=True,
                frame_stack=frame_stack,
                image_summary_interval=image_summary_interval)(y0, None)
            utils.replay_summary(
                'ceb/y1',
                g_step,
                reshape=True,
                frame_stack=frame_stack,
                image_summary_interval=image_summary_interval)(y1, None)

        ceb_loss, feat_x0, zx0 = m_ceb.train(x0, a0, y0, y1, r0, r1, m_vars)
        if train_next_frame_decoder:
            # zx0: [B, 1, Z]
            zx0 = tf.squeeze(zx0, axis=1)
            # y0: [B, 1, H, W, Cxframe_stack]
            next_obs = tf.cast(tf.squeeze(y0, axis=1), tf.float32) / 255.0
            next_frame_decoder.train(next_obs, tf.stop_gradient(zx0))

        if enc_ema_tau is not None:
            common.soft_variables_update(e_enc.variables,
                                         b_enc.variables,
                                         tau=enc_ema_tau,
                                         tau_non_trainable=enc_ema_tau)

    def agent_train_step(experience):
        # preprocess experience
        if image_aug_type == 'random_shifting':
            experience, cropped_frames = experience
            x0, x1, x0a, x1a, y0, y1 = cropped_frames
            experience = tf.nest.map_structure(
                lambda t: composite.slice_to(t, axis=1, end=2), experience)
            time_steps, actions, next_time_steps = (
                tf_agent.experience_to_transitions(experience))  # pylint: disable=protected-access
        elif image_aug_type is None:
            experience = tf.nest.map_structure(
                lambda t: composite.slice_to(t, axis=1, end=2), experience)
            time_steps, actions, next_time_steps = (
                tf_agent.experience_to_transitions(experience))  # pylint: disable=protected-access
            x0 = time_steps.observation['pixels']
            x1 = next_time_steps.observation['pixels']
        else:
            raise NotImplementedError

        tf_agent.train_pix(time_steps,
                           actions,
                           next_time_steps,
                           x0,
                           x1,
                           x0a=x0a if use_augmented_q else None,
                           x1a=x1a if use_augmented_q else None,
                           e_enc=e_enc,
                           e_enc_t=e_enc_t,
                           q_aug=use_augmented_q,
                           use_critic_grad=use_critic_grad)

    def checkpoint(step):
        rb_checkpointer.save(global_step=step)
        train_checkpointer.save(global_step=step)
        policy_checkpointer.save(global_step=step)
        model_ckpt.save(global_step=step)
        if train_next_frame_decoder:
            next_frame_decoder_ckpt.save(global_step=step)

    def evaluate():
        # Override outer record_if that may be out of sync with respect to the
        # env_steps.result() value used for the summay step.
        with tf.compat.v2.summary.record_if(True):
            qj(g_step.numpy(), 'Starting eval at step', tic=1)
            results = pisac_metric_utils.eager_compute(
                eval_metrics,
                eval_tf_env,
                eval_policy,
                histograms=eval_histograms,
                num_episodes=num_eval_episodes,
                train_step=env_steps.result(),
                summary_writer=summary_writer,
                summary_prefix='Eval',
                use_function=drivers_in_graph,
            )
            if eval_metrics_callback is not None:
                eval_metrics_callback(results, env_steps.result())
            tfa_metric_utils.log_metrics(eval_metrics)
            qj(s='Finished eval', toc=1)

    def update_target():
        common.soft_variables_update(
            e_enc.variables,
            e_enc_t.variables,
            tau=tf_agent.target_update_tau,
            tau_non_trainable=tf_agent.target_update_tau)
        common.soft_variables_update(
            tf_agent._critic_network_1.variables,  # pylint: disable=protected-access
            tf_agent._target_critic_network_1.variables,  # pylint: disable=protected-access
            tau=tf_agent.target_update_tau,
            tau_non_trainable=tf_agent.target_update_tau)
        common.soft_variables_update(
            tf_agent._critic_network_2.variables,  # pylint: disable=protected-access
            tf_agent._target_critic_network_2.variables,  # pylint: disable=protected-access
            tau=tf_agent.target_update_tau,
            tau_non_trainable=tf_agent.target_update_tau)

    if use_tf_functions:
        if learn_ceb:
            m_ceb.train = common.function(m_ceb.train)
            model_train_step = common.function(model_train_step)
        agent_train_step = common.function(agent_train_step)
        tf_agent.train_pix = common.function(tf_agent.train_pix)
        update_target = common.function(update_target)
        if train_next_frame_decoder:
            next_frame_decoder.train = common.function(
                next_frame_decoder.train)

    if not learn_ceb and initial_feature_step > 0:
        raise ValueError('Not learning CEB but initial_feature_step > 0')

    with tf.summary.record_if(
            lambda: tf.math.equal(g_step % summary_interval, 0)):
        if learn_ceb and g_step.numpy() < initial_feature_step:
            qj(initial_feature_step, 'Pretraining CEB...', tic=1)
            for _ in range(g_step.numpy(), initial_feature_step):
                with tf.name_scope('LearningRates'):
                    tf.summary.scalar(name='CEB learning rate',
                                      data=feature_lr_fn(),
                                      step=g_step)
                experience, _ = next(iterator_nstep)
                model_train_step(experience)
                g_step.assign_add(1)
            qj(s='Done pretraining CEB.', toc=1)

    first_step = True
    for _ in range(g_step.numpy(), num_iterations):
        g_step_val = g_step.numpy()
        start_time = time.time()

        with tf.summary.record_if(
                lambda: tf.math.equal(g_step % summary_interval, 0)):

            with tf.name_scope('LearningRates'):
                tf.summary.scalar(name='Actor learning rate',
                                  data=actor_lr_fn(),
                                  step=g_step)
                tf.summary.scalar(name='Critic learning rate',
                                  data=critic_lr_fn(),
                                  step=g_step)
                tf.summary.scalar(name='Alpha learning rate',
                                  data=alpha_lr_fn(),
                                  step=g_step)
                if learn_ceb:
                    tf.summary.scalar(name='CEB learning rate',
                                      data=feature_lr_fn(),
                                      step=g_step)

            with tf.name_scope('Train'):
                tf.summary.scalar(name='StepsVsEnvironmentSteps',
                                  data=env_steps.result(),
                                  step=g_step)
                tf.summary.scalar(name='StepsVsAverageReturn',
                                  data=average_return.result(),
                                  step=g_step)

            if g_step_val % collect_every == 0:
                time_step, policy_state = collect_driver.run(
                    time_step=time_step,
                    policy_state=policy_state,
                )

            experience, _ = next(iterator_nstep)
            agent_train_step(experience)
            if (g_step_val -
                    initial_feature_step) % tf_agent.target_update_period == 0:
                update_target()
            if learn_ceb:
                model_train_step(experience)
            time_acc += time.time() - start_time

        # Increment global step counter.
        g_step.assign_add(1)
        g_step_val = g_step.numpy()

        if (g_step_val - initial_feature_step) % log_interval == 0:
            for train_metric in train_metrics:
                train_metric.tf_summaries(train_step=env_steps.result())
            logging.info('env steps = %d, average return = %f',
                         env_steps.result(), average_return.result())
            env_steps_per_sec = (env_steps.result().numpy() -
                                 env_steps_before) / time_acc
            logging.info('%.3f env steps/sec', env_steps_per_sec)
            tf.compat.v2.summary.scalar(name='env_steps_per_sec',
                                        data=env_steps_per_sec,
                                        step=env_steps.result())
            time_acc = 0
            env_steps_before = env_steps.result().numpy()

        if (g_step_val - initial_feature_step) % eval_interval == 0:
            eval_start_time = time.time()
            evaluate()
            logging.info('eval time %.3f sec', time.time() - eval_start_time)

        if (g_step_val - initial_feature_step) % checkpoint_interval == 0:
            checkpoint(g_step_val)

        # Write gin config to Tensorboard
        if first_step:
            summ = utils.Summ(0, root_dir)
            conf = gin.operative_config_str()
            conf = '    ' + conf.replace('\n', '\n    ')
            summ.text('gin/config', conf)
            summ.flush()
            first_step = False

    # Final checkpoint.
    checkpoint(g_step.numpy())

    # Final evaluation.
    evaluate()
Ejemplo n.º 6
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 critic_network,
                 actor_network,
                 actor_optimizer,
                 critic_optimizer,
                 alpha_optimizer,
                 actor_policy_ctor=actor_policy.ActorPolicy,
                 squash_actions=True,
                 target_update_tau=1.0,
                 target_update_period=1,
                 td_errors_loss_fn=tf.math.squared_difference,
                 gamma=1.0,
                 reward_scale_factor=1.0,
                 initial_log_alpha=0.0,
                 target_entropy=None,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 train_step_counter=None,
                 name=None):
        """Creates a SAC Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      critic_network: A function critic_network((observations, actions)) that
        returns the q_values for each observation and action.
      actor_network: A function actor_network(observation, action_spec) that
       returns action distribution.
      actor_optimizer: The optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      alpha_optimizer: The default optimizer to use for the alpha variable.
      actor_policy_ctor: The policy class to use.
      squash_actions: Whether or not to use tanh to squash actions between
        -1 and 1.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn:  A function for computing the elementwise TD errors
        loss.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      initial_log_alpha: Initial value for log_alpha.
      target_entropy: The target average policy entropy, for updating alpha.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)

        self._critic_network1 = critic_network
        self._critic_network2 = critic_network.copy(name='CriticNetwork2')
        self._target_critic_network1 = critic_network.copy(
            name='TargetCriticNetwork1')
        self._target_critic_network2 = critic_network.copy(
            name='TargetCriticNetwork2')
        self._actor_network = actor_network

        policy = actor_policy_ctor(time_step_spec=time_step_spec,
                                   action_spec=action_spec,
                                   actor_network=self._actor_network)

        self._log_alpha = common.create_variable(
            'initial_log_alpha',
            initial_value=initial_log_alpha,
            dtype=tf.float32,
            trainable=True)

        # If target_entropy was not passed, set it to negative of the total number
        # of action dimensions.
        if target_entropy is None:
            flat_action_spec = tf.nest.flatten(action_spec)
            target_entropy = -np.sum([
                np.product(single_spec.shape.as_list())
                for single_spec in flat_action_spec
            ])

        self._squash_actions = squash_actions
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer
        self._td_errors_loss_fn = td_errors_loss_fn
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_entropy = target_entropy
        self._gradient_clipping = gradient_clipping
        self._debug_summaries = debug_summaries
        self._summarize_grads_and_vars = summarize_grads_and_vars

        super(SacAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=2,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Ejemplo n.º 7
0
 def __init__(self, variable_scope='num_episodes_step_observer'):
     with tf.compat.v1.variable_scope(variable_scope):
         self._num_episodes = common.create_variable('num_episodes',
                                                     0,
                                                     shape=[],
                                                     dtype=tf.int32)
Ejemplo n.º 8
0
  def __init__(self,

               # counter
               train_step_counter,

               # specs
               time_step_spec,
               action_spec,

               # networks
               critic_network,
               actor_network,
               model_network,
               compressor_network,

               # optimizers
               actor_optimizer,
               critic_optimizer,
               alpha_optimizer,
               model_optimizer,

               # target update
               target_update_tau=1.0,
               target_update_period=1,

               # inputs and stop gradients
               critic_input='state',
               actor_input='state',
               critic_input_stop_gradient=True,
               actor_input_stop_gradient=False,

               # model stuff
               model_batch_size=256, # will round to nearest full trajectory
               ac_batch_size=128,

               # other
               episodes_per_trial = 1,
               num_tasks_per_train=1,
               num_batches_per_sampled_trials=1,
               td_errors_loss_fn=tf.math.squared_difference,
               gamma=1.0,
               reward_scale_factor=1.0,
               task_reward_dim=None,
               initial_log_alpha=0.0,
               target_entropy=None,
               gradient_clipping=None,
               control_timestep=None,
               num_images_per_summary=1,

               offline_ratio=None,
               override_reward_func=None,
               ):

    tf.Module.__init__(self)
    self.override_reward_func = override_reward_func
    self.offline_ratio = offline_ratio

    ################
    # critic
    ################
    # networks
    self._critic_network1 = critic_network
    self._critic_network2 = critic_network.copy(name='CriticNetwork2')
    self._target_critic_network1 = critic_network.copy(name='TargetCriticNetwork1')
    self._target_critic_network2 = critic_network.copy(name='TargetCriticNetwork2')
    # update the target networks
    self._target_update_tau = target_update_tau
    self._target_update_period = target_update_period
    self._update_target = self._get_target_updater(tau=self._target_update_tau, period=self._target_update_period)

    ################
    # model
    ################
    self._model_network = model_network
    self.model_input = self._model_network.model_input

    ################
    # compressor
    ################
    self._compressor_network = compressor_network

    ################
    # actor
    ################
    self._actor_network = actor_network

    ################
    # policies
    ################

    self.condition_on_full_latent_dist = (actor_input=="latentDistribution" and critic_input=="latentDistribution")
    
    # both policies below share the same actor network
    # but they process latents (to give to actor network) in potentially different ways

    # used for eval
    which_posterior='first'
    if self._model_network.sparse_reward_inputs:
      which_rew_input='sparse'
    else:
      which_rew_input='dense'

    policy = MeldPolicy(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        actor_network=self._actor_network,
        model_network=self._model_network,
        actor_input=actor_input,
        which_posterior=which_posterior,
        which_rew_input=which_rew_input,
        )

    # used for collecting data during training

    # overwrite if specified (eg for double agent)
    which_posterior='first'
    if self._model_network.sparse_reward_inputs:
      which_rew_input='sparse'
    else:
      which_rew_input='dense'

    collect_policy = MeldPolicy(
      time_step_spec=time_step_spec,
      action_spec=action_spec,
      actor_network=self._actor_network,
      model_network=self._model_network,
      actor_input=actor_input,
      which_posterior=which_posterior,
      which_rew_input=which_rew_input,
      )


    ################
    # more vars
    ################
    self.num_batches_per_sampled_trials = num_batches_per_sampled_trials
    self.episodes_per_trial = episodes_per_trial
    self._task_reward_dim = task_reward_dim
    self._log_alpha = common.create_variable(
        'initial_log_alpha',
        initial_value=initial_log_alpha,
        dtype=tf.float32,
        trainable=True)

    # If target_entropy was not passed, set it to negative of the total number
    # of action dimensions.
    if target_entropy is None:
      flat_action_spec = tf.nest.flatten(action_spec)
      target_entropy = -np.sum([
        np.product(single_spec.shape.as_list())
        for single_spec in flat_action_spec
      ])

    self._actor_optimizer = actor_optimizer
    self._critic_optimizer = critic_optimizer
    self._alpha_optimizer = alpha_optimizer
    self._model_optimizer = model_optimizer
    self._td_errors_loss_fn = td_errors_loss_fn
    self._gamma = gamma
    self._reward_scale_factor = reward_scale_factor
    self._target_entropy = target_entropy
    self._gradient_clipping = gradient_clipping

    self._critic_input = critic_input
    self._actor_input = actor_input
    self._critic_input_stop_gradient = critic_input_stop_gradient
    self._actor_input_stop_gradient = actor_input_stop_gradient
    self._model_batch_size = model_batch_size
    self._ac_batch_size = ac_batch_size
    self._control_timestep = control_timestep
    self._num_images_per_summary = num_images_per_summary
    self._actor_time_step_spec = time_step_spec._replace(observation=actor_network.input_tensor_spec)
    self._num_tasks_per_train = num_tasks_per_train

    ################
    # init tf agent
    ################

    super(MeldAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy=policy,
        collect_policy=collect_policy, #used to set self.step_spec
        train_sequence_length=None, #train function can accept experience of any length T (i.e., [B,T,...])
        train_step_counter=train_step_counter)

    self._train_model_fn = common.function_in_tf1()(self._train_model)
    self._train_ac_fn = common.function_in_tf1()(self._train_ac)
Ejemplo n.º 9
0
  def testSequencePreprocessNotBatched(self):
    counter = common.create_variable('test_train_counter')
    n_time_steps = 3
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.compat.v1.train.AdamOptimizer(),
        actor_net=DummyActorNet(
            self._obs_spec,
            self._action_spec,
        ),
        value_net=DummyValueNet(self._obs_spec),
        normalize_observations=False,
        num_epochs=1,
        use_gae=False,
        use_td_lambda_return=False,
        compute_value_and_advantage_in_train=False,
        train_step_counter=counter)
    observations = tf.constant([[1, 2], [3, 4], [5, 6]], dtype=tf.float32)

    mid_time_step_val = ts.StepType.MID.tolist()
    time_steps = ts.TimeStep(
        step_type=tf.constant(
            [mid_time_step_val] * n_time_steps, dtype=tf.int32),
        reward=tf.constant([1] * n_time_steps, dtype=tf.float32),
        discount=tf.constant([1] * n_time_steps, dtype=tf.float32),
        observation=observations)
    actions = tf.constant([[0], [1], [1]], dtype=tf.float32)

    old_action_distribution_parameters = {
        'loc': tf.constant([[0.0]] * n_time_steps, dtype=tf.float32),
        'scale': tf.constant([[1.0]] * n_time_steps, dtype=tf.float32),
    }

    value_preds = tf.constant([9., 15., 21.], dtype=tf.float32)
    policy_info = {
        'dist_params': old_action_distribution_parameters,
        'value_prediction': value_preds,
    }
    experience = trajectory.Trajectory(time_steps.step_type, observations,
                                       actions, policy_info,
                                       time_steps.step_type, time_steps.reward,
                                       time_steps.discount)

    returned_experience = agent.preprocess_sequence(experience)
    self.evaluate(tf.compat.v1.initialize_all_variables())

    self.assertAllClose(observations, returned_experience.observation)
    self.assertAllClose(actions, returned_experience.action)

    self.assertAllClose(old_action_distribution_parameters,
                        returned_experience.policy_info['dist_params'])
    self.assertEqual(n_time_steps,
                     returned_experience.policy_info['return'].shape)
    self.assertAllClose([40.4821, 30.79],
                        returned_experience.policy_info['return'][:-1])
    self.assertEqual(
        n_time_steps,
        returned_experience.policy_info['normalized_advantage'].shape)
    self.assertAllClose(
        [1., -1.], returned_experience.policy_info['normalized_advantage'][:-1])
 def create_variable(spec):
   return common.create_variable(
       name=spec.name,
       dtype=spec.dtype,
       shape=[batch_size] + spec.shape.as_list())
Ejemplo n.º 11
0
  def testSaveAction(self, seeded, has_state, distribution_net,
                     has_input_fn_and_spec):
    with tf.compat.v1.Graph().as_default():
      tf.compat.v1.set_random_seed(self._global_seed)
      with tf.compat.v1.Session().as_default():
        global_step = common.create_variable('train_step', initial_value=0)
        if distribution_net:
          network = actor_distribution_network.ActorDistributionNetwork(
              self._time_step_spec.observation, self._action_spec)
          policy = actor_policy.ActorPolicy(
              time_step_spec=self._time_step_spec,
              action_spec=self._action_spec,
              actor_network=network)
        else:
          if has_state:
            network = q_rnn_network.QRnnNetwork(
                input_tensor_spec=self._time_step_spec.observation,
                action_spec=self._action_spec,
                lstm_size=(40,))
          else:
            network = q_network.QNetwork(
                input_tensor_spec=self._time_step_spec.observation,
                action_spec=self._action_spec)

          policy = q_policy.QPolicy(
              time_step_spec=self._time_step_spec,
              action_spec=self._action_spec,
              q_network=network)

        action_seed = 98723

        batch_size = 3
        action_inputs = tensor_spec.sample_spec_nest(
            (self._time_step_spec, policy.policy_state_spec),
            outer_dims=(batch_size,),
            seed=4)
        action_input_values = self.evaluate(action_inputs)
        action_input_tensors = tf.nest.map_structure(tf.convert_to_tensor,
                                                     action_input_values)

        action_output = policy.action(*action_input_tensors, seed=action_seed)
        distribution_output = policy.distribution(*action_input_tensors)
        self.assertIsInstance(
            distribution_output.action, tfp.distributions.Distribution)

        self.evaluate(tf.compat.v1.global_variables_initializer())

        action_output_dict = collections.OrderedDict(
            ((spec.name, value) for (spec, value) in zip(
                tf.nest.flatten(policy.policy_step_spec),
                tf.nest.flatten(action_output))))

        # Check output of the flattened signature call.
        (action_output_value, action_output_dict) = self.evaluate(
            (action_output, action_output_dict))

        distribution_output_value = self.evaluate(_sample_from_distributions(
            distribution_output))

        input_fn_and_spec = None
        if has_input_fn_and_spec:
          input_fn_and_spec = (_convert_string_vector_to_action_input,
                               tf.TensorSpec((7,), tf.string, name='example'))

        saver = policy_saver.PolicySaver(
            policy,
            batch_size=None,
            use_nest_path_signatures=False,
            seed=action_seed,
            input_fn_and_spec=input_fn_and_spec,
            train_step=global_step)
        path = os.path.join(self.get_temp_dir(), 'save_model_action')
        saver.save(path)

    with tf.compat.v1.Graph().as_default():
      tf.compat.v1.set_random_seed(self._global_seed)
      with tf.compat.v1.Session().as_default():
        reloaded = tf.compat.v2.saved_model.load(path)

        self.assertIn('action', reloaded.signatures)
        reloaded_action = reloaded.signatures['action']
        if has_input_fn_and_spec:
          self._compare_input_output_specs(
              reloaded_action,
              expected_input_specs=input_fn_and_spec[1],
              expected_output_spec=policy.policy_step_spec,
              batch_input=True)

        else:
          self._compare_input_output_specs(
              reloaded_action,
              expected_input_specs=(self._time_step_spec,
                                    policy.policy_state_spec),
              expected_output_spec=policy.policy_step_spec,
              batch_input=True)

        # Reload action_input_values as tensors in the new graph.
        action_input_tensors = tf.nest.map_structure(tf.convert_to_tensor,
                                                     action_input_values)

        action_input_spec = (self._time_step_spec, policy.policy_state_spec)
        function_action_input_dict = collections.OrderedDict(
            (spec.name, value) for (spec, value) in zip(
                tf.nest.flatten(action_input_spec),
                tf.nest.flatten(action_input_tensors)))

        # NOTE(ebrevdo): The graph-level seeds for the policy and the reloaded
        # model are equal, which in addition to seeding the call to action() and
        # PolicySaver helps ensure equality of the output of action() in both
        # cases.
        self.assertEqual(reloaded_action.graph.seed, self._global_seed)

        # The seed= argument for the SavedModel action call was given at
        # creation of the PolicySaver.
        if has_input_fn_and_spec:
          action_string_vector = _convert_action_input_to_string_vector(
              action_input_tensors)
          action_string_vector_values = self.evaluate(action_string_vector)
          reloaded_action_output_dict = reloaded_action(action_string_vector)
          reloaded_action_output = reloaded.action(action_string_vector)
          reloaded_distribution_output = reloaded.distribution(
              action_string_vector)
          self.assertIsInstance(reloaded_distribution_output.action,
                                tfp.distributions.Distribution)

        else:
          # This is the flat-signature function.
          reloaded_action_output_dict = reloaded_action(
              **function_action_input_dict)
          # This is the non-flat function.
          reloaded_action_output = reloaded.action(*action_input_tensors)
          reloaded_distribution_output = reloaded.distribution(
              *action_input_tensors)
          self.assertIsInstance(reloaded_distribution_output.action,
                                tfp.distributions.Distribution)

          if not has_state:
            # Try both cases: one with an empty policy_state and one with no
            # policy_state.  Compare them.

            # NOTE(ebrevdo): The first call to .action() must be stored in
            # reloaded_action_output because this is the version being compared
            # later against the true action_output and the values will change
            # after the first call due to randomness.
            reloaded_action_output_no_input_state = reloaded.action(
                action_input_tensors[0])
            reloaded_distribution_output_no_input_state = reloaded.distribution(
                action_input_tensors[0])
            # Even with a seed, multiple calls to action will get different
            # values, so here we just check the signature matches.
            self.assertIsInstance(
                reloaded_distribution_output_no_input_state.action,
                tfp.distributions.Distribution)
            tf.nest.map_structure(self.match_dtype_shape,
                                  reloaded_action_output_no_input_state,
                                  reloaded_action_output)

            tf.nest.map_structure(
                self.match_dtype_shape,
                _sample_from_distributions(
                    reloaded_distribution_output_no_input_state),
                _sample_from_distributions(reloaded_distribution_output))

        self.evaluate(tf.compat.v1.global_variables_initializer())
        (reloaded_action_output_dict,
         reloaded_action_output_value) = self.evaluate(
             (reloaded_action_output_dict, reloaded_action_output))

        reloaded_distribution_output_value = self.evaluate(
            _sample_from_distributions(reloaded_distribution_output))

        self.assertAllEqual(action_output_dict.keys(),
                            reloaded_action_output_dict.keys())

        for k in action_output_dict:
          if seeded:
            self.assertAllClose(
                action_output_dict[k],
                reloaded_action_output_dict[k],
                msg='\nMismatched dict key: %s.' % k)
          else:
            self.match_dtype_shape(
                action_output_dict[k],
                reloaded_action_output_dict[k],
                msg='\nMismatch dict key: %s.' % k)

        # With non-signature functions, we can check that passing a seed does
        # the right thing the second time.
        if seeded:
          tf.nest.map_structure(self.assertAllClose, action_output_value,
                                reloaded_action_output_value)
        else:
          tf.nest.map_structure(self.match_dtype_shape, action_output_value,
                                reloaded_action_output_value)

        tf.nest.map_structure(self.assertAllClose,
                              distribution_output_value,
                              reloaded_distribution_output_value)

    ## TFLite tests.

    # The converter must run outside of a TF1 graph context, even in
    # eager mode, to ensure the TF2 path is being executed.  Only
    # works in TF2.
    if tf.compat.v1.executing_eagerly_outside_functions():
      tflite_converter = tf.lite.TFLiteConverter.from_saved_model(
          path, signature_keys=['action'])
      tflite_converter.target_spec.supported_ops = [
          tf.lite.OpsSet.TFLITE_BUILTINS,
          # TODO(b/111309333): Remove this when `has_input_fn_and_spec`
          # is `False` once TFLite has native support for RNG ops, atan, etc.
          tf.lite.OpsSet.SELECT_TF_OPS,
      ]
      tflite_serialized_model = tflite_converter.convert()

      tflite_interpreter = tf.lite.Interpreter(
          model_content=tflite_serialized_model)

      tflite_runner = tflite_interpreter.get_signature_runner('action')
      tflite_signature = tflite_interpreter.get_signature_list()['action']

      if has_input_fn_and_spec:
        tflite_action_input_dict = {
            'example': action_string_vector_values,
        }
      else:
        tflite_action_input_dict = collections.OrderedDict(
            (spec.name, value) for (spec, value) in zip(
                tf.nest.flatten(action_input_spec),
                tf.nest.flatten(action_input_values)))

      self.assertEqual(
          set(tflite_signature['inputs']),
          set(tflite_action_input_dict))
      self.assertEqual(
          set(tflite_signature['outputs']),
          set(action_output_dict))

      tflite_output = tflite_runner(**tflite_action_input_dict)

      self.assertAllClose(tflite_output, action_output_dict)
Ejemplo n.º 12
0
    def __init__(self, action_spec: BoundedTensorSpec):
        super().__init__(action_spec)

        self._highest_return = common.create_variable("highest_reward",
                                                      -inf,
                                                      dtype=tf.float32)
Ejemplo n.º 13
0
 def testNonScalarInitialValue(self):
   var = common.create_variable('var', [1, 2], shape=None)
   self.evaluate(tf.compat.v1.global_variables_initializer())
   self.assertAllEqual(self.evaluate(var), [1, 2])
Ejemplo n.º 14
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_network: DistributionNetwork,
                 critic_network: Network,
                 critic_loss=None,
                 target_entropy=None,
                 initial_log_alpha=0.0,
                 target_update_tau=0.05,
                 target_update_period=1,
                 dqda_clipping=None,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 alpha_optimizer=None,
                 gradient_clipping=None,
                 debug_summaries=False,
                 name="SacAlgorithm"):
        """Create a SacAlgorithm

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            actor_network (Network): The network will be called with
                call(observation, step_type).
            critic_network (Network): The network will be called with
                call(observation, action, step_type).
            critic_loss (None|OneStepTDLoss): an object for calculating critic loss.
                If None, a default OneStepTDLoss will be used.
            initial_log_alpha (float): initial value for variable log_alpha
            target_entropy (float|None): The target average policy entropy, for updating alpha.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between [-dqda_clipping, dqda_clipping].
                Does not perform clipping if dqda_clipping == 0.
            actor_optimizer (tf.optimizers.Optimizer): The optimizer for actor.
            critic_optimizer (tf.optimizers.Optimizer): The optimizer for critic.
            alpha_optimizer (tf.optimizers.Optimizer): The optimizer for alpha.
            gradient_clipping (float): Norm length to clip gradients.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        critic_network1 = critic_network
        critic_network2 = critic_network.copy(name='CriticNetwork2')
        log_alpha = tfa_common.create_variable(name='log_alpha',
                                               initial_value=initial_log_alpha,
                                               dtype=tf.float32,
                                               trainable=True)
        super().__init__(
            observation_spec,
            action_spec,
            train_state_spec=SacState(
                share=SacShareState(actor=actor_network.state_spec),
                actor=SacActorState(critic1=critic_network.state_spec,
                                    critic2=critic_network.state_spec),
                critic=SacCriticState(
                    critic1=critic_network.state_spec,
                    critic2=critic_network.state_spec,
                    target_critic1=critic_network.state_spec,
                    target_critic2=critic_network.state_spec)),
            optimizer=[actor_optimizer, critic_optimizer, alpha_optimizer],
            trainable_module_sets=[[actor_network],
                                   [critic_network1, critic_network2],
                                   [log_alpha]],
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            name=name)

        self._log_alpha = log_alpha
        self._actor_network = actor_network
        self._critic_network1 = critic_network1
        self._critic_network2 = critic_network2
        self._target_critic_network1 = self._critic_network1.copy(
            name='target_critic_network1')
        self._target_critic_network2 = self._critic_network2.copy(
            name='target_critic_network2')
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer

        if critic_loss is None:
            critic_loss = OneStepTDLoss(debug_summaries=debug_summaries)
        self._critic_loss = critic_loss

        flat_action_spec = tf.nest.flatten(self._action_spec)
        self._is_continuous = tensor_spec.is_continuous(flat_action_spec[0])
        if target_entropy is None:
            target_entropy = np.sum(
                list(
                    map(dist_utils.calc_default_target_entropy,
                        flat_action_spec)))
        self._target_entropy = target_entropy

        self._dqda_clipping = dqda_clipping

        self._update_target = common.get_target_updater(
            models=[self._critic_network1, self._critic_network2],
            target_models=[
                self._target_critic_network1, self._target_critic_network2
            ],
            tau=target_update_tau,
            period=target_update_period)
Ejemplo n.º 15
0
    def testSaveAction(self, seeded, has_state, distribution_net,
                       has_input_fn_and_spec):
        with tf.compat.v1.Graph().as_default():
            tf.compat.v1.set_random_seed(self._global_seed)
            with tf.compat.v1.Session().as_default():
                global_step = common.create_variable('train_step',
                                                     initial_value=0)
                if distribution_net:
                    network = actor_distribution_network.ActorDistributionNetwork(
                        self._time_step_spec.observation, self._action_spec)
                    policy = actor_policy.ActorPolicy(
                        time_step_spec=self._time_step_spec,
                        action_spec=self._action_spec,
                        actor_network=network)
                else:
                    if has_state:
                        network = q_rnn_network.QRnnNetwork(
                            input_tensor_spec=self._time_step_spec.observation,
                            action_spec=self._action_spec,
                            lstm_size=(40, ))
                    else:
                        network = q_network.QNetwork(
                            input_tensor_spec=self._time_step_spec.observation,
                            action_spec=self._action_spec)

                    policy = q_policy.QPolicy(
                        time_step_spec=self._time_step_spec,
                        action_spec=self._action_spec,
                        q_network=network)

                action_seed = 98723

                batch_size = 3
                action_inputs = tensor_spec.sample_spec_nest(
                    (self._time_step_spec, policy.policy_state_spec),
                    outer_dims=(batch_size, ),
                    seed=4)
                action_input_values = self.evaluate(action_inputs)
                action_input_tensors = tf.nest.map_structure(
                    tf.convert_to_tensor, action_input_values)

                action_output = policy.action(*action_input_tensors,
                                              seed=action_seed)
                distribution_output = policy.distribution(
                    *action_input_tensors)
                self.assertIsInstance(distribution_output.action,
                                      tfp.distributions.Distribution)

                self.evaluate(tf.compat.v1.global_variables_initializer())

                action_output_dict = dict(((spec.name, value) for (
                    spec,
                    value) in zip(tf.nest.flatten(policy.policy_step_spec),
                                  tf.nest.flatten(action_output))))

                # Check output of the flattened signature call.
                (action_output_value, action_output_dict) = self.evaluate(
                    (action_output, action_output_dict))

                distribution_output_value = self.evaluate(
                    _sample_from_distributions(distribution_output))

                input_fn_and_spec = None
                if has_input_fn_and_spec:
                    input_fn_and_spec = (
                        self._convert_string_vector_to_action_input,
                        tf.TensorSpec((7, ), tf.string, name='example'))

                saver = policy_saver.PolicySaver(
                    policy,
                    batch_size=None,
                    use_nest_path_signatures=False,
                    seed=action_seed,
                    input_fn_and_spec=input_fn_and_spec,
                    train_step=global_step)
                path = os.path.join(self.get_temp_dir(), 'save_model_action')
                saver.save(path)

        with tf.compat.v1.Graph().as_default():
            tf.compat.v1.set_random_seed(self._global_seed)
            with tf.compat.v1.Session().as_default():
                reloaded = tf.compat.v2.saved_model.load(path)

                self.assertIn('action', reloaded.signatures)
                reloaded_action = reloaded.signatures['action']
                if has_input_fn_and_spec:
                    self._compare_input_output_specs(
                        reloaded_action,
                        expected_input_specs=input_fn_and_spec[1],
                        expected_output_spec=policy.policy_step_spec,
                        batch_input=True)

                else:
                    self._compare_input_output_specs(
                        reloaded_action,
                        expected_input_specs=(self._time_step_spec,
                                              policy.policy_state_spec),
                        expected_output_spec=policy.policy_step_spec,
                        batch_input=True)

                # Reload action_input_values as tensors in the new graph.
                action_input_tensors = tf.nest.map_structure(
                    tf.convert_to_tensor, action_input_values)

                action_input_spec = (self._time_step_spec,
                                     policy.policy_state_spec)
                function_action_input_dict = dict(
                    (spec.name, value)
                    for (spec,
                         value) in zip(tf.nest.flatten(action_input_spec),
                                       tf.nest.flatten(action_input_tensors)))

                # NOTE(ebrevdo): The graph-level seeds for the policy and the reloaded
                # model are equal, which in addition to seeding the call to action() and
                # PolicySaver helps ensure equality of the output of action() in both
                # cases.
                self.assertEqual(reloaded_action.graph.seed, self._global_seed)

                def match_dtype_shape(x, y, msg=None):
                    self.assertEqual(x.shape, y.shape, msg=msg)
                    self.assertEqual(x.dtype, y.dtype, msg=msg)

                # The seed= argument for the SavedModel action call was given at
                # creation of the PolicySaver.
                if has_input_fn_and_spec:
                    action_string_vector = self._convert_action_input_to_string_vector(
                        action_input_tensors)
                    reloaded_action_output_dict = reloaded_action(
                        action_string_vector)
                    reloaded_action_output = reloaded.action(
                        action_string_vector)
                    reloaded_distribution_output = reloaded.distribution(
                        action_string_vector)
                    self.assertIsInstance(reloaded_distribution_output.action,
                                          tfp.distributions.Distribution)

                else:
                    # This is the flat-signature function.
                    reloaded_action_output_dict = reloaded_action(
                        **function_action_input_dict)
                    # This is the non-flat function.
                    reloaded_action_output = reloaded.action(
                        *action_input_tensors)
                    reloaded_distribution_output = reloaded.distribution(
                        *action_input_tensors)
                    self.assertIsInstance(reloaded_distribution_output.action,
                                          tfp.distributions.Distribution)

                    if not has_state:
                        # Try both cases: one with an empty policy_state and one with no
                        # policy_state.  Compare them.

                        # NOTE(ebrevdo): The first call to .action() must be stored in
                        # reloaded_action_output because this is the version being compared
                        # later against the true action_output and the values will change
                        # after the first call due to randomness.
                        reloaded_action_output_no_input_state = reloaded.action(
                            action_input_tensors[0])
                        reloaded_distribution_output_no_input_state = reloaded.distribution(
                            action_input_tensors[0])
                        # Even with a seed, multiple calls to action will get different
                        # values, so here we just check the signature matches.
                        self.assertIsInstance(
                            reloaded_distribution_output_no_input_state.action,
                            tfp.distributions.Distribution)
                        tf.nest.map_structure(
                            match_dtype_shape,
                            reloaded_action_output_no_input_state,
                            reloaded_action_output)

                        tf.nest.map_structure(
                            match_dtype_shape,
                            _sample_from_distributions(
                                reloaded_distribution_output_no_input_state),
                            _sample_from_distributions(
                                reloaded_distribution_output))

                self.evaluate(tf.compat.v1.global_variables_initializer())
                (reloaded_action_output_dict,
                 reloaded_action_output_value) = self.evaluate(
                     (reloaded_action_output_dict, reloaded_action_output))

                reloaded_distribution_output_value = self.evaluate(
                    _sample_from_distributions(reloaded_distribution_output))

                self.assertAllEqual(action_output_dict.keys(),
                                    reloaded_action_output_dict.keys())

                for k in action_output_dict:
                    if seeded:
                        self.assertAllClose(action_output_dict[k],
                                            reloaded_action_output_dict[k],
                                            msg='\nMismatched dict key: %s.' %
                                            k)
                    else:
                        match_dtype_shape(action_output_dict[k],
                                          reloaded_action_output_dict[k],
                                          msg='\nMismatch dict key: %s.' % k)

                # With non-signature functions, we can check that passing a seed does
                # the right thing the second time.
                if seeded:
                    tf.nest.map_structure(self.assertAllClose,
                                          action_output_value,
                                          reloaded_action_output_value)
                else:
                    tf.nest.map_structure(match_dtype_shape,
                                          action_output_value,
                                          reloaded_action_output_value)

                tf.nest.map_structure(self.assertAllClose,
                                      distribution_output_value,
                                      reloaded_distribution_output_value)
Ejemplo n.º 16
0
  def testTrain(self, num_epochs, use_td_lambda_return,
                compute_value_and_advantage_in_train):
    # Mock the build_train_op to return an op for incrementing this counter.
    counter = common.create_variable('test_train_counter')
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        tf.compat.v1.train.AdamOptimizer(),
        actor_net=DummyActorNet(
            self._obs_spec,
            self._action_spec,
        ),
        value_net=DummyValueNet(self._obs_spec),
        normalize_observations=False,
        num_epochs=num_epochs,
        use_gae=use_td_lambda_return,
        use_td_lambda_return=use_td_lambda_return,
        compute_value_and_advantage_in_train=compute_value_and_advantage_in_train,
        train_step_counter=counter)
    observations = tf.constant([
        [[1, 2], [3, 4], [5, 6]],
        [[1, 2], [3, 4], [5, 6]],
    ],
                               dtype=tf.float32)

    mid_time_step_val = ts.StepType.MID.tolist()
    time_steps = ts.TimeStep(
        step_type=tf.constant([[mid_time_step_val] * 3] * 2, dtype=tf.int32),
        reward=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        discount=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        observation=observations)
    actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32)

    action_distribution_parameters = {
        'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
        'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
    }
    value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]],
                              dtype=tf.float32)

    policy_info = {
        'dist_params': action_distribution_parameters,
    }
    if not compute_value_and_advantage_in_train:
      policy_info['value_prediction'] = value_preds
    experience = trajectory.Trajectory(time_steps.step_type, observations,
                                       actions, policy_info,
                                       time_steps.step_type, time_steps.reward,
                                       time_steps.discount)
    if not compute_value_and_advantage_in_train:
      experience = agent._preprocess(experience)

    if tf.executing_eagerly():
      loss = lambda: agent.train(experience)
    else:
      loss = agent.train(experience)

    # Assert that counter starts out at zero.
    self.evaluate(tf.compat.v1.initialize_all_variables())
    self.assertEqual(0, self.evaluate(counter))
    loss_type = self.evaluate(loss)
    loss_numpy = loss_type.loss

    # Assert that loss is not zero as we are training in a non-episodic env.
    self.assertNotEqual(
        loss_numpy,
        0.0,
        msg=('Loss is exactly zero, looks like no training '
             'was performed due to incomplete episodes.'))

    # Assert that train_op ran increment_counter num_epochs times.
    self.assertEqual(num_epochs, self.evaluate(counter))
Ejemplo n.º 17
0
    def testSaveGetInitialState(self):
        network = q_rnn_network.QRnnNetwork(
            input_tensor_spec=self._time_step_spec.observation,
            action_spec=self._action_spec,
            lstm_size=(40, ))

        policy = q_policy.QPolicy(time_step_spec=self._time_step_spec,
                                  action_spec=self._action_spec,
                                  q_network=network)

        train_step = common.create_variable('train_step', initial_value=0)
        saver_nobatch = policy_saver.PolicySaver(
            policy,
            train_step=train_step,
            batch_size=None,
            use_nest_path_signatures=False)
        path = os.path.join(self.get_temp_dir(),
                            'save_model_initial_state_nobatch')

        self.evaluate(tf.compat.v1.global_variables_initializer())

        with self.cached_session():
            saver_nobatch.save(path)
            reloaded_nobatch = tf.compat.v2.saved_model.load(path)
            self.evaluate(
                tf.compat.v1.initializers.variables(
                    reloaded_nobatch.model_variables))

        self.assertIn('get_initial_state', reloaded_nobatch.signatures)
        reloaded_get_initial_state = (
            reloaded_nobatch.signatures['get_initial_state'])
        self._compare_input_output_specs(
            reloaded_get_initial_state,
            expected_input_specs=(tf.TensorSpec(dtype=tf.int32,
                                                shape=(),
                                                name='batch_size'), ),
            expected_output_spec=policy.policy_state_spec,
            batch_input=False,
            batch_size=None)

        initial_state = policy.get_initial_state(batch_size=3)
        initial_state = self.evaluate(initial_state)

        reloaded_nobatch_initial_state = reloaded_nobatch.get_initial_state(
            batch_size=3)
        reloaded_nobatch_initial_state = self.evaluate(
            reloaded_nobatch_initial_state)
        tf.nest.map_structure(self.assertAllClose, initial_state,
                              reloaded_nobatch_initial_state)

        saver_batch = policy_saver.PolicySaver(policy,
                                               train_step=train_step,
                                               batch_size=3,
                                               use_nest_path_signatures=False)
        path = os.path.join(self.get_temp_dir(),
                            'save_model_initial_state_batch')
        with self.cached_session():
            saver_batch.save(path)
            reloaded_batch = tf.compat.v2.saved_model.load(path)
            self.evaluate(
                tf.compat.v1.initializers.variables(
                    reloaded_batch.model_variables))
        self.assertIn('get_initial_state', reloaded_batch.signatures)
        reloaded_get_initial_state = reloaded_batch.signatures[
            'get_initial_state']
        self._compare_input_output_specs(
            reloaded_get_initial_state,
            expected_input_specs=(),
            expected_output_spec=policy.policy_state_spec,
            batch_input=False,
            batch_size=3)

        reloaded_batch_initial_state = reloaded_batch.get_initial_state()
        reloaded_batch_initial_state = self.evaluate(
            reloaded_batch_initial_state)
        tf.nest.map_structure(self.assertAllClose, initial_state,
                              reloaded_batch_initial_state)
Ejemplo n.º 18
0
  def testStatelessValueNetTrain(self, compute_value_and_advantage_in_train):
    counter = common.create_variable('test_train_counter')
    actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
        self._time_step_spec.observation,
        self._action_spec,
        input_fc_layer_params=None,
        output_fc_layer_params=None,
        lstm_size=(20,))
    value_net = value_network.ValueNetwork(
        self._time_step_spec.observation, fc_layer_params=None)
    agent = ppo_agent.PPOAgent(
        self._time_step_spec,
        self._action_spec,
        optimizer=tf.compat.v1.train.AdamOptimizer(),
        actor_net=actor_net,
        value_net=value_net,
        num_epochs=1,
        train_step_counter=counter,
        compute_value_and_advantage_in_train=compute_value_and_advantage_in_train
    )
    observations = tf.constant([
        [[1, 2], [3, 4], [5, 6]],
        [[1, 2], [3, 4], [5, 6]],
    ],
                               dtype=tf.float32)

    mid_time_step_val = ts.StepType.MID.tolist()
    time_steps = ts.TimeStep(
        step_type=tf.constant([[mid_time_step_val] * 3] * 2, dtype=tf.int32),
        reward=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        discount=tf.constant([[1] * 3] * 2, dtype=tf.float32),
        observation=observations)
    actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32)

    action_distribution_parameters = {
        'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
        'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
    }
    value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]],
                              dtype=tf.float32)

    policy_info = {
        'dist_params': action_distribution_parameters,
    }
    if not compute_value_and_advantage_in_train:
      policy_info['value_prediction'] = value_preds
    experience = trajectory.Trajectory(time_steps.step_type, observations,
                                       actions, policy_info,
                                       time_steps.step_type, time_steps.reward,
                                       time_steps.discount)
    if not compute_value_and_advantage_in_train:
      experience = agent._preprocess(experience)

    if tf.executing_eagerly():
      loss = lambda: agent.train(experience)
    else:
      loss = agent.train(experience)

    self.evaluate(tf.compat.v1.initialize_all_variables())

    loss_type = self.evaluate(loss)
    loss_numpy = loss_type.loss
    # Assert that loss is not zero as we are training in a non-episodic env.
    self.assertNotEqual(
        loss_numpy,
        0.0,
        msg=('Loss is exactly zero, looks like no training '
             'was performed due to incomplete episodes.'))
Ejemplo n.º 19
0
  def __init__(self,
               data_spec,
               batch_size,
               max_length=1000,
               scope='TFUniformReplayBuffer',
               device='cpu:*',
               table_fn=table.Table,
               dataset_drop_remainder=False,
               dataset_window_shift=None,
               stateful_dataset=False):
    """Creates a TFUniformReplayBuffer.

    The TFUniformReplayBuffer stores episodes in `B == batch_size` blocks of
    size `L == max_length`, with total frame capacity
    `C == L * B`.  Storage looks like:

    ```
    block1 ep1 frame1
               frame2
           ...
           ep2 frame1
               frame2
           ...
           <L frames total>
    block2 ep1 frame1
               frame2
           ...
           ep2 frame1
               frame2
           ...
           <L frames total>
    ...
    blockB ep1 frame1
               frame2
           ...
           ep2 frame1
               frame2
           ...
           <L frames total>
    ```
    Multiple episodes may be stored within a given block, up to `max_length`
    frames total.  In practice, new episodes will overwrite old ones as the
    block rolls over its `max_length`.

    Args:
      data_spec: A TensorSpec or a list/tuple/nest of TensorSpecs describing a
        single item that can be stored in this buffer.
      batch_size: Batch dimension of tensors when adding to buffer.
      max_length: The maximum number of items that can be stored in a single
        batch segment of the buffer.
      scope: Scope prefix for variables and ops created by this class.
      device: A TensorFlow device to place the Variables and ops.
      table_fn: Function to create tables `table_fn(data_spec, capacity)` that
        can read/write nested tensors.
      dataset_drop_remainder: If `True`, then when calling
        `as_dataset` with arguments `single_deterministic_pass=True` and
        `sample_batch_size is not None`, the final batch will be dropped if it
        does not contain exactly `sample_batch_size` items.  This is helpful for
        static shape inference as the resulting tensors will always have
        leading dimension `sample_batch_size` instead of `None`.
      dataset_window_shift: Window shift used when calling `as_dataset` with
        arguments `single_deterministic_pass=True` and `num_steps is not None`.
        This determines how the resulting frames are windowed.  If `None`, then
        there is no overlap created between frames and each frame is seen
        exactly once.  For example, if `max_length=5`, `num_steps=2`,
        `sample_batch_size=None`, and `dataset_window_shift=None`, then the
        datasets returned will have frames `{[0, 1], [2, 3], [4]}`.

        If `dataset_window_shift is not None`, then windows are created with a
        window overlap of `dataset_window_shift` and you will see each frame up
        to `num_steps` times.  For example, if `max_length=5`, `num_steps=2`,
        `sample_batch_size=None`, and `dataset_window_shift=1`, then the
        datasets returned will have windows of shifted repeated frames:
        `{[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]}`.

        For more details, see the documentation of `tf.data.Dataset.window`,
        specifically for the `shift` argument.

        The default behavior is to not overlap frames
        (`dataset_window_shift=None`) but users often want to see all
        combinations of frame sequences, in which case `dataset_window_shift=1`
        is the appropriate value.
      stateful_dataset: whether the dataset contains stateful ops or not.
    """
    self._batch_size = batch_size
    self._max_length = max_length
    capacity = self._batch_size * self._max_length
    super(TFUniformReplayBuffer, self).__init__(
        data_spec, capacity, stateful_dataset)

    self._id_spec = tensor_spec.TensorSpec([], dtype=tf.int64, name='id')
    self._capacity_value = np.int64(self._capacity)
    self._batch_offsets = (
        tf.range(self._batch_size, dtype=tf.int64) * self._max_length)
    self._scope = scope
    self._device = device
    self._table_fn = table_fn
    self._dataset_drop_remainder = dataset_drop_remainder
    self._dataset_window_shift = dataset_window_shift
    with tf.device(self._device), tf.compat.v1.variable_scope(self._scope):
      self._capacity = tf.constant(capacity, dtype=tf.int64)
      self._data_table = table_fn(self._data_spec, self._capacity_value)
      self._id_table = table_fn(self._id_spec, self._capacity_value)
      self._last_id = common.create_variable('last_id', -1)
      self._last_id_cs = tf.CriticalSection(name='last_id')
Ejemplo n.º 20
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 optimizer=None,
                 actor_net=None,
                 value_net=None,
                 importance_ratio_clipping=0.0,
                 lambda_value=0.95,
                 discount_factor=0.99,
                 entropy_regularization=0.0,
                 policy_l2_reg=0.0,
                 value_function_l2_reg=0.0,
                 value_pred_loss_coef=0.5,
                 num_epochs=25,
                 use_gae=False,
                 use_td_lambda_return=False,
                 normalize_rewards=True,
                 reward_norm_clipping=10.0,
                 normalize_observations=True,
                 log_prob_clipping=0.0,
                 kl_cutoff_factor=2.0,
                 kl_cutoff_coef=1000.0,
                 initial_adaptive_kl_beta=1.0,
                 adaptive_kl_target=0.01,
                 adaptive_kl_tolerance=0.3,
                 gradient_clipping=None,
                 check_numerics=False,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 train_step_counter=None,
                 name=None):
        """Creates a PPO Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      optimizer: Optimizer to use for the agent.
      actor_net: A function actor_net(observations, action_spec) that returns
        tensor of action distribution params for each observation. Takes nested
        observation and returns nested action.
      value_net: A function value_net(time_steps) that returns value tensor from
        neural net predictions for each observation. Takes nested observation
        and returns batch of value_preds.
      importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective.
        For more detail, see explanation at the top of the doc.
      lambda_value: Lambda parameter for TD-lambda computation.
      discount_factor: Discount factor for return computation.
      entropy_regularization: Coefficient for entropy regularization loss term.
      policy_l2_reg: Coefficient for l2 regularization of policy weights.
      value_function_l2_reg: Coefficient for l2 regularization of value function
        weights.
      value_pred_loss_coef: Multiplier for value prediction loss to balance with
        policy gradient loss.
      num_epochs: Number of epochs for computing policy updates.
      use_gae: If True (default False), uses generalized advantage estimation
        for computing per-timestep advantage. Else, just subtracts value
        predictions from empirical return.
      use_td_lambda_return: If True (default False), uses td_lambda_return for
        training value function. (td_lambda_return = gae_advantage +
        value_predictions)
      normalize_rewards: If true, keeps moving variance of rewards and
        normalizes incoming rewards.
      reward_norm_clipping: Value above an below to clip normalized reward.
      normalize_observations: If true, keeps moving mean and variance of
        observations and normalizes incoming observations.
      log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN
        values.  Default: no clipping.
      kl_cutoff_factor: If policy KL changes more than this much for any single
        timestep, adds a squared KL penalty to loss function.
      kl_cutoff_coef: Loss coefficient for kl cutoff term.
      initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive
        kl penalty.
      adaptive_kl_target: Desired kl target for policy updates. If actual kl is
        far from this target, adaptive_kl_beta will be updated.
      adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above (1
        + tol) * adaptive_kl_target, or below (1 - tol) * adaptive_kl_target,
        will cause adaptive_kl_beta to be updated.
      gradient_clipping: Norm length to clip gradients.  Default: no clipping.
      check_numerics: If true, adds tf.debugging.check_numerics to help find
        NaN / Inf values. For debugging only.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If true, gradient summaries will be written.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If the actor_net is not a DistributionNetwork.
    """
        if not isinstance(actor_net, network.DistributionNetwork):
            raise ValueError(
                'actor_net must be an instance of a DistributionNetwork.')

        tf.Module.__init__(self, name=name)

        self._optimizer = optimizer
        self._actor_net = actor_net
        self._value_net = value_net
        self._importance_ratio_clipping = importance_ratio_clipping
        self._lambda = lambda_value
        self._discount_factor = discount_factor
        self._entropy_regularization = entropy_regularization
        self._policy_l2_reg = policy_l2_reg
        self._value_function_l2_reg = value_function_l2_reg
        self._value_pred_loss_coef = value_pred_loss_coef
        self._num_epochs = num_epochs
        self._use_gae = use_gae
        self._use_td_lambda_return = use_td_lambda_return
        self._reward_norm_clipping = reward_norm_clipping
        self._log_prob_clipping = log_prob_clipping
        self._kl_cutoff_factor = kl_cutoff_factor
        self._kl_cutoff_coef = kl_cutoff_coef
        self._adaptive_kl_target = adaptive_kl_target
        self._adaptive_kl_tolerance = adaptive_kl_tolerance
        self._gradient_clipping = gradient_clipping or 0.0
        self._check_numerics = check_numerics

        if initial_adaptive_kl_beta > 0.0:
            # TODO(kbanoop): Rename create_variable.
            self._adaptive_kl_beta = common.create_variable(
                'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32)
        else:
            self._adaptive_kl_beta = None

        self._reward_normalizer = None
        if normalize_rewards:
            self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer(
                tensor_spec.TensorSpec([], tf.float32),
                scope='normalize_reward')

        self._observation_normalizer = None
        if normalize_observations:
            self._observation_normalizer = (
                tensor_normalizer.StreamingTensorNormalizer(
                    time_step_spec.observation,
                    scope='normalize_observations'))

        policy = greedy_policy.GreedyPolicy(
            ppo_policy.PPOPolicy(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                actor_network=actor_net,
                value_network=value_net,
                observation_normalizer=self._observation_normalizer,
                clip=False,
                collect=False))

        collect_policy = ppo_policy.PPOPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=actor_net,
            value_network=value_net,
            observation_normalizer=self._observation_normalizer,
            clip=False,
            collect=True)

        self._action_distribution_spec = (self._actor_net.output_spec)

        super(PPOAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Ejemplo n.º 21
0
 def __init__(self, name='NumberOfEpisodes', prefix='Metrics', dtype=tf.int64):
   super(NumberOfEpisodes, self).__init__(name=name, prefix=prefix)
   self.dtype = dtype
   self.number_episodes = common.create_variable(
       initial_value=0, dtype=self.dtype, shape=(), name='number_episodes')
Ejemplo n.º 22
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 critic_network: network.Network,
                 actor_network: network.Network,
                 actor_optimizer: types.Optimizer,
                 critic_optimizer: types.Optimizer,
                 alpha_optimizer: types.Optimizer,
                 actor_loss_weight: types.Float = 1.0,
                 critic_loss_weight: types.Float = 0.5,
                 alpha_loss_weight: types.Float = 1.0,
                 actor_policy_ctor: Callable[
                     ..., tf_policy.TFPolicy] = actor_policy.ActorPolicy,
                 critic_network_2: Optional[network.Network] = None,
                 target_critic_network: Optional[network.Network] = None,
                 target_critic_network_2: Optional[network.Network] = None,
                 target_update_tau: types.Float = 1.0,
                 target_update_period: types.Int = 1,
                 td_errors_loss_fn: types.LossFn = tf.math.squared_difference,
                 gamma: types.Float = 1.0,
                 reward_scale_factor: types.Float = 1.0,
                 initial_log_alpha: types.Float = 0.0,
                 use_log_alpha_in_alpha_loss: bool = True,
                 target_entropy: Optional[types.Float] = None,
                 gradient_clipping: Optional[types.Float] = None,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 train_step_counter: Optional[tf.Variable] = None,
                 name: Optional[Text] = None):
        """Creates a SAC Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      critic_network: A function critic_network((observations, actions)) that
        returns the q_values for each observation and action.
      actor_network: A function actor_network(observation, action_spec) that
        returns action distribution.
      actor_optimizer: The optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      alpha_optimizer: The default optimizer to use for the alpha variable.
      actor_loss_weight: The weight on actor loss.
      critic_loss_weight: The weight on critic loss.
      alpha_loss_weight: The weight on alpha loss.
      actor_policy_ctor: The policy class to use.
      critic_network_2: (Optional.)  A `tf_agents.network.Network` to be used as
        the second critic network during Q learning.  The weights from
        `critic_network` are copied if this is not provided.
      target_critic_network: (Optional.)  A `tf_agents.network.Network` to be
        used as the target critic network during Q learning. Every
        `target_update_period` train steps, the weights from `critic_network`
        are copied (possibly withsmoothing via `target_update_tau`) to `
        target_critic_network`.  If `target_critic_network` is not provided, it
        is created by making a copy of `critic_network`, which initializes a new
        network with the same structure and its own layers and weights.
        Performing a `Network.copy` does not work when the network instance
        already has trainable parameters (e.g., has already been built, or when
        the network is sharing layers with another).  In these cases, it is up
        to you to build a copy having weights that are not shared with the
        original `critic_network`, so that this can be used as a target network.
        If you provide a `target_critic_network` that shares any weights with
        `critic_network`, a warning will be logged but no exception is thrown.
      target_critic_network_2: (Optional.) Similar network as
        target_critic_network but for the critic_network_2. See documentation
        for target_critic_network. Will only be used if 'critic_network_2' is
        also specified.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn:  A function for computing the elementwise TD errors
        loss.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      initial_log_alpha: Initial value for log_alpha.
      use_log_alpha_in_alpha_loss: A boolean, whether using log_alpha or alpha
        in alpha loss. Certain implementations of SAC use log_alpha as log
        values are generally nicer to work with.
      target_entropy: The target average policy entropy, for updating alpha. The
        default value is negative of the total number of actions.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)

        self._check_action_spec(action_spec)

        net_observation_spec = time_step_spec.observation
        critic_spec = (net_observation_spec, action_spec)

        self._critic_network_1 = critic_network

        if critic_network_2 is not None:
            self._critic_network_2 = critic_network_2
        else:
            self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
            # Do not use target_critic_network_2 if critic_network_2 is None.
            target_critic_network_2 = None

        # Wait until critic_network_2 has been copied from critic_network_1 before
        # creating variables on both.
        self._critic_network_1.create_variables(critic_spec)
        self._critic_network_2.create_variables(critic_spec)

        if target_critic_network:
            target_critic_network.create_variables(critic_spec)

        self._target_critic_network_1 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_1,
                target_critic_network,
                input_spec=critic_spec,
                name='TargetCriticNetwork1'))

        if target_critic_network_2:
            target_critic_network_2.create_variables(critic_spec)
        self._target_critic_network_2 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_2,
                target_critic_network_2,
                input_spec=critic_spec,
                name='TargetCriticNetwork2'))

        if actor_network:
            actor_network.create_variables(net_observation_spec)
        self._actor_network = actor_network

        policy = actor_policy_ctor(time_step_spec=time_step_spec,
                                   action_spec=action_spec,
                                   actor_network=self._actor_network,
                                   training=False)

        self._train_policy = actor_policy_ctor(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            training=True)

        self._log_alpha = common.create_variable(
            'initial_log_alpha',
            initial_value=initial_log_alpha,
            dtype=tf.float32,
            trainable=True)

        if target_entropy is None:
            target_entropy = self._get_default_target_entropy(action_spec)

        self._use_log_alpha_in_alpha_loss = use_log_alpha_in_alpha_loss
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer
        self._actor_loss_weight = actor_loss_weight
        self._critic_loss_weight = critic_loss_weight
        self._alpha_loss_weight = alpha_loss_weight
        self._td_errors_loss_fn = td_errors_loss_fn
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_entropy = target_entropy
        self._gradient_clipping = gradient_clipping
        self._debug_summaries = debug_summaries
        self._summarize_grads_and_vars = summarize_grads_and_vars
        self._update_target = self._get_target_updater(
            tau=self._target_update_tau, period=self._target_update_period)

        train_sequence_length = 2 if not critic_network.state_spec else None

        super(SacAgent, self).__init__(
            time_step_spec,
            action_spec,
            policy=policy,
            collect_policy=policy,
            train_sequence_length=train_sequence_length,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter,
        )

        self._as_transition = data_converter.AsTransition(
            self.data_context, squeeze_time_dim=(train_sequence_length == 2))
Ejemplo n.º 23
0
 def __init__(self, name='EnvironmentSteps', prefix='Metrics', dtype=tf.int64):
   super(EnvironmentSteps, self).__init__(name=name, prefix=prefix)
   self.dtype = dtype
   self.environment_steps = common.create_variable(
       initial_value=0, dtype=self.dtype, shape=(), name='environment_steps')
Ejemplo n.º 24
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 critic_network: network.Network,
                 actor_network: network.Network,
                 actor_optimizer: types.Optimizer,
                 critic_optimizer: types.Optimizer,
                 alpha_optimizer: types.Optimizer,
                 actor_loss_weight: types.Float = 1.0,
                 critic_loss_weight: types.Float = 0.5,
                 alpha_loss_weight: types.Float = 1.0,
                 actor_policy_ctor: Callable[
                     ..., tf_policy.TFPolicy] = actor_policy.ActorPolicy,
                 critic_network_2: Optional[network.Network] = None,
                 target_critic_network: Optional[network.Network] = None,
                 target_critic_network_2: Optional[network.Network] = None,
                 target_update_tau: types.Float = 1.0,
                 target_update_period: types.Int = 1,
                 td_errors_loss_fn: types.LossFn = tf.math.squared_difference,
                 gamma: types.Float = 1.0,
                 sigma: types.Float = 0.9,
                 reward_scale_factor: types.Float = 1.0,
                 initial_log_alpha: types.Float = 0.0,
                 use_log_alpha_in_alpha_loss: bool = True,
                 target_entropy: Optional[types.Float] = None,
                 gradient_clipping: Optional[types.Float] = None,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 train_step_counter: Optional[tf.Variable] = None,
                 name: Optional[Text] = None):

        tf.Module.__init__(self, name=name)

        self._check_action_spec(action_spec)

        net_observation_spec = time_step_spec.observation
        critic_spec = (net_observation_spec, action_spec)

        self._critic_network_1 = critic_network

        if critic_network_2 is not None:
            self._critic_network_2 = critic_network_2
        else:
            self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
            # Do not use target_critic_network_2 if critic_network_2 is None.
            target_critic_network_2 = None

        # Wait until critic_network_2 has been copied from critic_network_1 before
        # creating variables on both.
        self._critic_network_1.create_variables(critic_spec)
        self._critic_network_2.create_variables(critic_spec)

        if target_critic_network:
            target_critic_network.create_variables(critic_spec)

        self._target_critic_network_1 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_1,
                target_critic_network,
                input_spec=critic_spec,
                name='TargetCriticNetwork1'))

        if target_critic_network_2:
            target_critic_network_2.create_variables(critic_spec)
        self._target_critic_network_2 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_2,
                target_critic_network_2,
                input_spec=critic_spec,
                name='TargetCriticNetwork2'))

        if actor_network:
            actor_network.create_variables(net_observation_spec)
        self._actor_network = actor_network

        policy = actor_policy_ctor(time_step_spec=time_step_spec,
                                   action_spec=action_spec,
                                   actor_network=self._actor_network,
                                   training=False)

        self._train_policy = actor_policy_ctor(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=self._actor_network,
            training=True)

        self._log_alpha = common.create_variable(
            'initial_log_alpha',
            initial_value=initial_log_alpha,
            dtype=tf.float32,
            trainable=True)

        if target_entropy is None:
            target_entropy = self._get_default_target_entropy(action_spec)

        self._use_log_alpha_in_alpha_loss = use_log_alpha_in_alpha_loss
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer
        self._actor_loss_weight = actor_loss_weight
        self._critic_loss_weight = critic_loss_weight
        self._alpha_loss_weight = alpha_loss_weight
        self._td_errors_loss_fn = td_errors_loss_fn
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_entropy = target_entropy
        self._gradient_clipping = gradient_clipping
        self._debug_summaries = debug_summaries
        self._summarize_grads_and_vars = summarize_grads_and_vars
        self._update_target = self._get_target_updater(
            tau=self._target_update_tau, period=self._target_update_period)

        self.sigma = sigma

        train_sequence_length = 2 if not critic_network.state_spec else None

        super(sac_agent.SacAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=train_sequence_length,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter,
                             validate_args=False)

        self._as_transition = data_converter.AsTransition(
            self.data_context, squeeze_time_dim=(train_sequence_length == 2))
Ejemplo n.º 25
0
 def testDefaults(self):
     counter = common.create_variable('counter')
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(counter), 0)
Ejemplo n.º 26
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 critic_network,
                 actor_network,
                 actor_optimizer,
                 critic_optimizer,
                 alpha_optimizer,
                 actor_policy_ctor=actor_policy.ActorPolicy,
                 critic_network_2=None,
                 target_critic_network=None,
                 target_critic_network_2=None,
                 target_update_tau=1.0,
                 target_update_period=1,
                 td_errors_loss_fn=tf.math.squared_difference,
                 gamma=1.0,
                 reward_scale_factor=1.0,
                 initial_log_alpha=0.0,
                 target_entropy=None,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 train_step_counter=None,
                 name=None):
        """Creates a SAC Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      critic_network: A function critic_network((observations, actions)) that
        returns the q_values for each observation and action.
      actor_network: A function actor_network(observation, action_spec) that
        returns action distribution.
      actor_optimizer: The optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      alpha_optimizer: The default optimizer to use for the alpha variable.
      actor_policy_ctor: The policy class to use.
      critic_network_2: (Optional.)  A `tf_agents.network.Network` to be used as
        the second critic network during Q learning.  The weights from
        `critic_network` are copied if this is not provided.
      target_critic_network: (Optional.)  A `tf_agents.network.Network` to be
        used as the target critic network during Q learning. Every
        `target_update_period` train steps, the weights from `critic_network`
        are copied (possibly withsmoothing via `target_update_tau`) to `
        target_critic_network`.  If `target_critic_network` is not provided, it
        is created by making a copy of `critic_network`, which initializes a new
        network with the same structure and its own layers and weights.
        Performing a `Network.copy` does not work when the network instance
        already has trainable parameters (e.g., has already been built, or when
        the network is sharing layers with another).  In these cases, it is up
        to you to build a copy having weights that are not shared with the
        original `critic_network`, so that this can be used as a target network.
        If you provide a `target_critic_network` that shares any weights with
        `critic_network`, a warning will be logged but no exception is thrown.
      target_critic_network_2: (Optional.) Similar network as
        target_critic_network but for the critic_network_2. See documentation
        for target_critic_network. Will only be used if 'critic_network_2' is
        also specified.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn:  A function for computing the elementwise TD errors
        loss.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      initial_log_alpha: Initial value for log_alpha.
      target_entropy: The target average policy entropy, for updating alpha. The
        default value is negative of the total number of actions.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)

        flat_action_spec = tf.nest.flatten(action_spec)
        for spec in flat_action_spec:
            if spec.dtype.is_integer:
                raise NotImplementedError(
                    'SacAgent does not currently support discrete actions. '
                    'Action spec: {}'.format(action_spec))

        self._critic_network_1 = critic_network
        self._critic_network_1.create_variables()
        if target_critic_network:
            target_critic_network.create_variables()
        self._target_critic_network_1 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_1, target_critic_network,
                'TargetCriticNetwork1'))

        if critic_network_2 is not None:
            self._critic_network_2 = critic_network_2
        else:
            self._critic_network_2 = critic_network.copy(name='CriticNetwork2')
            # Do not use target_critic_network_2 if critic_network_2 is None.
            target_critic_network_2 = None
        self._critic_network_2.create_variables()
        if target_critic_network_2:
            target_critic_network_2.create_variables()
        self._target_critic_network_2 = (
            common.maybe_copy_target_network_with_checks(
                self._critic_network_2, target_critic_network_2,
                'TargetCriticNetwork2'))

        if actor_network:
            actor_network.create_variables()
        self._actor_network = actor_network

        policy = actor_policy_ctor(time_step_spec=time_step_spec,
                                   action_spec=action_spec,
                                   actor_network=self._actor_network)

        self._log_alpha = common.create_variable(
            'initial_log_alpha',
            initial_value=initial_log_alpha,
            dtype=tf.float32,
            trainable=True)

        # If target_entropy was not passed, set it to negative of the total number
        # of action dimensions.
        if target_entropy is None:
            flat_action_spec = tf.nest.flatten(action_spec)
            target_entropy = -np.sum([
                np.product(single_spec.shape.as_list())
                for single_spec in flat_action_spec
            ])

        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer
        self._alpha_optimizer = alpha_optimizer
        self._td_errors_loss_fn = td_errors_loss_fn
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_entropy = target_entropy
        self._gradient_clipping = gradient_clipping
        self._debug_summaries = debug_summaries
        self._summarize_grads_and_vars = summarize_grads_and_vars
        self._update_target = self._get_target_updater(
            tau=self._target_update_tau, period=self._target_update_period)

        train_sequence_length = 2 if not critic_network.state_spec else None

        super(SacAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy=policy,
                             collect_policy=policy,
                             train_sequence_length=train_sequence_length,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
Ejemplo n.º 27
0
 def testIncrement(self):
     counter = common.create_variable('counter', 0)
     inc_counter = counter.assign_add(1)
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertEqual(self.evaluate(inc_counter), 1)
Ejemplo n.º 28
0
  def __init__(
      self,
      time_step_spec,
      action_spec,
      optimizer = None,
      actor_net = None,
      value_net = None,
      importance_ratio_clipping = 0.0,
      lambda_value = 0.95,
      discount_factor = 0.99,
      entropy_regularization = 0.0,
      policy_l2_reg = 0.0,
      value_function_l2_reg = 0.0,
      shared_vars_l2_reg = 0.0,
      value_pred_loss_coef = 0.5,
      num_epochs = 25,
      use_gae = False,
      use_td_lambda_return = False,
      normalize_rewards = True,
      reward_norm_clipping = 10.0,
      normalize_observations = True,
      log_prob_clipping = 0.0,
      kl_cutoff_factor = 0.0,
      kl_cutoff_coef = 0.0,
      initial_adaptive_kl_beta = 0.0,
      adaptive_kl_target = 0.0,
      adaptive_kl_tolerance = 0.0,
      gradient_clipping = None,
      value_clipping = None,
      check_numerics = False,
      # TODO(b/150244758): Change the default to False once we move
      # clients onto Reverb.
      compute_value_and_advantage_in_train = True,
      update_normalizers_in_train = True,
      debug_summaries = False,
      summarize_grads_and_vars = False,
      train_step_counter = None,
      name = 'AttentionPPOAgent'):
    """Creates a PPO Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      optimizer: Optimizer to use for the agent, default to using
        `tf.compat.v1.train.AdamOptimizer`.
      actor_net: A `network.DistributionNetwork` which maps observations to
        action distributions. Commonly, it is set to
        `actor_distribution_network.ActorDistributionNetwork`.
      value_net: A `Network` which returns the value prediction for input
        states, with `call(observation, step_type, network_state)`. Commonly, it
        is set to `value_network.ValueNetwork`.
      importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective.
        For more detail, see explanation at the top of the doc.
      lambda_value: Lambda parameter for TD-lambda computation.
      discount_factor: Discount factor for return computation. Default to `0.99`
        which is the value used for all environments from (Schulman, 2017).
      entropy_regularization: Coefficient for entropy regularization loss term.
        Default to `0.0` because no entropy bonus was used in (Schulman, 2017).
      policy_l2_reg: Coefficient for L2 regularization of unshared actor_net
        weights. Default to `0.0` because no L2 regularization was applied on
        the policy network weights in (Schulman, 2017).
      value_function_l2_reg: Coefficient for l2 regularization of unshared value
        function weights. Default to `0.0` because no L2 regularization was
        applied on the policy network weights in (Schulman, 2017).
      shared_vars_l2_reg: Coefficient for l2 regularization of weights shared
        between actor_net and value_net. Default to `0.0` because no L2
        regularization was applied on the policy network or value network
        weights in (Schulman, 2017).
      value_pred_loss_coef: Multiplier for value prediction loss to balance with
        policy gradient loss. Default to `0.5`, which was used for all
        environments in the OpenAI baseline implementation. This parameters is
        irrelevant unless you are sharing part of actor_net and value_net. In
        that case, you would want to tune this coeeficient, whose value depends
        on the network architecture of your choice.
      num_epochs: Number of epochs for computing policy updates. (Schulman,2017)
        sets this to 10 for Mujoco, 15 for Roboschool and 3 for Atari.
      use_gae: If True (default False), uses generalized advantage estimation
        for computing per-timestep advantage. Else, just subtracts value
        predictions from empirical return.
      use_td_lambda_return: If True (default False), uses td_lambda_return for
        training value function; here: `td_lambda_return = gae_advantage +
          value_predictions`. `use_gae` must be set to `True` as well to enable
          TD -lambda returns. If `use_td_lambda_return` is set to True while
          `use_gae` is False, the empirical return will be used and a warning
          will be logged.
      normalize_rewards: If true, keeps moving variance of rewards and
        normalizes incoming rewards. While not mentioned directly in (Schulman,
        2017), reward normalization was implemented in OpenAI baselines and
        (Ilyas et al., 2018) pointed out that it largely improves performance.
        You may refer to Figure 1 of https://arxiv.org/pdf/1811.02553.pdf for a
          comparison with and without reward scaling.
      reward_norm_clipping: Value above and below to clip normalized reward.
        Additional optimization proposed in (Ilyas et al., 2018) set to `5` or
        `10`.
      normalize_observations: If `True`, keeps moving mean and variance of
        observations and normalizes incoming observations. Additional
        optimization proposed in (Ilyas et al., 2018). If true, and the
        observation spec is not tf.float32 (such as Atari), please manually
        convert the observation spec received from the environment to tf.float32
        before creating the networks. Otherwise, the normalized input to the
        network (float32) will have a different dtype as what the network
        expects, resulting in a mismatch error.
        Example usage: ```python observation_tensor_spec, action_spec,
          time_step_tensor_spec = ( spec_utils.get_tensor_specs(env))
          normalized_observation_tensor_spec = tf.nest.map_structure(
            lambda s: tf.TensorSpec( dtype=tf.float32, shape=s.shape,
              name=s.name ), observation_tensor_spec )  actor_net =
              actor_distribution_network.ActorDistributionNetwork(
              normalized_observation_tensor_spec, ...) value_net =
              value_network.ValueNetwork( normalized_observation_tensor_spec,
              ...) # Note that the agent still uses the original
              time_step_tensor_spec # from the environment. agent =
              ppo_clip_agent.PPOClipAgent( time_step_tensor_spec, action_spec,
              actor_net, value_net, ...) ```
      log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN
        values.  Default: no clipping.
      kl_cutoff_factor: Only meaningful when `kl_cutoff_coef > 0.0`. A multipler
        used for calculating the KL cutoff ( = `kl_cutoff_factor *
        adaptive_kl_target`). If policy KL averaged across the batch changes
        more than the cutoff, a squared cutoff loss would be added to the loss
        function.
      kl_cutoff_coef: kl_cutoff_coef and kl_cutoff_factor are additional params
        if one wants to use a KL cutoff loss term in addition to the adaptive KL
        loss term. Default to 0.0 to disable the KL cutoff loss term as this was
        not used in the paper.  kl_cutoff_coef is the coefficient to mulitply by
        the KL cutoff loss term, before adding to the total loss function.
      initial_adaptive_kl_beta: Initial value for beta coefficient of adaptive
        KL penalty. This initial value is not important in practice because the
        algorithm quickly adjusts to it. A common default is 1.0.
      adaptive_kl_target: Desired KL target for policy updates. If actual KL is
        far from this target, adaptive_kl_beta will be updated. You should tune
        this for your environment. 0.01 was found to perform well for Mujoco.
      adaptive_kl_tolerance: A tolerance for adaptive_kl_beta. Mean KL above `(1
        + tol) * adaptive_kl_target`, or below `(1 - tol) * adaptive_kl_target`,
        will cause `adaptive_kl_beta` to be updated. `0.5` was chosen
        heuristically in the paper, but the algorithm is not very sensitive to
        it.
      gradient_clipping: Norm length to clip gradients.  Default: no clipping.
      value_clipping: Difference between new and old value predictions are
        clipped to this threshold. Value clipping could be helpful when training
        very deep networks. Default: no clipping.
      check_numerics: If true, adds `tf.debugging.check_numerics` to help find
        NaN / Inf values. For debugging only.
      compute_value_and_advantage_in_train: A bool to indicate where value
        prediction and advantage calculation happen.  If True, both happen in
        agent.train(). If False, value prediction is computed during data
        collection. This argument must be set to `False` if mini batch learning
        is enabled.
      update_normalizers_in_train: A bool to indicate whether normalizers are
        updated as parts of the `train` method. Set to `False` if mini batch
        learning is enabled, or if `train` is called on multiple iterations of
        the same trajectories. In that case, you would need to use `PPOLearner`
        (which updates all the normalizers outside of the agent). This ensures
        that normalizers are updated in the same way as (Schulman, 2017).
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If true, gradient summaries will be written.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.

    Raises:
      TypeError: if `actor_net` or `value_net` is not of type
        `tf_agents.networks.Network`.
    """
    if not isinstance(actor_net, network.Network):
      raise TypeError('actor_net must be an instance of a network.Network.')
    if not isinstance(value_net, network.Network):
      raise TypeError('value_net must be an instance of a network.Network.')

    # PPOPolicy validates these, so we skip validation here.
    actor_net.create_variables(time_step_spec.observation)
    value_net.create_variables(time_step_spec.observation)

    tf.Module.__init__(self, name=name)

    self._optimizer = optimizer
    self._actor_net = actor_net
    self._value_net = value_net
    self._importance_ratio_clipping = importance_ratio_clipping
    self._lambda = lambda_value
    self._discount_factor = discount_factor
    self._entropy_regularization = entropy_regularization
    self._policy_l2_reg = policy_l2_reg
    self._value_function_l2_reg = value_function_l2_reg
    self._shared_vars_l2_reg = shared_vars_l2_reg
    self._value_pred_loss_coef = value_pred_loss_coef
    self._num_epochs = num_epochs
    self._use_gae = use_gae
    self._use_td_lambda_return = use_td_lambda_return
    self._reward_norm_clipping = reward_norm_clipping
    self._log_prob_clipping = log_prob_clipping
    self._kl_cutoff_factor = kl_cutoff_factor
    self._kl_cutoff_coef = kl_cutoff_coef
    self._adaptive_kl_target = adaptive_kl_target
    self._adaptive_kl_tolerance = adaptive_kl_tolerance
    self._gradient_clipping = gradient_clipping or 0.0
    self._value_clipping = value_clipping or 0.0
    self._check_numerics = check_numerics
    self._compute_value_and_advantage_in_train = (
        compute_value_and_advantage_in_train)
    self.update_normalizers_in_train = update_normalizers_in_train
    if not isinstance(self._optimizer, tf.keras.optimizers.Optimizer):
      logging.warning(
          'Only tf.keras.optimizers.Optimizers are well supported, got a '
          'non-TF2 optimizer: %s', self._optimizer)

    self._initial_adaptive_kl_beta = initial_adaptive_kl_beta
    if initial_adaptive_kl_beta > 0.0:
      self._adaptive_kl_beta = common.create_variable(
          'adaptive_kl_beta', initial_adaptive_kl_beta, dtype=tf.float32)
    else:
      self._adaptive_kl_beta = None

    self._reward_normalizer = None
    if normalize_rewards:
      self._reward_normalizer = tensor_normalizer.StreamingTensorNormalizer(
          tensor_spec.TensorSpec([], tf.float32), scope='normalize_reward')

    self._observation_normalizer = None
    if normalize_observations:
      self._observation_normalizer = (
          tensor_normalizer.StreamingTensorNormalizer(
              time_step_spec.observation, scope='normalize_observations'))

    self._advantage_normalizer = tensor_normalizer.StreamingTensorNormalizer(
        tensor_spec.TensorSpec([], tf.float32), scope='normalize_advantages')

    policy = greedy_policy.GreedyPolicy(
        attention_ppo_policy.AttentionPPOPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            actor_network=actor_net,
            value_network=value_net,
            observation_normalizer=self._observation_normalizer,
            clip=False,
            collect=False))

    collect_policy = attention_ppo_policy.AttentionPPOPolicy(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        actor_network=actor_net,
        value_network=value_net,
        observation_normalizer=self._observation_normalizer,
        clip=False,
        collect=True,
        compute_value_and_advantage_in_train=(
            self._compute_value_and_advantage_in_train),
    )

    if isinstance(self._actor_net, network.DistributionNetwork):
      # Legacy behavior
      self._action_distribution_spec = self._actor_net.output_spec
    else:
      self._action_distribution_spec = self._actor_net.create_variables(
          time_step_spec.observation)

    # Set training_data_spec to collect_data_spec with augmented policy info,
    # iff return and normalized advantage are saved in preprocess_sequence.
    if self._compute_value_and_advantage_in_train:
      training_data_spec = None
    else:
      training_policy_info = collect_policy.trajectory_spec.policy_info.copy()
      training_policy_info.update({
          'value_prediction':
              collect_policy.trajectory_spec.policy_info['value_prediction'],
          'return':
              tensor_spec.TensorSpec(shape=[], dtype=tf.float32),
          'advantage':
              tensor_spec.TensorSpec(shape=[], dtype=tf.float32),
      })
      training_data_spec = collect_policy.trajectory_spec.replace(
          policy_info=training_policy_info)

    super(ppo_agent.PPOAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy,
        train_sequence_length=None,
        training_data_spec=training_data_spec,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter)

    # This must be built after super() which sets up self.data_context.
    self._collected_as_transition = data_converter.AsTransition(
        self.collect_data_context, squeeze_time_dim=False)

    self._as_trajectory = data_converter.AsTrajectory(
        self.data_context, sequence_length=None)
Ejemplo n.º 29
0
 def testInitialValueWithShape(self):
     counter = common.create_variable('counter', 1, shape=(2, ))
     self.evaluate(tf.compat.v1.global_variables_initializer())
     self.assertAllEqual(self.evaluate(counter), [1, 1])
Ejemplo n.º 30
0
  def testTrainWithLagrange(self, use_lagrange_cql_alpha,
                            use_variable_for_cql_alpha,
                            log_cql_alpha_clipping,
                            expected_cql_alpha_step_one,
                            expected_cql_alpha_step_two,
                            expected_cql_loss_step_one,
                            expected_cql_loss_step_two):
    if use_variable_for_cql_alpha:
      cql_alpha = tf.Variable(5.0)
      cql_alpha_var = cql_alpha  # Getting around type checking.
    else:
      cql_alpha = 5.0
    cql_alpha_learning_rate = 0.5
    cql_tau = 10
    num_cql_samples = 5

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        self._obs_spec, self._action_spec, fc_layer_params=None)
    critic_net = critic_network.CriticNetwork(
        (self._obs_spec, self._action_spec),
        observation_fc_layer_params=(16,),
        action_fc_layer_params=(16,),
        joint_fc_layer_params=(16,),
        kernel_initializer='glorot_uniform',
        last_kernel_initializer='glorot_uniform')

    counter = common.create_variable('test_train_counter')
    optimizer_fn = tf.compat.v1.train.AdamOptimizer
    agent = cql_sac_agent.CqlSacAgent(
        self._time_step_spec,
        self._action_spec,
        critic_network=critic_net,
        actor_network=actor_net,
        actor_optimizer=optimizer_fn(1e-3),
        critic_optimizer=optimizer_fn(1e-3),
        alpha_optimizer=optimizer_fn(1e-3),
        cql_alpha=cql_alpha,
        num_cql_samples=num_cql_samples,
        include_critic_entropy_term=False,
        use_lagrange_cql_alpha=use_lagrange_cql_alpha,
        cql_alpha_learning_rate=cql_alpha_learning_rate,
        cql_tau=cql_tau,
        random_seed=self._random_seed,
        log_cql_alpha_clipping=log_cql_alpha_clipping,
        train_step_counter=counter)

    batch_size = 5
    observations = tf.constant(
        [[[1, 2], [3, 4]]] * batch_size, dtype=tf.float32)
    actions = tf.constant([[[0], [1]]] * batch_size, dtype=tf.float32)
    time_steps = ts.TimeStep(
        step_type=tf.constant([[1] * 2] * batch_size, dtype=tf.int32),
        reward=tf.constant([[1] * 2] * batch_size, dtype=tf.float32),
        discount=tf.constant([[1] * 2] * batch_size, dtype=tf.float32),
        observation=observations)

    experience = trajectory.Trajectory(time_steps.step_type, observations,
                                       actions, (), time_steps.step_type,
                                       time_steps.reward, time_steps.discount)

    # Force variable creation.
    agent.policy.variables()

    if not tf.executing_eagerly():
      # Get experience first to make sure optimizer variables are created and
      # can be initialized.
      experience = agent.train(experience)
      with self.cached_session() as sess:
        common.initialize_uninitialized_variables(sess)
      self.assertEqual(self.evaluate(counter), 0)
      self.evaluate(experience)
      self.assertEqual(self.evaluate(counter), 1)
    else:
      # Training step one.
      self.assertEqual(self.evaluate(counter), 0)
      loss = self.evaluate(agent.train(experience))
      self.assertEqual(self.evaluate(counter), 1)
      self.assertAllClose(loss.extra.cql_loss, expected_cql_loss_step_one)
      self.assertAllClose(loss.extra.cql_alpha, expected_cql_alpha_step_one)
      if use_lagrange_cql_alpha:
        self.assertGreater(loss.extra.cql_alpha_loss, 0)
      else:
        self.assertEqual(loss.extra.cql_alpha_loss, 0)

      # Training step two.
      if use_variable_for_cql_alpha:
        cql_alpha_var.assign_add(1)
      loss = self.evaluate(agent.train(experience))
      self.assertEqual(self.evaluate(counter), 2)
      self.assertAllClose(loss.extra.cql_loss, expected_cql_loss_step_two)
      self.assertAllClose(loss.extra.cql_alpha, expected_cql_alpha_step_two)