Esempio n. 1
0
 def action_spec(self):
     return tensor_spec.from_spec(
         array_spec.BoundedArraySpec((), np.int32, minimum=0, maximum=2))
Esempio n. 2
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 q_network: network.Network,
                 emit_log_probability: bool = False,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 validate_action_spec_and_network: bool = True,
                 name: Optional[Text] = None):
        """Builds a Q-Policy given a q_network.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: An instance of a `tf_agents.network.Network`,
        callable via `network(observation, step_type) -> (output, final_state)`.
      emit_log_probability: Whether to emit log-probs in info of `PolicyStep`.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment.
        The function takes in a full observation and returns a tuple consisting
        of 1) the part of the observation intended as input to the network and
        2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as:
        ```
        def observation_and_action_constraint_splitter(observation):
          return observation['network_input'], observation['constraint']
        ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the provided `q_network` is compatible with the network-specific
        half of the output of the `observation_and_action_constraint_splitter`.
        In particular, `observation_and_action_constraint_splitter` will be
        called on the observation before passing to the network.
        If `observation_and_action_constraint_splitter` is None, action
        constraints are not applied.
      validate_action_spec_and_network: If `True` (default),
        action_spec is checked to make sure it is a single scalar spec
        with a minimum of zero.  Also validates that the network's output
        matches the spec.
      name: The name of this policy. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If `q_network.action_spec` exists and is not compatible with
        `action_spec`.
      NotImplementedError: If `action_spec` contains more than one
        `BoundedTensorSpec`.
    """
        action_spec = tensor_spec.from_spec(action_spec)
        time_step_spec = tensor_spec.from_spec(time_step_spec)

        network_action_spec = getattr(q_network, 'action_spec', None)

        if network_action_spec is not None:
            action_spec = cast(tf.TypeSpec, action_spec)
            if not action_spec.is_compatible_with(network_action_spec):
                raise ValueError(
                    'action_spec must be compatible with q_network.action_spec; '
                    'instead got action_spec=%s, q_network.action_spec=%s' %
                    (action_spec, network_action_spec))

        flat_action_spec = tf.nest.flatten(action_spec)
        if len(flat_action_spec) > 1:
            raise ValueError(
                'Only scalar actions are supported now, but action spec is: {}'
                .format(action_spec))
        if validate_action_spec_and_network:
            spec = flat_action_spec[0]
            if spec.shape.rank > 0:
                raise ValueError(
                    'Only scalar actions are supported now, but action spec is: {}'
                    .format(action_spec))

            if spec.minimum != 0:
                raise ValueError(
                    'Action specs should have minimum of 0, but saw: {0}'.
                    format(spec))

            num_actions = spec.maximum - spec.minimum + 1
            network_utils.check_single_floating_network_output(
                q_network.create_variables(), (num_actions, ), str(q_network))

        # We need to maintain the flat action spec for dtype, shape and range.
        self._flat_action_spec = flat_action_spec[0]

        self._q_network = q_network
        super(QPolicy,
              self).__init__(time_step_spec,
                             action_spec,
                             policy_state_spec=q_network.state_spec,
                             clip=False,
                             emit_log_probability=emit_log_probability,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             name=name)
Esempio n. 3
0
    def __init__(
            self,
            root_dir,
            env_name,
            num_iterations=200,
            max_episode_frames=108000,  # ALE frames
            terminal_on_life_loss=False,
            conv_layer_params=((32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3),
                                                                  1)),
            fc_layer_params=(512, ),
            # Params for collect
            initial_collect_steps=80000,  # ALE frames
            epsilon_greedy=0.01,
            epsilon_decay_period=1000000,  # ALE frames
            replay_buffer_capacity=1000000,
            # Params for train
            train_steps_per_iteration=1000000,  # ALE frames
            update_period=16,  # ALE frames
            target_update_tau=1.0,
            target_update_period=32000,  # ALE frames
            batch_size=32,
            learning_rate=2.5e-4,
            n_step_update=2,
            gamma=0.99,
            reward_scale_factor=1.0,
            gradient_clipping=None,
            # Params for eval
            do_eval=True,
            eval_steps_per_iteration=500000,  # ALE frames
            eval_epsilon_greedy=0.001,
            # Params for checkpoints, summaries, and logging
            log_interval=1000,
            summary_interval=1000,
            summaries_flush_secs=10,
            debug_summaries=True,
            summarize_grads_and_vars=True,
            eval_metrics_callback=None):
        """A simple Atari train and eval for DQN.

    Args:
      root_dir: Directory to write log files to.
      env_name: Fully-qualified name of the Atari environment (i.e. Pong-v0).
      num_iterations: Number of train/eval iterations to run.
      max_episode_frames: Maximum length of a single episode, in ALE frames.
      terminal_on_life_loss: Whether to simulate an episode termination when a
        life is lost.
      conv_layer_params: Params for convolutional layers of QNetwork.
      fc_layer_params: Params for fully connected layers of QNetwork.
      initial_collect_steps: Number of frames to ALE frames to process before
        beginning to train. Since this is in ALE frames, there will be
        initial_collect_steps/4 items in the replay buffer when training starts.
      epsilon_greedy: Final epsilon value to decay to for training.
      epsilon_decay_period: Period over which to decay epsilon, from 1.0 to
        epsilon_greedy (defined above).
      replay_buffer_capacity: Maximum number of items to store in the replay
        buffer.
      train_steps_per_iteration: Number of ALE frames to run through for each
        iteration of training.
      update_period: Run a train operation every update_period ALE frames.
      target_update_tau: Coeffecient for soft target network updates (1.0 ==
        hard updates).
      target_update_period: Period, in ALE frames, to copy the live network to
        the target network.
      batch_size: Number of frames to include in each training batch.
      learning_rate: RMS optimizer learning rate.
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Applies standard single-step updates when set to 1.
      gamma: Discount for future rewards.
      reward_scale_factor: Scaling factor for rewards.
      gradient_clipping: Norm length to clip gradients.
      do_eval: If True, run an eval every iteration. If False, skip eval.
      eval_steps_per_iteration: Number of ALE frames to run through for each
        iteration of evaluation.
      eval_epsilon_greedy: Epsilon value to use for the evaluation policy (0 ==
        totally greedy policy).
      log_interval: Log stats to the terminal every log_interval training
        steps.
      summary_interval: Write TF summaries every summary_interval training
        steps.
      summaries_flush_secs: Flush summaries to disk every summaries_flush_secs
        seconds.
      debug_summaries: If True, write additional summaries for debugging (see
        dqn_agent for which summaries are written).
      summarize_grads_and_vars: Include gradients in summaries.
      eval_metrics_callback: A callback function that takes (metric_dict,
        global_step) as parameters. Called after every eval with the results of
        the evaluation.
    """
        self._update_period = update_period / ATARI_FRAME_SKIP
        self._train_steps_per_iteration = (train_steps_per_iteration /
                                           ATARI_FRAME_SKIP)
        self._do_eval = do_eval
        self._eval_steps_per_iteration = eval_steps_per_iteration / ATARI_FRAME_SKIP
        self._eval_epsilon_greedy = eval_epsilon_greedy
        self._initial_collect_steps = initial_collect_steps / ATARI_FRAME_SKIP
        self._summary_interval = summary_interval
        self._num_iterations = num_iterations
        self._log_interval = log_interval
        self._eval_metrics_callback = eval_metrics_callback

        with gin.unlock_config():
            gin.bind_parameter(('tf_agents.environments.atari_preprocessing.'
                                'AtariPreprocessing.terminal_on_life_loss'),
                               terminal_on_life_loss)

        root_dir = os.path.expanduser(root_dir)
        train_dir = os.path.join(root_dir, 'train')
        eval_dir = os.path.join(root_dir, 'eval')

        train_summary_writer = tf.compat.v2.summary.create_file_writer(
            train_dir, flush_millis=summaries_flush_secs * 1000)
        train_summary_writer.set_as_default()
        self._train_summary_writer = train_summary_writer

        self._eval_summary_writer = None
        if self._do_eval:
            self._eval_summary_writer = tf.compat.v2.summary.create_file_writer(
                eval_dir, flush_millis=summaries_flush_secs * 1000)
            self._eval_metrics = [
                py_metrics.AverageReturnMetric(name='PhaseAverageReturn',
                                               buffer_size=np.inf),
                py_metrics.AverageEpisodeLengthMetric(
                    name='PhaseAverageEpisodeLength', buffer_size=np.inf),
            ]

        self._global_step = tf.compat.v1.train.get_or_create_global_step()
        with tf.compat.v2.summary.record_if(lambda: tf.math.equal(
                self._global_step % self._summary_interval, 0)):
            self._env = suite_atari.load(
                env_name,
                max_episode_steps=max_episode_frames / ATARI_FRAME_SKIP,
                gym_env_wrappers=suite_atari.
                DEFAULT_ATARI_GYM_WRAPPERS_WITH_STACKING)
            self._env = batched_py_environment.BatchedPyEnvironment(
                [self._env])

            observation_spec = tensor_spec.from_spec(
                self._env.observation_spec())
            time_step_spec = ts.time_step_spec(observation_spec)
            action_spec = tensor_spec.from_spec(self._env.action_spec())

            with tf.device('/cpu:0'):
                epsilon = tf.compat.v1.train.polynomial_decay(
                    1.0,
                    self._global_step,
                    epsilon_decay_period / ATARI_FRAME_SKIP /
                    self._update_period,
                    end_learning_rate=epsilon_greedy)

            with tf.device('/gpu:0'):
                optimizer = tf.compat.v1.train.RMSPropOptimizer(
                    learning_rate=learning_rate,
                    decay=0.95,
                    momentum=0.0,
                    epsilon=0.00001,
                    centered=True)
                categorical_q_net = AtariCategoricalQNetwork(
                    observation_spec,
                    action_spec,
                    conv_layer_params=conv_layer_params,
                    fc_layer_params=fc_layer_params)
                agent = categorical_dqn_agent.CategoricalDqnAgent(
                    time_step_spec,
                    action_spec,
                    categorical_q_network=categorical_q_net,
                    optimizer=optimizer,
                    epsilon_greedy=epsilon,
                    n_step_update=n_step_update,
                    target_update_tau=target_update_tau,
                    target_update_period=(target_update_period /
                                          ATARI_FRAME_SKIP /
                                          self._update_period),
                    gamma=gamma,
                    reward_scale_factor=reward_scale_factor,
                    gradient_clipping=gradient_clipping,
                    debug_summaries=debug_summaries,
                    summarize_grads_and_vars=summarize_grads_and_vars,
                    train_step_counter=self._global_step)

                self._collect_policy = py_tf_policy.PyTFPolicy(
                    agent.collect_policy)

                if self._do_eval:
                    self._eval_policy = py_tf_policy.PyTFPolicy(
                        epsilon_greedy_policy.EpsilonGreedyPolicy(
                            policy=agent.policy,
                            epsilon=self._eval_epsilon_greedy))

                py_observation_spec = self._env.observation_spec()
                py_time_step_spec = ts.time_step_spec(py_observation_spec)
                py_action_spec = policy_step.PolicyStep(
                    self._env.action_spec())
                data_spec = trajectory.from_transition(py_time_step_spec,
                                                       py_action_spec,
                                                       py_time_step_spec)
                self._replay_buffer = py_hashed_replay_buffer.PyHashedReplayBuffer(
                    data_spec=data_spec, capacity=replay_buffer_capacity)

            with tf.device('/cpu:0'):
                ds = self._replay_buffer.as_dataset(
                    sample_batch_size=batch_size, num_steps=n_step_update + 1)
                ds = ds.prefetch(4)
                ds = ds.apply(
                    tf.data.experimental.prefetch_to_device('/gpu:0'))

            with tf.device('/gpu:0'):
                self._ds_itr = tf.compat.v1.data.make_one_shot_iterator(ds)
                experience = self._ds_itr.get_next()
                self._train_op = agent.train(experience)

                self._env_steps_metric = py_metrics.EnvironmentSteps()
                self._step_metrics = [
                    py_metrics.NumberOfEpisodes(),
                    self._env_steps_metric,
                ]
                self._train_metrics = self._step_metrics + [
                    py_metrics.AverageReturnMetric(buffer_size=10),
                    py_metrics.AverageEpisodeLengthMetric(buffer_size=10),
                ]
                # The _train_phase_metrics average over an entire train iteration,
                # rather than the rolling average of the last 10 episodes.
                self._train_phase_metrics = [
                    py_metrics.AverageReturnMetric(name='PhaseAverageReturn',
                                                   buffer_size=np.inf),
                    py_metrics.AverageEpisodeLengthMetric(
                        name='PhaseAverageEpisodeLength', buffer_size=np.inf),
                ]
                self._iteration_metric = py_metrics.CounterMetric(
                    name='Iteration')

                # Summaries written from python should run every time they are
                # generated.
                with tf.compat.v2.summary.record_if(True):
                    self._steps_per_second_ph = tf.compat.v1.placeholder(
                        tf.float32, shape=(), name='steps_per_sec_ph')
                    self._steps_per_second_summary = tf.compat.v2.summary.scalar(
                        name='global_steps_per_sec',
                        data=self._steps_per_second_ph,
                        step=self._global_step)

                    for metric in self._train_metrics:
                        metric.tf_summaries(train_step=self._global_step,
                                            step_metrics=self._step_metrics)

                    for metric in self._train_phase_metrics:
                        metric.tf_summaries(
                            train_step=self._global_step,
                            step_metrics=(self._iteration_metric, ))
                    self._iteration_metric.tf_summaries(
                        train_step=self._global_step)

                    if self._do_eval:
                        with self._eval_summary_writer.as_default():
                            for metric in self._eval_metrics:
                                metric.tf_summaries(
                                    train_step=self._global_step,
                                    step_metrics=(self._iteration_metric, ))

                self._train_checkpointer = common.Checkpointer(
                    ckpt_dir=train_dir,
                    agent=agent,
                    global_step=self._global_step,
                    optimizer=optimizer,
                    metrics=metric_utils.MetricsGroup(
                        self._train_metrics + self._train_phase_metrics +
                        [self._iteration_metric], 'train_metrics'))
                self._policy_checkpointer = common.Checkpointer(
                    ckpt_dir=os.path.join(train_dir, 'policy'),
                    policy=agent.policy,
                    global_step=self._global_step)
                self._rb_checkpointer = common.Checkpointer(
                    ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
                    max_to_keep=1,
                    replay_buffer=self._replay_buffer)

                self._init_agent_op = agent.initialize()
Esempio n. 4
0
def train_agent(iterations, modeldir, logdir, policydir):
    """Train and convert the model using TF Agents."""

    train_py_env = planestrike_py_environment.PlaneStrikePyEnvironment(
        board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2)
    eval_py_env = planestrike_py_environment.PlaneStrikePyEnvironment(
        board_size=BOARD_SIZE, discount=DISCOUNT, max_steps=BOARD_SIZE**2)

    train_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    # Alternatively you could use ActorDistributionNetwork as actor_net
    actor_net = tfa.networks.Sequential(
        [
            tfa.keras_layers.InnerReshape([BOARD_SIZE, BOARD_SIZE],
                                          [BOARD_SIZE**2]),
            tf.keras.layers.Dense(FC_LAYER_PARAMS, activation='relu'),
            tf.keras.layers.Dense(BOARD_SIZE**2),
            tf.keras.layers.Lambda(
                lambda t: tfp.distributions.Categorical(logits=t)),
        ],
        input_spec=train_py_env.observation_spec())

    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)

    train_step_counter = tf.Variable(0)

    tf_agent = reinforce_agent.ReinforceAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        actor_network=actor_net,
        optimizer=optimizer,
        normalize_returns=True,
        train_step_counter=train_step_counter)

    tf_agent.initialize()

    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy

    tf_policy_saver = policy_saver.PolicySaver(collect_policy)

    # Use reverb as replay buffer
    replay_buffer_signature = tensor_spec.from_spec(tf_agent.collect_data_spec)
    table = reverb.Table(
        REPLAY_BUFFER_TABLE_NAME,
        max_size=REPLAY_BUFFER_CAPACITY,
        sampler=reverb.selectors.Uniform(),
        remover=reverb.selectors.Fifo(),
        rate_limiter=reverb.rate_limiters.MinSize(1),
        signature=replay_buffer_signature
    )  # specify signature here for validation at insertion time

    reverb_server = reverb.Server([table])

    replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
        tf_agent.collect_data_spec,
        sequence_length=None,
        table_name=REPLAY_BUFFER_TABLE_NAME,
        local_server=reverb_server)

    replay_buffer_observer = reverb_utils.ReverbAddEpisodeObserver(
        replay_buffer.py_client, REPLAY_BUFFER_TABLE_NAME,
        REPLAY_BUFFER_CAPACITY)

    # Optimize by wrapping some of the code in a graph using TF function.
    tf_agent.train = common.function(tf_agent.train)

    # Evaluate the agent's policy once before training.
    avg_return = compute_avg_return_and_steps(eval_env, tf_agent.policy,
                                              NUM_EVAL_EPISODES)

    summary_writer = tf.summary.create_file_writer(logdir)

    for i in range(iterations):
        # Collect a few episodes using collect_policy and save to the replay buffer.
        collect_episode(train_py_env, collect_policy,
                        COLLECT_EPISODES_PER_ITERATION, replay_buffer_observer)

        # Use data from the buffer and update the agent's network.
        iterator = iter(replay_buffer.as_dataset(sample_batch_size=1))
        trajectories, _ = next(iterator)
        tf_agent.train(experience=trajectories)
        replay_buffer.clear()

        logger = tf.get_logger()
        if i % EVAL_INTERVAL == 0:
            avg_return, avg_episode_length = compute_avg_return_and_steps(
                eval_env, eval_policy, NUM_EVAL_EPISODES)
            with summary_writer.as_default():
                tf.summary.scalar('Average return', avg_return, step=i)
                tf.summary.scalar('Average episode length',
                                  avg_episode_length,
                                  step=i)
                summary_writer.flush()
            logger.info(
                'iteration = {0}: Average Return = {1}, Average Episode Length = {2}'
                .format(i, avg_return, avg_episode_length))

    summary_writer.close()

    tf_policy_saver.save(policydir)
    # Convert to tflite model
    converter = tf.lite.TFLiteConverter.from_saved_model(
        policydir, signature_keys=['action'])
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS,  # enable TensorFlow Lite ops.
        tf.lite.OpsSet.SELECT_TF_OPS  # enable TensorFlow ops.
    ]
    tflite_policy = converter.convert()
    with open(os.path.join(modeldir, 'planestrike_tf_agents.tflite'),
              'wb') as f:
        f.write(tflite_policy)
Esempio n. 5
0
 def testFromStringSpec(self):
     spec = tensor_spec.from_spec(array_spec.ArraySpec([1], np.string_))
     self.assertEqual(tf.string, spec.dtype)
Esempio n. 6
0
def main(_):
    logging.set_verbosity(logging.INFO)

    # Wait for the collect policy to become available, then load it.
    collect_policy_dir = os.path.join(FLAGS.root_dir,
                                      learner.POLICY_SAVED_MODEL_DIR,
                                      learner.COLLECT_POLICY_SAVED_MODEL_DIR)
    collect_policy = train_utils.wait_for_policy(collect_policy_dir,
                                                 load_specs_from_pbtxt=True)

    samples_per_insert = FLAGS.samples_per_insert
    min_table_size_before_sampling = FLAGS.min_table_size_before_sampling

    # Create the signature for the variable container holding the policy weights.
    train_step = train_utils.create_train_step()
    variables = {
        reverb_variable_container.POLICY_KEY: collect_policy.variables(),
        reverb_variable_container.TRAIN_STEP_KEY: train_step
    }
    variable_container_signature = tf.nest.map_structure(
        lambda variable: tf.TensorSpec(variable.shape, dtype=variable.dtype),
        variables)
    logging.info('Signature of variables: \n%s', variable_container_signature)

    # Create the signature for the replay buffer holding observed experience.
    replay_buffer_signature = tensor_spec.from_spec(
        collect_policy.collect_data_spec)
    replay_buffer_signature = tf.nest.map_structure(
        lambda s: tf.TensorSpec((None, ) + s.shape, s.dtype, s.name),
        replay_buffer_signature)
    logging.info('Signature of experience: \n%s', replay_buffer_signature)

    if samples_per_insert is not None:
        # Use SamplesPerInsertRatio limiter
        samples_per_insert_tolerance = (_SAMPLES_PER_INSERT_TOLERANCE_RATIO *
                                        samples_per_insert)
        error_buffer = min_table_size_before_sampling * samples_per_insert_tolerance

        experience_rate_limiter = reverb.rate_limiters.SampleToInsertRatio(
            min_size_to_sample=min_table_size_before_sampling,
            samples_per_insert=samples_per_insert,
            error_buffer=error_buffer)
    else:
        # Use MinSize limiter
        experience_rate_limiter = reverb.rate_limiters.MinSize(
            min_table_size_before_sampling)

    # Crete and start the replay buffer and variable container server.
    server = reverb.Server(
        tables=[
            reverb.Table(  # Replay buffer storing experience.
                name=reverb_replay_buffer.DEFAULT_TABLE,
                sampler=reverb.selectors.Uniform(),
                remover=reverb.selectors.Fifo(),
                rate_limiter=experience_rate_limiter,
                max_size=FLAGS.replay_buffer_capacity,
                max_times_sampled=0,
                signature=replay_buffer_signature,
            ),
            reverb.Table(  # Variable container storing policy parameters.
                name=reverb_variable_container.DEFAULT_TABLE,
                sampler=reverb.selectors.Uniform(),
                remover=reverb.selectors.Fifo(),
                rate_limiter=reverb.rate_limiters.MinSize(1),
                max_size=1,
                max_times_sampled=0,
                signature=variable_container_signature,
            ),
        ],
        port=FLAGS.port)
    server.wait()
Esempio n. 7
0
 def observation_spec(self):
     return tensor_spec.from_spec(self._envs[0].tf_env.observation_spec())
initial_collect_steps = 100  # @param {type:"integer"} 
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 5  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 5  # @param {type:"integer"}

# Consider layers that go big -> small -> big, e.g.
fc_layer_params = (1024, 256, 64, 256, 1024)
#fc_layer_params = (100, 50)
# Maybe change env -> tf_env
action_tensor_spec = tensor_spec.from_spec(train_env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
flatten_layer = tf.keras.layers.Flatten()
Esempio n. 9
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 policy_state_spec: types.NestedTensorSpec = (),
                 info_spec: types.NestedTensorSpec = (),
                 clip: bool = True,
                 emit_log_probability: bool = False,
                 automatic_state_reset: bool = True,
                 observation_and_action_constraint_splitter: Optional[
                     types.Splitter] = None,
                 validate_args: bool = True,
                 name: Optional[Text] = None):
        """Initialization of TFPolicy class.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps. Usually
        provided by the user to the subclass.
      action_spec: A nest of BoundedTensorSpec representing the actions. Usually
        provided by the user to the subclass.
      policy_state_spec: A nest of TensorSpec representing the policy_state.
        Provided by the subclass, not directly by the user.
      info_spec: A nest of TensorSpec representing the policy info. Provided by
        the subclass, not directly by the user.
      clip: Whether to clip actions to spec before returning them.  Default
        True. Most policy-based algorithms (PCL, PPO, REINFORCE) use unclipped
        continuous actions for training.
      emit_log_probability: Emit log-probabilities of actions, if supported. If
        True, policy_step.info will have CommonFields.LOG_PROBABILITY set.
        Please consult utility methods provided in policy_step for setting and
        retrieving these. When working with custom policies, either provide a
        dictionary info_spec or a namedtuple with the field 'log_probability'.
      automatic_state_reset:  If `True`, then `get_initial_policy_state` is used
        to clear state in `action()` and `distribution()` for for time steps
        where `time_step.is_first()`.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment. The function takes in a full observation and returns a
        tuple consisting of 1) the part of the observation intended as input to
        the network and 2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as: ```
        def observation_and_action_constraint_splitter(observation): return
          observation['network_input'], observation['constraint'] ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
          sure the provided `q_network` is compatible with the network-specific
          half of the output of the
          `observation_and_action_constraint_splitter`. In particular,
          `observation_and_action_constraint_splitter` will be called on the
          observation before passing to the network. If
          `observation_and_action_constraint_splitter` is None, action
          constraints are not applied.
      validate_args: Python bool.  Whether to verify inputs to, and outputs of,
        functions like `action` and `distribution` against spec structures,
        dtypes, and shapes.

        Research code may prefer to set this value to `False` to allow iterating
        on input and output structures without being hamstrung by overly
        rigid checking (at the cost of harder-to-debug errors).

        See also `TFAgent.validate_args`.
      name: A name for this module. Defaults to the class name.
    """
        super(TFPolicy, self).__init__(name=name)
        common.check_tf1_allowed()
        common.tf_agents_gauge.get_cell('TFAPolicy').set(True)
        common.assert_members_are_not_overridden(base_cls=TFPolicy,
                                                 instance=self)
        if not isinstance(time_step_spec, ts.TimeStep):
            raise ValueError(
                'The `time_step_spec` must be an instance of `TimeStep`, but is `{}`.'
                .format(type(time_step_spec)))

        self._time_step_spec = tensor_spec.from_spec(time_step_spec)
        self._action_spec = tensor_spec.from_spec(action_spec)
        self._policy_state_spec = tensor_spec.from_spec(policy_state_spec)
        self._emit_log_probability = emit_log_probability
        self._validate_args = validate_args

        if emit_log_probability:
            log_probability_spec = tensor_spec.BoundedTensorSpec(
                shape=(),
                dtype=tf.float32,
                maximum=0,
                minimum=-float('inf'),
                name='log_probability')
            log_probability_spec = tf.nest.map_structure(
                lambda _: log_probability_spec, action_spec)
            info_spec = policy_step.set_log_probability(
                info_spec, log_probability_spec)  # pytype: disable=wrong-arg-types

        self._info_spec = tensor_spec.from_spec(info_spec)
        self._setup_specs()
        self._clip = clip
        self._action_fn = common.function_in_tf1()(self._action)
        self._automatic_state_reset = automatic_state_reset
        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
Esempio n. 10
0
memory_features = True
lose_on_illegal_move = False
drawless = False
conv_features = True

train_py_environment = gofish_env.GoFishEnv(bot, max_visible_opponent_hand_size=10, max_visible_deck_size=10, drawless=drawless, lose_on_illegal_move=lose_on_illegal_move, memory_features=memory_features, conv_features=conv_features)

print('Validating env.')
utils.validate_py_environment(train_py_environment, episodes=5)
print('Validation complete.')

eval_py_environment = gofish_env.GoFishEnv(bot, max_visible_opponent_hand_size=10, max_visible_deck_size=10, drawless=drawless, lose_on_illegal_move=lose_on_illegal_move, memory_features=memory_features, conv_features=conv_features)
train_env = tf_py_environment.TFPyEnvironment(train_py_environment)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_environment)

action_tensor_spec = tensor_spec.from_spec(train_py_environment.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

if conv_features:

else:
    # Define a helper function to create Dense layers configured with the right
    # activation and kernel initializer.
    def dense_layer(num_units):
      return tf.keras.layers.Dense(
          num_units,
          activation=tf.keras.activations.relu,
          kernel_initializer=tf.keras.initializers.VarianceScaling(
              scale=2.0, mode='fan_in', distribution='truncated_normal'))

    # QNetwork consists of a sequence of Dense layers followed by a dense layer
Esempio n. 11
0
    py_env = suite_pybullet.load('AntBulletEnv-v0')
    py_env.render(mode="human")
    env = tf_py_environment.TFPyEnvironment(py_env)

    strategy = strategy_utils.get_strategy(tpu=False, use_gpu=True)

    replay_buffer_capacity = 2000
    learning_rate = 1e-3
    fc_layer_params = [128, 64, 64]

    num_iterations = 100

    log_interval = 2
    eval_interval = 2

    action_tensor_spec = tensor_spec.from_spec(env.action_spec())

    num_actions = action_tensor_spec.shape[0]

    with strategy.scope():
        collect_policy = tf.saved_model.load(
            '/tmp/models/expert/AntBulletEnv-v0')

        dense_layers = [
            Dense(num_units, activation=relu) for num_units in fc_layer_params
        ]

        output_layer = Dense(num_actions, activation=None)

        cloning_net = Sequential(dense_layers + [output_layer])
        optimizer = Adam(learning_rate=learning_rate)
Esempio n. 12
0
  def _action(self, time_step, policy_state, seed):
    observation_and_action_constraint_splitter = (
        self.observation_and_action_constraint_splitter)

    outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec)
    if observation_and_action_constraint_splitter is not None:
      observation, mask = observation_and_action_constraint_splitter(
          time_step.observation)

      if self._stationary_mask is not None:
        mask = mask * self._stationary_mask

      action_spec = tensor_spec.from_spec(self.action_spec)
      action_spec = cast(tensor_spec.BoundedTensorSpec, action_spec)
      zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
      masked_categorical = masked.MaskedCategorical(zero_logits, mask)
      action_ = tf.cast(masked_categorical.sample() + action_spec.minimum,
                        action_spec.dtype)

      # If the action spec says each action should be shaped (1,), add another
      # dimension so the final shape is (B, 1) rather than (B,).
      if action_spec.shape.rank == 1:
        action_ = tf.expand_dims(action_, axis=-1)
      policy_info = tensor_spec.sample_spec_nest(
          self._info_spec, outer_dims=outer_dims)
    else:
      observation = time_step.observation
      action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec)

      if self._accepts_per_arm_features:
        max_num_arms = action_spec.maximum - action_spec.minimum + 1
        batch_size = tf.shape(time_step.step_type)[0]
        num_actions = observation.get(
            bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY,
            tf.ones(shape=(batch_size,), dtype=tf.int32) * max_num_arms)
        mask = tf.sequence_mask(num_actions, max_num_arms)
        zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
        masked_categorical = masked.MaskedCategorical(zero_logits, mask)
        action_ = tf.nest.map_structure(
            lambda t: tf.cast(masked_categorical.sample() + t.minimum, t.dtype),
            action_spec)
      elif self._stationary_mask is not None:
        batch_size = tf.shape(time_step.step_type)[0]
        mask = tf.tile(self._stationary_mask, [batch_size, 1])
        zero_logits = tf.cast(tf.zeros_like(mask), tf.float32)
        masked_categorical = masked.MaskedCategorical(zero_logits, mask)
        action_ = tf.cast(masked_categorical.sample() + action_spec.minimum,
                          action_spec.dtype)
      else:
        action_ = tensor_spec.sample_spec_nest(
            self._action_spec, seed=seed, outer_dims=outer_dims)

      policy_info = tensor_spec.sample_spec_nest(
          self._info_spec, outer_dims=outer_dims)

    # Update policy info with chosen arm features.
    if self._accepts_per_arm_features:
      def _gather_fn(t):
        return tf.gather(params=t, indices=action_, batch_dims=1)
      chosen_arm_features = tf.nest.map_structure(
          _gather_fn, observation[bandit_spec_utils.PER_ARM_FEATURE_KEY])

      if policy_utilities.has_chosen_arm_features(self._info_spec):
        policy_info = policy_info._replace(
            chosen_arm_features=chosen_arm_features)

    # TODO(b/78181147): Investigate why this control dependency is required.
    def _maybe_convert_sparse_tensor(t):
      if isinstance(t, tf.SparseTensor):
        return tf.sparse.to_dense(t)
      else:
        return t
    if time_step is not None:
      with tf.control_dependencies(
          tf.nest.flatten(tf.nest.map_structure(_maybe_convert_sparse_tensor,
                                                time_step))):
        action_ = tf.nest.map_structure(tf.identity, action_)

    if self.emit_log_probability:
      if (self._accepts_per_arm_features
          or observation_and_action_constraint_splitter is not None
          or self._stationary_mask is not None):
        action_spec = cast(tensor_spec.BoundedTensorSpec, self.action_spec)
        log_probability = masked_categorical.log_prob(
            action_ - action_spec.minimum)
      else:
        log_probability = tf.nest.map_structure(
            lambda s: _calculate_log_probability(outer_dims, s),
            self._action_spec)
      policy_info = policy_step.set_log_probability(policy_info,
                                                    log_probability)

    step = policy_step.PolicyStep(action_, policy_state, policy_info)
    return step
Esempio n. 13
0
def get_action_spec(robot_type):
    return tensor_spec.from_spec(specs.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=(VectorEnv.get_action_space(robot_type) - 1), name='action'))
Esempio n. 14
0
  def __init__(self,
               input_tensor_spec,
               output_tensor_spec,
               fc_layer_params=None,
               dropout_layer_params=None,
               conv_layer_params=None,
               activation_fn=tf.keras.activations.relu,
               kernel_initializer=None,
               last_kernel_initializer=None,
               name='ActorNetwork'):
    """Creates an instance of `ActorNetwork`.

    Args:
      input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the
        inputs.
      output_tensor_spec: A nest of `tensor_spec.BoundedTensorSpec` representing
        the outputs.
      fc_layer_params: Optional list of fully_connected parameters, where each
        item is the number of units in the layer.
      dropout_layer_params: Optional list of dropout layer parameters, each item
        is the fraction of input units to drop or a dictionary of parameters
        according to the keras.Dropout documentation. The additional parameter
        `permanent`, if set to True, allows to apply dropout at inference for
        approximated Bayesian inference. The dropout layers are interleaved with
        the fully connected layers; there is a dropout layer after each fully
        connected layer, except if the entry in the list is None. This list must
        have the same length of fc_layer_params, or be None.
      conv_layer_params: Optional list of convolution layers parameters, where
        each item is a length-three tuple indicating (filters, kernel_size,
        stride).
      activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ...
      kernel_initializer: kernel initializer for all layers except for the value
        regression layer. If None, a VarianceScaling initializer will be used.
      last_kernel_initializer: kernel initializer for the value regression
         layer. If None, a RandomUniform initializer will be used.
      name: A string representing name of the network.

    Raises:
      ValueError: If `input_tensor_spec` or `action_spec` contains more than one
        item, or if the action data type is not `float`.
    """

    super(ActorNetwork, self).__init__(
        input_tensor_spec=input_tensor_spec,
        state_spec=(),
        name=name)

    output_tensor_spec = tensor_spec.from_spec(output_tensor_spec)

    if len(tf.nest.flatten(input_tensor_spec)) > 1:
      raise ValueError('Only a single observation is supported by this network')

    flat_action_spec = tf.nest.flatten(output_tensor_spec)
    if len(flat_action_spec) > 1:
      raise ValueError('Only a single action is supported by this network')
    self._single_action_spec = flat_action_spec[0]

    if self._single_action_spec.dtype not in [tf.float32, tf.float64]:
      raise ValueError('Only float actions are supported by this network.')

    if kernel_initializer is None:
      kernel_initializer = tf.compat.v1.keras.initializers.VarianceScaling(
          scale=1. / 3., mode='fan_in', distribution='uniform')
    if last_kernel_initializer is None:
      last_kernel_initializer = tf.keras.initializers.RandomUniform(
          minval=-0.003, maxval=0.003)

    # TODO(kbanoop): Replace mlp_layers with encoding networks.
    self._mlp_layers = utils.mlp_layers(
        conv_layer_params,
        fc_layer_params,
        dropout_layer_params,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        name='input_mlp')

    self._mlp_layers.append(
        tf.keras.layers.Dense(
            flat_action_spec[0].shape.num_elements(),
            activation=tf.keras.activations.tanh,
            kernel_initializer=last_kernel_initializer,
            name='action'))

    self._output_tensor_spec = output_tensor_spec
Esempio n. 15
0
def _action_space_fixture(gym_space_bound, gym_space_shape):
    gym_space = gym.spaces.Box(low=-gym_space_bound,
                               high=gym_space_bound,
                               shape=gym_space_shape,
                               dtype=np.float32)
    return tensor_spec.from_spec(spec_from_gym_space(gym_space, name="action"))
Esempio n. 16
0
    def __init__(
            self,
            time_step_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            q_network: network.Network,
            optimizer: types.Optimizer,
            observation_and_action_constraint_splitter: Optional[
                types.Splitter] = None,
            epsilon_greedy: Optional[types.FloatOrReturningFloat] = 0.1,
            n_step_update: int = 1,
            boltzmann_temperature: Optional[
                types.FloatOrReturningFloat] = None,
            emit_log_probability: bool = False,
            # Params for target network updates
            target_q_network: Optional[network.Network] = None,
            target_update_tau: types.Float = 1.0,
            target_update_period: int = 1,
            # Params for training.
            td_errors_loss_fn: Optional[types.LossFn] = None,
            gamma: types.Float = 1.0,
            reward_scale_factor: types.Float = 1.0,
            gradient_clipping: Optional[types.Float] = None,
            # Params for debugging
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None):
        """Creates a DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with `call(observation, step_type)` and should
        emit logits over the action space.
      optimizer: The optimizer to use for training.
      observation_and_action_constraint_splitter: A function used to process
        observations with action constraints. These constraints can indicate,
        for example, a mask of valid/invalid actions for a given state of the
        environment.
        The function takes in a full observation and returns a tuple consisting
        of 1) the part of the observation intended as input to the network and
        2) the constraint. An example
        `observation_and_action_constraint_splitter` could be as simple as:
        ```
        def observation_and_action_constraint_splitter(observation):
          return observation['network_input'], observation['constraint']
        ```
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the provided `q_network` is compatible with the network-specific
        half of the output of the `observation_and_action_constraint_splitter`.
        In particular, `observation_and_action_constraint_splitter` will be
        called on the observation before passing to the network.
        If `observation_and_action_constraint_splitter` is None, action
        constraints are not applied.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method). Only one of epsilon_greedy and
        boltzmann_temperature should be provided.
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Defaults to single-step updates. Note that this requires the
        user to call train on Trajectory objects with a time dimension of
        `n_step_update + 1`. However, note that we do not yet support
        `n_step_update > 1` in the case of RNNs (i.e., non-empty
        `q_network.state_spec`).
      boltzmann_temperature: Temperature value to use for Boltzmann sampling of
        the actions during data collection. The closer to 0.0, the higher the
        probability of choosing the best action. Only one of epsilon_greedy and
        boltzmann_temperature should be provided.
      emit_log_probability: Whether policies emit log probabilities or not.
      target_q_network: (Optional.)  A `tf_agents.network.Network`
        to be used as the target network during Q learning.  Every
        `target_update_period` train steps, the weights from
        `q_network` are copied (possibly with smoothing via
        `target_update_tau`) to `target_q_network`.

        If `target_q_network` is not provided, it is created by
        making a copy of `q_network`, which initializes a new
        network with the same structure and its own layers and weights.

        Network copying is performed via the `Network.copy` superclass method,
        and may inadvertently lead to the resulting network to share weights
        with the original.  This can happen if, for example, the original
        network accepted a pre-built Keras layer in its `__init__`, or
        accepted a Keras layer that wasn't built, but neglected to create
        a new copy.

        In these cases, it is up to you to provide a target Network having
        weights that are not shared with the original `q_network`.
        If you provide a `target_q_network` that shares any
        weights with `q_network`, a warning will be logged but
        no exception is thrown.

        Note; shallow copies of Keras layers may be built via the code:

        ```python
        new_layer = type(layer).from_config(layer.get_config())
        ```
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of element_wise_huber_loss is used. This function takes as
        input the target and the estimated Q values and returns the loss for
        each element of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If `action_spec` contains more than one action or action
        spec minimum is not equal to 0.
      ValueError: If the q networks do not emit floating point outputs with
        inner shape matching `action_spec`.
      NotImplementedError: If `q_network` has non-empty `state_spec` (i.e., an
        RNN is provided) and `n_step_update > 1`.
    """
        tf.Module.__init__(self, name=name)

        action_spec = tensor_spec.from_spec(action_spec)
        self._check_action_spec(action_spec)

        if epsilon_greedy is not None and boltzmann_temperature is not None:
            raise ValueError(
                'Configured both epsilon_greedy value {} and temperature {}, '
                'however only one of them can be used for exploration.'.format(
                    epsilon_greedy, boltzmann_temperature))

        self._observation_and_action_constraint_splitter = (
            observation_and_action_constraint_splitter)
        self._q_network = q_network
        net_observation_spec = time_step_spec.observation
        if observation_and_action_constraint_splitter:
            net_observation_spec, _ = observation_and_action_constraint_splitter(
                net_observation_spec)
        q_network.create_variables(net_observation_spec)
        if target_q_network:
            target_q_network.create_variables(net_observation_spec)
        self._target_q_network = common.maybe_copy_target_network_with_checks(
            self._q_network,
            target_q_network,
            input_spec=net_observation_spec,
            name='TargetQNetwork')

        self._check_network_output(self._q_network, 'q_network')
        self._check_network_output(self._target_q_network, 'target_q_network')

        self._epsilon_greedy = epsilon_greedy
        self._n_step_update = n_step_update
        self._boltzmann_temperature = boltzmann_temperature
        self._optimizer = optimizer
        self._td_errors_loss_fn = (td_errors_loss_fn
                                   or common.element_wise_huber_loss)
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._gradient_clipping = gradient_clipping
        self._update_target = self._get_target_updater(target_update_tau,
                                                       target_update_period)

        policy, collect_policy = self._setup_policy(time_step_spec,
                                                    action_spec,
                                                    boltzmann_temperature,
                                                    emit_log_probability)

        if q_network.state_spec and n_step_update != 1:
            raise NotImplementedError(
                'DqnAgent does not currently support n-step updates with stateful '
                'networks (i.e., RNNs), but n_step_update = {}'.format(
                    n_step_update))

        train_sequence_length = (n_step_update +
                                 1 if not q_network.state_spec else None)

        super(DqnAgent, self).__init__(
            time_step_spec,
            action_spec,
            policy,
            collect_policy,
            train_sequence_length=train_sequence_length,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=train_step_counter,
        )

        if q_network.state_spec:
            # AsNStepTransition does not support emitting [B, T, ...] tensors,
            # which we need for DQN-RNN.
            self._as_transition = data_converter.AsTransition(
                self.data_context, squeeze_time_dim=False)
        else:
            # This reduces the n-step return and removes the extra time dimension,
            # allowing the rest of the computations to be independent of the
            # n-step parameter.
            self._as_transition = data_converter.AsNStepTransition(
                self.data_context, gamma=gamma, n=n_step_update)
Esempio n. 17
0
 def _nb_actions(self):
     """return number of actions"""
     action_tensor_spec = tensor_spec.from_spec(self.env.action_spec())
     return action_tensor_spec.maximum - action_tensor_spec.minimum + 1
Esempio n. 18
0
    def __init__(
            self,
            time_step_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            cloning_network: network.Network,
            optimizer: types.Optimizer,
            num_outer_dims: Literal[1, 2] = 1,  # pylint: disable=bad-whitespace
            epsilon_greedy: types.Float = 0.1,
            loss_fn: Optional[Callable[[types.NestedTensor, bool],
                                       types.Tensor]] = None,
            gradient_clipping: Optional[types.Float] = None,
            # Params for debugging.
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            train_step_counter: Optional[tf.Variable] = None,
            name: Optional[Text] = None):
        """Creates an instance of a Behavioral Cloning agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      cloning_network: A `tf_agents.networks.Network` to be used by the agent.
        The network will be called as

          ```
          network(observation, step_type=step_type, network_state=initial_state)
          ```
        and must return a 2-tuple with elements `(output, next_network_state)`
      optimizer: The optimizer to use for training.
      num_outer_dims: The number of outer dimensions for the agent. Must be
        either 1 or 2. If 2, training will require both a batch_size and time
        dimension on every Tensor; if 1, training will require only a batch_size
        outer dimension.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if actions are discrete)
      loss_fn: A function for computing the error between the output of the
        cloning network and the action that was taken. If None, the loss
        depends on the action dtype. The `loss_fn` is called with parameters:
        `(experience, training)`, and must return a loss value for each element
        of the batch.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)
        self._cloning_network = cloning_network
        self._optimizer = optimizer
        self._gradient_clipping = gradient_clipping

        action_spec = tensor_spec.from_spec(action_spec)
        flat_action_spec = tf.nest.flatten(action_spec)
        continuous_specs = [
            tensor_spec.is_continuous(s) for s in flat_action_spec
        ]

        if not flat_action_spec:
            raise ValueError(
                'The `action_spec` must contain at least one action.')

        single_discrete_scalar_action = (
            len(flat_action_spec) == 1 and flat_action_spec[0].shape.rank == 0
            and not tensor_spec.is_continuous(flat_action_spec[0]))
        single_continuous_action = (len(flat_action_spec) == 1
                                    and tensor_spec.is_continuous(
                                        flat_action_spec[0]))

        if (not loss_fn and not single_discrete_scalar_action
                and not single_continuous_action):
            raise ValueError(
                'A `loss_fn` must be provided unless there is a single, scalar '
                'discrete action or a single (scalar or non-scalar) continuous '
                'action.')

        self._network_output_spec = cloning_network.create_variables(
            time_step_spec.observation)

        # If there is a mix of continuous and discrete actions we want to use an
        # actor policy so we can use the `setup_as_continuous` method as long as the
        # user provided a custom loss_fn which we verified above.
        if any(continuous_specs):
            policy, collect_policy = self._setup_as_continuous(
                time_step_spec, action_spec, loss_fn)
        else:
            policy, collect_policy = self._setup_as_discrete(
                time_step_spec, action_spec, loss_fn, epsilon_greedy)

        super(BehavioralCloningAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)

        self._as_trajectory = data_converter.AsTrajectory(
            self.data_context,
            sequence_length=None,
            num_outer_dims=num_outer_dims)
Esempio n. 19
0
    def __init__(self,
                 time_step_spec: ts.TimeStep,
                 action_spec: types.NestedTensorSpec,
                 policy: tf_policy.TFPolicy,
                 collect_policy: tf_policy.TFPolicy,
                 train_sequence_length: Optional[int],
                 num_outer_dims: int = 2,
                 training_data_spec: Optional[types.NestedTensorSpec] = None,
                 debug_summaries: bool = False,
                 summarize_grads_and_vars: bool = False,
                 enable_summaries: bool = True,
                 train_step_counter: Optional[tf.Variable] = None):
        """Meant to be called by subclass constructors.

    Args:
      time_step_spec: A nest of tf.TypeSpec representing the time_steps.
        Provided by the user.
      action_spec: A nest of BoundedTensorSpec representing the actions.
        Provided by the user.
      policy: An instance of `tf_policy.TFPolicy` representing the
        Agent's current policy.
      collect_policy: An instance of `tf_policy.TFPolicy` representing the
        Agent's current data collection policy (used to set `self.step_spec`).
      train_sequence_length: A python integer or `None`, signifying the number
        of time steps required from tensors in `experience` as passed to
        `train()`.  All tensors in `experience` will be shaped `[B, T, ...]` but
        for certain agents, `T` should be fixed.  For example, DQN requires
        transitions in the form of 2 time steps, so for a non-RNN DQN Agent, set
        this value to 2.  For agents that don't care, or which can handle `T`
        unknown at graph build time (i.e. most RNN-based agents), set this
        argument to `None`.
      num_outer_dims: The number of outer dimensions for the agent. Must be
        either 1 or 2. If 2, training will require both a batch_size and time
        dimension on every Tensor; if 1, training will require only a batch_size
        outer dimension.
      training_data_spec: A nest of TensorSpec specifying the structure of data
        the train() function expects. If None, defaults to the trajectory_spec
        of the collect_policy.
      debug_summaries: A bool; if true, subclasses should gather debug
        summaries.
      summarize_grads_and_vars: A bool; if true, subclasses should additionally
        collect gradient and variable summaries.
      enable_summaries: A bool; if false, subclasses should not gather any
        summaries (debug or otherwise); subclasses should gate *all* summaries
        using either `summaries_enabled`, `debug_summaries`, or
        `summarize_grads_and_vars` properties.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.

    Raises:
      ValueError: If `num_outer_dims` is not in `[1, 2]`.
    """
        common.check_tf1_allowed()
        common.tf_agents_gauge.get_cell("TFAgent").set(True)
        common.tf_agents_gauge.get_cell(str(type(self))).set(True)
        if not isinstance(time_step_spec, ts.TimeStep):
            raise TypeError(
                "The `time_step_spec` must be an instance of `TimeStep`, but is `{}`."
                .format(type(time_step_spec)))

        if num_outer_dims not in [1, 2]:
            raise ValueError("num_outer_dims must be in [1, 2].")

        time_step_spec = tensor_spec.from_spec(time_step_spec)
        action_spec = tensor_spec.from_spec(action_spec)
        self._time_step_spec = time_step_spec
        self._action_spec = action_spec
        self._policy = policy
        self._collect_policy = collect_policy
        self._train_sequence_length = train_sequence_length
        self._num_outer_dims = num_outer_dims
        self._debug_summaries = debug_summaries
        self._summarize_grads_and_vars = summarize_grads_and_vars
        self._enable_summaries = enable_summaries
        self._training_data_spec = training_data_spec
        # Data context for data collected directly from the collect policy.
        self._collect_data_context = data_converter.DataContext(
            time_step_spec=self._time_step_spec,
            action_spec=self._action_spec,
            info_spec=collect_policy.info_spec)
        # Data context for data passed to train().  May be different if
        # training_data_spec is provided.
        if training_data_spec is not None:
            training_data_spec = tensor_spec.from_spec(training_data_spec)
            # training_data_spec can be anything; so build a data_context
            # via best-effort with fall-backs to the collect data spec.
            training_discount_spec = getattr(training_data_spec, "discount",
                                             time_step_spec.discount)
            training_observation_spec = getattr(training_data_spec,
                                                "observation",
                                                time_step_spec.observation)
            training_reward_spec = getattr(training_data_spec, "reward",
                                           time_step_spec.reward)
            training_step_type_spec = getattr(training_data_spec, "step_type",
                                              time_step_spec.step_type)
            training_policy_info_spec = getattr(training_data_spec,
                                                "policy_info",
                                                collect_policy.info_spec)
            training_action_spec = getattr(training_data_spec, "action",
                                           action_spec)
            self._data_context = data_converter.DataContext(
                time_step_spec=ts.TimeStep(
                    discount=training_discount_spec,
                    observation=training_observation_spec,
                    reward=training_reward_spec,
                    step_type=training_step_type_spec),
                action_spec=training_action_spec,
                info_spec=training_policy_info_spec)
        else:
            self._data_context = data_converter.DataContext(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                info_spec=collect_policy.info_spec)
        if train_step_counter is None:
            train_step_counter = tf.compat.v1.train.get_or_create_global_step()
        self._train_step_counter = train_step_counter
        self._train_fn = common.function_in_tf1()(self._train)
        self._initialize_fn = common.function_in_tf1()(self._initialize)
        self._preprocess_sequence_fn = common.function_in_tf1()(
            self._preprocess_sequence)
        self._loss_fn = common.function_in_tf1()(self._loss)
Esempio n. 20
0
def main(_):
    # setting up
    start_time = time.time()
    tf.compat.v1.enable_resource_variables()
    tf.compat.v1.disable_eager_execution()
    logging.set_verbosity(logging.INFO)
    global observation_omit_size, goal_coord, sample_count, iter_count, episode_size_buffer, episode_return_buffer

    root_dir = os.path.abspath(os.path.expanduser(FLAGS.logdir))
    if not tf.io.gfile.exists(root_dir):
        tf.io.gfile.makedirs(root_dir)
    log_dir = os.path.join(root_dir, FLAGS.environment)

    if not tf.io.gfile.exists(log_dir):
        tf.io.gfile.makedirs(log_dir)
    save_dir = os.path.join(log_dir, "models")
    if not tf.io.gfile.exists(save_dir):
        tf.io.gfile.makedirs(save_dir)

    print("directory for recording experiment data:", log_dir)

    # in case training is paused and resumed, so can be restored
    try:
        sample_count = np.load(os.path.join(log_dir,
                                            "sample_count.npy")).tolist()
        iter_count = np.load(os.path.join(log_dir, "iter_count.npy")).tolist()
        episode_size_buffer = np.load(
            os.path.join(log_dir, "episode_size_buffer.npy")).tolist()
        episode_return_buffer = np.load(
            os.path.join(log_dir, "episode_return_buffer.npy")).tolist()
    except:
        sample_count = 0
        iter_count = 0
        episode_size_buffer = []
        episode_return_buffer = []

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        os.path.join(log_dir, "train", "in_graph_data"),
        flush_millis=10 * 1000)
    train_summary_writer.set_as_default()

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(True):
        # environment related stuff
        env = do.get_environment(env_name=FLAGS.environment)
        py_env = wrap_env(
            skill_wrapper.SkillWrapper(
                env,
                num_latent_skills=FLAGS.num_skills,
                skill_type=FLAGS.skill_type,
                preset_skill=None,
                min_steps_before_resample=FLAGS.min_steps_before_resample,
                resample_prob=FLAGS.resample_prob,
            ),
            max_episode_steps=FLAGS.max_env_steps,
        )

        # all specifications required for all networks and agents
        py_action_spec = py_env.action_spec()
        tf_action_spec = tensor_spec.from_spec(
            py_action_spec)  # policy, critic action spec
        env_obs_spec = py_env.observation_spec()
        py_env_time_step_spec = ts.time_step_spec(
            env_obs_spec)  # replay buffer time_step spec
        if observation_omit_size > 0:
            agent_obs_spec = array_spec.BoundedArraySpec(
                (env_obs_spec.shape[0] - observation_omit_size, ),
                env_obs_spec.dtype,
                minimum=env_obs_spec.minimum,
                maximum=env_obs_spec.maximum,
                name=env_obs_spec.name,
            )  # policy, critic observation spec
        else:
            agent_obs_spec = env_obs_spec
        py_agent_time_step_spec = ts.time_step_spec(
            agent_obs_spec)  # policy, critic time_step spec
        tf_agent_time_step_spec = tensor_spec.from_spec(
            py_agent_time_step_spec)

        if not FLAGS.reduced_observation:
            skill_dynamics_observation_size = (
                py_env_time_step_spec.observation.shape[0] - FLAGS.num_skills)
        else:
            skill_dynamics_observation_size = FLAGS.reduced_observation

        # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well)
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            tf_agent_time_step_spec.observation,
            tf_action_spec,
            fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
            continuous_projection_net=do._normal_projection_net,
        )

        critic_net = critic_network.CriticNetwork(
            (tf_agent_time_step_spec.observation, tf_action_spec),
            observation_fc_layer_params=None,
            action_fc_layer_params=None,
            joint_fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
        )

        if (FLAGS.skill_dynamics_relabel_type is not None
                and "importance_sampling" in FLAGS.skill_dynamics_relabel_type
                and FLAGS.is_clip_eps > 1.0):
            reweigh_batches_flag = True
        else:
            reweigh_batches_flag = False

        agent = dads_agent.DADSAgent(
            # DADS parameters
            save_dir,
            skill_dynamics_observation_size,
            observation_modify_fn=do.process_observation,
            restrict_input_size=observation_omit_size,
            latent_size=FLAGS.num_skills,
            latent_prior=FLAGS.skill_type,
            prior_samples=FLAGS.random_skills,
            fc_layer_params=(FLAGS.hidden_layer_size, ) * 2,
            normalize_observations=FLAGS.normalize_data,
            network_type=FLAGS.graph_type,
            num_mixture_components=FLAGS.num_components,
            fix_variance=FLAGS.fix_variance,
            reweigh_batches=reweigh_batches_flag,
            skill_dynamics_learning_rate=FLAGS.skill_dynamics_lr,
            # SAC parameters
            time_step_spec=tf_agent_time_step_spec,
            action_spec=tf_action_spec,
            actor_network=actor_net,
            critic_network=critic_net,
            target_update_tau=0.005,
            target_update_period=1,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=FLAGS.agent_lr),
            td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gamma=FLAGS.agent_gamma,
            reward_scale_factor=1.0 / (FLAGS.agent_entropy + 1e-12),
            gradient_clipping=None,
            debug_summaries=FLAGS.debug,
            train_step_counter=global_step,
        )

        # evaluation policy
        eval_policy = py_tf_policy.PyTFPolicy(agent.policy)

        # collection policy
        if FLAGS.collect_policy == "default":
            collect_policy = py_tf_policy.PyTFPolicy(agent.collect_policy)
        elif FLAGS.collect_policy == "ou_noise":
            collect_policy = py_tf_policy.PyTFPolicy(
                ou_noise_policy.OUNoisePolicy(agent.collect_policy,
                                              ou_stddev=0.2,
                                              ou_damping=0.15))

        # relabelling policy deals with batches of data, unlike collect and eval
        relabel_policy = py_tf_policy.PyTFPolicy(agent.collect_policy)

        # constructing a replay buffer, need a python spec
        policy_step_spec = policy_step.PolicyStep(action=py_action_spec,
                                                  state=(),
                                                  info=())

        if (FLAGS.skill_dynamics_relabel_type is not None
                and "importance_sampling" in FLAGS.skill_dynamics_relabel_type
                and FLAGS.is_clip_eps > 1.0):
            policy_step_spec = policy_step_spec._replace(
                info=policy_step.set_log_probability(
                    policy_step_spec.info,
                    array_spec.ArraySpec(
                        shape=(), dtype=np.float32, name="action_log_prob"),
                ))

        trajectory_spec = from_transition(py_env_time_step_spec,
                                          policy_step_spec,
                                          py_env_time_step_spec)
        capacity = FLAGS.replay_buffer_capacity
        # for all the data collected
        rbuffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
            capacity=capacity, data_spec=trajectory_spec)

        if FLAGS.train_skill_dynamics_on_policy:
            # for on-policy data (if something special is required)
            on_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
                capacity=FLAGS.initial_collect_steps + FLAGS.collect_steps +
                10,
                data_spec=trajectory_spec,
            )

        # insert experience manually with relabelled rewards and skills
        agent.build_agent_graph()
        agent.build_skill_dynamics_graph()
        agent.create_savers()

        # saving this way requires the saver to be out the object
        train_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "agent"),
            agent=agent,
            global_step=global_step,
        )
        policy_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "policy"),
            policy=agent.policy,
            global_step=global_step,
        )
        rb_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(save_dir, "replay_buffer"),
            max_to_keep=1,
            replay_buffer=rbuffer,
        )

        setup_time = time.time() - start_time
        print("Setup time:", setup_time)

        with tf.compat.v1.Session().as_default() as sess:
            eval_policy.session = sess
            eval_policy.initialize(None)
            eval_policy.restore(os.path.join(FLAGS.logdir, "models", "policy"))

            plotdir = os.path.join(FLAGS.logdir, "plots")
            if not os.path.exists(plotdir):
                os.mkdir(plotdir)
            do.FLAGS = FLAGS
            do.eval_loop(eval_dir=plotdir,
                         eval_policy=eval_policy,
                         plot_name="plot")
Esempio n. 21
0
 def action_spec(self):
     return tensor_spec.from_spec(self._envs[0].tf_env.action_spec())
Esempio n. 22
0
    def load(self):
        # setting up
        tf.compat.v1.enable_resource_variables()
        tf.compat.v1.disable_eager_execution()

        root_dir = os.path.abspath(os.path.expanduser(self.flags.logdir))
        if not tf.io.gfile.exists(root_dir):
            tf.io.gfile.makedirs(root_dir)
        log_dir = os.path.join(root_dir, self.flags.environment)

        if not tf.io.gfile.exists(log_dir):
            tf.io.gfile.makedirs(log_dir)
        save_dir = os.path.join(log_dir, "models")
        if not tf.io.gfile.exists(save_dir):
            tf.io.gfile.makedirs(save_dir)

        train_summary_writer = tf.compat.v2.summary.create_file_writer(
            os.path.join(log_dir, "train", "in_graph_data"),
            flush_millis=10 * 1000)
        train_summary_writer.set_as_default()

        global_step = tf.compat.v1.train.get_or_create_global_step()
        with tf.compat.v2.summary.record_if(True):
            # environment related stuff
            env = do.get_environment(env_name=self.flags.environment)
            py_env = wrap_env(
                skill_wrapper.SkillWrapper(
                    env,
                    num_latent_skills=self.flags.num_skills,
                    skill_type=self.flags.skill_type,
                    preset_skill=None,
                    min_steps_before_resample=self.flags.
                    min_steps_before_resample,
                    resample_prob=self.flags.resample_prob,
                ),
                max_episode_steps=self.flags.max_env_steps,
            )

            # all specifications required for all networks and agents
            py_action_spec = py_env.action_spec()
            tf_action_spec = tensor_spec.from_spec(
                py_action_spec)  # policy, critic action spec
            env_obs_spec = py_env.observation_spec()
            py_env_time_step_spec = ts.time_step_spec(
                env_obs_spec)  # replay buffer time_step spec
            if self.flags.observation_omission_size > 0:
                agent_obs_spec = array_spec.BoundedArraySpec(
                    (env_obs_spec.shape[0] -
                     self.flags.observation_omission_size),
                    env_obs_spec.dtype,
                    minimum=env_obs_spec.minimum,
                    maximum=env_obs_spec.maximum,
                    name=env_obs_spec.name,
                )  # policy, critic observation spec
            else:
                agent_obs_spec = env_obs_spec
            py_agent_time_step_spec = ts.time_step_spec(
                agent_obs_spec)  # policy, critic time_step spec
            tf_agent_time_step_spec = tensor_spec.from_spec(
                py_agent_time_step_spec)

            if not self.flags.reduced_observation:
                skill_dynamics_observation_size = (
                    py_env_time_step_spec.observation.shape[0] -
                    self.flags.num_skills)
            else:
                skill_dynamics_observation_size = self.flags.reduced_observation

            # TODO(architsh): Shift co-ordinate hiding to actor_net and critic_net (good for futher image based processing as well)
            actor_net = actor_distribution_network.ActorDistributionNetwork(
                tf_agent_time_step_spec.observation,
                tf_action_spec,
                fc_layer_params=(self.flags.hidden_layer_size, ) * 2,
                continuous_projection_net=do._normal_projection_net,
            )

            critic_net = critic_network.CriticNetwork(
                (tf_agent_time_step_spec.observation, tf_action_spec),
                observation_fc_layer_params=None,
                action_fc_layer_params=None,
                joint_fc_layer_params=(self.flags.hidden_layer_size, ) * 2,
            )

            if (self.flags.skill_dynamics_relabel_type is not None
                    and "importance_sampling"
                    in self.flags.skill_dynamics_relabel_type
                    and self.flags.is_clip_eps > 1.0):
                reweigh_batches_flag = True
            else:
                reweigh_batches_flag = False

            agent = dads_agent.DADSAgent(
                # DADS parameters
                save_dir,
                skill_dynamics_observation_size,
                observation_modify_fn=self.process_observation,
                restrict_input_size=self.flags.observation_omission_size,
                latent_size=self.flags.num_skills,
                latent_prior=self.flags.skill_type,
                prior_samples=self.flags.random_skills,
                fc_layer_params=(self.flags.hidden_layer_size, ) * 2,
                normalize_observations=self.flags.normalize_data,
                network_type=self.flags.graph_type,
                num_mixture_components=self.flags.num_components,
                fix_variance=self.flags.fix_variance,
                reweigh_batches=reweigh_batches_flag,
                skill_dynamics_learning_rate=self.flags.skill_dynamics_lr,
                # SAC parameters
                time_step_spec=tf_agent_time_step_spec,
                action_spec=tf_action_spec,
                actor_network=actor_net,
                critic_network=critic_net,
                target_update_tau=0.005,
                target_update_period=1,
                actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=self.flags.agent_lr),
                critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=self.flags.agent_lr),
                alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
                    learning_rate=self.flags.agent_lr),
                td_errors_loss_fn=tf.compat.v1.losses.mean_squared_error,
                gamma=self.flags.agent_gamma,
                reward_scale_factor=1.0 / (self.flags.agent_entropy + 1e-12),
                gradient_clipping=None,
                debug_summaries=self.flags.debug,
                train_step_counter=global_step,
            )

            # evaluation policy
            eval_policy = py_tf_policy.PyTFPolicy(agent.policy)

            # constructing a replay buffer, need a python spec
            policy_step_spec = policy_step.PolicyStep(action=py_action_spec,
                                                      state=(),
                                                      info=())

            if (self.flags.skill_dynamics_relabel_type is not None
                    and "importance_sampling"
                    in self.flags.skill_dynamics_relabel_type
                    and self.flags.is_clip_eps > 1.0):
                policy_step_spec = policy_step_spec._replace(
                    info=policy_step.set_log_probability(
                        policy_step_spec.info,
                        array_spec.ArraySpec(
                            shape=(
                            ), dtype=np.float32, name="action_log_prob"),
                    ))

            # insert experience manually with relabelled rewards and skills
            agent.build_agent_graph()
            agent.build_skill_dynamics_graph()

            with tf.compat.v1.Session().as_default() as sess:
                eval_policy.session = sess
                eval_policy.initialize(None)
                eval_policy.restore(
                    os.path.join(self.flags.logdir, "models", "policy"))
                self.policy = eval_policy
Esempio n. 23
0
def train_eval(
        root_dir,
        env_name='CartPole-v0',
        # Training params
        initial_collect_steps=1000,
        num_iterations=100000,
        fc_layer_params=(100, ),
        # Agent params
        epsilon_greedy=0.1,
        batch_size=64,
        learning_rate=1e-3,
        n_step_update=1,
        gamma=0.99,
        target_update_tau=0.05,
        target_update_period=5,
        reward_scale_factor=1.0,
        # Replay params
        reverb_port=None,
        replay_capacity=100000,
        # Others
        policy_save_interval=1000,
        eval_interval=1000,
        eval_episodes=10):
    """Trains and evaluates DQN."""
    collect_env = suite_gym.load(env_name)
    eval_env = suite_gym.load(env_name)

    time_step_tensor_spec = tensor_spec.from_spec(collect_env.time_step_spec())
    action_tensor_spec = tensor_spec.from_spec(collect_env.action_spec())

    train_step = train_utils.create_train_step()
    num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

    # Define a helper function to create Dense layers configured with the right
    # activation and kernel initializer.
    def dense_layer(num_units):
        return tf.keras.layers.Dense(
            num_units,
            activation=tf.keras.activations.relu,
            kernel_initializer=tf.keras.initializers.VarianceScaling(
                scale=2.0, mode='fan_in', distribution='truncated_normal'))

    # QNetwork consists of a sequence of Dense layers followed by a dense layer
    # with `num_actions` units to generate one q_value per available action as
    # it's output.
    dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
    q_values_layer = tf.keras.layers.Dense(
        num_actions,
        activation=None,
        kernel_initializer=tf.keras.initializers.RandomUniform(minval=-0.03,
                                                               maxval=0.03),
        bias_initializer=tf.keras.initializers.Constant(-0.2))
    q_net = sequential.Sequential(dense_layers + [q_values_layer])

    agent = dqn_agent.DqnAgent(
        time_step_tensor_spec,
        action_tensor_spec,
        q_network=q_net,
        epsilon_greedy=epsilon_greedy,
        n_step_update=n_step_update,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        train_step_counter=train_step)

    table_name = 'uniform_table'
    table = reverb.Table(table_name,
                         max_size=replay_capacity,
                         sampler=reverb.selectors.Uniform(),
                         remover=reverb.selectors.Fifo(),
                         rate_limiter=reverb.rate_limiters.MinSize(1))
    reverb_server = reverb.Server([table], port=reverb_port)
    reverb_replay = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=2,
        table_name=table_name,
        local_server=reverb_server)
    rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
        reverb_replay.py_client,
        table_name,
        sequence_length=2,
        stride_length=1)

    dataset = reverb_replay.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=batch_size,
                                       num_steps=2).prefetch(3)
    experience_dataset_fn = lambda: dataset

    saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
    env_step_metric = py_metrics.EnvironmentSteps()

    learning_triggers = [
        triggers.PolicySavedModelTrigger(
            saved_model_dir,
            agent,
            train_step,
            interval=policy_save_interval,
            metadata_metrics={triggers.ENV_STEP_METADATA_KEY:
                              env_step_metric}),
        triggers.StepPerSecondLogTrigger(train_step, interval=100),
    ]

    dqn_learner = learner.Learner(root_dir,
                                  train_step,
                                  agent,
                                  experience_dataset_fn,
                                  triggers=learning_triggers)

    # If we haven't trained yet make sure we collect some random samples first to
    # fill up the Replay Buffer with some experience.
    random_policy = random_py_policy.RandomPyPolicy(
        collect_env.time_step_spec(), collect_env.action_spec())
    initial_collect_actor = actor.Actor(collect_env,
                                        random_policy,
                                        train_step,
                                        steps_per_run=initial_collect_steps,
                                        observers=[rb_observer])
    logging.info('Doing initial collect.')
    initial_collect_actor.run()

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy,
                                                        use_tf_function=True)

    collect_actor = actor.Actor(
        collect_env,
        collect_policy,
        train_step,
        steps_per_run=1,
        observers=[rb_observer, env_step_metric],
        metrics=actor.collect_metrics(10),
        summary_dir=os.path.join(root_dir, learner.TRAIN_DIR),
    )

    tf_greedy_policy = agent.policy
    greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_greedy_policy,
                                                       use_tf_function=True)

    eval_actor = actor.Actor(
        eval_env,
        greedy_policy,
        train_step,
        episodes_per_run=eval_episodes,
        metrics=actor.eval_metrics(eval_episodes),
        summary_dir=os.path.join(root_dir, 'eval'),
    )

    if eval_interval:
        logging.info('Evaluating.')
        eval_actor.run_and_log()

    logging.info('Training.')
    for _ in range(num_iterations):
        collect_actor.run()
        dqn_learner.run(iterations=1)

        if eval_interval and dqn_learner.train_step_numpy % eval_interval == 0:
            logging.info('Evaluating.')
            eval_actor.run_and_log()

    rb_observer.close()
    reverb_server.stop()
Esempio n. 24
0
def create_tensor_specs(data_spec, episode_len):
    spec = tuple([data_spec for _ in range(episode_len)])
    tensor_data_spec = tensor_spec.from_spec(data_spec)
    tensor_episode_spec = tensor_spec.from_spec((spec, spec))
    return tensor_data_spec, tensor_episode_spec
Esempio n. 25
0
def train_eval(
        root_dir,
        env_name='CartPole-v0',
        num_iterations=100000,
        fc_layer_params=(100, ),
        # Params for collect
        initial_collect_steps=1000,
        collect_steps_per_iteration=1,
        epsilon_greedy=0.1,
        replay_buffer_capacity=100000,
        # Params for target update
        target_update_tau=0.05,
        target_update_period=5,
        # Params for train
        train_steps_per_iteration=1,
        batch_size=64,
        learning_rate=1e-3,
        n_step_update=1,
        gamma=0.99,
        reward_scale_factor=1.0,
        gradient_clipping=None,
        # Params for eval
        num_eval_episodes=10,
        eval_interval=1000,
        # Params for checkpoints, summaries and logging
        train_checkpoint_interval=10000,
        policy_checkpoint_interval=5000,
        log_interval=1000,
        summaries_flush_secs=10,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        eval_metrics_callback=None):
    """A simple train and eval for DQN."""
    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)
    eval_metrics = [
        py_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        py_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes),
    ]

    # Note this is a python environment.
    env = batched_py_environment.BatchedPyEnvironment(
        [suite_gym.load(env_name)])
    eval_py_env = suite_gym.load(env_name)

    # Convert specs to BoundedTensorSpec.
    action_spec = tensor_spec.from_spec(env.action_spec())
    observation_spec = tensor_spec.from_spec(env.observation_spec())
    time_step_spec = ts.time_step_spec(observation_spec)

    q_net = q_network.QNetwork(tensor_spec.from_spec(env.observation_spec()),
                               tensor_spec.from_spec(env.action_spec()),
                               fc_layer_params=fc_layer_params)

    # The agent must be in graph.
    global_step = tf.compat.v1.train.get_or_create_global_step()
    agent = dqn_agent.DqnAgent(
        time_step_spec,
        action_spec,
        q_network=q_net,
        epsilon_greedy=epsilon_greedy,
        n_step_update=n_step_update,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate),
        td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        gradient_clipping=gradient_clipping,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=global_step)

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_policy.PyTFPolicy(tf_collect_policy)
    greedy_policy = py_tf_policy.PyTFPolicy(agent.policy)
    random_policy = random_py_policy.RandomPyPolicy(env.time_step_spec(),
                                                    env.action_spec())

    # Python replay buffer.
    replay_buffer = py_uniform_replay_buffer.PyUniformReplayBuffer(
        capacity=replay_buffer_capacity,
        data_spec=tensor_spec.to_nest_array_spec(agent.collect_data_spec))

    time_step = env.reset()

    # Initialize the replay buffer with some transitions. We use the random
    # policy to initialize the replay buffer to make sure we get a good
    # distribution of actions.
    for _ in range(initial_collect_steps):
        time_step = collect_step(env, time_step, random_policy, replay_buffer)

    # TODO(b/112041045) Use global_step as counter.
    train_checkpointer = common.Checkpointer(ckpt_dir=train_dir,
                                             agent=agent,
                                             global_step=global_step)

    policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
        train_dir, 'policy'),
                                              policy=agent.policy,
                                              global_step=global_step)

    ds = replay_buffer.as_dataset(sample_batch_size=batch_size,
                                  num_steps=n_step_update + 1)
    ds = ds.prefetch(4)
    itr = tf.compat.v1.data.make_initializable_iterator(ds)

    experience = itr.get_next()

    train_op = common.function(agent.train)(experience)

    with eval_summary_writer.as_default(), \
         tf.compat.v2.summary.record_if(True):
        for eval_metric in eval_metrics:
            eval_metric.tf_summaries(train_step=global_step)

    with tf.compat.v1.Session() as session:
        train_checkpointer.initialize_or_restore(session)
        common.initialize_uninitialized_variables(session)
        session.run(itr.initializer)
        # Copy critic network values to the target critic network.
        session.run(agent.initialize())
        train = session.make_callable(train_op)
        global_step_call = session.make_callable(global_step)
        session.run(train_summary_writer.init())
        session.run(eval_summary_writer.init())

        # Compute initial evaluation metrics.
        global_step_val = global_step_call()
        metric_utils.compute_summaries(
            eval_metrics,
            eval_py_env,
            greedy_policy,
            num_episodes=num_eval_episodes,
            global_step=global_step_val,
            log=True,
            callback=eval_metrics_callback,
        )

        timed_at_step = global_step_val
        collect_time = 0
        train_time = 0
        steps_per_second_ph = tf.compat.v1.placeholder(tf.float32,
                                                       shape=(),
                                                       name='steps_per_sec_ph')
        steps_per_second_summary = tf.compat.v2.summary.scalar(
            name='global_steps_per_sec',
            data=steps_per_second_ph,
            step=global_step)

        for _ in range(num_iterations):
            start_time = time.time()
            for _ in range(collect_steps_per_iteration):
                time_step = collect_step(env, time_step, collect_policy,
                                         replay_buffer)
            collect_time += time.time() - start_time
            start_time = time.time()
            for _ in range(train_steps_per_iteration):
                loss = train()
            train_time += time.time() - start_time
            global_step_val = global_step_call()
            if global_step_val % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step_val,
                             loss.loss)
                steps_per_sec = ((global_step_val - timed_at_step) /
                                 (collect_time + train_time))
                session.run(steps_per_second_summary,
                            feed_dict={steps_per_second_ph: steps_per_sec})
                logging.info('%.3f steps/sec', steps_per_sec)
                logging.info(
                    '%s', 'collect_time = {}, train_time = {}'.format(
                        collect_time, train_time))
                timed_at_step = global_step_val
                collect_time = 0
                train_time = 0

            if global_step_val % train_checkpoint_interval == 0:
                train_checkpointer.save(global_step=global_step_val)

            if global_step_val % policy_checkpoint_interval == 0:
                policy_checkpointer.save(global_step=global_step_val)

            if global_step_val % eval_interval == 0:
                metric_utils.compute_summaries(
                    eval_metrics,
                    eval_py_env,
                    greedy_policy,
                    num_episodes=num_eval_episodes,
                    global_step=global_step_val,
                    log=True,
                    callback=eval_metrics_callback,
                )
                # Reset timing to avoid counting eval time.
                timed_at_step = global_step_val
                start_time = time.time()
if __name__ == '__main__':
    #COMMAND-LINE ARGUMENTS
    parser = argparse.ArgumentParser('Read-From-Bigtable Script')
    parser.add_argument('--gcp-project-id', type=str, default='for-robolab-cbai')
    parser.add_argument('--cbt-instance-id', type=str, default='rab-rl-bigtable')
    parser.add_argument('--cbt-table-name', type=str, default='cartpole-experience-replay')
    args = parser.parse_args()

    #INITIALIZE RL AGENT
    observation_spec = tensor_spec.BoundedTensorSpec(             # Make observation spec manually
            shape=(len(min_array_obs),), dtype=np.float32, minimum=min_array_obs, maximum=max_array_obs)

    action_spec = tensor_spec.BoundedTensorSpec(                  # Make action spec manually
            shape=(), dtype=np.int32, minimum=0, maximum=max_nb_actions - 1)

    time_step_spec = ts.time_step_spec(tensor_spec.from_spec(observation_spec))

    q_net = q_network.QNetwork(
        observation_spec,
        action_spec,
        fc_layer_params=fc_layer_params)

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    train_step_counter = tf.compat.v2.Variable(0, dtype='int64')
    
    tf_agent = dqn_agent.DqnAgent(
        time_step_spec,
        action_spec,
        q_network=q_net,
        optimizer=optimizer,
        td_errors_loss_fn=tf.compat.v2.keras.losses.MSE,
Esempio n. 27
0
def ai_game():
    pygame.init()
    display = pygame.display.set_mode((HEIGHT, WIDTH))
    pygame.display.set_caption("Snake")
    font = pygame.font.SysFont("Times New Roman", 24)
    # snake_agent = SnakeAgent()
    # game(display, snake_agent)
    time.sleep(5)

    train_env = SnakeGameEnv(display, font)
    eval_env = SnakeGameEnv(display, font)
    # env = CardGameEnv()
    # utils.validate_py_environment(env)
    fc_layer_params = (100, 50)
    action_tensor_spec = tensor_spec.from_spec(train_env.action_spec())
    num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
    train_env = tf_py_environment.TFPyEnvironment(train_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_env)

    # QNetwork consists of a sequence of Dense layers followed by a dense layer
    # with `num_actions` units to generate one q_value per available action as
    # it's output.
    dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
    q_values_layer = tf.keras.layers.Dense(
        num_actions,
        activation=None,
        kernel_initializer=tf.keras.initializers.RandomUniform(
            minval=-0.03, maxval=0.03),
        bias_initializer=tf.keras.initializers.Constant(-0.2))
    q_net = sequential.Sequential(dense_layers + [q_values_layer])
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    train_step_counter = tf.Variable(0)
    agent = dqn_agent.DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter)
    agent.initialize()
    print('Initialized Agent')
    random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                    train_env.action_spec())

    print('Reset time spec')
    time_step = train_env.reset()
    random_policy.action(time_step)
    print('Successfully instantiated random policy')
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=REPLAY_BUFFER_MAX_LEN)

    print('Created replay buffer, collecting data ... ')
    collect_data(train_env, random_policy, replay_buffer, INITIAL_COLLECT_STEPS)
    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=BATCH_SIZE,
        num_steps=2).prefetch(3)
    print('Collecting data complete')
    iterator = iter(dataset)
    # Reset the train step
    agent.train_step_counter.assign(0)
    avg_return = compute_avg_return(train_env, agent.policy, NUM_EVAL_EPISODES)
    returns = [avg_return]
    print('Beginning to train...')
    for i in range(NUM_ITERATIONS):
        collect_data(train_env, agent.collect_policy, replay_buffer, COLLECT_STEPS_PER_ITERATION)

        experience, unused_info = next(iterator)
        train_loss = agent.train(experience).loss
        step = agent.train_step_counter.numpy()
        #print(train_env.time_step_spec())

        print(f"Training agent through iteration {(i / NUM_ITERATIONS) * 100:.2f}%...")
        if step % LOG_INTERVAL == 0:
            pass
            #print('step = {0}: loss = {1}'.format(step, train_loss))

        if step % EVAL_INTERVAL == 0:
            #avg_return = compute_avg_return(train_env, agent.policy, NUM_EVAL_EPISODES)
            #print('step = {0}: Average Return = {1}'.format(step, avg_return))
            #returns.append(avg_return)
            pass

    """
Esempio n. 28
0
 def to_event(s):
   return (s.event_spec if isinstance(s, DistributionSpecV2)
           else tensor_spec.from_spec(s))
Esempio n. 29
0
    def __init__(self, environment, check_dims=False, isolation=False):
        """Initializes a new `TFPyEnvironment`.

    Args:
      environment: Environment to interact with, implementing
        `py_environment.PyEnvironment`.  Or a `callable` that returns
        an environment of this form.  If a `callable` is provided and
        `thread_isolation` is provided, the callable is executed in the
        dedicated thread.
      check_dims: Whether should check batch dimensions of actions in `step`.
      isolation: If this value is `False` (default), interactions with
        the environment will occur within whatever thread the methods of the
        `TFPyEnvironment` are run from.  For example, in TF graph mode, methods
        like `step` are called from multiple threads created by the TensorFlow
        engine; calls to step the environment are guaranteed to be sequential,
        but not from the same thread.  This creates problems for environments
        that are not thread-safe.

        Using isolation ensures not only that a dedicated thread (or
        thread-pool) is used to interact with the environment, but also that
        interaction with the environment happens in a serialized manner.

        If `isolation == True`, a dedicated thread is created for
        interactions with the environment.

        If `isolation` is an instance of `multiprocessing.pool.Pool` (this
        includes instances of `multiprocessing.pool.ThreadPool`, nee
        `multiprocessing.dummy.Pool` and `multiprocessing.Pool`, then this
        pool is used to interact with the environment.

        **NOTE** If using `isolation` with a `BatchedPyEnvironment`, ensure
        you create the `BatchedPyEnvironment` with `multithreading=False`, since
        otherwise the multithreading in that wrapper reverses the effects of
        this one.

    Raises:
      TypeError: If `environment` is not an instance of
        `py_environment.PyEnvironment` or subclasses, or is a callable that does
        not return an instance of `PyEnvironment`.
      TypeError: If `isolation` is not `True`, `False`, or an instance of
        `multiprocessing.pool.Pool`.
    """
        if not isolation:
            self._pool = None
        elif isinstance(isolation, pool.Pool):
            self._pool = isolation
        elif isolation:
            self._pool = pool.ThreadPool(1)
        else:
            raise TypeError(
                'isolation should be True, False, or an instance of '
                'a multiprocessing Pool or ThreadPool.  Saw: {}'.format(
                    isolation))

        if callable(environment):
            environment = self._execute(environment)
        if not isinstance(environment, py_environment.PyEnvironment):
            raise TypeError(
                'Environment should implement py_environment.PyEnvironment')

        if not environment.batched:
            # If executing in an isolated thread, do not enable multiprocessing for
            # this environment.
            environment = batched_py_environment.BatchedPyEnvironment(
                [environment], multithreading=not self._pool)
        self._env = environment
        self._check_dims = check_dims

        if isolation and getattr(self._env, '_parallel_execution', None):
            logging.warning(
                'Wrapped environment is executing in parallel.  '
                'Perhaps it is a BatchedPyEnvironment with multithreading=True, '
                'or it is a ParallelPyEnvironment.  This conflicts with the '
                '`isolation` arg passed to TFPyEnvironment: interactions with the '
                'wrapped environment are no longer guaranteed to happen in a common '
                'thread.  Environment: %s', (self._env, ))

        action_spec = tensor_spec.from_spec(self._env.action_spec())
        time_step_spec = tensor_spec.from_spec(self._env.time_step_spec())
        batch_size = self._env.batch_size if self._env.batch_size else 1

        super(TFPyEnvironment, self).__init__(time_step_spec, action_spec,
                                              batch_size)

        # Gather all the dtypes and shapes of the elements in time_step.
        self._time_step_dtypes = [
            s.dtype for s in tf.nest.flatten(self.time_step_spec())
        ]

        self._time_step = None
        self._lock = threading.Lock()
Esempio n. 30
0
 def observation_spec(self):
     return tensor_spec.from_spec(
         array_spec.BoundedArraySpec((1, ),
                                     np.int32,
                                     minimum=[1],
                                     maximum=[2]))