コード例 #1
0
ファイル: dqn_agent.py プロジェクト: wuzh07/agents
  def _setup_policy(self, time_step_spec, action_spec,
                    boltzmann_temperature, emit_log_probability):

    policy = q_policy.QPolicy(
        time_step_spec,
        action_spec,
        q_network=self._q_network,
        emit_log_probability=emit_log_probability,
        observation_and_action_constraint_splitter=(
            self._observation_and_action_constraint_splitter))

    if boltzmann_temperature is not None:
      collect_policy = boltzmann_policy.BoltzmannPolicy(
          policy, temperature=self._boltzmann_temperature)
    else:
      collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
          policy, epsilon=self._epsilon_greedy)
    policy = greedy_policy.GreedyPolicy(policy)

    # Create self._target_greedy_policy in order to compute target Q-values.
    target_policy = q_policy.QPolicy(
        time_step_spec,
        action_spec,
        q_network=self._target_q_network,
        observation_and_action_constraint_splitter=(
            self._observation_and_action_constraint_splitter))
    self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy)

    return policy, collect_policy
コード例 #2
0
 def _get_policies(self, time_step_spec, action_spec, cloning_network):
   policy = q_policy.QPolicy(
       time_step_spec, action_spec, q_network=self._cloning_network)
   collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
       policy, epsilon=self._epsilon_greedy)
   policy = greedy_policy.GreedyPolicy(policy)
   return policy, collect_policy
コード例 #3
0
    def _setup_policy(self, time_step_spec, action_spec, boltzmann_temperature,
                      emit_log_probability):
        policy = categorical_q_policy.CategoricalQPolicy(
            time_step_spec,
            action_spec,
            self._q_network,
            self._min_q_value,
            self._max_q_value,
            observation_and_action_constraint_splitter=(
                self._observation_and_action_constraint_splitter))

        if boltzmann_temperature is not None:
            collect_policy = boltzmann_policy.BoltzmannPolicy(
                policy, temperature=boltzmann_temperature)
        else:
            collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
                policy, epsilon=self._epsilon_greedy)
        policy = greedy_policy.GreedyPolicy(policy)

        target_policy = categorical_q_policy.CategoricalQPolicy(
            time_step_spec,
            action_spec,
            self._target_q_network,
            self._min_q_value,
            self._max_q_value,
            observation_and_action_constraint_splitter=(
                self._observation_and_action_constraint_splitter))
        self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy)

        return policy, collect_policy
コード例 #4
0
    def testTensorEpsilon(self):
        epsilon_ph = tf.placeholder(tf.float32, shape=())
        policy = epsilon_greedy_policy.EpsilonGreedyPolicy(self._policy,
                                                           epsilon=epsilon_ph)
        self.assertEqual(policy.time_step_spec(), self._time_step_spec)
        self.assertEqual(policy.action_spec(), self._action_spec)

        policy_state = policy.get_initial_state(batch_size=2)
        action_step = policy.action(self._time_step, policy_state, seed=54)
        nest.assert_same_structure(self._action_spec, action_step.action)

        self.evaluate(tf.global_variables_initializer())
        with self.cached_session() as sess:
            for epsilon in [0.0, 0.2, 0.7, 1.0]:
                # Collect 100 steps with the current value of epsilon.
                actions = []
                num_steps = 1000
                for _ in range(num_steps):
                    action_ = sess.run(action_step.action,
                                       {epsilon_ph: epsilon})[0]
                    self.assertIn(action_, [0, 1, 2])
                    actions.append(action_)

                # Verify that action distribution changes as we vary epsilon.
                self.checkActionDistribution(actions, epsilon, num_steps)
コード例 #5
0
  def testTensorEpsilon(self, epsilon):
    policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
        self._policy, epsilon=epsilon)
    self.assertEqual(policy.time_step_spec, self._time_step_spec)
    self.assertEqual(policy.action_spec, self._action_spec)

    policy_state = policy.get_initial_state(batch_size=2)
    time_step = tf.nest.map_structure(tf.convert_to_tensor, self._time_step)

    @common.function
    def action_step_fn(time_step=time_step):
      return policy.action(time_step, policy_state, seed=54)

    tf.nest.assert_same_structure(
        self._action_spec,
        self.evaluate(action_step_fn(time_step)).action)

    if tf.executing_eagerly():
      action_step = action_step_fn
    else:
      action_step = action_step_fn()

    actions = []

    num_steps = 1000
    for _ in range(num_steps):
      action_ = self.evaluate(action_step).action[0]
      self.assertIn(action_, [0, 1, 2])
      actions.append(action_)

    # Verify that action distribution changes as we vary epsilon.
    self.checkActionDistribution(actions, epsilon, num_steps)
コード例 #6
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            reward_network,
            optimizer,
            epsilon,
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            name=None):
        """Creates a Neural Epsilon Greedy Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
      optimizer: The optimizer to use for training.
      epsilon: A float representing the probability of choosing a random action
        instead of the greedy action.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
        super(NeuralEpsilonGreedyAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             reward_network=reward_network,
                             optimizer=optimizer,
                             observation_and_action_constraint_splitter=None,
                             error_loss_fn=error_loss_fn,
                             gradient_clipping=gradient_clipping,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter,
                             name=name)
        self._policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            self._policy, epsilon=epsilon)
        self._collect_policy = self._policy
コード例 #7
0
def get_policy(env, q_net, epsilon_callback):

    q_plcy = q_policy.QPolicy(env.time_step_spec(),
                              env.action_spec(),
                              q_network=q_net)
    # greedy_plcy = greedy_policy.GreedyPolicy(q_plcy)
    ep_greedy_plcy = epsilon_greedy_policy.EpsilonGreedyPolicy(
        q_plcy, epsilon_callback)
    plcy = ep_greedy_plcy
    return plcy
コード例 #8
0
    def testInfoFromGreedy(self):
        PolicyInfo = collections.namedtuple(  # pylint: disable=invalid-name
            'PolicyInfo',
            ('log_probability', 'predicted_rewards', 'bandit_policy_type'))
        # Set default empty tuple for all fields.
        PolicyInfo.__new__.__defaults__ = ((), ) * len(PolicyInfo._fields)

        info_spec = PolicyInfo(
            bandit_policy_type=self._bandit_policy_type_spec,
            log_probability=tensor_spec.BoundedTensorSpec(
                shape=(),
                dtype=tf.float32,
                maximum=0,
                minimum=-float('inf'),
                name='log_probability'))

        policy_with_info_spec = fixed_policy.FixedPolicy(
            np.asarray(self._greedy_action, dtype=np.int32),
            self._time_step_spec,
            self._action_spec,
            policy_info=PolicyInfo(
                bandit_policy_type=self._bandit_policy_type),
            info_spec=info_spec)

        epsilon = 0.2
        policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            policy_with_info_spec,
            epsilon=epsilon,
            info_fields_to_inherit_from_greedy=['log_probability'])
        self.assertEqual(policy.time_step_spec, self._time_step_spec)
        self.assertEqual(policy.action_spec, self._action_spec)

        time_step = tf.nest.map_structure(tf.convert_to_tensor,
                                          self._time_step)

        @common.function
        def action_step_fn(time_step=time_step):
            return policy.action(time_step, policy_state=(), seed=54)

        tf.nest.assert_same_structure(
            self._action_spec,
            self.evaluate(action_step_fn(time_step)).action)

        if tf.executing_eagerly():
            action_step = action_step_fn
        else:
            action_step = action_step_fn()

        step = self.evaluate(action_step)
        tf.nest.assert_same_structure(info_spec, step.info)

        self.checkBanditPolicyTypeShape(step.info.bandit_policy_type,
                                        batch_size=2)
        self.assertAllEqual(step.info.log_probability,
                            tf.zeros_like(step.info.log_probability))
コード例 #9
0
 def _get_policies(self, time_step_spec, action_spec, cloning_network):
     policy = q_policy.QPolicy(
         time_step_spec,
         action_spec,
         q_network=self._cloning_network,
         # Unlike DQN, we support continuous action spaces - in which case
         # the policy just emits the network output.  In that case, we
         # don't care if the action_spec is a scalar integer value.
         validate_action_spec_and_network=False,
     )
     collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
         policy, epsilon=self._epsilon_greedy)
     policy = greedy_policy.GreedyPolicy(policy)
     return policy, collect_policy
コード例 #10
0
 def _setup_as_discrete(self, time_step_spec, action_spec, loss_fn,
                        epsilon_greedy):
     self._loss_fn = loss_fn or self._discrete_loss
     # Unlike DQN, we support continuous action spaces - in which case
     # the policy just emits the network output.  In that case, we
     # don't care if the action_spec is a scalar integer value.
     policy = q_policy.QPolicy(
         time_step_spec,
         action_spec,
         q_network=self._cloning_network,
         validate_action_spec_and_network=False,
     )
     collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
         policy, epsilon=epsilon_greedy)
     policy = greedy_policy.GreedyPolicy(policy)
     return policy, collect_policy
コード例 #11
0
    def _setup_policy(self, time_step_spec, action_spec, boltzmann_temperature,
                      emit_log_probability):

        policy = q_policy.QPolicy(time_step_spec,
                                  action_spec,
                                  q_network=self._q_network,
                                  emit_log_probability=emit_log_probability)

        if boltzmann_temperature is not None:
            collect_policy = boltzmann_policy.BoltzmannPolicy(
                policy, temperature=self._boltzmann_temperature)
        else:
            collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
                policy, epsilon=self._epsilon_greedy)
        policy = greedy_policy.GreedyPolicy(policy)

        return policy, collect_policy
コード例 #12
0
    def _setup_policy(self, time_step_spec, action_spec, emit_log_probability):
        policy = qtopt_cem_policy.CEMPolicy(
            time_step_spec,
            action_spec,
            q_network=self._target_q_network,
            sampler=self._sampler,
            init_mean=self._init_mean_cem,
            init_var=self._init_var_cem,
            info_spec=self._info_spec,
            num_samples=self._num_samples_cem,
            num_elites=self._num_elites_cem,
            num_iterations=self._num_iter_cem,
            emit_log_probability=emit_log_probability,
            training=False)

        collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            policy, epsilon=self._epsilon_greedy)

        return policy, collect_policy
コード例 #13
0
  def testFixedEpsilon(self, epsilon):
    policy = epsilon_greedy_policy.EpsilonGreedyPolicy(self._policy,
                                                       epsilon=epsilon)
    self.assertEqual(policy.time_step_spec(), self._time_step_spec)
    self.assertEqual(policy.action_spec(), self._action_spec)

    policy_state = policy.get_initial_state(batch_size=2)
    action_step = policy.action(self._time_step, policy_state, seed=54)
    nest.assert_same_structure(self._action_spec, action_step.action)

    self.evaluate(tf.global_variables_initializer())
    # Collect 100 steps with the current value of epsilon.
    actions = []
    num_steps = 100
    for _ in range(num_steps):
      action_ = self.evaluate(action_step.action)[0]
      self.assertIn(action_, [0, 1, 2])
      actions.append(action_)

    self.checkActionDistribution(actions, epsilon, num_steps)
コード例 #14
0
  def _setup_as_discrete(self, time_step_spec, action_spec, loss_fn,
                         epsilon_greedy):
    self._bc_loss_fn = loss_fn or self._discrete_loss

    if any(isinstance(d, distribution_utils.DistributionSpecV2) for
           d in tf.nest.flatten([self._network_output_spec])):
      # If the output of the cloning network contains a distribution.
      base_policy = actor_policy.ActorPolicy(time_step_spec, action_spec,
                                             self._cloning_network)
    else:
      # If the output of the cloning network is logits.
      base_policy = q_policy.QPolicy(
          time_step_spec,
          action_spec,
          q_network=self._cloning_network,
          validate_action_spec_and_network=False)
    policy = greedy_policy.GreedyPolicy(base_policy)
    collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
        base_policy, epsilon=epsilon_greedy)
    return policy, collect_policy
コード例 #15
0
    def testInfoSpec(self):
        PolicyInfo = collections.namedtuple(  # pylint: disable=invalid-name
            'PolicyInfo', ('log_probability', 'predicted_rewards'))
        # Set default empty tuple for all fields.
        PolicyInfo.__new__.__defaults__ = ((), ) * len(PolicyInfo._fields)

        info_spec = PolicyInfo()
        policy_with_info_spec = fixed_policy.FixedPolicy(
            np.asarray([self._greedy_action], dtype=np.int32),
            self._time_step_spec,
            self._action_spec,
            policy_info=PolicyInfo(),
            info_spec=info_spec)

        epsilon = 0.2
        policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            policy_with_info_spec, epsilon=epsilon)
        self.assertEqual(policy.time_step_spec, self._time_step_spec)
        self.assertEqual(policy.action_spec, self._action_spec)

        time_step = tf.nest.map_structure(tf.convert_to_tensor,
                                          self._time_step)

        @common.function
        def action_step_fn(time_step=time_step):
            return policy.action(time_step, policy_state=(), seed=54)

        tf.nest.assert_same_structure(
            self._action_spec,
            self.evaluate(action_step_fn(time_step)).action)

        if tf.executing_eagerly():
            action_step = action_step_fn
        else:
            action_step = action_step_fn()

        step = self.evaluate(action_step)
        tf.nest.assert_same_structure(info_spec, step.info)
コード例 #16
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            categorical_q_network,
            optimizer,
            min_q_value=-10.0,
            max_q_value=10.0,
            epsilon_greedy=0.1,
            n_step_update=1,
            boltzmann_temperature=None,
            # Params for target network updates
            target_update_tau=1.0,
            target_update_period=1,
            # Params for training.
            td_errors_loss_fn=None,
            gamma=1.0,
            reward_scale_factor=1.0,
            gradient_clipping=None,
            # Params for debugging
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            name=None):
        """Creates a Categorical DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A `BoundedTensorSpec` representing the actions.
      categorical_q_network: A categorical_q_network.CategoricalQNetwork that
        returns the q_distribution for each action.
      optimizer: The optimizer to use for training.
      min_q_value: A float specifying the minimum Q-value, used for setting up
        the support.
      max_q_value: A float specifying the maximum Q-value, used for setting up
        the support.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Defaults to single-step updates. Note that this requires the
        user to call train on Trajectory objects with a time dimension of
        `n_step_update + 1`. However, note that we do not yet support
        `n_step_update > 1` in the case of RNNs (i.e., non-empty
        `q_network.state_spec`).
      boltzmann_temperature: Temperature value to use for Boltzmann sampling of
        the actions during data collection. The closer to 0.0, the higher the
        probability of choosing the best action.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of element_wise_huber_loss is used. This function takes as
        input the target and the estimated Q values and returns the loss for
        each element of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      TypeError: If the action spec contains more than one action.
    """
        num_atoms = getattr(categorical_q_network, 'num_atoms', None)
        if num_atoms is None:
            raise TypeError(
                'Expected categorical_q_network to have property '
                '`num_atoms`, but it doesn\'t (note: you likely want to '
                'use a CategoricalQNetwork). Network is: %s' %
                (categorical_q_network, ))

        self._num_atoms = num_atoms
        self._min_q_value = min_q_value
        self._max_q_value = max_q_value
        self._support = tf.linspace(min_q_value, max_q_value, num_atoms)

        super(CategoricalDqnAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             categorical_q_network,
                             optimizer,
                             epsilon_greedy=epsilon_greedy,
                             n_step_update=n_step_update,
                             boltzmann_temperature=boltzmann_temperature,
                             target_update_tau=target_update_tau,
                             target_update_period=target_update_period,
                             td_errors_loss_fn=td_errors_loss_fn,
                             gamma=gamma,
                             reward_scale_factor=reward_scale_factor,
                             gradient_clipping=gradient_clipping,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter,
                             name=name)

        policy = categorical_q_policy.CategoricalQPolicy(
            min_q_value, max_q_value, self._q_network, self._action_spec)
        if boltzmann_temperature is not None:
            self._collect_policy = boltzmann_policy.BoltzmannPolicy(
                policy, temperature=self._boltzmann_temperature)
        else:
            self._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
                policy, epsilon=self._epsilon_greedy)
        self._policy = greedy_policy.GreedyPolicy(policy)
コード例 #17
0
def train():
    summary_interval = 1000
    summaries_flush_secs = 10
    num_eval_episodes = 5
    root_dir = '/tmp/tensorflow/logs/tfenv01'
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')
    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs*1000)
    train_summary_writer.set_as_default()
    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs*1000)
    # maybe py_metrics?
    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes),
    ]

    environment = TradeEnvironment()
    # utils.validate_py_environment(environment, episodes=5)
    # Environments
    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):
        train_env = tf_py_environment.TFPyEnvironment(environment)
        eval_env = tf_py_environment.TFPyEnvironment(environment)

        num_iterations = 50
        fc_layer_params = (512, )  # ~ (17 + 1001) / 2
        input_fc_layer_params = (50, )
        output_fc_layer_params = (20, )
        lstm_size = (30, )
        
        initial_collect_steps = 20
        collect_steps_per_iteration = 1
        collect_episodes_per_iteration = 1  # the same as above
        batch_size = 64
        replay_buffer_capacity = 10000
        
        train_sequence_length = 10

        gamma = 0.99  # check if 1.0 works as well
        target_update_tau = 0.05
        target_update_period = 5
        epsilon_greedy = 0.1
        gradient_clipping = None
        reward_scale_factor = 1.0

        learning_rate = 1e-2
        log_interval = 30
        eval_interval = 15

        # train_env.observation_spec(),
        q_net = q_rnn_network.QRnnNetwork(
            train_env.time_step_spec().observation,
            train_env.action_spec(),
            input_fc_layer_params=input_fc_layer_params,
            lstm_size=lstm_size,
            output_fc_layer_params=output_fc_layer_params,
        )

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        tf_agent = dqn_agent.DqnAgent(
            train_env.time_step_spec(),
            train_env.action_spec(),
            q_network=q_net,
            optimizer=optimizer,
            epsilon_greedy=epsilon_greedy,
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=global_step,
        )

        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            tf_agent.collect_data_spec,
            batch_size=train_env.batch_size,
            max_length=replay_buffer_capacity,
        )

        train_metrics = [
            tf_metrics.NumberOfEpisodes(),
            tf_metrics.EnvironmentSteps(),
            tf_metrics.AverageReturnMetric(),
            tf_metrics.AverageEpisodeLengthMetric(),
        ]

        # Policy which does not allow some actions in certain states
        q_policy = FilteredQPolicy(
            tf_agent._time_step_spec, 
            tf_agent._action_spec, 
            q_network=tf_agent._q_network,
        )

        # Valid policy to pre-fill replay buffer
        initial_collect_policy = DummyTradePolicy(
            train_env.time_step_spec(),
            train_env.action_spec(),
        )
        print('Initial collecting...')
        initial_collect_op = dynamic_episode_driver.DynamicEpisodeDriver(
            train_env,
            initial_collect_policy,
            observers=[replay_buffer.add_batch] + train_metrics,
            num_episodes=initial_collect_steps,
        ).run()

        # Main agent's policy; greedy one
        policy = greedy_policy.GreedyPolicy(q_policy)
        # Policy used for evaluation, the same as above
        eval_policy = greedy_policy.GreedyPolicy(q_policy)
    
        tf_agent._policy = policy
        collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            q_policy, epsilon=tf_agent._epsilon_greedy)
        # Patch random policy for epsilon greedy collect policy
        filtered_random_tf_policy = FilteredRandomTFPolicy(
            time_step_spec=policy.time_step_spec,
            action_spec=policy.action_spec,
        )
        collect_policy._random_policy = filtered_random_tf_policy
        tf_agent._collect_policy = collect_policy
        collect_op = dynamic_episode_driver.DynamicEpisodeDriver(
            train_env,
            collect_policy,
            observers=[replay_buffer.add_batch] + train_metrics,
            num_episodes=collect_episodes_per_iteration,
        ).run()
        dataset = replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=train_sequence_length+1,
        ).prefetch(3)

        iterator = iter(dataset) 
        experience, _ = next(iterator)
        loss_info = common.function(tf_agent.train)(experience=experience)

        # Checkpoints
        train_checkpointer = common.Checkpointer(
            ckpt_dir=train_dir,
            agent=tf_agent,
            global_step=global_step,
            metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'),
        )
        policy_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(train_dir, 'policy'),
            policy=tf_agent.policy,
            global_step=global_step,
        )
        rb_checkpointer = common.Checkpointer(
            ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
            max_to_keep=1,
            replay_buffer=replay_buffer,
        )
        
        summary_ops = []
        for train_metric in train_metrics:
            summary_ops.append(train_metric.tf_summaries(
                train_step=global_step,
                step_metrics=train_metrics[:2],
            ))

        with eval_summary_writer.as_default(), \
                tf.compat.v2.summary.record_if(True):
            for eval_metric in eval_metrics:
                eval_metric.tf_summaries(train_step=global_step)

        init_agent_op = tf_agent.initialize()
    
        with tf.compat.v1.Session() as sess:
            # sess.run(train_summary_writer.init())
            # sess.run(eval_summary_writer.init())
            
            # Initialize the graph
            # tfe.Saver().restore()
            # train_checkpointer.initialize_or_restore()
            # rb_checkpointer.initialize_or_restore()
            # sess.run(iterator.initializer)
            common.initialize_uninitialized_variables(sess)

            sess.run(init_agent_op)
            print('Collecting initial experience...')
            sess.run(initial_collect_op)

            global_step_val = sess.run(global_step)
            metric_utils.compute_summaries(
                eval_metrics,
                eval_env,
                eval_policy,
                num_episodes=num_eval_episodes,
                global_step=global_step_val,
                callback=eval_metrics_callback,
                log=True,
            )

            collect_call = sess.make_callable(collect_op)
            train_step_call = sess.make_callable([loss_info, summary_ops])
            global_step_call = sess.make_callable(global_step)

            timed_at_step = global_step_call()
            time_acc = 0
            steps_per_second_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(), name='steps_per_sec_ph')
            steps_per_second_summary = tf.compat.v2.summary.scalar(
                name='global_steps_per_sec',
                data=steps_per_second_ph,
                step=global_step,
            )

            # Train
            for i in range(num_iterations):
                start_time = time.time()
                collect_call()

                for _ in range(train_steps_per_iteration):
                    loss_info_value, _ = train_step_call()
                time_acc += time.time() - start_time
                global_step_val = global_step_call()

                if global_step_val % log_inerval == 0:
                    print('step=%d, loss=%f', 



                          global_step_val, loss_info_value.loss)
                    steps_per_sec = (global_step_val-timed_at_step) / time_acc
                    print('%.3f steps/sec', steps_per_sec)
                    sess.run(
                        steps_per_second_summary,
                        feed_dict={steps_per_second_ph: steps_per_sec},
                    )
                    timed_at_step = global_step_val
                    time_acc = 0

                # Save checkpoints
                if global_step_val % train_checkpoint_interval == 0:
                    train_checkpointer.save(global_step=global_step_val)

                if global_step_val % policy_checkpoint_interval == 0:
                    policy_checkpointer.save(global_step=global_step_val)

                if global_step_val % rb_checkpoint_interval == 0:
                    rb_checkpointer.save(global_step=global_step_val)

                # Evaluate
                if global_step_val % eval_interval == 0:
                    metric_utils.compute_summaries(
                        eval_metrics,
                        eval_env,
                        eval_policy,
                        num_episodes=num_eval_episodes,
                        global_step=global_step_val,
                        log=True,
                        callback=eval_metrics_callback,
                    )
    print('Done!')        
コード例 #18
0
    def __init__(
            self,
            root_dir,
            env_name,
            num_iterations=200,
            max_episode_frames=108000,  # ALE frames
            terminal_on_life_loss=False,
            conv_layer_params=((32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3),
                                                                  1)),
            fc_layer_params=(512, ),
            # Params for collect
            initial_collect_steps=80000,  # ALE frames
            epsilon_greedy=0.01,
            epsilon_decay_period=1000000,  # ALE frames
            replay_buffer_capacity=1000000,
            # Params for train
            train_steps_per_iteration=1000000,  # ALE frames
            update_period=16,  # ALE frames
            target_update_tau=1.0,
            target_update_period=32000,  # ALE frames
            batch_size=32,
            learning_rate=2.5e-4,
            n_step_update=2,
            gamma=0.99,
            reward_scale_factor=1.0,
            gradient_clipping=None,
            # Params for eval
            do_eval=True,
            eval_steps_per_iteration=500000,  # ALE frames
            eval_epsilon_greedy=0.001,
            # Params for checkpoints, summaries, and logging
            log_interval=1000,
            summary_interval=1000,
            summaries_flush_secs=10,
            debug_summaries=True,
            summarize_grads_and_vars=True,
            eval_metrics_callback=None):
        """A simple Atari train and eval for DQN.

    Args:
      root_dir: Directory to write log files to.
      env_name: Fully-qualified name of the Atari environment (i.e. Pong-v0).
      num_iterations: Number of train/eval iterations to run.
      max_episode_frames: Maximum length of a single episode, in ALE frames.
      terminal_on_life_loss: Whether to simulate an episode termination when a
        life is lost.
      conv_layer_params: Params for convolutional layers of QNetwork.
      fc_layer_params: Params for fully connected layers of QNetwork.
      initial_collect_steps: Number of frames to ALE frames to process before
        beginning to train. Since this is in ALE frames, there will be
        initial_collect_steps/4 items in the replay buffer when training starts.
      epsilon_greedy: Final epsilon value to decay to for training.
      epsilon_decay_period: Period over which to decay epsilon, from 1.0 to
        epsilon_greedy (defined above).
      replay_buffer_capacity: Maximum number of items to store in the replay
        buffer.
      train_steps_per_iteration: Number of ALE frames to run through for each
        iteration of training.
      update_period: Run a train operation every update_period ALE frames.
      target_update_tau: Coeffecient for soft target network updates (1.0 ==
        hard updates).
      target_update_period: Period, in ALE frames, to copy the live network to
        the target network.
      batch_size: Number of frames to include in each training batch.
      learning_rate: RMS optimizer learning rate.
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Applies standard single-step updates when set to 1.
      gamma: Discount for future rewards.
      reward_scale_factor: Scaling factor for rewards.
      gradient_clipping: Norm length to clip gradients.
      do_eval: If True, run an eval every iteration. If False, skip eval.
      eval_steps_per_iteration: Number of ALE frames to run through for each
        iteration of evaluation.
      eval_epsilon_greedy: Epsilon value to use for the evaluation policy (0 ==
        totally greedy policy).
      log_interval: Log stats to the terminal every log_interval training
        steps.
      summary_interval: Write TF summaries every summary_interval training
        steps.
      summaries_flush_secs: Flush summaries to disk every summaries_flush_secs
        seconds.
      debug_summaries: If True, write additional summaries for debugging (see
        dqn_agent for which summaries are written).
      summarize_grads_and_vars: Include gradients in summaries.
      eval_metrics_callback: A callback function that takes (metric_dict,
        global_step) as parameters. Called after every eval with the results of
        the evaluation.
    """
        self._update_period = update_period / ATARI_FRAME_SKIP
        self._train_steps_per_iteration = (train_steps_per_iteration /
                                           ATARI_FRAME_SKIP)
        self._do_eval = do_eval
        self._eval_steps_per_iteration = eval_steps_per_iteration / ATARI_FRAME_SKIP
        self._eval_epsilon_greedy = eval_epsilon_greedy
        self._initial_collect_steps = initial_collect_steps / ATARI_FRAME_SKIP
        self._summary_interval = summary_interval
        self._num_iterations = num_iterations
        self._log_interval = log_interval
        self._eval_metrics_callback = eval_metrics_callback

        with gin.unlock_config():
            gin.bind_parameter(('tf_agents.environments.atari_preprocessing.'
                                'AtariPreprocessing.terminal_on_life_loss'),
                               terminal_on_life_loss)

        root_dir = os.path.expanduser(root_dir)
        train_dir = os.path.join(root_dir, 'train')
        eval_dir = os.path.join(root_dir, 'eval')

        train_summary_writer = tf.compat.v2.summary.create_file_writer(
            train_dir, flush_millis=summaries_flush_secs * 1000)
        train_summary_writer.set_as_default()
        self._train_summary_writer = train_summary_writer

        self._eval_summary_writer = None
        if self._do_eval:
            self._eval_summary_writer = tf.compat.v2.summary.create_file_writer(
                eval_dir, flush_millis=summaries_flush_secs * 1000)
            self._eval_metrics = [
                py_metrics.AverageReturnMetric(name='PhaseAverageReturn',
                                               buffer_size=np.inf),
                py_metrics.AverageEpisodeLengthMetric(
                    name='PhaseAverageEpisodeLength', buffer_size=np.inf),
            ]

        self._global_step = tf.compat.v1.train.get_or_create_global_step()
        with tf.compat.v2.summary.record_if(lambda: tf.math.equal(
                self._global_step % self._summary_interval, 0)):
            self._env = suite_atari.load(
                env_name,
                max_episode_steps=max_episode_frames / ATARI_FRAME_SKIP,
                gym_env_wrappers=suite_atari.
                DEFAULT_ATARI_GYM_WRAPPERS_WITH_STACKING)
            self._env = batched_py_environment.BatchedPyEnvironment(
                [self._env])

            observation_spec = tensor_spec.from_spec(
                self._env.observation_spec())
            time_step_spec = ts.time_step_spec(observation_spec)
            action_spec = tensor_spec.from_spec(self._env.action_spec())

            with tf.device('/cpu:0'):
                epsilon = tf.compat.v1.train.polynomial_decay(
                    1.0,
                    self._global_step,
                    epsilon_decay_period / ATARI_FRAME_SKIP /
                    self._update_period,
                    end_learning_rate=epsilon_greedy)

            with tf.device('/gpu:0'):
                optimizer = tf.compat.v1.train.RMSPropOptimizer(
                    learning_rate=learning_rate,
                    decay=0.95,
                    momentum=0.0,
                    epsilon=0.00001,
                    centered=True)
                categorical_q_net = AtariCategoricalQNetwork(
                    observation_spec,
                    action_spec,
                    conv_layer_params=conv_layer_params,
                    fc_layer_params=fc_layer_params)
                agent = categorical_dqn_agent.CategoricalDqnAgent(
                    time_step_spec,
                    action_spec,
                    categorical_q_network=categorical_q_net,
                    optimizer=optimizer,
                    epsilon_greedy=epsilon,
                    n_step_update=n_step_update,
                    target_update_tau=target_update_tau,
                    target_update_period=(target_update_period /
                                          ATARI_FRAME_SKIP /
                                          self._update_period),
                    gamma=gamma,
                    reward_scale_factor=reward_scale_factor,
                    gradient_clipping=gradient_clipping,
                    debug_summaries=debug_summaries,
                    summarize_grads_and_vars=summarize_grads_and_vars,
                    train_step_counter=self._global_step)

                self._collect_policy = py_tf_policy.PyTFPolicy(
                    agent.collect_policy)

                if self._do_eval:
                    self._eval_policy = py_tf_policy.PyTFPolicy(
                        epsilon_greedy_policy.EpsilonGreedyPolicy(
                            policy=agent.policy,
                            epsilon=self._eval_epsilon_greedy))

                py_observation_spec = self._env.observation_spec()
                py_time_step_spec = ts.time_step_spec(py_observation_spec)
                py_action_spec = policy_step.PolicyStep(
                    self._env.action_spec())
                data_spec = trajectory.from_transition(py_time_step_spec,
                                                       py_action_spec,
                                                       py_time_step_spec)
                self._replay_buffer = py_hashed_replay_buffer.PyHashedReplayBuffer(
                    data_spec=data_spec, capacity=replay_buffer_capacity)

            with tf.device('/cpu:0'):
                ds = self._replay_buffer.as_dataset(
                    sample_batch_size=batch_size, num_steps=n_step_update + 1)
                ds = ds.prefetch(4)
                ds = ds.apply(
                    tf.data.experimental.prefetch_to_device('/gpu:0'))

            with tf.device('/gpu:0'):
                self._ds_itr = tf.compat.v1.data.make_one_shot_iterator(ds)
                experience = self._ds_itr.get_next()
                self._train_op = agent.train(experience)

                self._env_steps_metric = py_metrics.EnvironmentSteps()
                self._step_metrics = [
                    py_metrics.NumberOfEpisodes(),
                    self._env_steps_metric,
                ]
                self._train_metrics = self._step_metrics + [
                    py_metrics.AverageReturnMetric(buffer_size=10),
                    py_metrics.AverageEpisodeLengthMetric(buffer_size=10),
                ]
                # The _train_phase_metrics average over an entire train iteration,
                # rather than the rolling average of the last 10 episodes.
                self._train_phase_metrics = [
                    py_metrics.AverageReturnMetric(name='PhaseAverageReturn',
                                                   buffer_size=np.inf),
                    py_metrics.AverageEpisodeLengthMetric(
                        name='PhaseAverageEpisodeLength', buffer_size=np.inf),
                ]
                self._iteration_metric = py_metrics.CounterMetric(
                    name='Iteration')

                # Summaries written from python should run every time they are
                # generated.
                with tf.compat.v2.summary.record_if(True):
                    self._steps_per_second_ph = tf.compat.v1.placeholder(
                        tf.float32, shape=(), name='steps_per_sec_ph')
                    self._steps_per_second_summary = tf.compat.v2.summary.scalar(
                        name='global_steps_per_sec',
                        data=self._steps_per_second_ph,
                        step=self._global_step)

                    for metric in self._train_metrics:
                        metric.tf_summaries(train_step=self._global_step,
                                            step_metrics=self._step_metrics)

                    for metric in self._train_phase_metrics:
                        metric.tf_summaries(
                            train_step=self._global_step,
                            step_metrics=(self._iteration_metric, ))
                    self._iteration_metric.tf_summaries(
                        train_step=self._global_step)

                    if self._do_eval:
                        with self._eval_summary_writer.as_default():
                            for metric in self._eval_metrics:
                                metric.tf_summaries(
                                    train_step=self._global_step,
                                    step_metrics=(self._iteration_metric, ))

                self._train_checkpointer = common.Checkpointer(
                    ckpt_dir=train_dir,
                    agent=agent,
                    global_step=self._global_step,
                    optimizer=optimizer,
                    metrics=metric_utils.MetricsGroup(
                        self._train_metrics + self._train_phase_metrics +
                        [self._iteration_metric], 'train_metrics'))
                self._policy_checkpointer = common.Checkpointer(
                    ckpt_dir=os.path.join(train_dir, 'policy'),
                    policy=agent.policy,
                    global_step=self._global_step)
                self._rb_checkpointer = common.Checkpointer(
                    ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
                    max_to_keep=1,
                    replay_buffer=self._replay_buffer)

                self._init_agent_op = agent.initialize()
コード例 #19
0
    def decay():
        nonlocal epsilon
        nonlocal decay_step
        _epsilon = epsilon
        epsilon = max(epsilon - epsilon_change, epsilon_min)
        decay_step += 1
        if decay_step % 500 == 0:
            print('Decaying epsilon from {0} to {1}'.format(_epsilon, epsilon))
        return _epsilon

    return decay


eval_policy = agent.policy  # Greedy policy
# collect_policy = agent.collect_policy  # Epsilon-greedy policy
collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
    agent.collect_policy.wrapped_policy, epsilon=decaying_epsilon())
random_policy = random_tf_policy.RandomTFPolicy(
    action_spec=collect_policy.action_spec,
    time_step_spec=collect_policy.time_step_spec)  # Random policy

# Replay buffer collection
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

replay_observer = [replay_buffer.add_batch]

collect_op = dynamic_step_driver.DynamicStepDriver(
    train_env,
    random_policy,
コード例 #20
0
ファイル: dqn_agent.py プロジェクト: bohblue2/agents
    def __init__(
            self,
            time_step_spec,
            action_spec,
            q_network,
            optimizer,
            epsilon_greedy=0.1,
            # Params for target network updates
            target_update_tau=1.0,
            target_update_period=1,
            # Params for training.
            td_errors_loss_fn=None,
            gamma=1.0,
            reward_scale_factor=1.0,
            gradient_clipping=None,
            # Params for debugging
            debug_summaries=False,
            summarize_grads_and_vars=False):
        """Creates a DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      optimizer: The optimizer to use for training.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of element_wise_huber_loss is used. This function takes as
        input the target and the estimated Q values and returns the loss for
        each element of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.

    Raises:
      ValueError: If the action spec contains more than one action.
    """
        flat_action_spec = nest.flatten(action_spec)
        self._num_actions = [
            spec.maximum - spec.minimum + 1 for spec in flat_action_spec
        ]

        # TODO(oars): Get DQN working with more than one dim in the actions.
        if len(flat_action_spec) > 1 or flat_action_spec[0].shape.ndims > 1:
            raise ValueError('Only one dimensional actions are supported now.')

        self._q_network = q_network
        self._target_q_network = self._q_network.copy(name='TargetQNetwork')
        self._epsilon_greedy = epsilon_greedy
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._optimizer = optimizer
        self._td_errors_loss_fn = td_errors_loss_fn or element_wise_huber_loss
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._gradient_clipping = gradient_clipping

        self._target_update_train_op = None

        policy = q_policy.QPolicy(time_step_spec,
                                  action_spec,
                                  q_network=self._q_network)

        collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            policy, epsilon=self._epsilon_greedy)
        policy = greedy_policy.GreedyPolicy(policy)

        super(DqnAgent, self).__init__(
            time_step_spec,
            action_spec,
            policy,
            collect_policy,
            train_sequence_length=2 if not q_network.state_spec else None,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars)
コード例 #21
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            categorical_q_network,
            optimizer,
            min_q_value=-10.0,
            max_q_value=10.0,
            epsilon_greedy=0.1,
            n_step_update=1,
            boltzmann_temperature=None,
            # Params for target network updates
            target_categorical_q_network=None,
            target_update_tau=1.0,
            target_update_period=1,
            # Params for training.
            td_errors_loss_fn=None,
            gamma=1.0,
            reward_scale_factor=1.0,
            gradient_clipping=None,
            # Params for debugging
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            name=None):
        """Creates a Categorical DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A `BoundedTensorSpec` representing the actions.
      categorical_q_network: A categorical_q_network.CategoricalQNetwork that
        returns the q_distribution for each action.
      optimizer: The optimizer to use for training.
      min_q_value: A float specifying the minimum Q-value, used for setting up
        the support.
      max_q_value: A float specifying the maximum Q-value, used for setting up
        the support.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Defaults to single-step updates. Note that this requires the
        user to call train on Trajectory objects with a time dimension of
        `n_step_update + 1`. However, note that we do not yet support
        `n_step_update > 1` in the case of RNNs (i.e., non-empty
        `q_network.state_spec`).
      boltzmann_temperature: Temperature value to use for Boltzmann sampling of
        the actions during data collection. The closer to 0.0, the higher the
        probability of choosing the best action.
      target_categorical_q_network: (Optional.)  A `tf_agents.network.Network`
        to be used as the target network during Q learning.  Every
        `target_update_period` train steps, the weights from
        `categorical_q_network` are copied (possibly with smoothing via
        `target_update_tau`) to `target_categorical_q_network`.

        If `target_categorical_q_network` is not provided, it is created by
        making a copy of `categorical_q_network`, which initializes a new
        network with the same structure and its own layers and weights.

        Network copying is performed via the `Network.copy` superclass method,
        and may inadvertently lead to the resulting network to share weights
        with the original.  This can happen if, for example, the original
        network accepted a pre-built Keras layer in its `__init__`, or
        accepted a Keras layer that wasn't built, but neglected to create
        a new copy.

        In these cases, it is up to you to provide a target Network having
        weights that are not shared with the original `categorical_q_network`.
        If you provide a `target_categorical_q_network` that shares any
        weights with `categorical_q_network`, a warning will be logged but
        no exception is thrown.

        Note; shallow copies of Keras layers may be built via the code:

        ```python
        new_layer = type(layer).from_config(layer.get_config())
        ```
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of huber_loss is used. This function takes as input the
        target and the estimated Q values and returns the loss for each element
        of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      TypeError: If the action spec contains more than one action.
    """
        super(CategoricalDqnAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             categorical_q_network,
                             optimizer,
                             epsilon_greedy=epsilon_greedy,
                             n_step_update=n_step_update,
                             boltzmann_temperature=boltzmann_temperature,
                             target_q_network=target_categorical_q_network,
                             target_update_tau=target_update_tau,
                             target_update_period=target_update_period,
                             td_errors_loss_fn=td_errors_loss_fn,
                             gamma=gamma,
                             reward_scale_factor=reward_scale_factor,
                             gradient_clipping=gradient_clipping,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter,
                             name=name)

        def check_atoms(net, label):
            num_atoms = getattr(net, 'num_atoms', None)
            if num_atoms is None:
                raise TypeError(
                    'Expected {} to have property `num_atoms`, but it '
                    'doesn\'t (note: you likely want to use a '
                    'CategoricalQNetwork). Network is: {}'.format(label, net))
            return num_atoms

        num_atoms = check_atoms(self._q_network, 'categorical_q_network')
        target_num_atoms = check_atoms(self._target_q_network,
                                       'target_categorical_q_network')
        if num_atoms != target_num_atoms:
            raise ValueError(
                'categorical_q_network and target_categorical_q_network have '
                'different numbers of atoms: {} vs. {}'.format(
                    num_atoms, target_num_atoms))
        self._num_atoms = num_atoms
        self._min_q_value = min_q_value
        self._max_q_value = max_q_value
        self._support = tf.linspace(min_q_value, max_q_value, num_atoms)

        policy = categorical_q_policy.CategoricalQPolicy(
            min_q_value, max_q_value, self._q_network, self._action_spec)
        if boltzmann_temperature is not None:
            self._collect_policy = boltzmann_policy.BoltzmannPolicy(
                policy, temperature=self._boltzmann_temperature)
        else:
            self._collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
                policy, epsilon=self._epsilon_greedy)
        self._policy = greedy_policy.GreedyPolicy(policy)

        target_policy = categorical_q_policy.CategoricalQPolicy(
            min_q_value, max_q_value, self._target_q_network,
            self._action_spec)
        self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy)
コード例 #22
0
ファイル: mixed_td3_agent.py プロジェクト: yj8907/agents
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 actor_network,
                 q_network,
                 actor_optimizer,
                 critic_optimizer,
                 exploration_noise_std=0.1,
                 boltzmann_temperature=None,
                 epsilon_greedy=0.1,
                 q_network_2=None,
                 target_actor_network=None,
                 target_q_network=None,
                 target_q_network_2=None,
                 target_update_tau=1.0,
                 target_update_period=1,
                 actor_update_period=1,
                 dqda_clipping=None,
                 td_errors_loss_fn=None,
                 gamma=1.0,
                 reward_scale_factor=1.0,
                 target_policy_noise=0.2,
                 target_policy_noise_clip=0.5,
                 gradient_clipping=None,
                 debug_summaries=False,
                 summarize_grads_and_vars=False,
                 train_step_counter=None,
                 action_params_mask=None,
                 n_step_update=1,
                 name=None):
        """Creates a Td3Agent Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A namedtuple of nested BoundedTensorSpec representing the actions.
      actor_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      q_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, action, step_type).
      actor_optimizer: The default optimizer to use for the actor network.
      critic_optimizer: The default optimizer to use for the critic network.
      exploration_noise_std: Scale factor on exploration policy noise.
      q_network_2: (Optional.)  A `tf_agents.network.Network` to be used as
        the second critic network during Q learning.  The weights from
        `q_network` are copied if this is not provided.
      target_actor_network: (Optional.)  A `tf_agents.network.Network` to be
        used as the target actor network during Q learning. Every
        `target_update_period` train steps, the weights from `actor_network` are
        copied (possibly withsmoothing via `target_update_tau`) to `
        target_actor_network`.  If `target_actor_network` is not provided, it is
        created by making a copy of `actor_network`, which initializes a new
        network with the same structure and its own layers and weights.
        Performing a `Network.copy` does not work when the network instance
        already has trainable parameters (e.g., has already been built, or when
        the network is sharing layers with another).  In these cases, it is up
        to you to build a copy having weights that are not shared with the
        original `actor_network`, so that this can be used as a target network.
        If you provide a `target_actor_network` that shares any weights with
        `actor_network`, a warning will be logged but no exception is thrown.
      target_q_network: (Optional.) Similar network as target_actor_network
        but for the q_network. See documentation for target_actor_network.
      target_q_network_2: (Optional.) Similar network as
        target_actor_network but for the q_network_2. See documentation for
        target_actor_network. Will only be used if 'q_network_2' is also
        specified.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      actor_update_period: Period for the optimization step on actor network.
      dqda_clipping: A scalar or float clips the gradient dqda element-wise
        between [-dqda_clipping, dqda_clipping]. Default is None representing no
        clippiing.
      td_errors_loss_fn:  A function for computing the TD errors loss. If None,
        a default value of elementwise huber_loss is used.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      target_policy_noise: Scale factor on target action noise
      target_policy_noise_clip: Value to clip noise.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      action_params_mask: A mask of continuous parameter actions for discrete action
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.
    """
        tf.Module.__init__(self, name=name)
        self._actor_network = actor_network
        self._target_actor_network = common.maybe_copy_target_network_with_checks(
            self._actor_network, target_actor_network, 'TargetActorNetwork')

        # critic network here is Q-network
        self._q_network_1 = q_network
        self._target_q_network_1 = (
            common.maybe_copy_target_network_with_checks(
                self._q_network_1, target_q_network, 'TargetCriticNetwork1'))

        if q_network_2 is not None:
            self._q_network_2 = q_network_2
        else:
            self._q_network_2 = q_network.copy(name='CriticNetwork2')
            # Do not use target_q_network_2 if q_network_2 is None.
            target_q_network_2 = None
        self._target_q_network_2 = (
            common.maybe_copy_target_network_with_checks(
                self._q_network_2, target_q_network_2, 'TargetCriticNetwork2'))
        self._actor_optimizer = actor_optimizer
        self._critic_optimizer = critic_optimizer

        self._exploration_noise_std = exploration_noise_std
        self._epsilon_greedy = epsilon_greedy
        self._boltzmann_temperature = boltzmann_temperature
        self._target_update_tau = target_update_tau
        self._target_update_period = target_update_period
        self._actor_update_period = actor_update_period
        self._dqda_clipping = dqda_clipping
        self._td_errors_loss_fn = (td_errors_loss_fn
                                   or common.element_wise_huber_loss)
        self._gamma = gamma
        self._reward_scale_factor = reward_scale_factor
        self._target_policy_noise = target_policy_noise
        self._target_policy_noise_clip = target_policy_noise_clip
        self._gradient_clipping = gradient_clipping

        self._update_target = self._get_target_updater(target_update_tau,
                                                       target_update_period)

        policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec.actor_network,
            actor_network=self._actor_network,
            clip=True)
        policy = mixed_q_policy.MixedQPolicy(policy,
                                             time_step_spec=time_step_spec,
                                             action_spec=action_spec.q_network,
                                             q_network=q_network)

        collect_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec.actor_network,
            actor_network=self._actor_network,
            clip=False)
        collect_policy = gaussian_policy.GaussianPolicy(
            collect_policy, scale=self._exploration_noise_std, clip=True)
        collect_policy = mixed_q_policy.MixedQPolicy(
            collect_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec.q_network,
            q_network=q_network)
        if boltzmann_temperature is not None:
            collect_policy = boltzmann_policy.BoltzmannPolicy(
                collect_policy, temperature=self._boltzmann_temperature)
        else:
            collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
                collect_policy, epsilon=self._epsilon_greedy)

        # Create self._target_greedy_policy in order to compute target Q-values.

        target_policy = actor_policy.ActorPolicy(
            time_step_spec=time_step_spec,
            action_spec=action_spec.actor_network,
            actor_network=self._target_actor_network,
            clip=True)
        target_policy = mixed_q_policy.MixedQPolicy(
            target_policy,
            time_step_spec=time_step_spec,
            action_spec=action_spec.q_network,
            q_network=self._target_q_network_1)
        self._target_greedy_policy = greedy_policy.GreedyPolicy(target_policy)
        self._action_params_mask = action_params_mask
        self._n_step_update = n_step_update
        if action_spec.actor_network is not None and action_params_mask is None:
            raise ValueError(
                "action_params_mask is required for actor network")

        super(MixedTd3Agent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=2
                             if not self._actor_network.state_spec else None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             train_step_counter=train_step_counter)
コード例 #23
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            reward_network,
            optimizer,
            epsilon,
            observation_and_action_constraint_splitter=None,
            # Params for training.
            error_loss_fn=tf.compat.v1.losses.mean_squared_error,
            gradient_clipping=None,
            # Params for debugging.
            debug_summaries=False,
            summarize_grads_and_vars=False,
            enable_summaries=True,
            expose_predicted_rewards=False,
            train_step_counter=None,
            name=None):
        """Creates a Neural Epsilon Greedy Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the `reward_network` is compatible with the network-specific half
        of the output of the `observation_and_action_constraint_splitter`. In
        particular, `observation_and_action_constraint_splitter` will be called
        on the observation before passing to the network.
      optimizer: The optimizer to use for training.
      epsilon: A float representing the probability of choosing a random action
        instead of the greedy action.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      expose_predicted_rewards: (bool) Whether to expose the predicted rewards
        in the policy info field under the name 'predicted_rewards'.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
        super(NeuralEpsilonGreedyAgent,
              self).__init__(time_step_spec=time_step_spec,
                             action_spec=action_spec,
                             reward_network=reward_network,
                             optimizer=optimizer,
                             observation_and_action_constraint_splitter=(
                                 observation_and_action_constraint_splitter),
                             error_loss_fn=error_loss_fn,
                             gradient_clipping=gradient_clipping,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars,
                             enable_summaries=enable_summaries,
                             expose_predicted_rewards=expose_predicted_rewards,
                             train_step_counter=train_step_counter,
                             name=name)
        self._policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            self._policy, epsilon=epsilon)
        self._collect_policy = self._policy
コード例 #24
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            cloning_network,
            optimizer,
            epsilon_greedy=0.1,
            # Params for training.
            loss_fn=None,
            gradient_clipping=None,
            # Params for debugging
            debug_summaries=False,
            summarize_grads_and_vars=False):
        """Creates an behavioral cloning Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      cloning_network: A tf_agents.network.Network to be used by the agent.
        The network will be called as

          ```
          network(observation, step_type, network_state=None)
          ```
        (with `network_state` optional) and must return a 2-tuple with elements
        `(output, next_network_state)` where `output` will be passed as the
        first argument to `loss_fn`, and used by a `Policy`.  Input tensors will
        be shaped `[batch, time, ...]` when training, and they will be shaped
        `[batch, ...]` when the network is called within a `Policy`.  If
        `cloning_network` has an empty network state, then for training
        `time` will always be `1` (individual examples).
      optimizer: The optimizer to use for training.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      loss_fn: A function for computing the error between the output of the
        cloning network and the action that was taken. If None, the loss
        depends on the action dtype.  If the dtype is integer, then `loss_fn`
        is

        ```python
        def loss_fn(logits, action):
          return tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=action - action_spec.minimum, logits=logits)
        ```

        If the dtype is floating point, the loss is
        `tf.math.squared_difference`.

        `loss_fn` must return a loss value for each element of the batch.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.

    Raises:
      NotImplementedError: If the action spec contains more than one action.
    """
        flat_action_spec = nest.flatten(action_spec)
        self._num_actions = [
            spec.maximum - spec.minimum + 1 for spec in flat_action_spec
        ]

        # TODO(oars): Get behavioral cloning working with more than one dim in
        # the actions.
        if len(flat_action_spec) > 1:
            raise NotImplementedError(
                'Multi-arity actions are not currently supported.')
        if flat_action_spec[0].dtype.is_floating:
            if loss_fn is None:
                loss_fn = tf.math.squared_difference
        else:
            if flat_action_spec[0].shape.ndims > 1:
                raise NotImplementedError(
                    'Only scalar and one dimensional integer actions are supported.'
                )
            if loss_fn is None:
                # TODO(ebrevdo): Maybe move the subtraction of the minimum into a
                # self._label_fn and rewrite this.
                def xent_loss_fn(logits, actions):
                    # Subtract the minimum so that we get a proper cross entropy loss on
                    # [0, maximum - minimum).
                    return tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=logits,
                        labels=actions - flat_action_spec[0].minimum)

                loss_fn = xent_loss_fn

        self._cloning_network = cloning_network
        self._loss_fn = loss_fn
        self._epsilon_greedy = epsilon_greedy
        self._optimizer = optimizer
        self._gradient_clipping = gradient_clipping

        policy = q_policy.QPolicy(time_step_spec,
                                  action_spec,
                                  q_network=self._cloning_network)
        collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            policy, epsilon=self._epsilon_greedy)
        policy = greedy_policy.GreedyPolicy(policy)

        super(BehavioralCloningAgent,
              self).__init__(time_step_spec,
                             action_spec,
                             policy,
                             collect_policy,
                             train_sequence_length=1
                             if not cloning_network.state_spec else None,
                             debug_summaries=debug_summaries,
                             summarize_grads_and_vars=summarize_grads_and_vars)
コード例 #25
0
 def _setup_policy(self, time_step_spec, action_spec):
     policy = Policy(time_step_spec, action_spec, network=self._network)
     collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
         policy, epsilon=self._epsilon_greedy)
     return policy, collect_policy
コード例 #26
0
  def __init__(
      self,
      time_step_spec,
      action_spec,
      q_network,
      optimizer,
      epsilon_greedy=0.1,
      boltzmann_temperature=None,
      # Params for target network updates
      target_update_tau=1.0,
      target_update_period=1,
      # Params for training.
      td_errors_loss_fn=None,
      gamma=1.0,
      reward_scale_factor=1.0,
      gradient_clipping=None,
      # Params for debugging
      debug_summaries=False,
      summarize_grads_and_vars=False,
      train_step_counter=None,
      name=None):
    """Creates a DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      optimizer: The optimizer to use for training.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      boltzmann_temperature: Temperature value to use for Boltzmann sampling of
        the actions during data collection. The closer to 0.0, the higher the
        probability of choosing the best action.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of element_wise_huber_loss is used. This function takes as
        input the target and the estimated Q values and returns the loss for
        each element of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall
        under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or action
        spec minimum is not equal to 0.
    """
    tf.Module.__init__(self, name=name)

    flat_action_spec = tf.nest.flatten(action_spec)
    self._num_actions = [
        spec.maximum - spec.minimum + 1 for spec in flat_action_spec
    ]

    # TODO(oars): Get DQN working with more than one dim in the actions.
    if len(flat_action_spec) > 1 or flat_action_spec[0].shape.ndims > 1:
      raise ValueError('Only one dimensional actions are supported now.')

    if not all(spec.minimum == 0 for spec in flat_action_spec):
      raise ValueError(
          'Action specs should have minimum of 0, but saw: {0}'.format(
              [spec.minimum for spec in flat_action_spec]))

    if epsilon_greedy is not None and boltzmann_temperature is not None:
      raise ValueError(
          'Configured both epsilon_greedy value {} and temperature {}, '
          'however only one of them can be used for exploration.'.format(
              epsilon_greedy, boltzmann_temperature))

    self._q_network = q_network
    self._target_q_network = self._q_network.copy(name='TargetQNetwork')
    self._epsilon_greedy = epsilon_greedy
    self._boltzmann_temperature = boltzmann_temperature
    self._optimizer = optimizer
    self._td_errors_loss_fn = td_errors_loss_fn or element_wise_huber_loss
    self._gamma = gamma
    self._reward_scale_factor = reward_scale_factor
    self._gradient_clipping = gradient_clipping
    self._update_target = self._get_target_updater(
        target_update_tau, target_update_period)

    policy = q_policy.QPolicy(
        time_step_spec, action_spec, q_network=self._q_network)

    if boltzmann_temperature is not None:
      collect_policy = boltzmann_policy.BoltzmannPolicy(
          policy, temperature=self._boltzmann_temperature)
    else:
      collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
          policy, epsilon=self._epsilon_greedy)
    policy = greedy_policy.GreedyPolicy(policy)

    super(DqnAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy,
        train_sequence_length=2 if not q_network.state_spec else None,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter)
コード例 #27
0
ファイル: dqn_agent.py プロジェクト: LONG-9621/Stackedcapsule
  def __init__(
      self,
      time_step_spec,
      action_spec,
      q_network,
      optimizer,
      epsilon_greedy=0.1,
      n_step_update=1,
      boltzmann_temperature=None,
      emit_log_probability=False,
      update_period=None,
      # Params for target network updates
      target_update_tau=1.0,
      target_update_period=1,
      # Params for training.
      td_errors_loss_fn=None,
      gamma=1.0,
      reward_scale_factor=1.0,
      gradient_clipping=None,
      # Params for debugging
      debug_summaries=False,
      enable_functions=True,
      summarize_grads_and_vars=False,
      train_step_counter=None,
      name=None):
    """Creates a DQN Agent.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      q_network: A tf_agents.network.Network to be used by the agent. The
        network will be called with call(observation, step_type).
      optimizer: The optimizer to use for training.
      epsilon_greedy: probability of choosing a random action in the default
        epsilon-greedy collect policy (used only if a wrapper is not provided to
        the collect_policy method).
      n_step_update: The number of steps to consider when computing TD error and
        TD loss. Defaults to single-step updates. Note that this requires the
        user to call train on Trajectory objects with a time dimension of
        `n_step_update + 1`. However, note that we do not yet support
        `n_step_update > 1` in the case of RNNs (i.e., non-empty
        `q_network.state_spec`).
      boltzmann_temperature: Temperature value to use for Boltzmann sampling of
        the actions during data collection. The closer to 0.0, the higher the
        probability of choosing the best action.
      emit_log_probability: Whether policies emit log probabilities or not.
      update_period: Update period.
      target_update_tau: Factor for soft update of the target networks.
      target_update_period: Period for soft update of the target networks.
      td_errors_loss_fn: A function for computing the TD errors loss. If None, a
        default value of element_wise_huber_loss is used. This function takes as
        input the target and the estimated Q values and returns the loss for
        each element of the batch.
      gamma: A discount factor for future rewards.
      reward_scale_factor: Multiplicative scale for the reward.
      gradient_clipping: Norm length to clip gradients.
      debug_summaries: A bool to gather debug summaries.
      enable_functions: A bool to decide whether or not to enable tf function
      summarize_grads_and_vars: If True, gradient and network variable summaries
        will be written during training.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or action
        spec minimum is not equal to 0.
      NotImplementedError: If `q_network` has non-empty `state_spec` (i.e., an
        RNN is provided) and `n_step_update > 1`.
    """
    tf.Module.__init__(self, name=name)

    flat_action_spec = tf.nest.flatten(action_spec)
    self._num_actions = [
        spec.maximum - spec.minimum + 1 for spec in flat_action_spec
    ]

    if len(flat_action_spec) > 1 or flat_action_spec[0].shape.ndims > 1:
      raise ValueError('Only one dimensional actions are supported now.')

    if not all(spec.minimum == 0 for spec in flat_action_spec):
      raise ValueError(
          'Action specs should have minimum of 0, but saw: {0}'.format(
              [spec.minimum for spec in flat_action_spec]))

    if epsilon_greedy is not None and boltzmann_temperature is not None:
      raise ValueError(
          'Configured both epsilon_greedy value {} and temperature {}, '
          'however only one of them can be used for exploration.'.format(
              epsilon_greedy, boltzmann_temperature))

    self._q_network = q_network
    self._target_q_network = self._q_network.copy(name='TargetQNetwork')
    self._epsilon_greedy = epsilon_greedy
    self._n_step_update = n_step_update
    self._boltzmann_temperature = boltzmann_temperature
    self._optimizer = optimizer
    self._td_errors_loss_fn = td_errors_loss_fn or element_wise_huber_loss
    self._gamma = gamma
    self._reward_scale_factor = reward_scale_factor
    self._gradient_clipping = gradient_clipping
    self._update_target = self._get_target_updater(target_update_tau,
                                                   target_update_period)

    policy = q_policy.QPolicy(
        time_step_spec,
        action_spec,
        q_network=self._q_network,
        emit_log_probability=emit_log_probability)

    if boltzmann_temperature is not None:
      collect_policy = boltzmann_policy.BoltzmannPolicy(
          policy, temperature=self._boltzmann_temperature)
    else:
      collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
          policy, epsilon=self._epsilon_greedy)
    policy = greedy_policy.GreedyPolicy(policy)

    if q_network.state_spec and n_step_update != 1:
      raise NotImplementedError(
          'DqnAgent does not currently support n-step updates with stateful '
          'networks (i.e., RNNs), but n_step_update = {}'.format(n_step_update))

    train_sequence_length = (
        n_step_update + 1 if not q_network.state_spec else None)

    super(DqnAgent, self).__init__(
        time_step_spec,
        action_spec,
        policy,
        collect_policy,
        train_sequence_length=train_sequence_length,
        update_period=update_period,
        debug_summaries=debug_summaries,
        enable_functions=enable_functions,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step_counter)

    tf.compat.v1.summary.scalar(
        'epsilon/' + self.name,
        self._epsilon_greedy,
        collections=['train_' + self.name])
コード例 #28
0
    def __init__(
            self,
            time_step_spec: types.TimeStep,
            action_spec: types.BoundedTensorSpec,
            reward_network: types.Network,
            optimizer: types.Optimizer,
            epsilon: float,
            observation_and_action_constraint_splitter: Optional[
                types.Splitter] = None,
            accepts_per_arm_features: bool = False,
            constraints: Iterable[constr.NeuralConstraint] = (),
            # Params for training.
            error_loss_fn: types.LossFn = tf.compat.v1.losses.
        mean_squared_error,
            gradient_clipping: Optional[float] = None,
            # Params for debugging.
            debug_summaries: bool = False,
            summarize_grads_and_vars: bool = False,
            enable_summaries: bool = True,
            emit_policy_info: Tuple[Text, ...] = (),
            train_step_counter: Optional[tf.Variable] = None,
            laplacian_matrix: Optional[types.Float] = None,
            laplacian_smoothing_weight: float = 0.001,
            info_fields_to_inherit_from_greedy: Sequence[Text] = (),
            name: Optional[Text] = None):
        """Creates a Neural Epsilon Greedy Agent.

    For more details about the Laplacian smoothing regularization, please see
    the documentation of the `GreedyRewardPredictionAgent`.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of `BoundedTensorSpec` representing the actions.
      reward_network: A `tf_agents.network.Network` to be used by the agent. The
        network will be called with call(observation, step_type) and it is
        expected to provide a reward prediction for all actions.
        *Note*: when using `observation_and_action_constraint_splitter`, make
        sure the `reward_network` is compatible with the network-specific half
        of the output of the `observation_and_action_constraint_splitter`. In
        particular, `observation_and_action_constraint_splitter` will be called
        on the observation before passing to the network.
      optimizer: The optimizer to use for training.
      epsilon: A float representing the probability of choosing a random action
        instead of the greedy action.
      observation_and_action_constraint_splitter: A function used for masking
        valid/invalid actions with each state of the environment. The function
        takes in a full observation and returns a tuple consisting of 1) the
        part of the observation intended as input to the bandit agent and
        policy, and 2) the boolean mask. This function should also work with a
        `TensorSpec` as input, and should output `TensorSpec` objects for the
        observation and mask.
      accepts_per_arm_features: (bool) Whether the policy accepts per-arm
        features.
      constraints: iterable of constraints objects that are instances of
        `tf_agents.bandits.agents.NeuralConstraint`.
      error_loss_fn: A function for computing the error loss, taking parameters
        labels, predictions, and weights (any function from tf.losses would
        work). The default is `tf.losses.mean_squared_error`.
      gradient_clipping: A float representing the norm length to clip gradients
        (or None for no clipping.)
      debug_summaries: A Python bool, default False. When True, debug summaries
        are gathered.
      summarize_grads_and_vars: A Python bool, default False. When True,
        gradients and network variable summaries are written during training.
      enable_summaries: A Python bool, default True. When False, all summaries
        (debug or otherwise) should not be written.
      emit_policy_info: (tuple of strings) what side information we want to get
        as part of the policy info. Allowed values can be found in
        `policy_utilities.PolicyInfo`.
      train_step_counter: An optional `tf.Variable` to increment every time the
        train op is run.  Defaults to the `global_step`.
      laplacian_matrix: A float `Tensor` shaped `[num_actions, num_actions]`.
        This holds the Laplacian matrix used to regularize the smoothness of the
        estimated expected reward function. This only applies to problems where
        the actions have a graph structure. If `None`, the regularization is not
        applied.
      laplacian_smoothing_weight: A float that determines the weight of the
        regularization term. Note that this has no effect if `laplacian_matrix`
        above is `None`.
      info_fields_to_inherit_from_greedy: List  of info fields that are reported
        from the greedy policy even when exploratory action is taken.
      name: Python str name of this agent. All variables in this module will
        fall under that name. Defaults to the class name.

    Raises:
      ValueError: If the action spec contains more than one action or or it is
      not a bounded scalar int32 spec with minimum 0.
    """
        super(NeuralEpsilonGreedyAgent, self).__init__(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            reward_network=reward_network,
            optimizer=optimizer,
            observation_and_action_constraint_splitter=(
                observation_and_action_constraint_splitter),
            accepts_per_arm_features=accepts_per_arm_features,
            constraints=constraints,
            error_loss_fn=error_loss_fn,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            enable_summaries=enable_summaries,
            emit_policy_info=emit_policy_info,
            train_step_counter=train_step_counter,
            laplacian_matrix=laplacian_matrix,
            laplacian_smoothing_weight=laplacian_smoothing_weight,
            name=name)
        self._policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
            self._policy,
            epsilon=epsilon,
            info_fields_to_inherit_from_greedy=
            info_fields_to_inherit_from_greedy)
        self._collect_policy = self._policy
コード例 #29
0
ファイル: tf_env02.py プロジェクト: bmwant/chemister
def train():
    global VERBOSE
    environment = TradeEnvironment()
    # utils.validate_py_environment(environment, episodes=5)
    # Environments
    train_env = tf_py_environment.TFPyEnvironment(environment)
    eval_env = tf_py_environment.TFPyEnvironment(environment)

    num_iterations = 50
    fc_layer_params = (512, )  # ~ (17 + 1001) / 2
    input_fc_layer_params = (17, )
    output_fc_layer_params = (20, )
    lstm_size = (17, )
    initial_collect_steps = 20
    collect_steps_per_iteration = 1
    batch_size = 64
    replay_buffer_capacity = 10000

    gamma = 0.99  # check if 1 will work here
    target_update_tau = 0.05
    target_update_period = 5
    epsilon_greedy = 0.1
    reward_scale_factor = 1.0
    learning_rate = 1e-2
    log_interval = 30
    num_eval_episodes = 5
    eval_interval = 15

    # q_net = q_network.QNetwork(
    #     train_env.observation_spec(),
    #     train_env.action_spec(),
    #     fc_layer_params=fc_layer_params,
    # )

    q_net = q_rnn_network.QRnnNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        input_fc_layer_params=input_fc_layer_params,
        lstm_size=lstm_size,
        output_fc_layer_params=output_fc_layer_params,
    )

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

    train_step_counter = tf.compat.v2.Variable(0)

    tf_agent = dqn_agent.DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        epsilon_greedy=epsilon_greedy,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        td_errors_loss_fn=dqn_agent.element_wise_squared_loss,
        train_step_counter=train_step_counter,
        gradient_clipping=None,
        debug_summaries=False,
        summarize_grads_and_vars=False,
    )

    q_policy = FilteredQPolicy(
        tf_agent._time_step_spec,
        tf_agent._action_spec,
        q_network=tf_agent._q_network,
    )

    # Valid policy to pre-fill replay buffer
    dummy_policy = DummyTradePolicy(
        train_env.time_step_spec(),
        train_env.action_spec(),
    )

    # Main agent's policy; greedy one
    policy = greedy_policy.GreedyPolicy(q_policy)
    filtered_random_py_policy = FilteredRandomPyPolicy(
        time_step_spec=policy.time_step_spec,
        action_spec=policy.action_spec,
    )
    filtered_random_tf_policy = tf_py_policy.TFPyPolicy(
        filtered_random_py_policy)
    collect_policy = epsilon_greedy_policy.EpsilonGreedyPolicy(
        q_policy, epsilon=tf_agent._epsilon_greedy)
    # Patch random policy for epsilon greedy collect policy

    filtered_random_tf_policy = FilteredRandomTFPolicy(
        time_step_spec=policy.time_step_spec,
        action_spec=policy.action_spec,
    )
    collect_policy._random_policy = filtered_random_tf_policy

    tf_agent._policy = policy
    tf_agent._collect_policy = collect_policy
    tf_agent.initialize()

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=replay_buffer_capacity,
    )
    print(
        'Pre-filling replay buffer in {} steps'.format(initial_collect_steps))
    for _ in range(initial_collect_steps):
        traj = collect_step(train_env, dummy_policy)
        replay_buffer.add_batch(traj)

    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=batch_size,
        num_steps=2,
    ).prefetch(3)

    iterator = iter(dataset)
    # Train
    tf_agent.train = common.function(tf_agent.train)

    tf_agent.train_step_counter.assign(0)

    avg_return = compute_avg_return(eval_env, tf_agent.policy,
                                    num_eval_episodes)

    returns = [avg_return]

    print('Starting iterations...')
    for i in range(num_iterations):

        # fill replay buffer
        for _ in range(collect_steps_per_iteration):
            traj = collect_step(train_env, tf_agent.collect_policy)
            # Add trajectory to the replay buffer
            replay_buffer.add_batch(traj)

        experience, _ = next(iterator)
        train_loss = tf_agent.train(experience)

        step = tf_agent.train_step_counter.numpy()

        if step % log_interval == 0:
            print('step = {0}: loss = {1}'.format(step, train_loss.loss))

        if step % eval_interval == 0:
            avg_return = compute_avg_return(eval_env, tf_agent.policy,
                                            num_eval_episodes)
            print('step = {0}: avg return = {1}'.format(step, avg_return))
            returns.append(avg_return)

    print('Finished {} iterations!'.format(num_iterations))

    print('Playing with resulting policy')
    VERBOSE = True
    r = compute_avg_return(eval_env, tf_agent.policy, 1)
    print('Result: {}'.format(r))
    steps = range(0, num_iterations + 1, eval_interval)

    # merged = tf.summary.merge_all()
    # writer = tf.summary.FileWriter(FLAGS.log_dir)
    #
    # writer.close()
    print('Check out chart for learning')
    plt.plot(steps, returns)
    plt.ylabel('Average Return')
    plt.xlabel('Step')
    plt.ylim(top=1000)
    plt.show()