Beispiel #1
0
    def test_vtrace_from_logits(self, batch_size=2):
        """Tests V-trace calculated from logits."""
        seq_len = 5
        num_actions = 3
        clip_rho_threshold = None  # No clipping.
        clip_pg_rho_threshold = None  # No clipping.

        values = {
            "behavior_policy_logits":
            _shaped_arange(seq_len, batch_size, num_actions),
            "target_policy_logits":
            _shaped_arange(seq_len, batch_size, num_actions),
            "actions":
            np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)),
            "discounts":
            np.array(  # T, B where B_i: [0.9 / (i+1)] * T
                [[0.9 / (b + 1) for b in range(batch_size)]
                 for _ in range(seq_len)],
                dtype=np.float32,
            ),
            "rewards":
            _shaped_arange(seq_len, batch_size),
            "values":
            _shaped_arange(seq_len, batch_size) / batch_size,
            "bootstrap_value":
            _shaped_arange(batch_size) + 1.0,  # B
        }
        values = {k: torch.from_numpy(v) for k, v in values.items()}

        from_logits_output = vtrace.from_logits(
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold,
            **values,
        )

        target_log_probs = vtrace.action_log_probs(
            values["target_policy_logits"], values["actions"])
        behavior_log_probs = vtrace.action_log_probs(
            values["behavior_policy_logits"], values["actions"])
        log_rhos = target_log_probs - behavior_log_probs

        # Calculate V-trace using the ground truth logits.
        from_iw = vtrace.from_importance_weights(
            log_rhos=log_rhos,
            discounts=values["discounts"],
            rewards=values["rewards"],
            values=values["values"],
            bootstrap_value=values["bootstrap_value"],
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold,
        )

        assert_allclose(from_iw.vs, from_logits_output.vs)
        assert_allclose(from_iw.pg_advantages,
                        from_logits_output.pg_advantages)
        assert_allclose(behavior_log_probs,
                        from_logits_output.behavior_action_log_probs)
        assert_allclose(target_log_probs,
                        from_logits_output.target_action_log_probs)
        assert_allclose(log_rhos, from_logits_output.log_rhos)
Beispiel #2
0
def learner(model, data, ps, args):
    """Learner to get trajectories from Actors."""
    optimizer = optim.RMSprop(model.parameters(),
                              lr=args.lr,
                              eps=args.epsilon,
                              weight_decay=args.decay,
                              momentum=args.momentum)
    batch_size = args.batch_size
    baseline_cost = args.baseline_cost
    entropy_cost = args.entropy_cost
    gamma = args.gamma
    save_path = args.save_path
    """Gets trajectories from actors and trains learner."""
    batch = []
    best = 0.
    while True:
        trajectory = data.get()
        batch.append(trajectory)
        if torch.cuda.is_available():
            trajectory.cuda()
        if len(batch) < batch_size:
            continue
        behaviour_logits, obs, actions, rewards, dones, hx = make_time_major(
            batch)
        optimizer.zero_grad()
        logits, values = model(obs, actions, rewards, dones, hx=hx)
        bootstrap_value = values[-1]
        actions, behaviour_logits, dones, rewards = actions[
            1:], behaviour_logits[1:], dones[1:], rewards[1:]
        logits, values = logits[:-1], values[:-1]
        discounts = (~dones).float() * gamma
        vs, pg_advantages = vtrace.from_logits(
            behaviour_policy_logits=behaviour_logits,
            target_policy_logits=logits,
            actions=actions,
            discounts=discounts,
            rewards=rewards,
            values=values,
            bootstrap_value=bootstrap_value)
        # policy gradient loss
        cross_entropy = F.cross_entropy(logits, actions, reduction='none')
        loss = (cross_entropy * pg_advantages.detach()).sum()
        # baseline_loss
        loss += baseline_cost * .5 * (vs - values).pow(2).sum()
        # entropy_loss
        loss += entropy_cost * -(-F.softmax(logits, 1) *
                                 F.log_softmax(logits, 1)).sum(-1).sum()
        loss.backward()
        optimizer.step()
        model.cpu()
        ps.push(model.state_dict())
        if rewards.mean().item() > best:
            torch.save(model.state_dict(), save_path)
        if torch.cuda.is_available():
            model.cuda()
        batch = []
Beispiel #3
0
def build_learner(agent, agent_state, env_outputs, agent_outputs):
  """Builds the learner loop.

  Args:
    agent: A snt.RNNCore module outputting `AgentOutput` named tuples, with an
      `unroll` call for computing the outputs for a whole trajectory.
    agent_state: The initial agent state for each sequence in the batch.
    env_outputs: A `StepOutput` namedtuple where each field is of shape
      [T+1, ...].
    agent_outputs: An `AgentOutput` namedtuple where each field is of shape
      [T+1, ...].

  Returns:
    A tuple of (done, infos, and environment frames) where
    the environment frames tensor causes an update.
  """
  learner_outputs, _ = agent.unroll(agent_outputs.action, env_outputs,
                                    agent_state)

  # Use last baseline value (from the value function) to bootstrap.
  bootstrap_value = learner_outputs.baseline[-1]

  # At this point, the environment outputs at time step `t` are the inputs that
  # lead to the learner_outputs at time step `t`. After the following shifting,
  # the actions in agent_outputs and learner_outputs at time step `t` is what
  # leads to the environment outputs at time step `t`.
  agent_outputs = nest.map_structure(lambda t: t[1:], agent_outputs)
  rewards, infos, done, _ = nest.map_structure(
      lambda t: t[1:], env_outputs)
  learner_outputs = nest.map_structure(lambda t: t[:-1], learner_outputs)

  if FLAGS.reward_clipping == 'abs_one':
    clipped_rewards = tf.clip_by_value(rewards, -1, 1)
  elif FLAGS.reward_clipping == 'soft_asymmetric':
    squeezed = tf.tanh(rewards / 5.0)
    # Negative rewards are given less weight than positive rewards.
    clipped_rewards = tf.where(rewards < 0, .3 * squeezed, squeezed) * 5.

  discounts = tf.to_float(~done) * FLAGS.discounting

  # Compute V-trace returns and weights.
  # Note, this is put on the CPU because it's faster than on GPU. It can be
  # improved further with XLA-compilation or with a custom TensorFlow operation.
  with tf.device('/cpu'):
    vtrace_returns = vtrace.from_logits(
        behaviour_policy_logits=agent_outputs.policy_logits,
        target_policy_logits=learner_outputs.policy_logits,
        actions=agent_outputs.action,
        discounts=discounts,
        rewards=clipped_rewards,
        values=learner_outputs.baseline,
        bootstrap_value=bootstrap_value)

  # Compute loss as a weighted sum of the baseline loss, the policy gradient
  # loss and an entropy regularization term.
  total_loss = compute_policy_gradient_loss(
      learner_outputs.policy_logits, agent_outputs.action,
      vtrace_returns.pg_advantages)
  total_loss += FLAGS.baseline_cost * compute_baseline_loss(
      vtrace_returns.vs - learner_outputs.baseline)
  total_loss += FLAGS.entropy_cost * compute_entropy_loss(
      learner_outputs.policy_logits)

  # Optimization
  num_env_frames = tf.train.get_global_step()
  learning_rate = tf.train.polynomial_decay(FLAGS.learning_rate, num_env_frames,
                                            FLAGS.total_environment_frames, 0)
  optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay,
                                        FLAGS.momentum, FLAGS.epsilon)
  train_op = optimizer.minimize(total_loss)

  # Merge updating the network and environment frames into a single tensor.
  with tf.control_dependencies([train_op]):
    num_env_frames_and_train = num_env_frames.assign_add(
        FLAGS.batch_size * FLAGS.unroll_length * FLAGS.num_action_repeats)

  # Adding a few summaries.
  tf.summary.scalar('learning_rate', learning_rate)
  tf.summary.scalar('total_loss', total_loss)
  tf.summary.histogram('action', agent_outputs.action)

  return done, infos, num_env_frames_and_train
Beispiel #4
0
    def test_vtrace_from_logits(self, batch_size):
        """Tests V-trace calculated from logits."""
        seq_len = 5
        num_actions = 3
        clip_rho_threshold = None  # No clipping.
        clip_pg_rho_threshold = None  # No clipping.

        dummy_config = {"model": None}

        # Intentionally leaving shapes unspecified to test if V-trace can
        # deal with that.
        placeholders = {
            # T, B, NUM_ACTIONS
            "behaviour_policy_logits": tf.placeholder(
                dtype=tf.float32, shape=[None, None, None]),
            # T, B, NUM_ACTIONS
            "target_policy_logits": tf.placeholder(
                dtype=tf.float32, shape=[None, None, None]),
            "actions": tf.placeholder(dtype=tf.int32, shape=[None, None]),
            "discounts": tf.placeholder(dtype=tf.float32, shape=[None, None]),
            "rewards": tf.placeholder(dtype=tf.float32, shape=[None, None]),
            "values": tf.placeholder(dtype=tf.float32, shape=[None, None]),
            "bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None]),
        }

        from_logits_output = vtrace.from_logits(
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold,
            config=dummy_config,
            **placeholders)

        target_log_probs = vtrace.log_probs_from_logits_and_actions(
            placeholders["target_policy_logits"], placeholders["actions"],
            dummy_config)
        behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
            placeholders["behaviour_policy_logits"], placeholders["actions"],
            dummy_config)
        log_rhos = target_log_probs - behaviour_log_probs
        ground_truth = (log_rhos, behaviour_log_probs, target_log_probs)

        values = {
            "behaviour_policy_logits": _shaped_arange(seq_len, batch_size,
                                                      num_actions),
            "target_policy_logits": _shaped_arange(seq_len, batch_size,
                                                   num_actions),
            "actions": np.random.randint(
                0, num_actions - 1, size=(seq_len, batch_size)),
            "discounts": np.array(  # T, B where B_i: [0.9 / (i+1)] * T
                [[0.9 / (b + 1) for b in range(batch_size)]
                 for _ in range(seq_len)]),
            "rewards": _shaped_arange(seq_len, batch_size),
            "values": _shaped_arange(seq_len, batch_size) / batch_size,
            "bootstrap_value": _shaped_arange(batch_size) + 1.0,  # B
        }

        feed_dict = {placeholders[k]: v for k, v in values.items()}
        with self.test_session() as session:
            from_logits_output_v = session.run(
                from_logits_output, feed_dict=feed_dict)
            (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs,
             ground_truth_target_action_log_probs) = session.run(
                 ground_truth, feed_dict=feed_dict)

        # Calculate V-trace using the ground truth logits.
        from_iw = vtrace.from_importance_weights(
            log_rhos=ground_truth_log_rhos,
            discounts=values["discounts"],
            rewards=values["rewards"],
            values=values["values"],
            bootstrap_value=values["bootstrap_value"],
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold)

        with self.test_session() as session:
            from_iw_v = session.run(from_iw)

        self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs)
        self.assertAllClose(from_iw_v.pg_advantages,
                            from_logits_output_v.pg_advantages)
        self.assertAllClose(ground_truth_behaviour_action_log_probs,
                            from_logits_output_v.behaviour_action_log_probs)
        self.assertAllClose(ground_truth_target_action_log_probs,
                            from_logits_output_v.target_action_log_probs)
        self.assertAllClose(ground_truth_log_rhos,
                            from_logits_output_v.log_rhos)
def build_learner(agent, env_outputs, agent_outputs, env_id):
  """Builds the learner loop.

  Args:
    agent: A snt.RNNCore module outputting `AgentOutput` named tuples, with an
      `unroll` call for computing the outputs for a whole trajectory.
    agent_state: The initial agent state for each sequence in the batch.
    env_outputs: A `StepOutput` namedtuple where each field is of shape
      [T+1, ...].
    agent_outputs: An `AgentOutput` namedtuple where each field is of shape
      [T+1, ...].

  Returns:
    A tuple of (done, infos, and environment frames) where
    the environment frames tensor causes an update.
  """

  # Need to map the game name, e.g 'BreakoutNoFrameSkip-v4' to an integer.  
  def get_single_game_info(_tuple):
    single_env_id, game_info = _tuple
    return game_info[single_env_id]

  # Retrieve the specific games in the current batch. 
  def get_batch_value(batch):
    return tf.map_fn(get_single_game_info, (env_id, batch), dtype=tf.float32)

  learner_outputs = agent.unroll(agent_outputs.action, env_outputs)
  un_normalized_vf = learner_outputs.un_normalized_vf
  normalized_vf   = learner_outputs.normalized_vf
  

  game_specific_un_normalized_vf = tf.map_fn(get_batch_value, un_normalized_vf, dtype=tf.float32)
  # game_specific_un_normalized_vf = tf.reduce_sum(game)
  game_specific_normalized_vf   = tf.map_fn(get_batch_value, normalized_vf, dtype=tf.float32)

  # Ensure the learner separates the value functions for each game. 
  # According to equation (10) in (Hessel et al., 2018). 
  learner_outputs = learner_outputs._replace(un_normalized_vf=game_specific_un_normalized_vf,
                                             normalized_vf=game_specific_normalized_vf) 
  # Use last baseline value (from the value function) to bootstrap.
  bootstrap_value = learner_outputs.un_normalized_vf[-1]
 
  # At this point, the environment outputs at time step `t` are the inputs that
  # lead to the learner_outputs at time step `t`. After the following shifting,
  # the actions in agent_outputs and learner_outputs at time step `t` is what
  # leads to the environment outputs at time step `t`.
  agent_outputs = nest.map_structure(lambda t: t[1:], agent_outputs)
  rewards, infos, done, _ = nest.map_structure(
      lambda t: t[1:], env_outputs)
  learner_outputs = nest.map_structure(lambda t: t[:-1], learner_outputs)

  if FLAGS.reward_clipping == 'abs_one':
    clipped_rewards = tf.clip_by_value(rewards, -1, 1)
  elif FLAGS.reward_clipping == 'soft_asymmetric':
    squeezed = tf.tanh(rewards / 5.0)
    # Negative rewards are given less weight than positive rewards.
    clipped_rewards = tf.where(rewards < 0, .3 * squeezed, squeezed) * 5.

  discounts = tf.to_float(~done) * FLAGS.discounting
  game_specific_mean = tf.gather(agent._mean, env_id)
  game_specific_std = tf.gather(agent._std, env_id)

  # Compute V-trace returns and weights.
  # Note, this is put on the CPU because it's faster than on GPU. It can be
  # improved further with XLA-compilation or with a custom TensorFlow operation.
  with tf.device('/cpu'):
    vtrace_returns = vtrace.from_logits(
        behaviour_policy_logits=agent_outputs.policy_logits,
        target_policy_logits=learner_outputs.policy_logits,
        actions=agent_outputs.action,
        discounts=discounts,
        rewards=clipped_rewards,
        un_normalized_values=learner_outputs.un_normalized_vf,
        normalized_values=learner_outputs.normalized_vf,
        mean=game_specific_mean,
        std=game_specific_std,
        bootstrap_value=bootstrap_value)

  # First term of equation (7) in (Hessel et al., 2018)
  normalized_vtrace = (vtrace_returns.vs - game_specific_mean) / game_specific_std

  normalized_vtrace = nest.map_structure(tf.stop_gradient, normalized_vtrace)


  # Compute loss as a weighted sum of the baseline loss, the policy gradient
  # loss and an entropy regularization term.
  total_loss = compute_policy_gradient_loss(
      learner_outputs.policy_logits, agent_outputs.action,
      vtrace_returns.pg_advantages)

  baseline_loss = compute_baseline_loss(
       normalized_vtrace - learner_outputs.normalized_vf)
  # Using the average GvT 
  baseline_loss = tf.divide(baseline_loss, FLAGS.unroll_length)

  total_loss += FLAGS.baseline_cost * baseline_loss
  total_loss += FLAGS.entropy_cost * compute_entropy_loss(
      learner_outputs.policy_logits)

  # Optimization
  num_env_frames = tf.train.get_global_step()

  learning_rate = tf.train.polynomial_decay(FLAGS.learning_rate, num_env_frames,
                                            FLAGS.total_environment_frames, 0)

  optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay,
                                        FLAGS.momentum, FLAGS.epsilon)

  # Use reward clipping for atari games only 
  if FLAGS.gradient_clipping > 0.0:
    # gradients, variables = zip(*optimizer.compute_gradients(total_loss))
    variables = tf.trainable_variables()
    gradients = tf.gradients(total_loss, variables)
    # print("VARIABLES: ", variables)
    gradients, _ = tf.clip_by_global_norm(gradients, FLAGS.gradient_clipping)
    print("GRADIENTS: ", gradients)
    train_op = optimizer.apply_gradients(zip(gradients, variables))
  else:
    train_op = optimizer.minimize(total_loss)

  # Merge updating the network and environment frames into a single tensor.
  with tf.control_dependencies([train_op]):
    num_env_frames_and_train = num_env_frames.assign_add(
        FLAGS.batch_size * FLAGS.unroll_length)

  # Adding a few summaries.
  tf.summary.scalar('learning_rate', learning_rate)
  tf.summary.scalar('total_loss', total_loss)
  tf.summary.histogram('action', agent_outputs.action)

  return (done, infos, num_env_frames_and_train) + (agent.update_moments(vtrace_returns.vs, env_id))
  def test_vtrace_from_logits(self, batch_size):
    """Tests V-trace calculated from logits."""
    seq_len = 5
    num_actions = 3
    clip_rho_threshold = None  # No clipping.
    clip_pg_rho_threshold = None  # No clipping.

    # Intentionally leaving shapes unspecified to test if V-trace can
    # deal with that.
    placeholders = {
        # T, B, NUM_ACTIONS
        'behaviour_policy_logits':
            tf.placeholder(dtype=tf.float32, shape=[None, None, None]),
        # T, B, NUM_ACTIONS
        'target_policy_logits':
            tf.placeholder(dtype=tf.float32, shape=[None, None, None]),
        'actions':
            tf.placeholder(dtype=tf.int32, shape=[None, None]),
        'discounts':
            tf.placeholder(dtype=tf.float32, shape=[None, None]),
        'rewards':
            tf.placeholder(dtype=tf.float32, shape=[None, None]),
        'values':
            tf.placeholder(dtype=tf.float32, shape=[None, None]),
        'bootstrap_value':
            tf.placeholder(dtype=tf.float32, shape=[None]),
    }

    from_logits_output = vtrace.from_logits(
        clip_rho_threshold=clip_rho_threshold,
        clip_pg_rho_threshold=clip_pg_rho_threshold,
        **placeholders)

    target_log_probs = vtrace.log_probs_from_logits_and_actions(
        placeholders['target_policy_logits'], placeholders['actions'])
    behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
        placeholders['behaviour_policy_logits'], placeholders['actions'])
    log_rhos = target_log_probs - behaviour_log_probs
    ground_truth = (log_rhos, behaviour_log_probs, target_log_probs)

    values = {
        'behaviour_policy_logits':
            _shaped_arange(seq_len, batch_size, num_actions),
        'target_policy_logits':
            _shaped_arange(seq_len, batch_size, num_actions),
        'actions':
            np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)),
        'discounts':
            np.array(  # T, B where B_i: [0.9 / (i+1)] * T
                [[0.9 / (b + 1)
                  for b in range(batch_size)]
                 for _ in range(seq_len)]),
        'rewards':
            _shaped_arange(seq_len, batch_size),
        'values':
            _shaped_arange(seq_len, batch_size) / batch_size,
        'bootstrap_value':
            _shaped_arange(batch_size) + 1.0,  # B
    }

    feed_dict = {placeholders[k]: v for k, v in values.items()}
    with self.test_session() as session:
      from_logits_output_v = session.run(
          from_logits_output, feed_dict=feed_dict)
      (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs,
       ground_truth_target_action_log_probs) = session.run(
           ground_truth, feed_dict=feed_dict)

    # Calculate V-trace using the ground truth logits.
    from_iw = vtrace.from_importance_weights(
        log_rhos=ground_truth_log_rhos,
        discounts=values['discounts'],
        rewards=values['rewards'],
        values=values['values'],
        bootstrap_value=values['bootstrap_value'],
        clip_rho_threshold=clip_rho_threshold,
        clip_pg_rho_threshold=clip_pg_rho_threshold)

    with self.test_session() as session:
      from_iw_v = session.run(from_iw)

    self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs)
    self.assertAllClose(from_iw_v.pg_advantages,
                        from_logits_output_v.pg_advantages)
    self.assertAllClose(ground_truth_behaviour_action_log_probs,
                        from_logits_output_v.behaviour_action_log_probs)
    self.assertAllClose(ground_truth_target_action_log_probs,
                        from_logits_output_v.target_action_log_probs)
    self.assertAllClose(ground_truth_log_rhos, from_logits_output_v.log_rhos)
Beispiel #7
0
    def training_process(self):

        # sample a batch of trajectories from memory and stack them, default batch_size=16
        # dim of trajectories: (batch, seq_len, -1)
        transitions = self.replay_memory.sample()
        batch = Transition(*zip(*transitions))
        state_batch = torch.stack(batch.state, dim=0)
        action_batch = torch.stack(batch.action, dim=0)
        reward_batch = torch.stack(batch.reward, dim=0)
        done_batch = torch.stack(batch.done, dim=0)
        behavior_logits_batch = torch.stack(batch.logits, dim=0)

        # make time major, dim of trajectories: (seq_len, batch, -1), for further computation
        state_batch = torch.transpose(state_batch, 0, 1)
        action_batch = torch.transpose(action_batch, 0, 1)
        reward_batch = torch.transpose(reward_batch, 0, 1)
        done_batch = torch.transpose(done_batch, 0, 1)
        if len(behavior_logits_batch.shape) == 4:
            # in case logits in (batch, seq_len, 1, #num_action), squeeze then permute it to (seq, batch, #num_action)
            behavior_logits_batch = behavior_logits_batch.squeeze(2)
        behavior_logits_batch = behavior_logits_batch.permute(1, 0, 2)

        # feed in to neural network, get learner output
        target_logits, baseline = self.agent(x=state_batch,
                                             action=action_batch,
                                             reward=reward_batch,
                                             dones=done_batch,
                                             core_state=None,
                                             isactor=False)

        # make time major of learner output
        target_logits = target_logits.permute(1, 0, 2)
        baseline = torch.transpose(baseline, 0, 1)

        # Use last baseline value (from the baseline function) to bootstrap.
        bootstrap_value = baseline[-1]

        # At this point, the environment outputs at time step `t` are the inputs that
        # lead to the learner_outputs at time step `t`. After the following shifting,
        # the actions in agent_outputs and learner_outputs at time step `t` is what
        # leads to the environment outputs at time step `t`.
        actions, behaviour_logits, rewards, dones = action_batch.view(action_batch.shape[0], -1).type(torch.long)[1:], behavior_logits_batch[1:], \
                                                    reward_batch.view(reward_batch.shape[0], -1)[1:], done_batch.view(done_batch.shape[0], -1)[1:]

        target_logits, baseline = target_logits[:-1], baseline[:-1]

        discounts = (~dones).float() * self.gamma

        vs, pg_advantages = vtrace.from_logits(
            behaviour_policy_logits=behaviour_logits,
            target_policy_logits=target_logits,
            actions=actions,
            discounts=discounts,
            rewards=rewards,
            values=baseline,
            bootstrap_value=bootstrap_value)

        self.optimizer.zero_grad()

        criterion = agent.MyLoss()
        loss = criterion.compute_policy_gradient_loss(
            target_logits, actions, pg_advantages)  # policy_gradient_loss
        loss += self.baseline_cost * criterion.compute_baseline_loss(
            vs=vs, baseline=baseline)  # baseline_loss
        loss += self.entropy_cost * criterion.compute_entropy_loss(
            target_logits)  # entropy regularization

        # loss in RL is not like loss in traditional ML,
        # the value of loss in RL only means the amplitude of update and direction (award or punishment).
        '''
        For comparing vtrace and loss in the IMPALA paper with our implementation
        '''
        # vtrace_tf.from_logits(
        #     behaviour_policy_logits=tf.convert_to_tensor(behaviour_logits.detach().numpy()),
        #     target_policy_logits=tf.convert_to_tensor(target_logits.detach().numpy()),
        #     actions=tf.convert_to_tensor(actions.int().detach().numpy()),
        #     discounts=tf.convert_to_tensor(discounts.detach().numpy()),
        #     rewards=tf.convert_to_tensor(rewards.detach().numpy()),
        #     values1=tf.convert_to_tensor(baseline.detach().numpy()),
        #     bootstrap_value=tf.convert_to_tensor(bootstrap_value.detach().numpy()))
        # # tf vs, tf pg_advantages will be printed in vtrace_tf.py
        # print('torch vs', vs)
        # print('torch pg_advantages', pg_advantages)

        # tf_loss = vtrace_tf.compute_policy_gradient_loss(tf.convert_to_tensor(target_logits.detach().numpy()),
        #                                                      tf.convert_to_tensor(actions.detach().numpy()),
        #                                                     tf.convert_to_tensor(pg_advantages.detach().numpy())) \
        #     + self.baseline_cost * vtrace_tf.compute_baseline_loss(tf.convert_to_tensor(vs.detach().numpy()),
        #                                                            tf.convert_to_tensor(baseline.detach().numpy())) \
        #     + self.entropy_cost * vtrace_tf.compute_entropy_loss(tf.convert_to_tensor(target_logits.detach().numpy()))
        # print('torch loss', loss)
        # print('tf loss', tf_loss)

        loss.backward()
        self.optimizer.step()
        self.loss_dict.append(loss.item())

        return
Beispiel #8
0
def test_vtrace_from_logit():
    """V-trace를 로짓에서 계산 테스트."""
    seq_len = 5  # n-step
    num_actions = 3
    batch_size = 2
    clip_rho_threshold = None  # No clipping.
    clip_pg_rho_threshold = None  # No clipping.

    np.random.seed(0)
    values = {
        'behavior_policy_logits':
        _shaped_arange(seq_len, batch_size, num_actions),
        'target_policy_logits':
        _shaped_arange(seq_len, batch_size, num_actions),
        'actions':
        np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)),
        'discounts':
        np.array(  # T, B where B_i: [0.9 / (i+1)] * T
            [[0.9 / (b + 1) for b in range(batch_size)]
             for _ in range(seq_len)]),
        'rewards':
        _shaped_arange(seq_len, batch_size),
        'values':
        _shaped_arange(seq_len, batch_size) / batch_size,
        'bootstrap_value':
        _shaped_arange(batch_size) + 1.0,  # B
    }

    from_logit_output = vtrace.from_logits(
        clip_rho_threshold=clip_rho_threshold,
        clip_pg_rho_threshold=clip_pg_rho_threshold,
        **values)

    ground_truth_target_log_probs = vtrace.log_probs_from_logits_and_actions(
        values['target_policy_logits'], values['actions'])
    ground_truth_behavior_log_probs = vtrace.log_probs_from_logits_and_actions(
        values['behavior_policy_logits'], values['actions'])
    ground_truth_log_rhos = ground_truth_target_log_probs - \
        ground_truth_behavior_log_probs

    from_iw = vtrace.from_importance_weights(
        log_rhos=ground_truth_log_rhos,
        discounts=values['discounts'],
        rewards=values['rewards'],
        values=values['values'],
        bootstrap_value=values['bootstrap_value'],
        clip_rho_threshold=clip_rho_threshold,
        clip_pg_rho_threshold=clip_pg_rho_threshold)

    # 중요도 가중치 결과 == 로짓 결과 == ground truth
    for g, o in zip(from_iw.vs, from_logit_output.vs):
        assert np.allclose(g, o.data.tolist())
    for g, o in zip(from_iw.pg_advantages, from_logit_output.pg_advantages):
        assert np.allclose(g, o.data.tolist())
    for g, o in zip(ground_truth_behavior_log_probs,
                    from_logit_output.behavior_action_log_probs):
        assert np.allclose(g, o.data.tolist())
    for g, o in zip(ground_truth_target_log_probs,
                    from_logit_output.target_action_log_probs):
        assert np.allclose(g, o.data.tolist())
    for g, o in zip(ground_truth_log_rhos, from_logit_output.log_rhos):
        assert np.allclose(g, o.data.tolist())

    logits = torch.Tensor(values['behavior_policy_logits'])
    actions = torch.LongTensor(values['actions'])
    advantages = from_iw.pg_advantages
    import pdb
    pdb.set_trace()  # breakpoint fd504776 //
    loss = calc_loss(logits, actions, advantages)
    pass
def build_learner(agent, agent_state, env_outputs, agent_outputs):
  """Builds the learner loop.

  Args:
    agent: A snt.RNNCore module outputting `AgentOutput` named tuples, with an
      `unroll` call for computing the outputs for a whole trajectory.
    agent_state: The initial agent state for each sequence in the batch.
    env_outputs: A `StepOutput` namedtuple where each field is of shape
      [T+1, ...].
    agent_outputs: An `AgentOutput` namedtuple where each field is of shape
      [T+1, ...].

  Returns:
    A tuple of (done, infos, and environment frames) where
    the environment frames tensor causes an update.
  """
  learner_outputs, _ = agent.unroll(agent_outputs.action, env_outputs,
                                    agent_state)

  # Use last baseline value (from the value function) to bootstrap.
  bootstrap_value = learner_outputs.baseline[-1]

  # At this point, the environment outputs at time step `t` are the inputs that
  # lead to the learner_outputs at time step `t`. After the following shifting,
  # the actions in agent_outputs and learner_outputs at time step `t` is what
  # leads to the environment outputs at time step `t`.
  agent_outputs = nest.map_structure(lambda t: t[1:], agent_outputs)
  rewards, infos, done, _ = nest.map_structure(
      lambda t: t[1:], env_outputs)
  learner_outputs = nest.map_structure(lambda t: t[:-1], learner_outputs)

  if FLAGS.reward_clipping == 'abs_one':
    clipped_rewards = tf.clip_by_value(rewards, -1, 1)
  elif FLAGS.reward_clipping == 'soft_asymmetric':
    squeezed = tf.tanh(rewards / 5.0)
    # Negative rewards are given less weight than positive rewards.
    clipped_rewards = tf.where(rewards < 0, .3 * squeezed, squeezed) * 5.

  discounts = tf.to_float(~done) * FLAGS.discounting

  # Compute V-trace returns and weights.
  # Note, this is put on the CPU because it's faster than on GPU. It can be
  # improved further with XLA-compilation or with a custom TensorFlow operation.
  with tf.device('/cpu'):
    vtrace_returns = vtrace.from_logits(
        behaviour_policy_logits=agent_outputs.policy_logits,
        target_policy_logits=learner_outputs.policy_logits,
        actions=agent_outputs.action,
        discounts=discounts,
        rewards=clipped_rewards,
        values=learner_outputs.baseline,
        bootstrap_value=bootstrap_value)

  # Compute loss as a weighted sum of the baseline loss, the policy gradient
  # loss and an entropy regularization term.
  total_loss = compute_policy_gradient_loss(
      learner_outputs.policy_logits, agent_outputs.action,
      vtrace_returns.pg_advantages)
  total_loss += FLAGS.baseline_cost * compute_baseline_loss(
      vtrace_returns.vs - learner_outputs.baseline)
  total_loss += FLAGS.entropy_cost * compute_entropy_loss(
      learner_outputs.policy_logits)

  # Optimization
  num_env_frames = tf.train.get_global_step()
  learning_rate = tf.train.polynomial_decay(FLAGS.learning_rate, num_env_frames,
                                            FLAGS.total_environment_frames, 0)
  optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay,
                                        FLAGS.momentum, FLAGS.epsilon)
  train_op = optimizer.minimize(total_loss)

  # Merge updating the network and environment frames into a single tensor.
  with tf.control_dependencies([train_op]):
    num_env_frames_and_train = num_env_frames.assign_add(
        FLAGS.batch_size * FLAGS.unroll_length * FLAGS.num_action_repeats)

  # Adding a few summaries.
  tf.summary.scalar('learning_rate', learning_rate)
  tf.summary.scalar('total_loss', total_loss)
  tf.summary.histogram('action', agent_outputs.action)

  return done, infos, num_env_frames_and_train
Beispiel #10
0
def build_learner(agent, agent_state, env_outputs, agent_outputs,
                  teacher_task_ph):
  """Builds the learner loop.

  Args:
    agent: A snt.RNNCore module outputting `AgentOutput` named tuples, with an
      `unroll` call for computing the outputs for a whole trajectory.
    agent_state: The initial agent state for each sequence in the batch.
    env_outputs: A `StepOutput` namedtuple where each field is of shape
      [T+1, ...].
    agent_outputs: An `AgentOutput` namedtuple where each field is of shape
      [T+1, ...].

  Returns:
    A tuple of (done, infos, and environment frames) where
    the environment frames tensor causes an update.
  """
  learner_outputs, _ = agent.unroll(agent_outputs.action, env_outputs,
                                    agent_state)
  teacher_selected_task = tf.identity(teacher_task_ph)

  # Use last baseline value (from the value function) to bootstrap.
  bootstrap_value = learner_outputs.baseline[-1]

  # At this point, the environment outputs at time step `t` are the inputs that
  # lead to the learner_outputs at time step `t`. After the following shifting,
  # the actions in agent_outputs and learner_outputs at time step `t` is what
  # leads to the environment outputs at time step `t`.
  agent_outputs = nest.map_structure(lambda t: t[1:], agent_outputs)
  rewards, infos, done, _ = nest.map_structure(
      lambda t: t[1:], env_outputs)
  learner_outputs = nest.map_structure(lambda t: t[:-1], learner_outputs)

  if FLAGS.reward_clipping == 'abs_one':
    clipped_rewards = tf.clip_by_value(rewards, -1, 1)
  elif FLAGS.reward_clipping == 'soft_asymmetric':
    squeezed = tf.tanh(rewards / 5.0)
    # Negative rewards are given less weight than positive rewards.
    # we don't have negative rewards so this is redundant
    clipped_rewards = tf.where(rewards < 0, .3 * squeezed, squeezed) * 5.

  discounts = tf.to_float(~done) * FLAGS.discounting

  # Compute V-trace returns and weights.
  # Note, this is put on the CPU because it's faster than on GPU. It can be
  # improved further with XLA-compilation or with a custom TensorFlow operation.
  with tf.device('/cpu'):
    vtrace_returns = vtrace.from_logits(
        behaviour_policy_logits=agent_outputs.policy_logits,
        target_policy_logits=learner_outputs.policy_logits,
        actions=agent_outputs.action,
        discounts=discounts,
        rewards=clipped_rewards,
        values=learner_outputs.baseline,
        bootstrap_value=bootstrap_value)

  # Compute loss as a weighted sum of the baseline loss, the policy gradient
  # loss and an entropy regularization term.
  total_loss = compute_policy_gradient_loss(
      learner_outputs.policy_logits, agent_outputs.action,
      vtrace_returns.pg_advantages)
  total_loss += FLAGS.baseline_cost * compute_baseline_loss(
      vtrace_returns.vs - learner_outputs.baseline)
  total_loss += FLAGS.entropy_cost * compute_entropy_loss(
      learner_outputs.policy_logits)

  # Optimization
  num_env_frames = tf.train.get_global_step()
  learning_rate = tf.train.polynomial_decay(FLAGS.learning_rate, num_env_frames,
                                            FLAGS.total_environment_frames, 0)
  optimizer = tf.train.RMSPropOptimizer(learning_rate, FLAGS.decay,
                                        FLAGS.momentum, FLAGS.epsilon)
  train_op = optimizer.minimize(total_loss)

  # Compute progress signal
  if FLAGS.progress_signal == 'reward':
    # Keep returns at end of episodes.
    # Discard parts of the minibatch using other tasks than what the Teacher
    # expects.
    episode_returns_correct_task = tf.boolean_mask(
        rewards,
        tf.logical_and(done, tf.equal(infos.task_name, teacher_selected_task)))
    progress_signal = tf.where(
        tf.size(episode_returns_correct_task) > 0,
        x=tf.reduce_mean(episode_returns_correct_task, name='progress_reward'),
        y=0)
  elif FLAGS.progress_signal == 'advantage':
    # For Advantage, we will compute returns[t] - returns[t-k] below, when
    # preparing to update the Teacher.
    # So just return reward[t] (again handling the wrong tasks parts)
    episode_returns_correct_task = tf.boolean_mask(
        rewards,
        tf.logical_and(done, tf.equal(infos.task_name, teacher_selected_task)))
    progress_signal = tf.where(
        tf.size(episode_returns_correct_task) > 0,
        x=tf.reduce_mean(episode_returns_correct_task, name='progress_reward'),
        y=0)
  elif FLAGS.progress_signal == 'gradient_norm':
    # compute norm of gradients as the progress signal
    params = tf.trainable_variables()
    gradients = tf.gradients(total_loss, params)
    gradient_norm = tf.global_norm(gradients)
    # TODO renormalize gradients hack, should be done adaptively...
    progress_signal = tf.divide(
        gradient_norm, 500., name='progress_gradient_norm')
  else:
    progress_signal = tf.constant(0.)

  # Merge updating the network and environment frames into a single tensor.
  with tf.control_dependencies([train_op]):
    num_env_frames_and_train = num_env_frames.assign_add(
        FLAGS.batch_size * FLAGS.unroll_length * FLAGS.num_action_repeats)

  # Adding a few summaries.
  tf.summary.scalar('learning_rate', learning_rate)
  tf.summary.scalar('total_loss', total_loss)
  tf.summary.histogram('action', agent_outputs.action)
  tf.summary.scalar('progress_signal', progress_signal)

  return done, infos, num_env_frames_and_train, progress_signal