コード例 #1
0
    def test_vtrace_from_logits(self, batch_size=2):
        """Tests V-trace calculated from logits."""
        seq_len = 5
        num_actions = 3
        clip_rho_threshold = None  # No clipping.
        clip_pg_rho_threshold = None  # No clipping.

        values = {
            "behavior_policy_logits":
            _shaped_arange(seq_len, batch_size, num_actions),
            "target_policy_logits":
            _shaped_arange(seq_len, batch_size, num_actions),
            "actions":
            np.random.randint(0, num_actions - 1, size=(seq_len, batch_size)),
            "discounts":
            np.array(  # T, B where B_i: [0.9 / (i+1)] * T
                [[0.9 / (b + 1) for b in range(batch_size)]
                 for _ in range(seq_len)],
                dtype=np.float32,
            ),
            "rewards":
            _shaped_arange(seq_len, batch_size),
            "values":
            _shaped_arange(seq_len, batch_size) / batch_size,
            "bootstrap_value":
            _shaped_arange(batch_size) + 1.0,  # B
        }
        values = {k: torch.from_numpy(v) for k, v in values.items()}

        from_logits_output = vtrace.from_logits(
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold,
            **values,
        )

        target_log_probs = vtrace.action_log_probs(
            values["target_policy_logits"], values["actions"])
        behavior_log_probs = vtrace.action_log_probs(
            values["behavior_policy_logits"], values["actions"])
        log_rhos = target_log_probs - behavior_log_probs

        # Calculate V-trace using the ground truth logits.
        from_iw = vtrace.from_importance_weights(
            log_rhos=log_rhos,
            discounts=values["discounts"],
            rewards=values["rewards"],
            values=values["values"],
            bootstrap_value=values["bootstrap_value"],
            clip_rho_threshold=clip_rho_threshold,
            clip_pg_rho_threshold=clip_pg_rho_threshold,
        )

        assert_allclose(from_iw.vs, from_logits_output.vs)
        assert_allclose(from_iw.pg_advantages,
                        from_logits_output.pg_advantages)
        assert_allclose(behavior_log_probs,
                        from_logits_output.behavior_action_log_probs)
        assert_allclose(target_log_probs,
                        from_logits_output.target_action_log_probs)
        assert_allclose(log_rhos, from_logits_output.log_rhos)
コード例 #2
0
def learn(
    flags,
    learner_queue,
    model,
    actor_model,
    optimizer,
    scheduler,
    stats,
    plogger,
    lock=threading.Lock(),
):
    for tensors in learner_queue:
        tensors = nest.map(lambda t: t.to(flags.learner_device), tensors)

        batch, initial_agent_state = tensors
        env_outputs, actor_outputs = batch
        frame, reward, done, *_ = env_outputs

        lock.acquire()  # Only one thread learning at a time.
        learner_outputs, unused_state = model(
            dict(frame=frame, reward=reward, done=done), initial_agent_state
        )

        # Take final value function slice for bootstrapping.
        learner_outputs = AgentOutput._make(learner_outputs)
        bootstrap_value = learner_outputs.baseline[-1]

        # Move from obs[t] -> action[t] to action[t] -> obs[t].
        batch = nest.map(lambda t: t[1:], batch)
        learner_outputs = nest.map(lambda t: t[:-1], learner_outputs)

        # Turn into namedtuples again.
        env_outputs, actor_outputs = batch
        env_outputs = EnvOutput._make(env_outputs)
        actor_outputs = AgentOutput._make(actor_outputs)
        learner_outputs = AgentOutput._make(learner_outputs)

        if flags.reward_clipping == "abs_one":
            clipped_rewards = torch.clamp(env_outputs.rewards, -1, 1)
        elif flags.reward_clipping == "none":
            clipped_rewards = env_outputs.rewards

        discounts = (1 - env_outputs.done).float() * flags.discounting

        vtrace_returns = vtrace.from_logits(
            behavior_policy_logits=actor_outputs.policy_logits,
            target_policy_logits=learner_outputs.policy_logits,
            actions=actor_outputs.action,
            discounts=discounts,
            rewards=clipped_rewards,
            values=learner_outputs.baseline,
            bootstrap_value=bootstrap_value,
        )

        pg_loss = compute_policy_gradient_loss(
            learner_outputs.policy_logits,
            actor_outputs.action,
            vtrace_returns.pg_advantages,
        )
        baseline_loss = flags.baseline_cost * compute_baseline_loss(
            vtrace_returns.vs - learner_outputs.baseline
        )
        entropy_loss = flags.entropy_cost * compute_entropy_loss(
            learner_outputs.policy_logits
        )

        total_loss = pg_loss + baseline_loss + entropy_loss

        scheduler.step()
        optimizer.zero_grad()
        total_loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping)
        optimizer.step()

        actor_model.load_state_dict(model.state_dict())

        episode_returns = env_outputs.episode_return[env_outputs.done]
        stats["step"] = stats.get("step", 0) + flags.unroll_length * flags.batch_size
        stats["episode_returns"] = tuple(episode_returns.cpu().numpy())
        stats["mean_episode_return"] = torch.mean(episode_returns).item()
        stats["mean_episode_step"] = torch.mean(env_outputs.episode_step.float()).item()
        stats["total_loss"] = total_loss.item()
        stats["pg_loss"] = pg_loss.item()
        stats["baseline_loss"] = baseline_loss.item()
        stats["entropy_loss"] = entropy_loss.item()

        stats["learner_queue_size"] = learner_queue.size()

        plogger.log(stats)

        if not len(episode_returns):
            # Hide the mean-of-empty-tuple NaN as it scares people.
            stats["mean_episode_return"] = None

        lock.release()
コード例 #3
0
ファイル: monobeast.py プロジェクト: nicoladainese96/SC2-RL
def learn(
        flags,
        actor_model,  # single actor model with shared memory? Confirm that?
        model,
        batch,
        optimizer,
        scheduler,
        lock=threading.Lock(),  # noqa: B008
):
    """Performs a learning (optimization) step."""
    with lock:

        learner_outputs = model.learner_step(batch)

        # Take final value function slice for bootstrapping.
        bootstrap_value = learner_outputs["baseline_trg"][-1]  # V_learner(s_T)
        entropy = learner_outputs['entropy']

        #rearranged_batch = {}
        #rearranged_batch['done'] = batch['done'][:-1] # done_{0}, ..., done_{T-1}
        #rearranged_batch['done'] = batch['done'][1:]
        #rearranged_batch['bootstrap'] = batch['bootstrap'][1:]
        #rearranged_batch['reward'] = batch['reward'][1:] # reward_{0}, ..., reward_{T-1}
        #rearranged_batch['log_prob'] = batch['log_prob'][:-1] # log_prob_{0}, ..., log_prob_{T-1}

        # gets [log_prob_{0}, ..., log_prob_{T-1}] and [V_{0},...,V_{T-1}]
        learner_outputs = {
            key: tensor[:-1]
            for key, tensor in learner_outputs.items() if key != 'entropy'
        }

        rewards = batch['reward'][1:]
        if flags.reward_clipping == "abs_one":
            clipped_rewards = torch.clamp(rewards, -1, 1)
        elif flags.reward_clipping == "none":
            clipped_rewards = rewards

        #discounts = (~rearranged_batch["done"]).float() * flags.discounting # 0 if done, gamma otherwise

        vtrace_returns = vtrace.from_logits(
            behavior_action_log_probs=batch['log_prob'][:-1],  # actor
            target_action_log_probs=learner_outputs["log_prob"],  # learner
            not_done=(~batch['done'][1:]).float(),
            bootstrap=batch['bootstrap'][1:],
            gamma=flags.discounting,
            rewards=clipped_rewards,
            values=learner_outputs["baseline"],
            values_trg=learner_outputs["baseline_trg"],
            bootstrap_value=bootstrap_value,  # coming from the learner too
        )

        pg_loss = compute_policy_gradient_loss(
            learner_outputs["log_prob"],
            vtrace_returns.pg_advantages,
        )

        baseline_loss = flags.baseline_cost * compute_baseline_loss(
            vtrace_returns.vs - learner_outputs["baseline"])

        entropy_loss = flags.entropy_cost * entropy
        total_loss = pg_loss + baseline_loss + entropy_loss
        # not every time we get an episode return because the unroll length is shorter than the episode length,
        # so not every time batch['done'] contains some True entries
        episode_returns = batch["episode_return"][
            batch["done"]]  # still to check, might be okay
        stats = {
            "episode_returns": tuple(episode_returns.cpu().numpy()),
            "mean_episode_return": torch.mean(episode_returns).item(),
            "total_loss": total_loss.item(),
            "pg_loss": pg_loss.item(),
            "baseline_loss": baseline_loss.item(),
            "entropy_loss": entropy_loss.item(),
        }

        optimizer.zero_grad()
        total_loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping)
        optimizer.step()
        if flags.optim == "RMSprop":
            scheduler.step()
        actor_model.load_state_dict(model.state_dict())
        return stats
コード例 #4
0
def learn(flags,
          actor_model,
          model,
          batch,
          initial_agent_state,
          optimizer,
          scheduler,
          stats,
          lock=threading.Lock(),
          envs=None):
    """Performs a learning (optimization) step."""
    with lock:
        # forward pass with gradients
        learner_outputs, unused_state = model(batch, initial_agent_state)

        # Take final value function slice for bootstrapping.
        bootstrap_value = learner_outputs["baseline"][-1]

        # Move from obs[t] -> action[t] to action[t] -> obs[t].
        batch = {key: tensor[1:] for key, tensor in batch.items()}
        learner_outputs = {
            key: tensor[:-1]
            for key, tensor in learner_outputs.items()
        }

        # if specified, clip rewards between -1 and 1
        rewards = batch["reward"]
        if flags.reward_clipping == "abs_one":
            clipped_rewards = torch.clamp(rewards, -1, 1)
        elif flags.reward_clipping == "none":
            clipped_rewards = rewards

        # the "~"/tilde operator is apparently kind of a complement or # inverse, so maybe this just reverses
        # the "done" tensor? in that case would discounting only be applied when the game was NOT done?
        discounts = (~batch["done"]).float() * flags.discounting

        # prepare tensors for computation of the loss
        task = F.one_hot(batch["task"].long(), flags.num_tasks).float()
        clipped_rewards = clipped_rewards[:, :, None]
        discounts = discounts[:, :, None]

        # prepare PopArt parameters as well
        mu = model.baseline.mu[None, None, :]
        sigma = model.baseline.sigma[None, None, :]

        # get the V-trace returns; I hope nothing needs to be changed about this, but I think
        # once one has the V-trace returns it can just be plugged into the PopArt equations
        vtrace_returns = vtrace.from_logits(
            behavior_policy_logits=batch["policy_logits"],
            target_policy_logits=learner_outputs["policy_logits"],
            actions=batch["action"],
            discounts=discounts,
            rewards=clipped_rewards,
            values=learner_outputs["baseline"],
            bootstrap_value=bootstrap_value,
            normalized_values=learner_outputs["normalized_baseline"],
            mu=mu,
            sigma=sigma)

        # PopArt normalization
        with torch.no_grad():
            normalized_vs = (vtrace_returns.vs - mu) / sigma

        # policy gradient loss
        pg_loss = compute_policy_gradient_loss(
            learner_outputs["policy_logits"],
            batch["action"],
            vtrace_returns.pg_advantages * task,
        )

        # value function/baseline loss (1/2 * squared difference between V-trace and value function)
        baseline_loss = flags.baseline_cost * compute_baseline_loss(
            # vtrace_returns.vs - learner_outputs["baseline"]
            (normalized_vs - learner_outputs["normalized_baseline"]) * task)

        # entropy loss for getting a "diverse" action distribution (?), "normal entropy" over action distribution
        entropy_loss = flags.entropy_cost * compute_entropy_loss(
            learner_outputs["policy_logits"])

        total_loss = pg_loss + baseline_loss + entropy_loss

        # do the backward pass (WITH GRADIENT NORM CLIPPING) and adjust hyperparameters (scheduler, ?)
        optimizer.zero_grad()
        total_loss.backward()
        # plot_grad_flow(model.named_parameters(), flags)
        gradient_tracker.process_backward_pass(model.named_parameters())
        nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping)
        optimizer.step()
        scheduler.step()

        # update the PopArt parameters, which the optimizer does not take care of
        if flags.use_popart:
            model.baseline.update_parameters(vtrace_returns.vs, task)

        # update the actor model with the new parameters
        actor_model.load_state_dict(model.state_dict())

        # get the returns only for finished episodes (where the game was played to completion)
        episode_returns = batch["episode_return"][batch["done"]]
        stats["step"] = stats.get("step",
                                  0) + flags.unroll_length * flags.batch_size
        stats["episode_returns"] = tuple(episode_returns.cpu().numpy())
        stats["mean_episode_return"] = torch.mean(episode_returns).item()
        stats["total_loss"] = total_loss.item()
        stats["pg_loss"] = pg_loss.item()
        stats["baseline_loss"] = baseline_loss.item()
        stats["entropy_loss"] = entropy_loss.item()
        stats["mu"] = mu[0, 0, :]
        stats["sigma"] = sigma[0, 0, :]
        if "env_step" not in stats:
            stats["env_step"] = {}
        for task in batch["task"][0].cpu().numpy():
            stats["env_step"][envs[task]] = stats["env_step"].get(
                envs[task], 0) + flags.unroll_length

        return stats
コード例 #5
0
ファイル: polybeast.py プロジェクト: ln-e/spiralpp
def learn(
        flags,
        learner_queue,
        d_queue,
        model,
        actor_model,
        D,
        optimizer,
        scheduler,
        stats,
        plogger,
        lock=threading.Lock(),
):
    for tensors in learner_queue:
        tensors = nest.map(
            lambda t: t.to(flags.learner_device, non_blocking=True), tensors)

        batch, agent_state, image = tensors

        env_outputs, actor_outputs, noise = batch
        batch = (env_outputs, actor_outputs)
        frame, reward, done, *_ = env_outputs

        d_queue.put((frame, image.squeeze(0)))

        lock.acquire()  # Only one thread learning at a time.
        optimizer.zero_grad()

        actor_outputs = AgentOutput._make(actor_outputs)

        if flags.condition:
            condition = image
        else:
            condition = None

        model = model.train()
        learner_outputs, agent_state = model(
            dict(
                obs=frame,
                condition=condition,
                action=actor_outputs.action,
                noise=noise,
                done=done,
            ),
            agent_state,
        )

        if flags.use_tca:
            frame = torch.flatten(frame, 0, 1)
            if flags.condition:
                condition = torch.flatten(condition, 0, 1)
        else:
            frame = frame[-1]
            if flags.condition:
                condition = condition[-1]

        D = D.eval()
        with torch.no_grad():
            if flags.condition:
                p = D(frame, condition).view(-1, flags.batch_size)
            else:
                p = D(frame).view(-1, flags.batch_size)

            if flags.use_tca:
                d_reward = p[1:] - p[:-1]
                reward = reward[1:] + d_reward
            else:
                reward[-1] = reward[-1] + p
                reward = reward[1:]

            # empty condition
            condition = None

        # Take final value function slice for bootstrapping.
        learner_outputs = AgentOutput._make(learner_outputs)
        bootstrap_value = learner_outputs.baseline[-1]

        # Move from obs[t] -> action[t] to action[t] -> obs[t].
        batch = nest.map(lambda t: t[1:], batch)
        learner_outputs = nest.map(lambda t: t[:-1], learner_outputs)

        # Turn into namedtuples again.
        env_outputs, actor_outputs = batch

        env_outputs = EnvOutput._make(env_outputs)
        actor_outputs = AgentOutput._make(actor_outputs)
        learner_outputs = AgentOutput._make(learner_outputs)

        discounts = (~env_outputs.done).float() * flags.discounting

        action = actor_outputs.action.unbind(dim=2)

        vtrace_returns = vtrace.from_logits(
            behavior_policy_logits=actor_outputs.policy_logits,
            target_policy_logits=learner_outputs.policy_logits,
            actions=action,
            discounts=discounts,
            rewards=reward,
            values=learner_outputs.baseline,
            bootstrap_value=bootstrap_value,
        )

        pg_loss = compute_policy_gradient_loss(
            learner_outputs.policy_logits,
            action,
            vtrace_returns.pg_advantages,
        )
        baseline_loss = flags.baseline_cost * compute_baseline_loss(
            vtrace_returns.vs - learner_outputs.baseline)
        entropy_loss = flags.entropy_cost * compute_entropy_loss(
            learner_outputs.policy_logits)

        total_loss = pg_loss + baseline_loss + entropy_loss

        total_loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping)

        optimizer.step()
        scheduler.step()

        actor_model.load_state_dict(model.state_dict())

        stats["step"] = stats.get("step",
                                  0) + flags.unroll_length * flags.batch_size
        stats["total_loss"] = total_loss.item()
        stats["pg_loss"] = pg_loss.item()
        stats["baseline_loss"] = baseline_loss.item()
        stats["entropy_loss"] = entropy_loss.item()
        stats["final_reward"] = reward[-1].mean().item()
        stats["episode_reward"] = reward.mean(dim=1).sum().item()
        stats["learner_queue_size"] = learner_queue.size()

        if flags.condition:
            if flags.use_tca:
                _, C, H, W = frame.shape
                frame = frame.view(flags.unroll_length, flags.batch_size, C, H,
                                   W)
                frame = frame[-1]
            stats["l2_loss"] = F.mse_loss(frame, image.squeeze(0)).item()

        plogger.log(stats)
        lock.release()
コード例 #6
0
ファイル: monobeast.py プロジェクト: guydav/torchbeast
def learn(
    flags,
    actor_model,
    model,
    batch,
    initial_agent_state,
    optimizer,
    scheduler,
    lock=threading.Lock(),  # noqa: B008
):
    """Performs a learning (optimization) step."""
    with lock:
        learner_outputs, unused_state = model(batch, initial_agent_state)

        # Take final value function slice for bootstrapping.
        bootstrap_value = learner_outputs["baseline"][-1]

        # Move from obs[t] -> action[t] to action[t] -> obs[t].
        batch = {key: tensor[1:] for key, tensor in batch.items()}
        learner_outputs = {key: tensor[:-1] for key, tensor in learner_outputs.items()}

        rewards = batch["reward"]
        if flags.reward_clipping == "abs_one":
            clipped_rewards = torch.clamp(rewards, -1, 1)
        elif flags.reward_clipping == "none":
            clipped_rewards = rewards

        discounts = (~batch["done"]).float() * flags.discounting

        vtrace_returns = vtrace.from_logits(
            behavior_policy_logits=batch["policy_logits"],
            target_policy_logits=learner_outputs["policy_logits"],
            actions=batch["action"],
            discounts=discounts,
            rewards=clipped_rewards,
            values=learner_outputs["baseline"],
            bootstrap_value=bootstrap_value,
        )

        pg_loss = compute_policy_gradient_loss(
            learner_outputs["policy_logits"],
            batch["action"],
            vtrace_returns.pg_advantages,
        )
        baseline_loss = flags.baseline_cost * compute_baseline_loss(
            vtrace_returns.vs - learner_outputs["baseline"]
        )
        entropy_loss = flags.entropy_cost * compute_entropy_loss(
            learner_outputs["policy_logits"]
        )

        total_loss = pg_loss + baseline_loss + entropy_loss

        episode_returns = batch["episode_return"][batch["done"]]
        stats = {
            "episode_returns": tuple(episode_returns.cpu().numpy()),
            "mean_episode_return": torch.mean(episode_returns).item(),
            "total_loss": total_loss.item(),
            "pg_loss": pg_loss.item(),
            "baseline_loss": baseline_loss.item(),
            "entropy_loss": entropy_loss.item(),
        }

        optimizer.zero_grad()
        total_loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping)
        optimizer.step()
        scheduler.step()

        actor_model.load_state_dict(model.state_dict())
        return stats
コード例 #7
0
def learn(
        flags,
        learner_queue,
        model,
        actor_model,
        D,
        optimizer,
        scheduler,
        stats,
        plogger,
        lock=threading.Lock(),
):
    for tensors in learner_queue:
        new_obs = tensors[1]
        tensors = edit_tuple(tensors, 1, new_obs["canvas"])

        tensors = nest.map(
            lambda t: t.to(flags.learner_device, non_blocking=True), tensors)

        batch, new_frame, initial_agent_state = tensors

        env_outputs, actor_outputs = batch
        obs, reward, done, step, _ = env_outputs

        lock.acquire()  # Only one thread learning at a time.

        if flags.use_tca:
            discriminator_reward = tca_reward_function(flags, obs, new_frame,
                                                       D)

            reward = env_outputs[1]
            env_outputs = edit_tuple(env_outputs, 1,
                                     reward + discriminator_reward)
            batch = edit_tuple(batch, 0, env_outputs)
        else:
            if done.any().item():
                discriminator_reward = reward_function(flags, done, new_frame,
                                                       D)

                reward = env_outputs[1]
                env_outputs = edit_tuple(env_outputs, 1,
                                         reward + discriminator_reward)
                batch = edit_tuple(batch, 0, env_outputs)

        optimizer.zero_grad()

        actor_outputs = AgentOutput._make(actor_outputs)

        learner_outputs, agent_state = model(obs, done, initial_agent_state)

        # Take final value function slice for bootstrapping.
        learner_outputs = AgentOutput._make(learner_outputs)
        bootstrap_value = learner_outputs.baseline[-1]

        # Move from obs[t] -> action[t] to action[t] -> obs[t].
        batch = nest.map(lambda t: t[1:], batch)
        learner_outputs = nest.map(lambda t: t[:-1], learner_outputs)

        # Turn into namedtuples again.
        env_outputs, actor_outputs = batch

        env_outputs = EnvOutput._make(env_outputs)
        actor_outputs = AgentOutput._make(actor_outputs)
        learner_outputs = AgentOutput._make(learner_outputs)

        discounts = (~env_outputs.done).float() * flags.discounting

        vtrace_returns = vtrace.from_logits(
            behavior_policy_logits=actor_outputs.policy_logits,
            target_policy_logits=learner_outputs.policy_logits,
            actions=actor_outputs.action,
            discounts=discounts,
            rewards=env_outputs.reward,
            values=learner_outputs.baseline,
            bootstrap_value=bootstrap_value,
        )

        vtrace_returns = vtrace.VTraceFromLogitsReturns._make(vtrace_returns)

        pg_loss = compute_policy_gradient_loss(
            learner_outputs.policy_logits,
            actor_outputs.action,
            vtrace_returns.pg_advantages,
        )
        baseline_loss = flags.baseline_cost * compute_baseline_loss(
            vtrace_returns.vs - learner_outputs.baseline)
        entropy_loss = flags.entropy_cost * compute_entropy_loss(
            learner_outputs.policy_logits)

        total_loss = pg_loss + baseline_loss + entropy_loss

        total_loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), flags.grad_norm_clipping)

        optimizer.step()
        scheduler.step()

        actor_model.load_state_dict(model.state_dict())

        episode_returns = env_outputs.episode_return[env_outputs.done]

        stats["step"] = stats.get("step",
                                  0) + flags.unroll_length * flags.batch_size
        stats["episode_returns"] = tuple(episode_returns.cpu().numpy())
        stats["mean_environment_return"] = episode_returns.mean().item()
        stats["mean_discriminator_return"] = discriminator_reward.mean().item()
        stats["mean_episode_return"] = (stats["mean_environment_return"] +
                                        stats["mean_discriminator_return"])
        stats["total_loss"] = total_loss.item()
        stats["pg_loss"] = pg_loss.item()
        stats["baseline_loss"] = baseline_loss.item()
        stats["entropy_loss"] = entropy_loss.item()
        stats["learner_queue_size"] = learner_queue.size()

        if flags.condition and new_frame.size() != 0:
            stats["l2_loss"] = F.mse_loss(
                *new_frame.split(split_size=new_frame.shape[1] //
                                 2, dim=1)).item()

        plogger.log(stats)
        lock.release()