Esempio n. 1
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Esempio n. 2
0
class PPOAgent(ResearchAgent):
    """The TensorForceAgent. Acts through the algorith, not here."""
    def __init__(self, actor_critic, character=characters.Bomber, **kwargs):
        self._actor_critic = actor_critic
        super(PPOAgent, self).__init__(character, **kwargs)

    def cuda(self):
        self._actor_critic.cuda()
        if hasattr(self, "_rollout"):
            self._rollout.cuda()

    @property
    def model(self):
        return self._actor_critic

    @property
    def optimizer(self):
        return self._optimizer

    def set_eval(self):
        self._actor_critic.eval()

    def set_train(self):
        self._actor_critic.train()

    def _rollout_data(self, step, num_agent, num_agent_end=None):
        if num_agent_end is not None:
            assert (num_agent_end > num_agent)
            observations = Variable(
                self._rollout.observations[step, num_agent:num_agent_end])
            states = Variable(self._rollout.states[step,
                                                   num_agent:num_agent_end])
            masks = Variable(self._rollout.masks[step,
                                                 num_agent:num_agent_end])
        else:
            observations = Variable(self._rollout.observations[step,
                                                               num_agent],
                                    volatile=True)
            states = Variable(self._rollout.states[step, num_agent],
                              volatile=True)
            masks = Variable(self._rollout.masks[step, num_agent],
                             volatile=True)
        return observations, states, masks

    def actor_critic_act(self, step, num_agent=0, deterministic=False):
        """Uses the actor_critic to take action.
        Args:
          step: The int timestep that we are acting.
          num_agent: Agent id that's running. Non-zero when agent has copies.
        Returns:
          See the actor_critic's act function in model.py.
        """
        # NOTE: Training uses this --> it uses act(..., deterministic=False).
        return self._actor_critic.act(*self.get_rollout_data(step, num_agent),
                                      deterministic=deterministic)

    def get_rollout_data(self, step, num_agent, num_agent_end=None):
        return self._rollout_data(step, num_agent, num_agent_end)

    def actor_critic_call(self, step, num_agent=0):
        observations, states, masks = self._rollout_data(step, num_agent)
        return self._actor_critic(observations, states, masks)[0].data

    def _evaluate_actions(self, observations, states, masks, actions):
        return self._actor_critic.evaluate_actions(observations, states, masks,
                                                   actions)

    def _optimize(self,
                  value_loss,
                  action_loss,
                  dist_entropy,
                  entropy_coef,
                  value_loss_coef,
                  max_grad_norm,
                  kl_loss=None,
                  kl_factor=0,
                  only_value_loss=False,
                  add_nonlin=False):
        self._optimizer.zero_grad()
        # only update the value head (to be used when fine tuning a model
        # trained with BC without a value predictor) -- only beginning of finetuning
        if only_value_loss:
            loss = value_loss * value_loss_coef
            # stop the gradients from flowing through the
            # parameters that are used to compute the actions (i.e. critic / policy head)
            # and only backprop the value loss through the value head
            #(i.e. parameters used exclusively to predict the value)
            for p in self._actor_critic.parameters():
                p.requires_grad = False
            self._actor_critic.critic_linear.requires_grad = True
            if add_nonlin:
                self._actor_critic.fc_critic.requires_grad = True
            loss.backward()
        else:
            loss = value_loss * value_loss_coef + action_loss \
                    - dist_entropy * entropy_coef
            if kl_factor > 0 and not use_is:
                loss += kl_factor * kl_loss
            loss.backward()

        nn.utils.clip_grad_norm(self._actor_critic.parameters(), max_grad_norm)
        self._optimizer.step()
        if hasattr(self, '_scheduler'):
            self._scheduler.step(loss)

        if only_value_loss:
            for p in self._actor_critic.parameters():
                p.requires_grad = True

    def halve_lr(self):
        for i, param_group in enumerate(self._optimizer.param_groups):
            old_lr = float(param_group['lr'])
            new_lr = max(old_lr * 0.5, 1e-7)
            param_group['lr'] = new_lr

    def compute_advantages(self, next_value_agents, use_gae, gamma, tau):
        for num_agent, next_value in enumerate(next_value_agents):
            self._rollout.compute_returns(next_value, use_gae, gamma, tau,
                                          num_agent)
        advantages = self._rollout.compute_advantages()
        diff = (advantages - advantages.mean())
        advantages = diff / (advantages.std() + 1e-5)
        return advantages

    def initialize(self, args, obs_shape, action_space,
                   num_training_per_episode, num_episodes, total_steps,
                   num_epoch, optimizer_state_dict, num_steps, uniform_v,
                   uniform_v_prior):
        params = self._actor_critic.parameters()
        self._optimizer = optim.Adam(params, lr=args.lr, eps=args.eps)
        if optimizer_state_dict:
            self._optimizer.load_state_dict(optimizer_state_dict)
        if args.use_lr_scheduler:
            self._scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self._optimizer, mode='min', verbose=True)

        self._rollout = RolloutStorage(num_steps, args.num_processes,
                                       obs_shape, action_space,
                                       self._actor_critic.state_size,
                                       num_training_per_episode)
        self.num_episodes = num_episodes
        self.total_steps = total_steps
        self.num_epoch = num_epoch
        self.uniform_v = uniform_v
        self.uniform_v_prior = uniform_v_prior

    def update_rollouts(self, obs, timestep):
        self._rollout.observations[timestep, :, :, :, :, :].copy_(obs)

    def insert_rollouts(self,
                        step,
                        current_obs,
                        states,
                        action,
                        action_log_prob,
                        value,
                        reward,
                        mask,
                        action_log_prob_distr=None,
                        dagger_prob_distr=None,
                        expert_action_log_prob=None,
                        training_action_log_prob=None):
        self._rollout.insert(step,
                             current_obs,
                             states,
                             action,
                             action_log_prob,
                             value,
                             reward,
                             mask,
                             action_log_prob_distr,
                             dagger_prob_distr,
                             expert_action_log_prob=None,
                             training_action_log_prob=None)

    def ppo(self,
            advantages,
            num_mini_batch,
            batch_size,
            num_steps,
            clip_param,
            entropy_coef,
            value_loss_coef,
            max_grad_norm,
            action_space,
            anneal=False,
            lr=1e-4,
            eps=1e-5,
            kl_factor=0,
            only_value_loss=False,
            add_nonlin=False,
            use_is=False,
            use_retrace=False,
            lambda_retrace=1.0):
        action_losses = []
        value_losses = []
        dist_entropies = []
        kl_losses = []
        kl_loss = None
        total_losses = []

        if hasattr(self._actor_critic, 'gru'):
            data_generator = self._rollout.recurrent_generator(
                advantages, num_mini_batch, batch_size, num_steps, kl_factor,
                use_is)
        else:
            data_generator = self._rollout.feed_forward_generator(
                advantages, num_mini_batch, batch_size, num_steps,
                action_space, kl_factor, use_is)

        for sample in data_generator:
            observations_batch, states_batch, actions_batch, return_batch, \
                masks_batch, old_action_log_probs_batch, adv_targ, \
                action_log_probs_distr_batch, dagger_probs_distr_batch, \
                expert_action_log_probs_batch, training_action_log_probs_batch \
                = sample

            # Reshape to do in a single forward pass for all steps
            result = self._evaluate_actions(Variable(observations_batch),
                                            Variable(states_batch),
                                            Variable(masks_batch),
                                            Variable(actions_batch))
            values, action_log_probs, dist_entropy, states = result

            adv_targ = Variable(adv_targ)
            ratio = action_log_probs
            ratio -= Variable(old_action_log_probs_batch)
            ratio = torch.exp(ratio)

            surr1 = ratio * adv_targ
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param)
            surr2 *= adv_targ
            action_loss = -torch.min(surr1, surr2).mean()

            value_loss = (Variable(return_batch) - values) \
                         .pow(2).mean()

            total_loss = value_loss * value_loss_coef + action_loss \
                        - dist_entropy * entropy_coef

            if kl_factor > 0 and not use_is:
                criterion = nn.KLDivLoss()
                kl_loss = criterion(Variable(action_log_probs_distr_batch),
                                    Variable(dagger_probs_distr_batch))
                total_loss += kl_factor * kl_loss

            self._optimize(value_loss, action_loss, dist_entropy, entropy_coef,
                           value_loss_coef, max_grad_norm, kl_loss, kl_factor,
                           only_value_loss, add_nonlin)
            lr = self._optimizer.param_groups[0]['lr']

            action_losses.append(action_loss.data[0])
            value_losses.append(value_loss.data[0])
            dist_entropies.append(dist_entropy.data[0])
            if kl_factor > 0 and not use_is:
                kl_losses.append(kl_loss.data[0])
            total_losses.append(total_loss.data[0])

        return action_losses, value_losses, dist_entropies, \
            kl_losses, total_losses, lr

    def copy_ex_model(self):
        """Creates a copy without the model. This is for operating with homogenous training."""
        return PPOAgent(None,
                        self._character,
                        num_processes=self._num_processes)

    def copy_with_model(self):
        """Creates a copy with the model. This is for operating with frozen backplay."""
        return PPOAgent(self._actor_critic,
                        self._character,
                        num_processes=self._num_processes)

    def after_epoch(self):
        self._rollout.after_epoch()

    def set_new_model(self, model, cuda=False):
        self._actor_critic = model
        if cuda:
            self._actor_critic.cuda()
Esempio n. 3
0
def main():
    print("######")
    print("HELLO! Returns start with infinity values")
    print("######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.random_task:
        env_params = {
            'wt': np.round(np.random.uniform(0.5, 1.0), 2),
            'x': np.round(np.random.uniform(-0.1, 0.1), 2),
            'y': np.round(np.random.uniform(-0.1, 0.1), 2),
            'z': np.round(np.random.uniform(0.15, 0.2), 2),
        }
    else:
        env_params = {
            'wt': args.euclidean_weight,
            'x': args.goal_x,
            'y': args.goal_y,
            'z': args.goal_z,
        }
    envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params)
            for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    envs = VecNormalize(envs, ob=False)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    actor_critic.input_norm.update(rollouts.observations[0])

    last_return = -np.inf
    best_return = -np.inf
    best_models = None

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                      Variable(rollouts.states[step], volatile=True),
                                                                      Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)
            actor_critic.input_norm.update(rollouts.observations[step + 1])

        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(advantages,
                                                            args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(advantages,
                                                            args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch),
                                                                                                   Variable(states_batch),
                                                                                                   Variable(masks_batch),
                                                                                                   Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if args.vis and j % args.vis_interval == 0:
            last_return = plot(logger, args.log_dir)

        if last_return > best_return:
            best_return = last_return
            try:
                os.makedirs(os.path.dirname(args.save_path))
            except OSError:
                pass

            info = {
                'return': best_return,
                'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon)
            }

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            torch.save((save_model, env_params, info), args.save_path)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                  format(j, total_num_steps,
                         int(total_num_steps / (end - start)),
                         last_return, best_return,
                         value_loss.data[0], action_loss.data[0]))
Esempio n. 4
0
class PPOAgent(BaseAgent):
    """The TensorForceAgent. Acts through the algorith, not here."""
    def __init__(self, agent, actor_critic):
        self._actor_critic = actor_critic
        self._agent = agent

    def cuda(self):
        self._actor_critic.cuda()
        self._rollout.cuda()

    def get_model(self):
        return self._actor_critic

    def get_optimizer(self):
        return self._optimizer

    def act(self, obs, action_space):
        """This agent has its own way of inducing actions."""
        return None

    def act_pytorch(self, step, num_agent=0):
        """Uses the actor_critic to act.

        Args:
          step: The int timestep that we are acting.
          num_agent: The agent id that we are acting. This is non zero when this agent has copies.

        Returns:
          See the actor_critic's act function in model.py.
        """
        return self._actor_critic.act(
            Variable(self._rollout.observations[step, num_agent],
                     volatile=True),
            Variable(self._rollout.states[step, num_agent], volatile=True),
            Variable(self._rollout.masks[step, num_agent], volatile=True))

    def run_actor_critic(self, step, num_agent=0):
        return self._actor_critic(
            Variable(self._rollout.observations[step][num_agent],
                     volatile=True),
            Variable(self._rollout.states[step][num_agent], volatile=True),
            Variable(self._rollout.masks[step][num_agent],
                     volatile=True))[0].data

    def evaluate_actions(self, observations, states, masks, actions):
        return self._actor_critic.evaluate_actions(observations, states, masks,
                                                   actions)

    def optimize(self, value_loss, action_loss, dist_entropy, entropy_coef,
                 max_grad_norm):
        self._optimizer.zero_grad()
        (value_loss + action_loss - dist_entropy * entropy_coef).backward()
        nn.utils.clip_grad_norm(self._actor_critic.parameters(), max_grad_norm)
        self._optimizer.step()

    def compute_advantages(self, next_value_agents, use_gae, gamma, tau):
        for num_agent, next_value in enumerate(next_value_agents):
            self._rollout.compute_returns(next_value, use_gae, gamma, tau,
                                          num_agent)
        advantages = self._rollout.compute_advantages()
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-5)
        return advantages

    def initialize(self, args, obs_shape, action_space,
                   num_training_per_episode):
        self._optimizer = optim.Adam(self._actor_critic.parameters(),
                                     args.lr,
                                     eps=args.eps)
        self._rollout = RolloutStorage(args.num_steps, args.num_processes,
                                       obs_shape, action_space,
                                       self._actor_critic.state_size,
                                       num_training_per_episode)

    def update_rollouts(self, obs, timestep):
        self._rollout.observations[timestep, :, :, :, :, :].copy_(obs)

    def insert_rollouts(self, step, current_obs, states, action,
                        action_log_prob, value, reward, mask):
        self._rollout.insert(step, current_obs, states, action,
                             action_log_prob, value, reward, mask)

    def feed_forward_generator(self, advantage, args):
        return self._rollout.feed_forward_generator(advantage, args)

    def copy(self, agent):
        # NOTE: Ugh. This is bad.
        return PPOAgent(agent, None)

    def after_update(self):
        self._rollout.after_update()
Esempio n. 5
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")
    print(args)

    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    for gamma in args.gamma:
        with open(args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv',
                  "wt") as monitor_file:
            monitor = csv.writer(monitor_file)
            monitor.writerow([
                'update', 'error',
                str(int(args.num_frames) // args.num_steps)
            ])

    os.environ['OMP_NUM_THREADS'] = '1'

    print("Using env {}".format(args.env_name))

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    num_heads = len(
        args.gamma) if not args.reward_predictor else len(args.gamma) - 1

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0],
                                 envs.action_space,
                                 num_heads=num_heads,
                                 hidden_size=args.hidden_size)
    else:
        actor_critic = MLPPolicy(obs_shape[0],
                                 envs.action_space,
                                 num_heads=num_heads,
                                 reward_predictor=args.reward_predictor,
                                 use_s=args.use_s,
                                 use_s_a=args.use_s_a,
                                 use_s_a_sprime=args.use_s_a_sprime)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    lrs = [args.lr] * len(actor_critic.param_groups)

    if not args.reward_predictor:
        assert len(actor_critic.param_groups) == len(lrs)
        model_params = [{
            'params': model_p,
            'lr': args.lr
        } for model_p, lr in zip(actor_critic.param_groups, lrs)]
    else:
        model_params = [{
            'params': model_p,
            'lr': p_lr
        } for model_p, p_lr in zip(actor_critic.param_groups[:-1], lrs)]
        model_params.append({
            'params': actor_critic.param_groups[-1],
            'lr': args.lr_rp
        })

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(model_params,
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)

    elif args.algo == 'ppo':
        optimizer = optim.Adam(model_params, args.lr, eps=args.eps)

    rollouts = RolloutStorage(args.num_steps,
                              args.num_processes,
                              obs_shape,
                              envs.action_space,
                              actor_critic.state_size,
                              gamma=args.gamma,
                              use_rp=args.reward_predictor)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs, obs_tensor):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            obs_tensor[:, :-shape_dim0] = obs_tensor[:, shape_dim0:]
        obs_tensor[:, -shape_dim0:] = obs

    obs = envs.reset()

    update_current_obs(obs, current_obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    advantages_list = []

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            cpu_actions = add_gaussian_noise(cpu_actions, args.action_noise)

            # Obser reward and next obs
            obs, raw_reward, done, info = envs.step(cpu_actions)

            reward = np.copy(raw_reward)

            reward = add_gaussian_noise(reward, args.reward_noise)

            reward = epsilon_greedy(reward, args.reward_epsilon,
                                    args.reward_high, args.reward_low)

            raw_reward = torch.from_numpy(
                np.expand_dims(np.stack(raw_reward), 1)).float()

            episode_rewards += raw_reward

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs)

            if args.reward_predictor:
                r_hat = actor_critic.predict_reward(
                    Variable(rollouts.observations[step], volatile=True),
                    action, Variable(current_obs, volatile=True))
                p_hat = min(args.rp_burn_in, j) / args.rp_burn_in
                estimate_reward = (1 -
                                   p_hat) * reward + p_hat * r_hat.data.cpu()
                reward = torch.cat([reward, estimate_reward], dim=-1)
                value = torch.cat([r_hat, value], dim=-1).data
            else:
                value = value.data

            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value, reward, masks,
                            raw_reward)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        if args.reward_predictor:
            if args.use_s or args.use_s_a:
                r_hat = actor_critic.predict_reward(
                    Variable(rollouts.observations[-1], volatile=True),
                    Variable(rollouts.actions[-1], volatile=True), None).data
                next_value = torch.cat([r_hat, next_value], dim=-1)
            else:
                next_value = torch.cat([
                    torch.zeros(list(next_value.size())[:-1] + [1]), next_value
                ],
                                       dim=-1)

        rollouts.compute_returns(next_value, args.use_gae, args.tau)

        if args.algo in ['a2c']:

            batch_states = Variable(rollouts.states[0].view(
                -1, actor_critic.state_size))
            batch_masks = Variable(rollouts.masks[:-1].view(-1, 1))
            batch_obs = Variable(rollouts.observations[:-1].view(
                -1, *obs_shape))
            batch_actions = Variable(rollouts.actions.view(-1, action_shape))

            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                batch_obs, batch_states, batch_masks, batch_actions)
            if args.reward_predictor:
                batch_obs_prime = Variable(rollouts.observations[1:].view(
                    -1, *obs_shape))
                values = torch.cat([
                    actor_critic.predict_reward(batch_obs, batch_actions,
                                                batch_obs_prime), values
                ],
                                   dim=-1)
            returns_as_variable = Variable(rollouts.returns[:-1])

            batched_v_loss = 0

            values = values.view(returns_as_variable.size())
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = returns_as_variable - values
            value_loss = advantages.pow(2).sum(-1).mean()
            action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) *
                            action_log_probs).mean()
            if args.reward_predictor:
                rp_error = (values[:, :, 0].data -
                            rollouts.raw_rewards).pow(2).mean()
                advantages_list.append([
                    rp_error,
                    advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0]
                ])
            else:
                advantages_list.append(
                    advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0])

            optimizer.zero_grad()
            (batched_v_loss + value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]

            advantages = advantages[:, :, -1]

            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                data_generator = rollouts.feed_forward_generator(
                    advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ, observations_batch_prime, true_rewards_batch, \
                            noisy_observations_batch, true_observations_batch = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    if args.reward_predictor:
                        values = torch.cat([
                            actor_critic.predict_reward(
                                Variable(observations_batch),
                                Variable(actions_batch),
                                Variable(observations_batch_prime)), values
                        ],
                                           dim=-1)

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    td = (Variable(return_batch) - values).pow(2)

                    value_loss = td.sum(-1).mean()

                    if args.reward_predictor:
                        rp_error = (values[:, 0].data -
                                    true_rewards_batch).pow(2).mean()
                        advantages_list.append(
                            [rp_error, td[:, -1].mean(0).data.cpu().numpy()])
                    else:
                        advantages_list.append(
                            td[:, -1].mean(0).data.cpu().numpy())

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()

                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, "
                "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    final_rewards.mean(), final_rewards.median(),
                    final_rewards.min(), final_rewards.max(),
                    dist_entropy.data[0], value_loss.data[0],
                    action_loss.data[0]))

            if len(advantages_list) > 2:
                advantages_array = np.array(advantages_list).reshape(
                    -1, len(args.gamma)).T
                for g, gamma in enumerate(args.gamma):
                    with open(
                            args.log_dir + '/MSE_' + str(gamma) +
                            '_monitor.csv', "a") as monitor_file:
                        monitor = csv.writer(monitor_file)
                        monitor.writerow(
                            [total_num_steps,
                             np.mean(advantages_array[g])])

            advantages_list = []
Esempio n. 6
0
class ReinforceAgent(ResearchAgent):
    """The TensorForceAgent. Acts through the algorith, not here."""
    def __init__(self, actor_critic, character=characters.Bomber, **kwargs):
        self._actor_critic = actor_critic
        super(ReinforceAgent, self).__init__(character, **kwargs)

    def cuda(self):
        self._actor_critic.cuda()
        if hasattr(self, "_rollout"):
            self._rollout.cuda()

    @property
    def model(self):
        return self._actor_critic

    @property
    def optimizer(self):
        return self._optimizer

    def set_eval(self):
        self._actor_critic.eval()

    def set_train(self):
        self._actor_critic.train()

    def _rollout_data(self, step, num_agent, num_agent_end=None):
        if num_agent_end is not None:
            assert (num_agent_end > num_agent)
            observations = Variable(
                self._rollout.observations[step, num_agent:num_agent_end],
                volatile=True)
            states = Variable(self._rollout.states[step,
                                                   num_agent:num_agent_end],
                              volatile=True)
            masks = Variable(self._rollout.masks[step,
                                                 num_agent:num_agent_end],
                             volatile=True)
        else:
            observations = Variable(self._rollout.observations[step,
                                                               num_agent],
                                    volatile=True)
            states = Variable(self._rollout.states[step, num_agent],
                              volatile=True)
            masks = Variable(self._rollout.masks[step, num_agent],
                             volatile=True)
        return observations, states, masks

    def actor_critic_act(self, step, num_agent=0, deterministic=False):
        """Uses the actor_critic to take action.
        Args:
          step: The int timestep that we are acting.
          num_agent: Agent id that's running. Non-zero when agent has copies.
        Returns:
          See the actor_critic's act function in model.py.
        """
        return self._actor_critic.act(*self.get_rollout_data(step, num_agent),
                                      deterministic=deterministic)

    def get_rollout_data(self, step, num_agent, num_agent_end=None):
        return self._rollout_data(step, num_agent, num_agent_end)

    def actor_critic_call(self, step, num_agent=0):
        observations, states, masks = self._rollout_data(step, num_agent)
        return self._actor_critic(observations, states, masks)[0].data

    def _evaluate_actions(self, observations, states, masks, actions):
        return self._actor_critic.evaluate_actions(observations, states, masks,
                                                   actions)

    def _optimize(self,
                  pg_loss,
                  max_grad_norm,
                  kl_loss=None,
                  kl_factor=0,
                  use_is=False):
        self._optimizer.zero_grad()
        loss = pg_loss
        if kl_factor > 0:  # and not use_is:
            loss += kl_factor * kl_loss
        loss.backward()
        nn.utils.clip_grad_norm(self._actor_critic.parameters(), max_grad_norm)
        self._optimizer.step()
        if hasattr(self, '_scheduler'):
            self._scheduler.step(loss)

    def halve_lr(self):
        for i, param_group in enumerate(self._optimizer.param_groups):
            old_lr = float(param_group['lr'])
            new_lr = max(old_lr * 0.5, 1e-7)
            param_group['lr'] = new_lr

    def compute_advantages(self, next_value_agents, use_gae, gamma, tau):
        for num_agent, next_value in enumerate(next_value_agents):
            self._rollout.compute_returns(next_value, use_gae, gamma, tau,
                                          num_agent)
        advantages = self._rollout.compute_advantages(reinforce=True)
        diff = (advantages - advantages.mean())
        advantages = diff / (advantages.std() + 1e-5)
        return advantages

    def initialize(self, args, obs_shape, action_space,
                   num_training_per_episode, num_episodes, total_steps,
                   num_epoch, optimizer_state_dict, num_steps, uniform_v,
                   uniform_v_prior):
        params = self._actor_critic.parameters()
        self._optimizer = optim.Adam(params, lr=args.lr, eps=args.eps)
        if optimizer_state_dict:
            self._optimizer.load_state_dict(optimizer_state_dict)
        if args.use_lr_scheduler:
            self._scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                self._optimizer, mode='min', verbose=True)

        self._rollout = RolloutStorage(num_steps, args.num_processes,
                                       obs_shape, action_space,
                                       self._actor_critic.state_size,
                                       num_training_per_episode)
        self.num_episodes = num_episodes
        self.total_steps = total_steps
        self.num_epoch = num_epoch
        self.uniform_v = uniform_v
        self.uniform_v_prior = uniform_v_prior

    def update_rollouts(self, obs, timestep):
        self._rollout.observations[timestep, :, :, :, :, :].copy_(obs)

    def insert_rollouts(self,
                        step,
                        current_obs,
                        states,
                        action,
                        action_log_prob,
                        value,
                        reward,
                        mask,
                        action_log_prob_distr=None,
                        dagger_prob_distr=None,
                        expert_action_log_prob=None,
                        training_action_log_prob=None):
        self._rollout.insert(step, current_obs, states, action,
                             action_log_prob, value, reward, mask,
                             action_log_prob_distr, dagger_prob_distr,
                             expert_action_log_prob, training_action_log_prob)

    def reinforce(self,
                  advantages,
                  num_mini_batch,
                  batch_size,
                  num_steps,
                  max_grad_norm,
                  action_space,
                  anneal=False,
                  lr=1e-4,
                  eps=1e-5,
                  kl_factor=0,
                  use_is=False,
                  use_retrace=False,
                  lambda_retrace=1.0):
        pg_losses = []
        kl_losses = []
        kl_loss = None
        total_losses = []

        for sample in self._rollout.feed_forward_generator(
                advantages, num_mini_batch, batch_size, num_steps,
                action_space, kl_factor, use_is):
            observations_batch, states_batch, actions_batch, return_batch, \
                masks_batch, old_action_log_probs_batch, adv_targ, \
                action_log_probs_distr_batch, dagger_probs_distr_batch, \
                expert_action_log_probs_batch, training_action_log_probs_batch = sample

            # Reshape to do in a single forward pass for all steps
            result = self._evaluate_actions(Variable(observations_batch),
                                            Variable(states_batch),
                                            Variable(masks_batch),
                                            Variable(actions_batch))
            values, action_log_probs, dist_entropy, states = result

            adv_targ = Variable(adv_targ)
            logprob = action_log_probs

            if use_is:
                behavior_action_probs_batch = kl_factor * torch.exp(Variable(expert_action_log_probs_batch)) + \
                    (1 - kl_factor) * torch.exp(Variable(training_action_log_probs_batch))

                training_action_probs_batch = torch.exp(
                    Variable(training_action_log_probs_batch))

                training_single_action_probs_batch = training_action_probs_batch.gather(
                    1, Variable(actions_batch))
                behavior_single_action_probs_batch = behavior_action_probs_batch.gather(
                    1, Variable(actions_batch))

                is_ratio = training_single_action_probs_batch / behavior_single_action_probs_batch
                if use_retrace:
                    truncated_ratio = np.array(
                        [max(1, p.data[0]) for p in is_ratio])
                    truncated_ratio = Variable(
                        torch.from_numpy(
                            truncated_ratio).float().cuda().unsqueeze(1))
                    importance_weight = lambda_retrace * truncated_ratio
                else:
                    importance_weight = is_ratio

                pg_loss = -(logprob * adv_targ * importance_weight).mean()

                print("\n#################################################\n")
                print("action log probs ", logprob)
                print("adv targ ", adv_targ)
                print("training probs ", training_action_probs_batch)
                print("behavior probs ", behavior_action_probs_batch)
                print("ratio ", is_ratio)
                print("IS ", importance_weight)
                print("loss: ", pg_loss)
                print("#################################################\n")

            else:
                pg_loss = -(logprob * adv_targ).mean()

            total_loss = pg_loss

            if kl_factor > 0:  # and not use_is:
                criterion = nn.KLDivLoss()
                kl_loss = criterion(Variable(action_log_probs_distr_batch),
                                    Variable(dagger_probs_distr_batch))
                total_loss += kl_factor * kl_loss

            self._optimize(total_loss, max_grad_norm, kl_loss, kl_factor,
                           use_is)
            lr = self._optimizer.param_groups[0]['lr']

            pg_losses.append(pg_loss.data[0])
            if kl_factor > 0:  # and not use_is:
                kl_losses.append(kl_loss.data[0])
            total_losses.append(total_loss.data[0])

        return pg_losses, kl_losses, total_losses, lr

    def copy_ex_model(self):
        """Creates a copy without the model. This is for operating with homogenous training."""
        return ReinforceAgent(None, self._character)

    def after_epoch(self):
        self._rollout.after_epoch()
Esempio n. 7
0
class PPOAgent(object):
	def __init__(self,args):
		self.args = args
		self.device = torch.device('cuda') if args.cuda else torch.device('cpu')
		dummy_env = gym.make(self.args.env_name)
		self.actor = ACNet(dummy_env.action_space.n,args.feedforward)
		del dummy_env
		if args.load_dir is not None:
			actorState = torch.load(args.load_dir,map_location=lambda storage, loc: storage)
		if args.continue_training:
			self.actor.load_state_dict(actorState)
			print("Loaded pretrained model successfully")
		if args.transfer:
			self.actor.load_autoturn_model(actorState)
		if args.cuda:
			self.actor.cuda()
		self.actor_optimizer = optim.Adam(self.actor.parameters(),lr=self.args.lr)
		self.env_list = [make_env(self.args.env_name,self.args.seed,i) for i in range(self.args.num_processes)]
		if self.args.num_processes > 1:
			self.envs = gym_vecenv.SubprocVecEnv(self.env_list)
		else:
			self.envs = gym_vecenv.DummyVecEnv(self.env_list)
		if len(self.envs.observation_space.shape) == 1:
			self.envs = gym_vecenv.VecNormalize(self.envs)
		
		self.obs_shape = self.envs.observation_space.shape
		self.obs_shape = (self.obs_shape[0] * args.num_stack, *self.obs_shape[1:])
		self.state_shape = 1 if args.feedforward else 256
		self.rollouts = RolloutStorage(self.args.num_fwd_steps, self.args.num_processes, self.obs_shape, self.envs.action_space, self.state_shape)
		self.num_updates = int(args.num_frames)//args.num_fwd_steps//args.num_processes
		self.current_obs = torch.zeros(self.args.num_processes,*self.obs_shape)
		self.writer = SummaryWriter(log_dir=self.args.save_dir)
		self.fortress_threshold = 650
		self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.actor_optimizer,
											mode='max',factor=0.2,patience=15,verbose=True,threshold=1e-3,
											threshold_mode='rel')
		#self.scheduler2 = torch.optim.lr_scheduler.MultiStepLR(self.actor_optimizer,milestones=[40,80],gamma=0.3)
	
	def update_current_obs(self,obs):
		shape_dim0 = self.envs.observation_space.shape[0]
		obs = torch.from_numpy(obs).float()
		if self.args.num_stack > 1:
				self.current_obs[:, :-shape_dim0] = self.current_obs[:, shape_dim0:]
		self.current_obs[:, -shape_dim0:] = obs

	def train(self):

		obs = self.envs.reset()
		self.update_current_obs(obs)
		self.rollouts.observations[0].copy_(self.current_obs)

		episode_rewards = torch.zeros([self.args.num_processes,1])
		final_rewards = torch.zeros([self.args.num_processes,1])
		num_destruction = 0
		if self.args.cuda:
			self.current_obs = self.current_obs.cuda()
			self.rollouts.cuda()

		start = time.time()
		num_episodes = 0
		for iteration in range(self.num_updates):
			for step in range(self.args.num_fwd_steps):
				with torch.no_grad():
					value,action,action_log_prob,states = self.actor.act(self.rollouts.observations[step],
																															self.rollouts.states[step],
																															self.rollouts.masks[step])
				cpu_actions = action.squeeze(1).cpu().numpy()
				obs,reward,done,info = self.envs.step(cpu_actions)
				num_destruction += sum(info)
				reward = torch.from_numpy(np.expand_dims(np.stack(reward),1)).float()

				episode_rewards += reward
				masks = torch.FloatTensor([[0.0] if i else [1.0] for i in done])
				final_rewards*=masks
				final_rewards += (1-masks)*episode_rewards
				episode_rewards *= masks

				if self.args.cuda:
					masks = masks.cuda()
				if self.current_obs.dim() == 4:
					self.current_obs *= masks.unsqueeze(2).unsqueeze(2)
				else:
					self.current_obs *= masks

				self.update_current_obs(obs)
				self.rollouts.insert(step,self.current_obs,states,action,action_log_prob,value,reward,masks)

			with torch.no_grad():
				next_value = self.actor.get_value(self.rollouts.observations[-1],
																					self.rollouts.states[-1],
																					self.rollouts.masks[-1]).detach()
			
			self.rollouts.compute_returns(next_value,True,self.args.gamma,self.args.tau)
			if not self.args.a2c:
				advantages = self.rollouts.returns[:-1] - self.rollouts.value_preds[:-1]
				advantages = (advantages - advantages.mean())/(advantages.std()+1e-5)

				for i in range(self.args.ppo_epoch):
					if self.args.feedforward:
						data_generator = self.rollouts.feed_forward_generator(advantages,self.args.num_mini_batch)
					else:
						data_generator = self.rollouts.recurrent_generator(advantages,self.args.num_mini_batch)
					for sample in data_generator:
						observations_batch, states_batch, actions_batch, \
							return_batch, masks_batch, old_action_log_probs_batch, \
								adv_targ = sample
						values,action_log_probs,dist_entropy,states = self.actor.evaluate_actions(observations_batch,
																																											states_batch,
																																											masks_batch,
																																											actions_batch)
						ratio = torch.exp(action_log_probs - old_action_log_probs_batch)
						surr1 = ratio*adv_targ
						surr2 = torch.clamp(ratio,1.0-self.args.clip_param,1.0+self.args.clip_param)*adv_targ
						action_loss = -torch.min(surr1,surr2).mean()
						value_loss = (values-return_batch).pow(2).mean()
						self.actor_optimizer.zero_grad()
						actorLoss = action_loss + self.args.value_loss_coeff*value_loss - self.args.entropy_coeff*dist_entropy
						actorLoss.backward()
						torch.nn.utils.clip_grad_norm_(self.actor.parameters(),self.args.max_grad_norm)
						self.actor_optimizer.step()
			else:
				values,action_log_probs,dist_entropy,states = self.actor.evaluate_actions(self.rollouts.observations[:-1].view(-1,*self.obs_shape),
																																									self.rollouts.states[0].view(-1,1 if self.args.feedforward else 256),
																																									self.rollouts.masks[:-1].view(-1,1),
																																									self.rollouts.actions.view(-1,1))
				values = values.view(self.args.num_fwd_steps,self.args.num_processes,1)
				action_log_probs = action_log_probs.view(self.args.num_fwd_steps,self.args.num_processes,1)
				advantages = self.rollouts.returns[:-1] - values
				value_loss = advantages.pow(2).mean()
				action_loss = -(advantages.detach()*action_log_probs).mean()
				self.actor_optimizer.zero_grad()
				actorLoss = action_loss + self.args.value_loss_coeff*value_loss - self.args.entropy_coeff*dist_entropy
				actorLoss.backward()
				self.actor_optimizer.step()

			self.rollouts.after_update()

			if num_destruction>self.fortress_threshold:
				torch.save(self.actor.state_dict(),self.args.save_dir+'/'+self.args.env_name+'_'+str(iteration)+'_ppo_actor.pth.tar')
				self.fortress_threshold = num_destruction

			if iteration%self.args.log_interval == 0:
				end = time.time()
				total_num_steps = (iteration+1)*self.args.num_processes*self.args.num_fwd_steps
				num_destruction /= self.args.num_processes
				print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, num fortress destroyed {:.2f}".
								format(iteration, total_num_steps,
											 int(total_num_steps / (end - start)),
											 final_rewards.mean(),
											 final_rewards.median(),
											 final_rewards.min(),
											 final_rewards.max(), dist_entropy.item(),
											 value_loss.item(), action_loss.item(),num_destruction))
				self.writer.add_scalar('data/rewardmean',final_rewards.mean(),total_num_steps)
				self.writer.add_scalar('data/distentropy',dist_entropy.item(),total_num_steps)
				self.writer.add_scalar('data/valueloss',value_loss.item(),total_num_steps)
				self.writer.add_scalar('data/actionloss',action_loss.item(),total_num_steps)
				self.writer.add_scalar('data/numdestruction',num_destruction,total_num_steps)
				#self.scheduler.step(final_rewards.mean())i
				#self.scheduler2.step()
				num_destruction = 0

			if iteration%self.args.save_interval==0:
				torch.save(self.actor.state_dict(),self.args.save_dir+'/'+self.args.env_name+'_'+str(iteration)+'_ppo_actor.pth.tar')
				self.writer.export_scalars_to_json(self.args.save_dir+'/'+self.args.env_name+"_all_scalars.json")

		self.envs.close()
		self.writer.close()
Esempio n. 8
0
def main():
    """
    主程序
    :return:
    """
    num_cls = args.wave_num * args.k + 1  # 所有的路由和波长选择组合,加上啥都不选
    action_shape = 1  # action的维度,默认是1.
    num_updates = int(
        args.steps) // args.workers // args.num_steps  # 梯度一共需要更新的次数
    if args.append_route.startswith("True"):
        channel_num = args.wave_num + args.k
    else:
        channel_num = args.wave_num

    # 解析weight
    if args.weight.startswith('None'):
        weight = None
    else:
        weight = args.weight
    # 创建actor_critic
    if args.mode.startswith('alg'):
        # ksp(args, weight)
        return
    elif args.mode.startswith('learning'):
        # CNN学习模式下,osb的shape应该是CHW
        obs_shape = (channel_num, args.img_height, args.img_width)
        if args.cnn.startswith('mobilenetv2'):
            actor_critic = MobileNetV2(in_channels=channel_num,
                                       num_classes=num_cls,
                                       t=6)
        elif args.cnn.startswith('simplenet'):
            actor_critic = SimpleNet(in_channels=channel_num,
                                     num_classes=num_cls)
        elif args.cnn.startswith('simplestnet'):
            actor_critic = SimplestNet(in_channels=channel_num,
                                       num_classes=num_cls)
        elif args.cnn.startswith('alexnet'):
            actor_critic = AlexNet(in_channels=channel_num,
                                   num_classes=num_cls)
        elif args.cnn.startswith('squeezenet'):
            actor_critic = SqueezeNet(in_channels=channel_num,
                                      num_classes=num_cls,
                                      version=1.0)
        elif args.cnn.startswith('expandsimplenet'):
            actor_critic = ExpandSimpleNet(in_channels=channel_num,
                                           num_classes=num_cls,
                                           expand_factor=args.expand_factor)
        elif args.cnn.startswith('deepersimplenet'):
            actor_critic = DeeperSimpleNet(in_channels=channel_num,
                                           num_classes=num_cls,
                                           expand_factor=args.expand_factor)
        else:
            raise NotImplementedError

        # 创建optimizer
        if args.algo.startswith("a2c"):
            optimizer = optim.RMSprop(actor_critic.parameters(),
                                      lr=args.base_lr,
                                      eps=args.epsilon,
                                      alpha=args.alpha)
        elif args.algo.startswith("ppo"):
            optimizer = optim.Adam(actor_critic.parameters(),
                                   lr=args.base_lr,
                                   eps=args.epsilon)
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError

    if args.cuda.startswith("True"):
        # 如果要使用cuda进行计算
        actor_critic.cuda()
        # actor_critic = DistModule(actor_critic)

    # 判断是否是评估模式
    if args.evaluate:
        print("evaluate mode")
        models = {}
        times = 1
        prefix = "trained_models"
        directory = os.path.join(prefix, 'a2c', args.cnn, args.step_over)
        env = RwaGame(net_config=args.net,
                      wave_num=args.wave_num,
                      rou=args.rou,
                      miu=args.miu,
                      max_iter=args.max_iter,
                      k=args.k,
                      mode=args.mode,
                      img_width=args.img_width,
                      img_height=args.img_height,
                      weight=weight,
                      step_over=args.step_over)

        for model_file in reversed(
                sorted(os.listdir(directory),
                       key=lambda item: int(item.split('.')[0]))):
            model_file = os.path.join(directory, model_file)
            print("evaluate model {}".format(model_file))
            params = torch.load(model_file)
            actor_critic.load_state_dict(params['state_dict'])
            actor_critic.eval()

            models[params['update_i']] = {}

            print("model loading is finished")
            for t in range(times):
                total_reward, total_services, allocated_services = 0, 0, 0
                obs, reward, done, info = env.reset()
                while not done:
                    inp = Variable(torch.Tensor(obs).unsqueeze(0),
                                   volatile=True)  # 禁止梯度更新
                    value, action, action_log_prob = actor_critic.act(
                        inputs=inp, deterministic=True)  # 确定性决策
                    action = action.data.numpy()[0]
                    obs, reward, done, info = env.step(action=action[0])
                    total_reward += reward
                    if reward == ARRIVAL_NEWPORT or reward == ARRIVAL_NOPORT:
                        allocated_services += 1
                    if args.step_over.startswith('one_time'):
                        if info:
                            total_services += 1
                    elif args.step_over.startswith('one_service'):
                        total_services += 1
                    else:
                        raise NotImplementedError
                models[params['update_i']]['time'] = t
                models[params['update_i']]['reward'] = total_reward
                models[params['update_i']]['total_services'] = total_services
                models[params['update_i']][
                    'allocated_services'] = allocated_services
                models[params['update_i']]['bp'] = (
                    total_services - allocated_services) / total_services
        # 输出仿真结果
        # print("|updated model|test index|reward|bp|total services|allocated services|")
        # print("|:-----|:-----|:-----|:-----|:-----|:-----|")
        # for m in sorted(models):
            for i in range(times):
                print("|{up}|{id}|{r}|{bp:.4f}|{ts}|{als}|".format(
                    up=params['update_i'],
                    id=models[params['update_i']]['time'],
                    r=models[params['update_i']]['reward'],
                    bp=models[params['update_i']]['bp'],
                    ts=models[params['update_i']]['total_services'],
                    als=models[params['update_i']]['allocated_services']))
        return

    # 创建游戏环境
    envs = [
        make_env(net_config=args.net,
                 wave_num=args.wave_num,
                 k=args.k,
                 mode=args.mode,
                 img_width=args.img_width,
                 img_height=args.img_height,
                 weight=weight,
                 step_over=args.step_over) for _ in range(args.workers)
    ]
    envs = SubprocEnv(envs)
    # 创建游戏运行过程中相关变量存储更新的容器
    rollout = RolloutStorage(num_steps=args.num_steps,
                             num_processes=args.workers,
                             obs_shape=obs_shape,
                             action_shape=action_shape)
    current_obs = torch.zeros(args.workers, *obs_shape)

    observation, _, _, _ = envs.reset()
    update_current_obs(current_obs, observation, channel_num)

    rollout.observations[0].copy_(current_obs)
    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.workers, 1])
    final_rewards = torch.zeros([args.workers, 1])

    if args.cuda.startswith("True"):
        current_obs = current_obs.cuda()
        rollout.cuda()

    start = time.time()
    log_start = time.time()
    total_services = 0  # log_interval期间一共有多少个业务到达
    allocated_services = 0  # log_interval期间一共有多少个业务被分配成功
    update_begin = 0

    # 判断是否是接续之前的训练
    if args.resume:
        pms = torch.load(args.resume)
        actor_critic.load_state_dict(pms['state_dict'])
        optimizer.load_state_dict(pms['optimizer'])
        update_begin = pms['update_i']
        print("resume process from update_i {}, with base_lr {}".format(
            update_begin, args.base_lr))

    for updata_i in range(update_begin, num_updates):
        update_start = time.time()
        for step in range(args.num_steps):
            # 选择行为
            inp = Variable(rollout.observations[step], volatile=True)  # 禁止梯度更新
            value, action, action_log_prob = actor_critic.act(
                inputs=inp, deterministic=False)
            # print(action)
            # 压缩维度,放到cpu上执行。因为没有用到GPU,所以并没有什么卵用,权当提示
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # 观察observation,以及下一个observation
            envs.step_async(cpu_actions)
            obs, reward, done, info = envs.step_wait(
            )  # reward和done都是(n,)的numpy.ndarray向量
            #  if reward == ARRIVAL_NEWPORT_NEWPORT or reward == ARRIVAL_NOPORT_NEWPORT or reward == ARRIVAL_NOPORT_NOPORT:
            #     allocated_services += 1
            print(reward)
            for i in reward:
                if i == ARRIVAL_NEWPORT or i == ARRIVAL_NOPORT:
                    allocated_services += 1
        #  allocated_services += (reward==ARRIVAL_NEWPORT_NEWPORT or reward==ARRIVAL_NOPORT_NEWPORT or reward==ARRIVAL_NOPORT_NOPORT).any().sum()  # 计算分配成功的reward的次数
        # TODO 未解决
            if args.step_over.startswith('one_service'):
                total_services += (info == True).sum()  # 计算本次step中包含多少个业务到达事件
            # elif args.step_over.startswith('one_service'):
            #     total_services += args.workers
            else:
                raise NotImplementedError
            reward = torch.from_numpy(np.expand_dims(reward, 1)).float()
            episode_rewards += reward  # 累加reward分数
            # 如果游戏结束,则重新开始计算episode_rewards和final_rewards,并且以返回的reward为初始值重新进行累加。
            masks = torch.FloatTensor([[0.0] if d else [1.0] for d in done
                                       ])  # True --> 0, False --> 1
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            #            if done[len(done)-1]:
            #               print('游戏结束最终端口数量:',envs.get_all_edges_port())

            if args.cuda.startswith("True"):
                masks = masks.cuda()

            # 给masks扩充2个维度,与current_obs相乘。则运行结束的游戏进程对应的obs值会变成0,图像上表示全黑,即游戏结束的画面。
            current_obs *= masks.unsqueeze(2).unsqueeze(2)
            update_current_obs(current_obs=current_obs,
                               obs=obs,
                               channel_num=channel_num)
            # 把本步骤得到的结果存储起来
            rollout.insert(step=step,
                           current_obs=current_obs,
                           action=action.data,
                           action_log_prob=action_log_prob.data,
                           value_pred=value.data,
                           reward=reward,
                           mask=masks)

        # TODO 强行停止
        # envs.close()
        # return

        # 注意不要引用上述for循环定义的变量。下面变量的命名和使用都要注意。
        next_inp = Variable(rollout.observations[-1], volatile=True)  # 禁止梯度更新
        next_value = actor_critic(next_inp)[0].data  # 获取下一步的value值
        rollout.compute_returns(next_value=next_value,
                                use_gae=False,
                                gamma=args.gamma,
                                tau=None)

        if args.algo.startswith('a2c'):
            # 下面进行A2C算法梯度更新
            inps = Variable(rollout.observations[:-1].view(-1, *obs_shape))
            acts = Variable(rollout.actions.view(-1, action_shape))

            # print("a2cs's acts size is {}".format(acts.size()))
            value, action_log_probs, cls_entropy = actor_critic.evaluate_actions(
                inputs=inps, actions=acts)
            print(cls_entropy.data)
            # print("inputs' shape is {}".format(inps.size()))
            # print("value's shape is {}".format(value.size()))
            value = value.view(args.num_steps, args.workers, 1)
            # print("action_log_probs's shape is {}".format(action_log_probs.size()))
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.workers, 1)
            # 计算loss
            advantages = Variable(rollout.returns[:-1]) - value
            value_loss = advantages.pow(2).mean()  # L2Loss or MSE Loss
            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()
            total_loss = value_loss * args.value_loss_coef + action_loss - cls_entropy * args.entropy_coef

            optimizer.zero_grad()
            total_loss.backward()
            # 下面进行迷之操作。。梯度裁剪(https://www.cnblogs.com/lindaxin/p/7998196.html)
            nn.utils.clip_grad_norm(actor_critic.parameters(),
                                    args.max_grad_norm)
            # average_gradients(actor_critic)
            optimizer.step()
        elif args.algo.startswith('ppo'):
            # 下面进行PPO算法梯度更新
            advantages = rollout.returns[:-1] - rollout.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)
            for e in range(args.ppo_epoch):
                data_generator = rollout.feed_forward_generator(
                    advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, actions_batch, \
                    return_batch, masks_batch, old_action_log_probs_batch, \
                    adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, cls_entropy = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

        # 事后一支烟
        rollout.after_update()
        update_time = time.time() - update_start
        print("updates {} finished, cost time {}:{}".format(
            updata_i, update_time // 60, update_time % 60))
        # print("total services is {}".format(total_services))
        # 存储模型
        if updata_i % args.save_interval == 0:
            save_path = os.path.join(args.save_dir, 'a2c')
            save_path = os.path.join(save_path, args.cnn)
            save_path = os.path.join(save_path, args.step_over)
            save_path = os.path.join(save_path, args.parameter)
            if os.path.exists(save_path) and os.path.isdir(save_path):
                pass
            else:
                os.makedirs(save_path)
            save_file = os.path.join(save_path, str(updata_i) + '.tar')
            save_content = {
                'update_i': updata_i,
                'state_dict': actor_critic.state_dict(),
                'optimizer': optimizer.state_dict(),
                'mean_reward': final_rewards.mean()
            }
            torch.save(save_content, save_file)

        # 输出日志
        if updata_i % args.log_interval == 0:
            end = time.time()
            interval = end - log_start
            remaining_seconds = (num_updates - updata_i -
                                 1) / args.log_interval * interval
            remaining_hours = int(remaining_seconds // 3600)
            remaining_minutes = int((remaining_seconds % 3600) / 60)
            total_num_steps = (updata_i + 1) * args.workers * args.num_steps
            blocked_services = total_services - allocated_services
            bp = blocked_services / total_services
            wave_port_num, total_port_num = envs.get_all_edges_port()
            wave_occ_sum, resource_utilization_rate = envs.get_resourceUtilization(
            )

            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, \
            entropy {:.5f}, value loss {:.5f}, policy loss {:.8f}, remaining time {}:{}, 阻塞率为{}/{}={}, \
                  各个波长端口数量为{}, 总的端口数量为{}, 带宽占用情况为{}, 资源占用率为{}".format(
                    updata_i, total_num_steps,
                    int(total_num_steps / (end - start)), final_rewards.mean(),
                    final_rewards.median(), final_rewards.min(),
                    final_rewards.max(), cls_entropy.data, value_loss.data,
                    action_loss.data, remaining_hours, remaining_minutes,
                    blocked_services, total_services, bp, wave_port_num,
                    total_port_num, wave_occ_sum, resource_utilization_rate))
            # raise NotImplementedError
            total_services = 0
            allocated_services = 0
            log_start = time.time()

    envs.close()
Esempio n. 9
0
        if env_info.local_done[0]:
            break

    if step == 0:
        print('ppo update')
        # do ppo update
        next_value = agent(obs)[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-5)

        for e in range(args.ppo_epoch):
            data_generator = rollouts.feed_forward_generator(
                advantages, args.num_mini_batch)

            for sample in data_generator:
                observations_batch, actions_batch, \
                return_batch, masks_batch, old_action_log_probs_batch, \
                adv_targ = sample

                # Reshape to do in a single forward pass for all steps
                action_log_probs, dist_entropy, values = agent.evaluate_action(
                    Variable(observations_batch), Variable(actions_batch))

                adv_targ = Variable(adv_targ)
                ratio = torch.exp(action_log_probs -
                                  Variable(old_action_log_probs_batch))
                surr1 = ratio * adv_targ
                surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
Esempio n. 10
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]
                 )  # I guess the obs_shape[0] is channel number

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # args.num_steps should be the length of interactions before each updating/training
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy(
            )  # returns are state value, sampled action, act_log_prob, hidden states

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(
                step, current_obs, states.data, action.data,
                action_log_prob.data, value.data, reward, masks
            )  # so the rollout stores one batch of interaction sequences, each sequence has length of args.num_steps

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))
            # values should be values of observations, states are the hidden states used in rnn module, by pwang8

            values = values.view(
                args.num_steps, args.num_processes,
                1)  # values are estimated current state values
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            # rollouts.returns are current "Action" value calculted following Bellmans' eqaution gamma * State_value(t+1) + reward(t)
            advantages = Variable(
                rollouts.returns[:-1]
            ) - values  # This is also the definition of advantage value (action_value - state_value).
            value_loss = advantages.pow(
                2).mean()  # values are estimated current state_value(t)

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            # If ACKTR is utilized, it is not only a different optimizer is used, they also added some new loss source
            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(
                    values - Variable(sample_values.data)
                ).pow(2).mean(
                )  # don't know what is the difference between this and just randomly sample some noise

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:
                                                                      -1]  # calculating the advantage value of an action
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            # The difference from this ppo optimization to the optimization above is that: it updates params for
            # multiple epochs in ppo optimization. Because of this, it samples from the rollouts storage a minibatch
            # every time to calculate gradient. Sampling is conducted for optimization purpose.
            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))
                    # For the 1st epoch of updating, I guess the action_log_probls is the same as old_action_log_probs_batch
                    # because params of the NN have not been updated at that time. But later, in other updating epochs,
                    # this ratio will generate some error. The old_action_log_probs_batch will not be updated during
                    # these param updating epochs.
                    # action_log_probs is the log prob of that action taken by the agent. So it's one value here, not
                    # log_prob for all actions with certain input observation/state. By pwang8, Dec 31, 2017
                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)
                    # compared to a2c, the major difference for ppo is that action_loss is calculated in controlled way
                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Esempio n. 11
0
def main():

    num_updates = int(
        config.max_num_frames) // args.num_steps // config.a2c.num_processes
    n_times_is_converging = 0

    print("num_updates:     " + str(num_updates))

    print("stop_learning:   " + str(config.a2c.stop_learning))

    # Initializing evaluation
    evaluator = Evaluator(evaluation_id)

    os.environ['OMP_NUM_THREADS'] = '1'

    envs = [
        make_env(config.env_name, args.seed, i, evaluation_id)
        for i in range(config.a2c.num_processes)
    ]

    if config.a2c.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    obs_numel = reduce(operator.mul, obs_shape, 1)

    actor_critic = Policy(obs_numel, envs.action_space)

    # Maxime: log some info about the model and its size
    modelSize = 0
    for p in actor_critic.parameters():
        pSize = reduce(operator.mul, p.size(), 1)
        modelSize += pSize
    print(str(actor_critic))
    print('Total model size: %d' % modelSize)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if config.a2c.algorithm == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif config.a2c.algorithm == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif config.a2c.algorithm == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, config.a2c.num_processes,
                              obs_shape, envs.action_space,
                              actor_critic.state_size)
    current_obs = torch.zeros(config.a2c.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([config.a2c.num_processes, 1])
    final_rewards = torch.zeros([config.a2c.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()
    start = time.time()

    send_env_name = False
    for j in range(num_updates):

        if n_times_is_converging > 1:
            print("Converged...")
            break

        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            evaluator.update(done, info)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            elif current_obs.dim() == 3:
                current_obs *= masks.unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if config.a2c.algorithm in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[:-1].view(-1,
                                                   actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, config.a2c.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     config.a2c.num_processes,
                                                     1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if config.a2c.algorithm == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if config.a2c.algorithm == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif config.a2c.algorithm == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        save_dir = "../a2c_trained_model/"
        if j % config.a2c.save_model_interval == 0:
            save_path = save_dir
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

        if j % config.a2c.save_evaluation_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps

            # if the environment name and the envelope state was not send
            if not send_env_name:
                evaluator.save(j, total_num_steps, final_rewards, dist_entropy,
                               value_loss, action_loss, config.env_name,
                               config.envelope)
                send_env_name = True
            else:
                evaluator.save(j, total_num_steps, final_rewards, dist_entropy,
                               value_loss, action_loss)

            if evaluator.is_converging:
                n_times_is_converging += 1
            else:
                n_times_is_converging = 0

            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

        if config.visdom and j % config.visdom_interval == 0:
            win = visdom_plot(total_num_steps, final_rewards.mean())
Esempio n. 12
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    test_envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
        test_envs = SubprocVecEnv(test_envs)
    else:
        envs = DummyVecEnv(envs)
        test_envs = DummyVecEnv(test_envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.saved_encoder_model:
        obs_shape = (args.num_stack, args.latent_space_size)

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    modelSize = 0
    for p in actor_critic.parameters():
        pSize = reduce(operator.mul, p.size(), 1)
        modelSize += pSize
    print(str(actor_critic))
    print('Total model size: %d' % modelSize)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.resume_experiment:
        print("\n############## Loading saved model ##############\n")
        actor_critic, ob_rms = torch.load(
            os.path.join(save_path, args.env_name + args.save_tag + ".pt"))
        tr.load(os.path.join(log_path, args.env_name + args.save_tag + ".p"))

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    print(obs_shape)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    rollouts_test = RolloutStorage(args.num_steps_test, args.num_processes,
                                   obs_shape, envs.action_space,
                                   actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)
    current_obs_test = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs, test=False):
        shape_dim0 = envs.observation_space.shape[0]
        if args.saved_encoder_model:
            shape_dim0 = 1
            obs, _ = vae.encode(Variable(torch.cuda.FloatTensor(obs)))
            obs = obs.data.cpu().numpy()
        obs = torch.from_numpy(obs).float()
        if not test:
            if args.num_stack > 1:
                current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
            current_obs[:, -shape_dim0:] = obs
        else:
            if args.num_stack > 1:
                current_obs_test[:, :
                                 -shape_dim0] = current_obs_test[:,
                                                                 shape_dim0:]
            current_obs_test[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    reward_avg = 0

    if args.cuda:
        current_obs = current_obs.cuda()
        current_obs_test = current_obs_test.cuda()
        rollouts.cuda()
        rollouts_test.cuda()

    start = time.time()

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observation, reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            # Maxime: clip the reward within [0,1] for more reliable training
            # This code deals poorly with large reward values
            reward = np.clip(reward, a_min=0, a_max=None) / 400

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            tr.episodes_done += args.num_processes - masks.sum()

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        tr.iterations_done += 1

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(
                save_model,
                os.path.join(save_path, args.env_name + args.save_tag + ".pt"))

            total_test_reward_list = []
            step_test_list = []

            for _ in range(args.num_tests):
                test_obs = test_envs.reset()
                update_current_obs(test_obs, test=True)
                rollouts_test.observations[0].copy_(current_obs_test)
                step_test = 0
                total_test_reward = 0

                while step_test < args.num_steps_test:
                    value_test, action_test, action_log_prob_test, states_test = actor_critic.act(
                        Variable(rollouts_test.observations[step_test],
                                 volatile=True),
                        Variable(rollouts_test.states[step_test],
                                 volatile=True),
                        Variable(rollouts_test.masks[step_test],
                                 volatile=True))
                    cpu_actions_test = action_test.data.squeeze(
                        1).cpu().numpy()

                    # Observation, reward and next obs
                    obs_test, reward_test, done_test, info_test = test_envs.step(
                        cpu_actions_test)

                    # masks here doesn't really matter, but still
                    masks_test = torch.FloatTensor(
                        [[0.0] if done_test_ else [1.0]
                         for done_test_ in done_test])

                    # Maxime: clip the reward within [0,1] for more reliable training
                    # This code deals poorly with large reward values
                    reward_test = np.clip(reward_test, a_min=0,
                                          a_max=None) / 400

                    total_test_reward += reward_test[0]
                    reward_test = torch.from_numpy(
                        np.expand_dims(np.stack(reward_test), 1)).float()

                    update_current_obs(obs_test)
                    rollouts_test.insert(step_test, current_obs_test, states_test.data, action_test.data, action_log_prob_test.data,\
                     value_test.data, reward_test, masks_test)

                    step_test += 1

                    if done_test:
                        break

                #rollouts_test.reset() # Need to reinitialise with .cuda(); don't forget
                total_test_reward_list.append(total_test_reward)
                step_test_list.append(step_test)

            append_to(tr.test_reward, tr,
                      sum(total_test_reward_list) / args.num_tests)
            append_to(tr.test_episode_len, tr,
                      sum(step_test_list) / args.num_tests)

            logger.log_scalar_rl(
                "test_reward", tr.test_reward[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "test_episode_len", tr.test_episode_len[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])

            # Saving all the MyContainer variables
            tr.save(
                os.path.join(log_path, args.env_name + args.save_tag + ".p"))

        if j % args.log_interval == 0:
            reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean()
            end = time.time()
            tr.global_steps_done = (j +
                                    1) * args.num_processes * args.num_steps

            print(
                "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, tr.global_steps_done,
                        int(tr.global_steps_done / (end - start)), reward_avg,
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

            append_to(tr.pg_loss, tr, action_loss.data[0])
            append_to(tr.val_loss, tr, value_loss.data[0])
            append_to(tr.entropy_loss, tr, dist_entropy.data[0])
            append_to(tr.train_reward_avg, tr, reward_avg)

            logger.log_scalar_rl(
                "train_pg_loss", tr.pg_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_val_loss", tr.val_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_entropy_loss", tr.entropy_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_reward_avg", tr.train_reward_avg[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            """
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0])
                )
            """

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass