Esempio n. 1
0
class A2C:
    def __init__(self,
                 env_name="BipedalWalker-v2",
                 num_steps=5,
                 num_workers=10,
                 num_updates=10000,
                 log_frequency=10,
                 use_gae=True,
                 gamma=0.99,
                 tau=0.95,
                 entropy_coef=0.01):

        observation_space, action_space = get_env_info(env_name)
        self.num_steps = num_steps
        self.num_updates = num_updates
        self.log_frequency = log_frequency
        self.use_gae = use_gae
        self.gamma = gamma
        self.tau = tau
        self.entropy_coef = entropy_coef
        self.max_grad_norm = 0.5

        self.simulator = RolloutCollector(env_name, num_workers)
        self.eval_env = gym.make(env_name)
        self.obs_dim, self.action_dim = observation_space.shape[
            0], action_space.shape[0]
        self.storage = RolloutStorage(num_steps, num_workers,
                                      observation_space.shape, action_space)
        self.policy = Actor(self.obs_dim, self.action_dim)
        self.V = Critic(self.obs_dim)

        self.actor_optimizer = optim.Adam(self.policy.parameters(), lr=5e-4)
        self.critic_optimizer = optim.Adam(self.V.parameters(), lr=5e-4)

        # track statistics
        self.episode_count = 0

    def get_actions(self, obs_n):
        with torch.no_grad():
            obs_batch = torch.FloatTensor(np.stack(obs_n))
            dist = self.policy(obs_batch)
            action_sample = dist.sample()
            values = self.V(obs_batch)
            action_n = [
                action_sample[i].numpy() for i in range(len(action_sample))
            ]
        return action_n, action_sample, values

    def update_storage(self, obs, actions, rewards, values, dones):
        self.episode_count += torch.sum(dones).item()
        masks = 1 - dones
        self.storage.insert(obs, actions, values, rewards, masks)

    def set_initial_observations(self, observations):
        self.storage.obs[0].copy_(observations)

    def compute_advantages(self):
        advantages = self.storage.returns[:-1] - self.storage.values[:-1]
        # standardize the advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-5)
        return advantages

    def update(self):
        with torch.no_grad():
            next_value = self.V(self.storage.obs[-1])

        self.storage.compute_returns(next_value, self.use_gae, self.gamma,
                                     self.tau)
        self.storage.returns.mul_(0.1)
        advantages = self.compute_advantages()
        obs_batch, actions_batch, values_batch, return_batch, adv_targ = self.storage.build_batch(
            advantages)

        # Update the policy
        self.actor_optimizer.zero_grad()
        action_dist = self.policy(obs_batch)
        action_log_probs = action_dist.log_prob(actions_batch)
        objective = torch.mean(adv_targ * action_log_probs)
        policy_loss = -objective

        # compute the value loss
        self.critic_optimizer.zero_grad()
        value_loss = F.mse_loss(self.V(obs_batch), return_batch)

        # compute other losses
        entropy_loss = -torch.mean(action_dist.entropy())

        # sum the losses, backprop, and step
        net_loss = policy_loss + value_loss + self.entropy_coef * entropy_loss
        net_loss.backward()

        nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
        nn.utils.clip_grad_norm_(self.V.parameters(), self.max_grad_norm)

        self.critic_optimizer.step()
        self.actor_optimizer.step()
        return value_loss.detach().item(
        ), -policy_loss.detach().item(), -entropy_loss.detach().item()

    def evaluate(self, n=20, render=False):
        env = self.eval_env
        action_bounds = [env.action_space.low, env.action_space.high]
        all_rewards = []
        for i in range(n):
            episode_rewards = []
            state = env.reset()
            terminal = False
            while not terminal:
                dist = self.policy(torch.FloatTensor(state).view(1, -1))
                action = dist.sample().numpy().reshape(-1)
                action = np.clip(action, action_bounds[0], action_bounds[1])
                next_state, reward, terminal, info = env.step(action)
                episode_rewards.append(reward)
                state = next_state
                if render:
                    fps = 8.0
                    env.render()
                    time.sleep(1 / fps)
            all_rewards.append(np.sum(episode_rewards))
        all_rewards = np.array(all_rewards)
        env.reset()
        return all_rewards

    def __iter__(self):
        obs_n = self.simulator.reset()
        for u in range(self.num_updates):
            self.set_initial_observations(torch.FloatTensor(np.stack(obs_n)))
            for t in range(self.num_steps):
                # Compute actions using policy given latest observation
                action_n, actions, values = self.get_actions(obs_n)

                # Give action to each worker and take an environment step
                obs_n, reward_n, done_n = self.simulator.step(action_n)

                observations = torch.FloatTensor(np.stack(obs_n))
                rewards = torch.FloatTensor(np.vstack(reward_n))
                dones = torch.FloatTensor(np.vstack(done_n))

                # Update the storage
                self.update_storage(observations, actions, rewards, values,
                                    dones)

            value_loss, objective, mean_policy_entropy = self.update()
            self.storage.after_update()

            if (u + 1) % self.log_frequency == 0:
                eval_episode_returns = self.evaluate()
                yield self.episode_count, eval_episode_returns, value_loss, objective, mean_policy_entropy
Esempio n. 2
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    envs = [
        make_env(args.env_name,
                 seed=args.seed,
                 digit=args.digit,
                 rank=i,
                 log_dir=args.log_dir,
                 use_patience=args.use_patience)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    print(obs_shape)

    actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                             args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_lengths = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            episode_lengths += torch.ones(episode_lengths.size())
            episode_lengths *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic.get_value(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True)).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, Episode lengths {:.2f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0], episode_lengths.mean()))
        if j > 0 and j % args.vis_interval == 0:
            pass
Esempio n. 3
0
    def run(self):
        # (16, 4, 84, 84)
        current_obs = np.zeros([NUM_PROCESSES, *self.obs_shape])
        episode_rewards = np.zeros([NUM_PROCESSES, 1])
        final_rewards = np.zeros([NUM_PROCESSES, 1])

        # torch.Size([16, 1, 84, 84])
        obs = self.env.reset()
        # frameの先頭に最新のobsを格納
        current_obs[:, :1] = obs

        storage = RolloutStorage(NUM_ADVANCED_STEP, NUM_PROCESSES,
                                 self.obs_shape, current_obs)

        for j in tqdm(range(NUM_UPDATES)):
            for step in range(NUM_ADVANCED_STEP):
                #with torch.no_grad():
                _, cpu_actions = self.actor_critic.predict(
                    storage.observations[step] / 255)
                action = np.argmax(np.array(
                    [np.random.multinomial(1, x) for x in cpu_actions]),
                                   axis=1)

                # obs size:(16, 1, 84, 84)
                obs, reward, done, info = self.env.step(action)

                reward = reward.reshape(-1, 1)
                episode_rewards += reward

                final_rewards[done] = episode_rewards[done]
                episode_rewards[done] = 0

                # 現在の状態をdone時には全部0にする
                current_obs[done] = 0

                # frameをstackする
                current_obs[:, 1:] = current_obs[:, :-1]  # 2~4番目に1~3番目を上書き
                current_obs[:, :1] = obs  # 1番目に最新のobsを格納

                # メモリオブジェクトに今stepのtransitionを挿入
                storage.insert(current_obs, action, reward, done)

            # advancedした最終stepの状態から予想する状態価値を計算
            #with torch.no_grad():
            input_obs = storage.observations[-1] / 255
            next_value, _ = self.actor_critic.predict(input_obs)

            # 全stepの割引報酬和returnsを計算
            storage.compute_discounted_rewards(next_value)

            # ネットワークとstorageの更新
            self.global_brain.update(storage)
            storage.after_update()

            # ログ:途中経過の出力
            if j % 100 == 0:
                print(
                    "finished frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(j * NUM_PROCESSES * NUM_ADVANCED_STEP,
                            final_rewards.mean(), np.median(final_rewards),
                            final_rewards.min(), final_rewards.max()))

            # 結合パラメータの保存
            if j % 12500 == 0:
                self.actor_critic.save('weight_' + str(j) + '.pth')

        # 実行ループの終了
        self.actor_critic.save('weight_end.pth')