Ejemplo n.º 1
0
    def __init__(self,
                 env_name="BipedalWalker-v2",
                 num_steps=5,
                 num_workers=10,
                 num_updates=10000,
                 log_frequency=10,
                 use_gae=True,
                 gamma=0.99,
                 tau=0.95,
                 entropy_coef=0.01):

        observation_space, action_space = get_env_info(env_name)
        self.num_steps = num_steps
        self.num_updates = num_updates
        self.log_frequency = log_frequency
        self.use_gae = use_gae
        self.gamma = gamma
        self.tau = tau
        self.entropy_coef = entropy_coef
        self.max_grad_norm = 0.5

        self.simulator = RolloutCollector(env_name, num_workers)
        self.eval_env = gym.make(env_name)
        self.obs_dim, self.action_dim = observation_space.shape[
            0], action_space.shape[0]
        self.storage = RolloutStorage(num_steps, num_workers,
                                      observation_space.shape, action_space)
        self.policy = Actor(self.obs_dim, self.action_dim)
        self.V = Critic(self.obs_dim)

        self.actor_optimizer = optim.Adam(self.policy.parameters(), lr=5e-4)
        self.critic_optimizer = optim.Adam(self.V.parameters(), lr=5e-4)

        # track statistics
        self.episode_count = 0
Ejemplo n.º 2
0
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = args.remotes
        self.seed = 42
        self.max_steps = 1e9
        self.grad_norm = 0.5
        self.entropy_weight = 0.05
        self.eps = np.finfo(np.float32).eps.item()

        #######################    NOTE: You need to implement
        self.recurrent = True  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 1000
        self.save_freq = 1
        self.save_dir = './ckpts/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = MultiEnv()
            self.envs.configure(remotes=self.n_processes)

        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        observation = self.envs.reset()
        self.obs_shape = np.transpose(observation[0], (2, 0, 1)).shape
        self.act_shape = args.action_space

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size,
                                 self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        if args.test_a2c:
            self.load_model('./ckpts/model_1239.pt')

        self.hidden = None
        self.init_game_setting()
Ejemplo n.º 3
0
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = 64
        self.seed = 7122
        self.max_steps = 1e7
        self.grad_norm = 0.5
        self.entropy_weight = 0.05

        #######################    NOTE: You need to implement
        self.recurrent = False # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py
        
        self.display_freq = 4000
        self.save_freq = 100000
        self.save_dir = './checkpoints/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)
         
        self.envs = env
        if self.envs == None:
            self.envs = make_vec_envs('SuperMarioBros-v0', self.seed,
                    self.n_processes)
        self.device = torch.device("cuda:1" if use_cuda else "cpu")

        self.obs_shape = self.envs.observation_space.shape
        self.act_shape = self.envs.action_space.n

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                self.obs_shape, self.act_shape, self.hidden_size) 
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                self.hidden_size, self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, 
                eps=1e-5)

        if args.test_mario:
            self.load_model(os.path.join('mario.pt'))
            print('finish model loading ...')

        self.hidden = None
        self.init_game_setting()
Ejemplo n.º 4
0
    def _update(self):
        # TODO: Compute returns
        # R_t = reward_t + gamma * R_{t+1}
        state_values_true = self.calc_actual_state_values(
            self.rollouts.rewards, self.rollouts.dones
        )  #(rewards, dones)#from storage: obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()); obs =state?

        # TODO:
        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        # loss = value_loss + action_loss (- entropy_weight * entropy)

        s = Variable(torch.FloatTensor(self.rollouts.obs))
        action_probs, state_values_est, hiddens = self.model(
            s)  #action_probs, state_values_est
        action_log_probs = action_probs.log()
        a = Variable(torch.LongTensor(self.rollouts.actions).view(-1, 1))
        chosen_action_log_probs = action_log_probs.gather(1, a)
        # This is also the TD error
        advantages = state_values_true - state_values_est
        entropy = (action_probs * action_log_probs).sum(1).mean()
        action_loss = (chosen_action_log_probs * advantages).mean()
        value_loss = advantages.pow(2).mean()

        loss = value_loss + action_loss - 0.0001 * entropy  #entropy_weight = 0.0001
        # Update
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)
        self.optimizer.step()

        # TODO:
        # Clear rollouts after update (RolloutStorage.reset())
        RolloutStorage.reset()  ##

        return loss.item()
Ejemplo n.º 5
0
class AgentA2C:
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = args.remotes
        self.seed = 42
        self.max_steps = 1e9
        self.grad_norm = 0.5
        self.entropy_weight = 0.05
        self.eps = np.finfo(np.float32).eps.item()

        #######################    NOTE: You need to implement
        self.recurrent = True  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 1000
        self.save_freq = 1
        self.save_dir = './ckpts/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = MultiEnv()
            self.envs.configure(remotes=self.n_processes)

        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        observation = self.envs.reset()
        self.obs_shape = np.transpose(observation[0], (2, 0, 1)).shape
        self.act_shape = args.action_space

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size,
                                 self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        if args.test_a2c:
            self.load_model('./ckpts/model_1239.pt')

        self.hidden = None
        self.init_game_setting()

    def _update(self):
        # R_t = reward_t + gamma * R_{t+1}
        with torch.no_grad():
            next_value, _, _ = self.model(self.rollouts.obs[-1],
                                          self.rollouts.hiddens[-1],
                                          self.rollouts.masks[-1])

        self.rollouts.returns[-1] = next_value.detach()
        for step in reversed(range(self.rollouts.rewards.size(0))):
            self.rollouts.returns[step] = self.rollouts.rewards[step] + \
                                            (self.rollouts.returns[step + 1] * \
                                             self.gamma * \
                                             self.rollouts.masks[step + 1])

        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        # loss = value_loss + action_loss (- entropy_weight * entropy)
        values, action_probs, _ = self.model(
            self.rollouts.obs[:-1].view(-1, self.obs_shape[0],
                                        self.obs_shape[1], self.obs_shape[2]),
            self.rollouts.hiddens[0], self.rollouts.masks[:-1].view(-1, 1))
        distribution = torch.distributions.Categorical(action_probs)
        log_probs = distribution.log_prob(
            self.rollouts.actions.flatten()).flatten()
        returns = self.rollouts.returns[:-1].flatten()
        values = values.flatten()
        value_loss = F.smooth_l1_loss(returns, values)
        advantages = returns - values
        action_loss = -(log_probs * advantages.detach()).mean()
        entropy = distribution.entropy().mean()
        loss = value_loss + action_loss + (-self.entropy_weight * entropy)

        # Update
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)
        self.optimizer.step()

        # Clear rollouts after update (RolloutStorage.reset())
        self.rollouts.reset()

        return loss.item()

    def _step(self, obs, hiddens, masks):
        with torch.no_grad():
            # Sample actions from the output distributions
            # HINT: you can use torch.distributions.Categorical
            values, action_probs, hiddens = self.model(obs, hiddens, masks)
            actions = torch.distributions.Categorical(action_probs).sample()

        transformed_action = multiActionTransform(actions.cpu().numpy())
        obs, rewards, dones, infos = self.envs.step(transformed_action)

        # Store transitions (obs, hiddens, actions, values, rewards, masks)
        # You need to convert arrays to tensors first
        # HINT: masks = (1 - dones)
        obs = torch.from_numpy(obs).to(self.device).permute(0, 3, 1, 2)
        masks = torch.from_numpy(1 - dones).to(self.device)
        rewards = torch.from_numpy(rewards).to(self.device)
        penalty_rewards = (1 - masks) * -10
        rewards = rewards + penalty_rewards.double()

        self.rollouts.insert(obs, hiddens, actions.unsqueeze(1), values,
                             rewards.unsqueeze(1), masks.unsqueeze(1))

    def train(self):

        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~START TRAINING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )
        print(
            '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
        )

        running_reward = deque(maxlen=self.update_freq * 2)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0

        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device).permute(
            0, 3, 1, 2)
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)

        max_reward = 0.0
        counter = 0
        continual_crash = 0

        while True:
            try:
                # Update once every n-steps
                for step in range(self.update_freq):
                    self._step(self.rollouts.obs[step],
                               self.rollouts.hiddens[step],
                               self.rollouts.masks[step])

                    # Calculate episode rewards
                    episode_rewards += self.rollouts.rewards[step]
                    for r, m in zip(episode_rewards,
                                    self.rollouts.masks[step + 1]):
                        if m == 0:
                            running_reward.append(r.item())
                    episode_rewards *= self.rollouts.masks[step + 1]

                loss = self._update()
                total_steps += self.update_freq * self.n_processes

                # Log & save model
                if len(running_reward) == 0:
                    avg_reward = 0
                else:
                    avg_reward = sum(running_reward) / len(running_reward)

                if total_steps % self.display_freq == 0:
                    print(
                        'Steps: %d/%d | Avg reward: %f | Max reward: %f' %
                        (total_steps, self.max_steps, avg_reward, max_reward))
                    with open('a2c_log.txt', 'a') as fout:
                        fout.write(str(avg_reward) + '\n')

                if total_steps % self.save_freq == 0:
                    self.save_model('model_{}.pt'.format(counter), avg_reward)
                    counter += 1

                if avg_reward > max_reward:
                    max_reward = avg_reward
                    self.save_model('model_max_{}.pt'.format(counter),
                                    max_reward)
                    counter += 1

                if total_steps >= self.max_steps:
                    break

                continual_crash = 0

            except Exception as e:
                continual_crash += 1

                if continual_crash >= 10:
                    print(
                        '============================================================================================================================================'
                    )
                    print(e)
                    print("Crashed 10 times -- stopping u suck")
                    print(
                        '============================================================================================================================================'
                    )

                    raise e
                else:
                    print(
                        '#############################################################################################################################################'
                    )
                    print(e)
                    print("Env crash, making new env")
                    print(
                        '#############################################################################################################################################'
                    )

                    time.sleep(60)
                    self.envs = MultiEnv(resize=(250, 150))
                    self.envs.configure(remotes=self.n_processes)
                    time.sleep(60)

    def save_model(self, filename, max_reward):
        if not os.path.isdir(self.save_dir):
            os.mkdir(self.save_dir)
        print('model saved: ' + filename + ' (' + str(max_reward) + ')')
        torch.save(self.model, os.path.join(self.save_dir, filename))

    def load_model(self, path):
        if use_cuda:
            self.model = torch.load(path)
        else:
            self.model = torch.load(path, map_location='cpu')

    def init_game_setting(self):
        if self.recurrent:
            self.hidden = torch.zeros(1, self.hidden_size).to(self.device)

    def make_action(self, observation, test=False):
        with torch.no_grad():
            observation = torch.from_numpy(observation).float().permute(
                0, 3, 1, 2).to(self.device)
            _, action_prob, hidden = self.model(
                observation, self.hidden,
                torch.ones(1, 1).to(self.device))
            self.hidden = hidden
            action = torch.distributions.Categorical(action_prob).sample()

        return action.cpu().numpy()
Ejemplo n.º 6
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    envs = [
        make_env(args.env_name,
                 seed=args.seed,
                 digit=args.digit,
                 rank=i,
                 log_dir=args.log_dir,
                 use_patience=args.use_patience)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    print(obs_shape)

    actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                             args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    episode_lengths = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            episode_lengths += torch.ones(episode_lengths.size())
            episode_lengths *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic.get_value(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True)).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, Episode lengths {:.2f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0], episode_lengths.mean()))
        if j > 0 and j % args.vis_interval == 0:
            pass
Ejemplo n.º 7
0
class AgentMario:
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = 16
        self.seed = 7122
        self.max_steps = 1e7
        self.grad_norm = 0.5
        self.entropy_weight = 0.05

        #######################    NOTE: You need to implement
        self.recurrent = True  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 4000
        self.save_freq = 100000
        self.save_dir = './checkpoints/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = make_vec_envs('SuperMarioBros-v0', self.seed,
                                      self.n_processes)
        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        self.obs_shape = self.envs.observation_space.shape
        self.act_shape = self.envs.action_space.n

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size,
                                 self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        self.hidden = None
        self.init_game_setting()

    ####
    def calc_actual_state_values(self, rewards, dones):
        R = []
        rewards.reverse()

        # If we happen to end the set on a terminal state, set next return to zero
        if dones[-1] == True:
            next_return = 0

        # If not terminal state, bootstrap v(s) using our critic
        # TODO: don't need to estimate again, just take from last value of v(s) estimates
        else:
            s = torch.from_numpy(self.rollouts.obs[-1]).float().unsqueeze(
                0)  #states
            next_return = self.model.get_state_value(Variable(s)).data[0][0]

            # Backup from last state to calculate "true" returns for each state in the set
        R.append(next_return)
        dones.reverse()
        for r in range(1, len(rewards)):
            if not dones[r]:
                this_return = rewards[r] + next_return * self.gamma
            else:
                this_return = 0
            R.append(this_return)
            next_return = this_return

        R.reverse()
        state_values_true = Variable(torch.FloatTensor(R)).unsqueeze(1)

        return state_values_true
####

    def _update(self):
        # TODO: Compute returns
        # R_t = reward_t + gamma * R_{t+1}
        state_values_true = self.calc_actual_state_values(
            self.rollouts.rewards, self.rollouts.dones
        )  #(rewards, dones)#from storage: obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy()); obs =state?

        # TODO:
        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        # loss = value_loss + action_loss (- entropy_weight * entropy)

        s = Variable(torch.FloatTensor(self.rollouts.obs))
        action_probs, state_values_est, hiddens = self.model(
            s)  #action_probs, state_values_est
        action_log_probs = action_probs.log()
        a = Variable(torch.LongTensor(self.rollouts.actions).view(-1, 1))
        chosen_action_log_probs = action_log_probs.gather(1, a)
        # This is also the TD error
        advantages = state_values_true - state_values_est
        entropy = (action_probs * action_log_probs).sum(1).mean()
        action_loss = (chosen_action_log_probs * advantages).mean()
        value_loss = advantages.pow(2).mean()

        loss = value_loss + action_loss - 0.0001 * entropy  #entropy_weight = 0.0001
        # Update
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)
        self.optimizer.step()

        # TODO:
        # Clear rollouts after update (RolloutStorage.reset())
        RolloutStorage.reset()  ##

        return loss.item()

    def _step(self, obs, hiddens, masks):

        with torch.no_grad():
            pass
            # TODO:
            # Sample actions from the output distributions
            # HINT: you can use torch.distributions.Categorical
            actions, values, hiddens = self.make_action(obs, hiddens, masks)
        #print("##################################*****************",actions.cpu().numpy(),type(actions.cpu().numpy()),actions.cpu().numpy().shape)
        #print("##################################*****************",actions.max(1)[0].item())
        obs, rewards, dones, infos = self.envs.step(
            actions.max(1)[0])  #.numpy().max(0)[0].item())

        # TODO:
        # Store transitions (obs, hiddens, actions, values, rewards, masks)
        # You need to convert arrays to tensors first
        # HINT: masks = (1 - dones)

        self.rollouts.to(device)
        masks = 1 - dones
        self.rollouts.insert(obs, hiddens, actions, values, rewards, masks)
        self.rollouts.to(device)

    def train(self):

        print('Start training')
        running_reward = deque(maxlen=10)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0

        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device)
        self.rollouts.obs[0].copy_(obs)  #torch.Size([16, 4, 84, 84])
        self.rollouts.to(self.device)

        while True:
            # Update once every n-steps
            for step in range(self.update_freq):
                print("# ******************step***********************", step)
                #print("self.rollouts.actions[step]", self.rollouts.actions[step])
                # print("self.rollouts.obs[step]", self.rollouts.hiddens[step])
                # print("self.rollouts.obs[step]", self.rollouts.masks[step])
                self._step(self.rollouts.obs[step],
                           self.rollouts.hiddens[step],
                           self.rollouts.masks[step])

                # Calculate episode rewards
                episode_rewards += self.rollouts.rewards[step]
                for r, m in zip(episode_rewards,
                                self.rollouts.masks[step + 1]):
                    if m == 0:
                        running_reward.append(r.item())
                episode_rewards *= self.rollouts.masks[step + 1]

            loss = self._update()
            total_steps += self.update_freq * self.n_processes

            # Log & save model
            if len(running_reward) == 0:
                avg_reward = 0
            else:
                avg_reward = sum(running_reward) / len(running_reward)

            if total_steps % self.display_freq == 0:
                print('Steps: %d/%d | Avg reward: %f' %
                      (total_steps, self.max_steps, avg_reward))

            if total_steps % self.save_freq == 0:
                self.save_model('model.pt')

            if total_steps >= self.max_steps:
                break

    def save_model(self, filename):
        torch.save(self.model, os.path.join(self.save_dir, filename))

    def load_model(self, path):
        self.model = torch.load(path)

    def init_game_setting(self):
        if self.recurrent:
            self.hidden = torch.zeros(1, self.hidden_size).to(self.device)

    def make_action(self, observation, hiddens, masks, test=False):
        # TODO: Use you model to choose an action
        # if test == True:
        #     observation = torch.from_numpy(observation).permute(2, 0, 1).unsqueeze(0).to(device)
        # print("!!!!!!!!!!!!!!",observation.shape)
        # state = torch.from_numpy(observation).float().unsqueeze(0)
        values, action_probs, hiddens = self.model(observation, hiddens, masks)

        # m = Categorical(action_probs)
        # action = m.sample()
        # #self.saved_actions.append(m.log_prob(action))

        return action_probs, values, hiddens
Ejemplo n.º 8
0
class AgentMario:
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = 16
        self.seed = 7122
        self.max_steps = 1e7
        self.grad_norm = 0.5
        self.entropy_weight = 0.05

        #######################    NOTE: You need to implement
        self.recurrent = True  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 4000
        self.save_freq = 100000
        self.save_dir = './checkpoints/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = make_vec_envs('SuperMarioBros-v0', self.seed,
                                      self.n_processes)
        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        self.obs_shape = self.envs.observation_space.shape
        self.act_shape = self.envs.action_space.n

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size,
                                 self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        self.hidden = None
        self.init_game_setting()

    def _update(self):
        # TODO: Compute returns
        # R_t = reward_t + gamma * R_{t+1}

        # TODO:
        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        # loss = value_loss + action_loss (- entropy_weight * entropy)

        # Update
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)
        self.optimizer.step()

        # TODO:
        # Clear rollouts after update (RolloutStorage.reset())

        return loss.item()

    def _step(self, obs, hiddens, masks):
        with torch.no_grad():
            pass
            # TODO:
            # Sample actions from the output distributions
            # HINT: you can use torch.distributions.Categorical

        obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy())

        # TODO:
        # Store transitions (obs, hiddens, actions, values, rewards, masks)
        # You need to convert arrays to tensors first
        # HINT: masks = (1 - dones)

    def train(self):

        print('Start training')
        running_reward = deque(maxlen=10)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0

        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device)
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)

        while True:
            # Update once every n-steps
            for step in range(self.update_freq):
                self._step(self.rollouts.obs[step],
                           self.rollouts.hiddens[step],
                           self.rollouts.masks[step])

                # Calculate episode rewards
                episode_rewards += self.rollouts.rewards[step]
                for r, m in zip(episode_rewards,
                                self.rollouts.masks[step + 1]):
                    if m == 0:
                        running_reward.append(r.item())
                episode_rewards *= self.rollouts.masks[step + 1]

            loss = self._update()
            total_steps += self.update_freq * self.n_processes

            # Log & save model
            if len(running_reward) == 0:
                avg_reward = 0
            else:
                avg_reward = sum(running_reward) / len(running_reward)

            if total_steps % self.display_freq == 0:
                print('Steps: %d/%d | Avg reward: %f' %
                      (total_steps, self.max_steps, avg_reward))

            if total_steps % self.save_freq == 0:
                self.save_model('model.pt')

            if total_steps >= self.max_steps:
                break

    def save_model(self, filename):
        torch.save(self.model, os.path.join(self.save_dir, filename))

    def load_model(self, path):
        self.model = torch.load(path)

    def init_game_setting(self):
        if self.recurrent:
            self.hidden = torch.zeros(1, self.hidden_size).to(self.device)

    def make_action(self, observation, test=False):
        # TODO: Use you model to choose an action
        return action
Ejemplo n.º 9
0
class A2C:
    def __init__(self,
                 env_name="BipedalWalker-v2",
                 num_steps=5,
                 num_workers=10,
                 num_updates=10000,
                 log_frequency=10,
                 use_gae=True,
                 gamma=0.99,
                 tau=0.95,
                 entropy_coef=0.01):

        observation_space, action_space = get_env_info(env_name)
        self.num_steps = num_steps
        self.num_updates = num_updates
        self.log_frequency = log_frequency
        self.use_gae = use_gae
        self.gamma = gamma
        self.tau = tau
        self.entropy_coef = entropy_coef
        self.max_grad_norm = 0.5

        self.simulator = RolloutCollector(env_name, num_workers)
        self.eval_env = gym.make(env_name)
        self.obs_dim, self.action_dim = observation_space.shape[
            0], action_space.shape[0]
        self.storage = RolloutStorage(num_steps, num_workers,
                                      observation_space.shape, action_space)
        self.policy = Actor(self.obs_dim, self.action_dim)
        self.V = Critic(self.obs_dim)

        self.actor_optimizer = optim.Adam(self.policy.parameters(), lr=5e-4)
        self.critic_optimizer = optim.Adam(self.V.parameters(), lr=5e-4)

        # track statistics
        self.episode_count = 0

    def get_actions(self, obs_n):
        with torch.no_grad():
            obs_batch = torch.FloatTensor(np.stack(obs_n))
            dist = self.policy(obs_batch)
            action_sample = dist.sample()
            values = self.V(obs_batch)
            action_n = [
                action_sample[i].numpy() for i in range(len(action_sample))
            ]
        return action_n, action_sample, values

    def update_storage(self, obs, actions, rewards, values, dones):
        self.episode_count += torch.sum(dones).item()
        masks = 1 - dones
        self.storage.insert(obs, actions, values, rewards, masks)

    def set_initial_observations(self, observations):
        self.storage.obs[0].copy_(observations)

    def compute_advantages(self):
        advantages = self.storage.returns[:-1] - self.storage.values[:-1]
        # standardize the advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-5)
        return advantages

    def update(self):
        with torch.no_grad():
            next_value = self.V(self.storage.obs[-1])

        self.storage.compute_returns(next_value, self.use_gae, self.gamma,
                                     self.tau)
        self.storage.returns.mul_(0.1)
        advantages = self.compute_advantages()
        obs_batch, actions_batch, values_batch, return_batch, adv_targ = self.storage.build_batch(
            advantages)

        # Update the policy
        self.actor_optimizer.zero_grad()
        action_dist = self.policy(obs_batch)
        action_log_probs = action_dist.log_prob(actions_batch)
        objective = torch.mean(adv_targ * action_log_probs)
        policy_loss = -objective

        # compute the value loss
        self.critic_optimizer.zero_grad()
        value_loss = F.mse_loss(self.V(obs_batch), return_batch)

        # compute other losses
        entropy_loss = -torch.mean(action_dist.entropy())

        # sum the losses, backprop, and step
        net_loss = policy_loss + value_loss + self.entropy_coef * entropy_loss
        net_loss.backward()

        nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
        nn.utils.clip_grad_norm_(self.V.parameters(), self.max_grad_norm)

        self.critic_optimizer.step()
        self.actor_optimizer.step()
        return value_loss.detach().item(
        ), -policy_loss.detach().item(), -entropy_loss.detach().item()

    def evaluate(self, n=20, render=False):
        env = self.eval_env
        action_bounds = [env.action_space.low, env.action_space.high]
        all_rewards = []
        for i in range(n):
            episode_rewards = []
            state = env.reset()
            terminal = False
            while not terminal:
                dist = self.policy(torch.FloatTensor(state).view(1, -1))
                action = dist.sample().numpy().reshape(-1)
                action = np.clip(action, action_bounds[0], action_bounds[1])
                next_state, reward, terminal, info = env.step(action)
                episode_rewards.append(reward)
                state = next_state
                if render:
                    fps = 8.0
                    env.render()
                    time.sleep(1 / fps)
            all_rewards.append(np.sum(episode_rewards))
        all_rewards = np.array(all_rewards)
        env.reset()
        return all_rewards

    def __iter__(self):
        obs_n = self.simulator.reset()
        for u in range(self.num_updates):
            self.set_initial_observations(torch.FloatTensor(np.stack(obs_n)))
            for t in range(self.num_steps):
                # Compute actions using policy given latest observation
                action_n, actions, values = self.get_actions(obs_n)

                # Give action to each worker and take an environment step
                obs_n, reward_n, done_n = self.simulator.step(action_n)

                observations = torch.FloatTensor(np.stack(obs_n))
                rewards = torch.FloatTensor(np.vstack(reward_n))
                dones = torch.FloatTensor(np.vstack(done_n))

                # Update the storage
                self.update_storage(observations, actions, rewards, values,
                                    dones)

            value_loss, objective, mean_policy_entropy = self.update()
            self.storage.after_update()

            if (u + 1) % self.log_frequency == 0:
                eval_episode_returns = self.evaluate()
                yield self.episode_count, eval_episode_returns, value_loss, objective, mean_policy_entropy
Ejemplo n.º 10
0
    def run(self):
        # (16, 4, 84, 84)
        current_obs = np.zeros([NUM_PROCESSES, *self.obs_shape])
        episode_rewards = np.zeros([NUM_PROCESSES, 1])
        final_rewards = np.zeros([NUM_PROCESSES, 1])

        # torch.Size([16, 1, 84, 84])
        obs = self.env.reset()
        # frameの先頭に最新のobsを格納
        current_obs[:, :1] = obs

        storage = RolloutStorage(NUM_ADVANCED_STEP, NUM_PROCESSES,
                                 self.obs_shape, current_obs)

        for j in tqdm(range(NUM_UPDATES)):
            for step in range(NUM_ADVANCED_STEP):
                #with torch.no_grad():
                _, cpu_actions = self.actor_critic.predict(
                    storage.observations[step] / 255)
                action = np.argmax(np.array(
                    [np.random.multinomial(1, x) for x in cpu_actions]),
                                   axis=1)

                # obs size:(16, 1, 84, 84)
                obs, reward, done, info = self.env.step(action)

                reward = reward.reshape(-1, 1)
                episode_rewards += reward

                final_rewards[done] = episode_rewards[done]
                episode_rewards[done] = 0

                # 現在の状態をdone時には全部0にする
                current_obs[done] = 0

                # frameをstackする
                current_obs[:, 1:] = current_obs[:, :-1]  # 2~4番目に1~3番目を上書き
                current_obs[:, :1] = obs  # 1番目に最新のobsを格納

                # メモリオブジェクトに今stepのtransitionを挿入
                storage.insert(current_obs, action, reward, done)

            # advancedした最終stepの状態から予想する状態価値を計算
            #with torch.no_grad():
            input_obs = storage.observations[-1] / 255
            next_value, _ = self.actor_critic.predict(input_obs)

            # 全stepの割引報酬和returnsを計算
            storage.compute_discounted_rewards(next_value)

            # ネットワークとstorageの更新
            self.global_brain.update(storage)
            storage.after_update()

            # ログ:途中経過の出力
            if j % 100 == 0:
                print(
                    "finished frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(j * NUM_PROCESSES * NUM_ADVANCED_STEP,
                            final_rewards.mean(), np.median(final_rewards),
                            final_rewards.min(), final_rewards.max()))

            # 結合パラメータの保存
            if j % 12500 == 0:
                self.actor_critic.save('weight_' + str(j) + '.pth')

        # 実行ループの終了
        self.actor_critic.save('weight_end.pth')
Ejemplo n.º 11
0
class AgentMario:
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = 64
        self.seed = 7122
        self.max_steps = 1e7
        self.grad_norm = 0.5
        self.entropy_weight = 0.05

        #######################    NOTE: You need to implement
        self.recurrent = False # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py
        
        self.display_freq = 4000
        self.save_freq = 100000
        self.save_dir = './checkpoints/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)
         
        self.envs = env
        if self.envs == None:
            self.envs = make_vec_envs('SuperMarioBros-v0', self.seed,
                    self.n_processes)
        self.device = torch.device("cuda:1" if use_cuda else "cpu")

        self.obs_shape = self.envs.observation_space.shape
        self.act_shape = self.envs.action_space.n

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                self.obs_shape, self.act_shape, self.hidden_size) 
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                self.hidden_size, self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, 
                eps=1e-5)

        if args.test_mario:
            self.load_model(os.path.join('mario.pt'))
            print('finish model loading ...')

        self.hidden = None
        self.init_game_setting()
   
    def _update(self):
        # TODO: Compute returns
        # R_t = reward_t + gamma * R_{t+1}
        rewards = self.rollouts.rewards
        obs = self.rollouts.obs
        hiddens = self.rollouts.hiddens
        masks = self.rollouts.masks
        actions = self.rollouts.actions
        preds = self.rollouts.value_preds

        # 5 x 16 x 1
        Vt = preds[:-1]
        Vt_1 = self.gamma * preds[1:] * masks[:-1]
        
        # 5 x 16
        from torch.autograd import Variable
        Advantage = Variable((rewards - (Vt-Vt_1)), requires_grad=False)
        R = Advantage.squeeze(-1)

        # TODO:
        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        # loss = value_loss + action_loss (- entropy_weight * entropy)
        entropys = []
        logP = []
        Q_values = []

        for idx, (ob, hidden, mask) in enumerate(zip(obs, hiddens, masks)):
            value, action_prob, _ = self.model(ob, hidden, mask)
            Q_values.append(value)
            if idx != obs.size(0)-1:
                m = Categorical(action_prob)
                logP.append(m.log_prob(actions[idx].squeeze(-1)))
                entropys.append(torch.mean(m.entropy()))

        logP = torch.stack(logP,0)
        action_loss = torch.mean(-R * logP)
        print(action_loss)

        Q_values = torch.stack(Q_values, 0)
        Qt = Q_values[:-1]
        Qt_1 = rewards + self.gamma * preds[1:] * masks[:-1]

        mse = torch.nn.MSELoss()
        value_loss = mse(Qt, Qt_1)
        print(value_loss)

        entropys = sum(entropys)/len(entropys)
        print(entropys)
        loss = value_loss + action_loss - entropys
        print(loss)

        # Update
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)
        self.optimizer.step()
        
        # TODO:
        # Clear rollouts after update (RolloutStorage.reset())
        self.rollouts.reset()

        return loss.item()

    def _step(self, obs, hiddens, masks):
        from torch.autograd import Variable
        from torch.distributions import Categorical
        import numpy as np
        
        with torch.no_grad():
            # TODO:
            # Sample actions from the output distributions
            # HINT: you can use torch.distributions.Categorical
            values, action_probs, hiddens = self.model(obs, hiddens, masks)
            m = Categorical(action_probs)
            actions = m.sample()

        obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy())
        
        # TODO:
        # Store transitions (obs, hiddens, actions, values, rewards, masks)
        # You need to convert arrays to tensors first
        # HINT: masks = (1 - dones)
        obs      = Variable(torch.FloatTensor(np.float32(obs)))
        rewards  = Variable(torch.FloatTensor(np.float32(rewards)))
        dones    = Variable(torch.FloatTensor(np.float32(dones))).unsqueeze(1)
        masks    = torch.ones(masks.shape) - dones
        self.rollouts.insert(obs, hiddens, actions.unsqueeze(-1), values, rewards.unsqueeze(-1), masks)

        
    def train(self):
        # logging
        import logging
        logging.basicConfig(filename="mario_reward.log", level=logging.INFO)

        print('Start training')
        running_reward = deque(maxlen=10)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0
        
        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device)
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)
        
        while True:
            # Update once every n-steps
            for step in range(self.update_freq):
                self._step(
                    self.rollouts.obs[step],
                    self.rollouts.hiddens[step],
                    self.rollouts.masks[step])

                # Calculate episode rewards
                episode_rewards += self.rollouts.rewards[step]
                for r, m in zip(episode_rewards, self.rollouts.masks[step + 1]):
                    if m == 0:
                        running_reward.append(r.item())
                episode_rewards *= self.rollouts.masks[step + 1]

            loss = self._update()
            total_steps += self.update_freq * self.n_processes

            # Log & save model
            if len(running_reward) == 0:
                avg_reward = 0
            else:
                avg_reward = sum(running_reward) / len(running_reward)

            if total_steps % self.display_freq == 0:
                logging.info("{},{}".format(total_steps, avg_reward))
                print('Steps: %d/%d | Avg reward: %f'%
                        (total_steps, self.max_steps, avg_reward))
            
            if total_steps % self.save_freq == 0:
                self.save_model('model.pt')
            
            if total_steps >= self.max_steps:
                break

    def save_model(self, filename):
        torch.save(self.model, os.path.join(self.save_dir, filename))

    def load_model(self, path):
        self.model = torch.load(path, map_location=torch.device('cpu'))

    def init_game_setting(self):
        if self.recurrent:
            self.hidden = torch.zeros(1, self.hidden_size).to(self.device)

    def make_action(self, observation, test=False):
        # TODO: Use you model to choose an action
        from torch.autograd import Variable
        
        observation = Variable(torch.from_numpy(observation).float().unsqueeze(0)).to(self.device)
        value, action_prob, hidden = self.model(observation, observation, observation)
        m = Categorical(action_prob)
        action = torch.argmax(m.probs).data.tolist()
        return action
Ejemplo n.º 12
0
class AgentMario:  #actor agent
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.99
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = 16
        self.seed = 7122
        self.max_steps = 1e7
        self.grad_norm = 0.5
        self.entropy_weight = 0.05

        #######################    NOTE: You need to implement
        self.recurrent = False  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        if args.test_mario:
            self.load_model('./checkpoints/model.pt')
        self.display_freq = 4000
        self.save_freq = 10000
        self.save_dir = './checkpoints/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = make_vec_envs('SuperMarioBros-v0', self.seed,
                                      self.n_processes)
        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        self.obs_shape = self.envs.observation_space.shape
        self.act_shape = self.envs.action_space.n
        #print(self.obs_shape) #(4, 84, 84)
        #print(self.act_shape) #12
        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size,
                                 self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        self.hidden = None
        self.init_game_setting()

    def _update(self):
        # TODO: Compute returns
        #print(self.rollouts.obs.size()) #torch.Size([6, 16, 4, 84, 84])
        obs_shape = self.rollouts.obs.size()[2:]
        #print(obs_shape) #torch.Size([4, 84, 84])
        #print(self.rollouts.actions.size()) #torch.Size([5, 16, 1])
        action_shape = self.rollouts.actions.size()[-1]
        #print(action_shape) #1
        num_steps, num_processes, _ = self.rollouts.rewards.size()
        #see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/a2c_ppo_acktr/algo/a2c_acktr.py line 38-43
        #input()
        # R_t = reward_t + gamma * R_{t+1}
        discounted_return = torch.zeros(self.update_freq, self.n_processes,
                                        1).to(self.device)
        #print(self.rollouts.rewards)
        for t in range(self.update_freq - 1, -1, -1):
            discounted_return[t] = self.rollouts.rewards[
                t] + self.gamma * self.rollouts.value_preds[t + 1]
            #print(t)
            #print(self.rollouts.masks[t])
        #print(self.rollouts.obs[:-1]) # [:-1] means don't take the last element
        #print(self.rollouts.obs[:-1].shape)#torch.Size([5, 16, 4, 84, 84])

    # print(self.rollouts.obs[:-1].view(-1, *obs_shape).shape)# torch.Size([80, 4, 84, 84]) n_steps*n_processes, 4, 84, 84
    #print(self.rollouts.hiddens[0].shape)#torch.Size([16, 512])
    #print(self.rollouts.hiddens[0].view(-1, self.model.hidden_size).shape) #torch.Size([16, 512])
    #print(self.rollouts.masks[:-1].view(-1, 1).shape) #torch.Size([80, 1])
        values, action_probs, hiddens = self.model(
            self.rollouts.obs[:-1].view(-1, *obs_shape),
            self.rollouts.hiddens[0].view(-1, self.model.hidden_size),
            self.rollouts.masks[:-1].view(-1, 1))
        #print(values.shape) #torch.Size([5, 16, 1])
        #print(action_probs.shape) #torch.Size([5, 16, 12])
        #print(hiddens.shape) #torch.Size([16, 512])
        values = values.view(num_steps, num_processes, 1)
        action_probs = action_probs.view(num_steps, num_processes, -1)
        #print(action_probs)
        #print(action_probs.gather(2 ,self.rollouts.actions))
        #print(action_probs.gather(2 ,self.rollouts.actions).shape) #torch.Size([5, 16, 1])
        #m=Categorical(action_probs)
        action_probs = action_probs.gather(2, self.rollouts.actions)
        #print(m)
        #print(self.rollouts.actions)
        #print(action_probs)
        action_log_probs = action_probs.log()
        #action_log_probs = m.log_prob(self.rollouts.actions.view(-1, action_shape))
        #print(action_log_probs)
        #print(action_log_probs.shape) #torch.Size([5, 16, 1])
        #input()
        #deal with self.rollouts.actions later!
        #=self.model(self.rollouts.obs)
        #print(self.rollouts.rewards.shape) #torch.Size([5, 16, 1])
        #print(self.rollouts.value_preds.shape)#torch.Size([6, 16, 1])
        advantages = discounted_return - values

        #not so sure, advantage=  r_t+gamma* V(s_t+1) - V(s_t) ?????
        #print(advantages)
        #print(advantages.shape) #torch.Size([5, 16, 1])
        #print(self.rollouts.action_log_probs.shape) #torch.Size([5, 16, 1])
        #input()
        #self.gamma*
        # TODO:
        #value loss is the critic loss; action loss is the actor loss
        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        #use output entropy as regularization for pi(s)
        # loss = value_loss + action_loss (- entropy_weight * entropy)
        #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 260-267
        critic_loss = advantages.pow(2).mean()
        #print(critic_loss.grad)
        #print(critic_loss) #tensor(1.2946, device='cuda:0', grad_fn=<MeanBackward1>)
        #print(critic_loss.shape) #torch.Size([])
        #https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/tree/master/a2c_ppo_acktr  -->USEFUL
        actor_loss = -(advantages * action_log_probs).mean()
        #print(actor_loss.grad)
        #print(actor_loss) #tensor(1.1621, device='cuda:0', grad_fn=<NegBackward>)

        #print(actor_loss.shape) #torch.Size([])
        #input()
        loss = actor_loss + critic_loss
        #print(loss) #tensor(2.4567, device='cuda:0', grad_fn=<AddBackward0>)

        #print(loss.shape)
        #input()
        # Update
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)
        self.optimizer.step()

        # TODO:
        # Clear rollouts after update (RolloutStorage.reset())
        self.rollouts.reset()
        return loss.item()

    def _step(self, obs, hiddens, masks):  #_step is just 1 step
        with torch.no_grad():
            #16 is n_processes, meaning 16 workers, means batch_size is 16(?)
            #print("obs.shape", obs.shape) #torch.Size([16, 4, 84, 84])
            #print(hiddens.shape) #torch.Size([16, 512])
            #print(masks.shape) #torch.Size([16, 1])
            #self.model has 3 inputs
            #I think we should for loop 16 times to get the state of each worker
            #which is WRONG!
            #for i in range(self.n_processes):
            values, action_probs, hiddens = self.model(obs, hiddens, masks)
            #values : V(st)  obs: st
            #print(values.shape) #
            #print(hiddens.shape)
            #print(action_probs) #torch.Size([1, 16, 12])
            #print(action_probs.shape) #torch.Size([16, 12])
            #action_probs means F.softmax(policy)
            m = Categorical(action_probs)
            #print(m) #Categorical(probs: torch.Size([16, 12]))
            actions = m.sample()
            #print(m.log_prob(actions).shape)
            #input()
            action_log_probs = m.log_prob(actions).unsqueeze(1)
            #print(m.log_prob(actions))
            #print(m.log_prob(actions).shape) #torch.Size([1, 16])
            #input()
            #print(actions)#tensor([[9, 4, 8, 6, 4, 3, 9, 3, 0, 3, 5, 5, 1, 0, 2, 5]], device='cuda:0')
            #print(actions.shape) #torch.Size([16])
            actions = actions.squeeze(0)
            #print(actions.cpu().numpy()) #[ 0  0  1  4  4  2  8  8  0  4  7  7  6 11  9  3]
            #input()
            #if you don't use recurrent, you don't need hidden and masks
            #values, action_provs, hiddens =self.model(obs, hiddens, masks)
            #actions=self.make_actions(obs)
            # TODO:
            # Sample actions from the output distributions
            # HINT: you can use torch.distributions.Categorical
            #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 256-257
#see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/main.py line 113~132
        obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy())
        #obs here is s_t+1
        #the step you're calling here is in shmem_vec_env.py step_async
        #you are inputing 16 actions to 16 environments
        #print(dones) #[False False False False False False False False False False False False
        # False False False False]
        #print(1-dones) #[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
        #print(infos)
        #input()
        #rewards : rt, truly obtain when taking actions at
        values = values.squeeze(0)
        actions = actions.unsqueeze(1)
        obs = torch.from_numpy(obs)
        rewards = torch.from_numpy(rewards).unsqueeze(1)
        #print(rewards.shape)
        masks = torch.from_numpy(1 - dones).unsqueeze(1)
        # TODO:
        self.rollouts.insert(obs, hiddens, actions, action_log_probs, values,
                             rewards, masks)
        # Store transitions (obs: s_t+1, hiddens, actions:a_t , values: V(s_t), rewards: r_t,  masks)
        # You need to convert arrays to tensors first
        # HINT: masks = (1 - dones)

    def train(self):

        print('Start training')
        running_reward = deque(maxlen=10)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0

        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device)
        #print(obs.shape) #torch.Size([16, 4, 84, 84])
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)
        #print(obs.shape) #torch.Size([16, 4, 84, 84])
        #print(self.rollouts.obs.shape) #torch.Size([6, 16, 4, 84, 84])
        # 6 is n_steps+1  --> see ../a2c/storage.py
        while True:
            # Update once every n-steps
            for step in range(self.update_freq):
                self._step(self.rollouts.obs[step],
                           self.rollouts.hiddens[step],
                           self.rollouts.masks[step])

                # Calculate episode rewards
                episode_rewards += self.rollouts.rewards[step]
                for r, m in zip(episode_rewards,
                                self.rollouts.masks[step + 1]):
                    #print(r)
                    #print(m)
                    if m == 0:
                        running_reward.append(r.item())
                episode_rewards *= self.rollouts.masks[step + 1]

            loss = self._update()  #update here
            total_steps += self.update_freq * self.n_processes

            # Log & save model
            if len(running_reward) == 0:
                avg_reward = 0
            else:
                avg_reward = sum(running_reward) / len(running_reward)

            if total_steps % self.display_freq == 0:
                print('Steps: %d/%d | Avg reward: %f' %
                      (total_steps, self.max_steps, avg_reward))

            if total_steps % self.save_freq == 0:
                self.save_model('model.pt')

            if total_steps >= self.max_steps:
                break

    def save_model(self, filename):
        print("Save the model to ", self.save_dir)
        torch.save(self.model, os.path.join(self.save_dir, filename))

    def load_model(self, path):
        print("Load the model from ", path)
        self.model = torch.load(path)

    def init_game_setting(self):
        if self.recurrent:
            self.hidden = torch.zeros(1, self.hidden_size).to(self.device)

    def make_action(self, observation, test=False):
        # TODO: Use you model to choose an action
        #self.load_model("./checkpoints/model.pt") #load the model somewhere else!
        #print(observation.shape) #(4, 84, 84)
        #print(observation)
        observation = torch.from_numpy(observation).to(
            self.device).unsqueeze(0)
        #when do we call this function??? -->../test/py line 41 will call this function
        #you also need to differentiate test=True and test=False
        #see https://github.com/jcwleo/mario_rl/blob/master/mario_a2c.py line 170
        #see https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail/blob/master/evaluation.py line 20-31
        eval_recurrent_hidden_states = torch.zeros(self.n_processes,
                                                   self.model.hidden_size,
                                                   device=self.device)
        eval_masks = torch.zeros(self.n_processes, 1, device=self.device)
        _, action_probs, _ = self.model(observation,
                                        eval_recurrent_hidden_states,
                                        eval_masks)
        #print(action_probs)
        #print(action_probs.shape) #torch.Size([1, 12])
        #print(action_probs.max(1)[1])
        #print(action_probs.max(1)[1].item())
        action = action_probs.max(1)[1].item()
        #print(action)
        return action
Ejemplo n.º 13
0
class AgentMario:
    def __init__(self, env, args):

        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.9
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = 16
        self.seed = 7122
        self.max_steps = 6e6
        self.grad_norm = 0.5
        self.entropy_weight = 0.05

        if args.test_mario:
            self.load_model('./checkpoints/model.pt')

        #######################    NOTE: You need to implement
        self.recurrent = True  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 4000
        self.save_freq = 100000
        self.save_dir = './checkpoints/'

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = make_vec_envs('SuperMarioBros-v0', self.seed,
                                      self.n_processes)
        self.device = torch.device("cuda:0" if use_cuda else "cpu")

        self.obs_shape = self.envs.observation_space.shape
        self.act_shape = self.envs.action_space.n

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)
        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size,
                                 self.recurrent).to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        self.hidden = None
        self.init_game_setting()

    def _update(self):
        # TODO: Compute returns
        # R_t = reward_t + gamma * R_{t+1}
        for step in reversed(range(self.rollouts.rewards.size(0))):
            self.rollouts.returns[step] = self.rollouts.returns[step+1] * \
                self.gamma * self.rollouts.masks[step+1] + self.rollouts.rewards[step]

        # TODO:
        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        # loss = value_loss + action_loss (- entropy_weight * entropy)

        obs_shape = self.rollouts.obs.size()[2:]
        action_shape = self.rollouts.actions.size()[-1]
        num_steps, num_processes, _ = self.rollouts.rewards.size()

        values, action_probs, hiddens = self.model(
            self.rollouts.obs[:-1].view(-1, *obs_shape),
            self.rollouts.hiddens[0].view(-1, 512),
            self.rollouts.masks[:-1].view(-1, 1))
        m = Categorical(action_probs)
        log_probs = m.log_prob(self.rollouts.actions.view(-1))
        entropys = m.entropy().mean()

        values = values.view(num_steps, num_processes, 1)
        log_probs = log_probs.view(num_steps, num_processes, 1)

        advantages = self.rollouts.returns[:-1] - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(advantages.detach() * log_probs).mean()
        loss = (value_loss + action_loss) - (entropys * self.entropy_weight)
        # Update
        self.optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(self.model.parameters(), self.grad_norm)
        self.optimizer.step()

        # TODO:
        # Clear rollouts after update (RolloutStorage.reset())
        self.rollouts.reset()

        return loss.item()

    def _step(self, obs, hiddens, masks):
        with torch.no_grad():
            values, action_probs, hiddens = self.model(obs, hiddens, masks)
            m = Categorical(action_probs)
            actions = m.sample()
            # TODO:
            # Sample actions from the output distributions
            # HINT: you can use torch.distributions.Categorical

        obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy())

        # TODO:
        # Store transitions (obs, hiddens, actions, values, rewards, masks)
        # You need to convert arrays to tensors first
        # HINT: masks = (1 - dones)
        masks = torch.FloatTensor([[0.0] if done else [1.0] for done in dones])
        obs = torch.from_numpy(obs).to(self.device)
        rewards = torch.from_numpy(rewards).unsqueeze(1).to(self.device)
        actions = actions.unsqueeze(1)
        self.rollouts.insert(obs, hiddens, actions, values, rewards, masks)

    def train(self):

        print('Start training')
        running_reward = deque(maxlen=10)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0

        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device)
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)
        x_value = []
        y_value = []
        while True:
            # Update once every n-steps
            for step in range(self.update_freq):
                self._step(self.rollouts.obs[step],
                           self.rollouts.hiddens[step],
                           self.rollouts.masks[step])

                # Calculate episode rewards
                episode_rewards += self.rollouts.rewards[step]
                for r, m in zip(episode_rewards,
                                self.rollouts.masks[step + 1]):
                    if m == 0:
                        running_reward.append(r.item())
                episode_rewards *= self.rollouts.masks[step + 1]

            loss = self._update()
            total_steps += self.update_freq * self.n_processes

            # Log & save model
            if len(running_reward) == 0:
                avg_reward = 0
            else:
                avg_reward = sum(running_reward) / len(running_reward)

            if total_steps % self.display_freq == 0:
                print('Steps: %d/%d | Avg reward: %f' %
                      (total_steps, self.max_steps, avg_reward))
                x_value.append(total_steps)
                y_value.append(avg_reward)

            if total_steps % self.save_freq == 0:
                self.save_model('model.pt')
#            if avg_reward > 5000:
#                self.save_model('model.pt')
#                x_value.append(total_steps)
#                y_value.append(avg_reward)
#                break
            if total_steps >= self.max_steps:
                break
        self.save_curve(x_value, y_value, 'mario_curve')

#    def save_curve(self, x_values, y_values, title):
#
#        tmp = {title:
#                {
#                    'x': x_values,
#                    'y': y_values
#                }
#            }
#
#        if os.path.isfile('./mario.json'):
#            with open('mario.json', 'r') as f:
#                file = json.load(f)
#            file.update(tmp)
#            with open('mario.json', 'w') as f:
#                json.dump(file, f)
#        else:
#            with open('mario.json', 'w') as f:
#                json.dump(tmp, f)

    def save_model(self, filename):
        torch.save(self.model, os.path.join(self.save_dir, filename))

    def load_model(self, path):
        self.model = torch.load(path)

    def init_game_setting(self):
        if self.recurrent:
            self.hidden = torch.zeros(1, self.hidden_size).to(self.device)

    def make_action(self, observation, test=False):
        # TODO: Use you model to choose an action
        if test:
            #            self.load_model('./checkpoints/model.pt')

            with torch.no_grad():
                obs = torch.from_numpy(observation).to(self.device)
                self.rollouts.obs[0].copy_(obs)
                self.rollouts.to(self.device)
                _, action_probs, self.rollouts.hiddens[0] = self.model(
                    self.rollouts.obs[0], self.rollouts.hiddens[0],
                    self.rollouts.masks[0])
                m = Categorical(action_probs)
                action = m.sample().cpu().numpy()
        return action[0]
Ejemplo n.º 14
0
    def __init__(self, env, args):

        self.use_gae = True
        self.use_standard = False
        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.90
        self.tau = 0.95
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = 16
        self.seed = 7122
        self.max_steps = 1e7
        self.grad_norm = 0.5
        self.clip_param = 0.2
        self.entropy_weight = 0.05

        #######################    NOTE: You need to implement
        self.recurrent = False  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 4000
        self.save_freq = 20000

        if args.test_a2c:
            if args.model_path == None:
                raise Exception('give --model_path')
        else:
            if args.folder_name == None:
                raise Exception('give --folder_name')
            self.model_dir = os.path.join('./model', args.folder_name)
            if not os.path.exists(self.model_dir):
                os.mkdir(self.model_dir)

        self.plot = {'reward': []}

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = make_vec_envs('SuperMarioBros-v0', self.seed,
                                      self.n_processes)

        self.device = torch.device("cuda:0" if use_cuda else "cpu")
        self.obs_shape = self.envs.observation_space.shape
        self.act_shape = self.envs.action_space.n

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)

        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size, self.recurrent)

        self.ppo_epochs = 4
        self.ppo_batch_size = 5

        if args.test_a2c:
            self.load_model(args.model_path)

        self.model = self.model.to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        self.hidden = None
        self.init_game_setting()
Ejemplo n.º 15
0
class AgentA2C:
    def __init__(self, env, args):

        self.use_gae = True
        self.use_standard = False
        # Hyperparameters
        self.lr = 7e-4
        self.gamma = 0.90
        self.tau = 0.95
        self.hidden_size = 512
        self.update_freq = 5
        self.n_processes = 16
        self.seed = 7122
        self.max_steps = 1e7
        self.grad_norm = 0.5
        self.clip_param = 0.2
        self.entropy_weight = 0.05

        #######################    NOTE: You need to implement
        self.recurrent = False  # <- ActorCritic._forward_rnn()
        #######################    Please check a2c/actor_critic.py

        self.display_freq = 4000
        self.save_freq = 20000

        if args.test_a2c:
            if args.model_path == None:
                raise Exception('give --model_path')
        else:
            if args.folder_name == None:
                raise Exception('give --folder_name')
            self.model_dir = os.path.join('./model', args.folder_name)
            if not os.path.exists(self.model_dir):
                os.mkdir(self.model_dir)

        self.plot = {'reward': []}

        torch.manual_seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        self.envs = env
        if self.envs == None:
            self.envs = make_vec_envs('SuperMarioBros-v0', self.seed,
                                      self.n_processes)

        self.device = torch.device("cuda:0" if use_cuda else "cpu")
        self.obs_shape = self.envs.observation_space.shape
        self.act_shape = self.envs.action_space.n

        self.rollouts = RolloutStorage(self.update_freq, self.n_processes,
                                       self.obs_shape, self.act_shape,
                                       self.hidden_size)

        self.model = ActorCritic(self.obs_shape, self.act_shape,
                                 self.hidden_size, self.recurrent)

        self.ppo_epochs = 4
        self.ppo_batch_size = 5

        if args.test_a2c:
            self.load_model(args.model_path)

        self.model = self.model.to(self.device)
        self.optimizer = RMSprop(self.model.parameters(), lr=self.lr, eps=1e-5)

        self.hidden = None
        self.init_game_setting()

    def ppo_iter(self, mini_batch_size, states, hiddens, masks, actions,
                 log_probs, returns, advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], hiddens[rand_ids, :], masks[
                rand_ids, :], actions[rand_ids, :], log_probs[
                    rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def _update(self):

        # R_t = reward_t + gamma * R_{t+1}
        with torch.no_grad():
            Return = self.model.get_estimate_returns(self.rollouts.obs[-1],
                                                     self.rollouts.hiddens[-1],
                                                     self.rollouts.masks[-1])

        self.rollouts.value_preds[-1].copy_(Return)
        self.rollouts.returns[-1].copy_(Return * self.rollouts.masks[-1])

        if self.use_standard:
            self.rollouts.rewards = (
                self.rollouts.rewards -
                self.rollouts.rewards.mean()) / self.rollouts.rewards.std()

        if self.use_gae:
            gae = 0
            for r in reversed(range(len(self.rollouts.rewards))):
                delta = self.rollouts.rewards[r] \
                        + self.gamma * self.rollouts.value_preds[r+1] * self.rollouts.masks[r+1] \
                        - self.rollouts.value_preds[r]
                gae = delta + self.gamma * self.tau * self.rollouts.masks[
                    r + 1] * gae
                Return = gae + self.rollouts.value_preds[r]
                self.rollouts.returns[r].copy_(Return)
        else:
            for r in reversed(range(len(self.rollouts.rewards))):
                Return = self.rollouts.rewards[
                    r] + self.gamma * Return * self.rollouts.masks[r + 1]
                self.rollouts.returns[r].copy_(Return)

        # Compute actor critic loss (value_loss, action_loss)
        # OPTIONAL: You can also maxmize entropy to encourage exploration
        # loss = value_loss + action_loss (- entropy_weight * entropy)

        #action_probs = self.rollouts.action_probs.view(self.n_processes * self.update_freq, -1)
        #est_returns = self.rollouts.value_preds[:-1].view(self.n_processes * self.update_freq, -1)

        with torch.no_grad():
            est_returns, log_probs, _ = self.model(
                self.rollouts.obs[:-1].view(
                    self.n_processes * self.update_freq, *self.obs_shape),
                self.rollouts.hiddens[:-1].view(
                    self.n_processes * self.update_freq, -1),
                self.rollouts.masks[:-1].view(
                    self.n_processes * self.update_freq, -1),
            )
        states = self.rollouts.obs[:-1]
        hiddens = self.rollouts.hiddens[:-1]
        masks = self.rollouts.masks[:-1]
        actions = self.rollouts.actions
        returns = self.rollouts.returns[:-1]
        est_returns = est_returns.view(self.update_freq, self.n_processes, -1)
        log_probs = log_probs.gather(
            1, actions.view(self.n_processes * self.ppo_batch_size,
                            -1)).view(self.update_freq, self.n_processes, -1)
        advantages = returns - est_returns

        all_loss = []

        for _ in range(self.ppo_epochs):
            for state, hidden, mask, action, old_log_probs, return_, advantage in self.ppo_iter(
                    self.ppo_batch_size, states, hiddens, masks, actions,
                    log_probs, returns, advantages):

                action = action.view(self.n_processes * self.ppo_batch_size,
                                     -1)
                return_ = return_.view(self.n_processes * self.ppo_batch_size,
                                       -1)
                state = state.view(self.n_processes * self.ppo_batch_size,
                                   *self.obs_shape)
                hidden = hidden.view(self.n_processes * self.ppo_batch_size,
                                     -1)
                mask = mask.view(self.n_processes * self.ppo_batch_size, -1)
                old_log_probs = old_log_probs.view(
                    self.n_processes * self.ppo_batch_size, -1)
                advantage = advantage.view(
                    self.n_processes * self.ppo_batch_size, -1)

                value, new_log_probs, _ = self.model(state, hidden, mask)

                ratio = (new_log_probs.gather(1, action).log() -
                         old_log_probs.log()).exp()

                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
                                    1.0 + self.clip_param) * advantage

                # action loss (Policy)
                action_loss = -torch.min(surr1, surr2).mean()
                # value loss (DQN)
                value_loss = (return_ - value).pow(2).mean()
                # entropy
                entropy = (new_log_probs * new_log_probs.log()).sum(1).mean()
                # loss
                loss = 0.5 * value_loss + action_loss - self.entropy_weight * entropy

                # Update
                self.optimizer.zero_grad()
                loss.backward()
                clip_grad_norm_(self.model.parameters(), self.grad_norm)
                self.optimizer.step()
                all_loss.append(loss.item())

        # Clear rollouts after update (RolloutStorage.reset())
        self.rollouts.reset()
        return sum(all_loss) / len(all_loss)

    def _step(self, obs, hiddens, masks):

        with torch.no_grad():
            values, action_probs, hiddens = self.model(obs, hiddens, masks)

        actions = Categorical(action_probs.detach()).sample()

        # Sample actions from the output distributions
        obs, rewards, dones, infos = self.envs.step(actions.cpu().numpy())
        obs = torch.from_numpy(obs)
        rewards = torch.from_numpy(rewards).unsqueeze(1)
        masks = torch.from_numpy(1 - (dones)).unsqueeze(1)
        actions = actions.unsqueeze(1)

        self.rollouts.insert(
            obs,  #next
            hiddens,  #next
            actions,  #now
            action_probs,  #now
            values,  #now
            rewards,  #now
            masks)  #next

        # Store transitions (obs, hiddens, actions, values, rewards, masks)

    def train(self):

        print('Start training')
        running_reward = deque(maxlen=10)
        episode_rewards = torch.zeros(self.n_processes, 1).to(self.device)
        total_steps = 0
        best_reward = 0

        # Store first observation
        obs = torch.from_numpy(self.envs.reset()).to(self.device)
        self.rollouts.obs[0].copy_(obs)
        self.rollouts.to(self.device)

        while True:
            # Update once every n-steps
            for step in range(self.update_freq):
                self._step(self.rollouts.obs[step],
                           self.rollouts.hiddens[step],
                           self.rollouts.masks[step])

                # Calculate episode rewards
                episode_rewards += self.rollouts.rewards[step]
                for r, m in zip(episode_rewards,
                                self.rollouts.masks[step + 1]):
                    if m == 0:
                        running_reward.append(r.item())
                episode_rewards *= self.rollouts.masks[step + 1]

            loss = self._update()
            total_steps += self.update_freq * self.n_processes

            # Log & save model
            if len(running_reward) == 0:
                avg_reward = 0
            else:
                avg_reward = sum(running_reward) / len(running_reward)

            self.plot['reward'].append(avg_reward)

            print('Steps: %d/%d | Avg reward: %f | Loss: %f' %
                  (total_steps, self.max_steps, avg_reward, loss),
                  end='\r')

            if total_steps % self.display_freq == 0:
                print('Steps: %d/%d | Avg reward: %f' %
                      (total_steps, self.max_steps, avg_reward))
                if total_steps % self.save_freq == 0:

                    with open(os.path.join(self.model_dir, 'plot.json'),
                              'w') as f:
                        json.dump(self.plot, f)
                    #if int(avg_reward) > best_reward:
                    best_reward = int(avg_reward)
                    self.save_model(
                        os.path.join(
                            self.model_dir,
                            's{}_r{}_model.pt'.format(total_steps,
                                                      best_reward)))

            if total_steps >= self.max_steps:
                break

    def save_model(self, path):
        torch.save(
            {
                'model': self.model,
                'optimizer': self.optimizer.state_dict()
            }, path)

    def load_model(self, path):
        print('Load model from', path)
        self.model = torch.load(path)['model']

    def init_game_setting(self):
        if self.recurrent:
            self.hidden = torch.zeros(1, self.hidden_size).to(self.device)

    def make_action(self, observation, test=False):

        obs = torch.FloatTensor([observation]).to(self.device)
        #self.rollouts.obs[0].copy_(obs)
        #self.rollouts.to(self.device)
        with torch.no_grad():
            action_probs, _ = self.model.get_action_probs(obs, None, None)
        action = action_probs.max(1)[1].item()
        return action