Ejemplo n.º 1
0
class PPOAgent:
    def __init__(self, config):

        self.config = config
        self.memory = Memory()
        self.device = 'cpu'
        self.env = gym.make(config['env'])

        # boolean for discrete action space:
        self.discrete_action_bool = isinstance(self.env.action_space, Discrete)
        self.gamma = config['gamma']
        self.lambd = config['lambda']
        self.c1 = config['c1']
        self.c2 = config['c2']
        self.norm_reward = config["reward_norm"]
        self.loss_name = config['loss_name']
        self.beta_kl = config['beta_KL']

        self.batch_size = config["batch_size"]
        if not (self.discrete_action_bool):
            print("Low : ", self.env.action_space.low)
            print("High : ", self.env.action_space.high)

        # set random seeds
        np.random.seed(config['seed'])
        torch.manual_seed(config['seed'])
        self.env.seed(config['seed'])

        # Critic
        self.value_network = CustomValueNetwork(
            self.env.observation_space.shape[0], 64, 1).to(self.device)
        self.value_network_optimizer: optim.Optimizer = optim.Adam(
            self.value_network.parameters(), lr=config['lr'])
        # Actor
        if self.discrete_action_bool:
            self.actor_network = CustomDiscreteActorNetwork(
                self.env.observation_space.shape[0], 64,
                self.env.action_space.n).to(self.device)
        else:
            self.actor_network = ContinuousActorNetwork(
                self.env.observation_space.shape[0], 64,
                self.env.action_space.shape[0], self.config["std"],
                self.env).to(self.device)

        self.actor_network_optimizer: optim.Optimizer = optim.Adam(
            self.actor_network.parameters(), lr=config['lr'])

        # save in memory policy estimates
        self.probs_list = []  # probability of actions taken
        self.mean_list = []  # mean estimate (for continuous action)

    def _returns_advantages(self, values, next_value):
        """Returns the cumulative discounted rewards with GAE

		Parameters
		----------
		rewards : array
			An array of shape (batch_size,) containing the rewards given by the env
		dones : array
			An array of shape (batch_size,) containing the done bool indicator given by the env
		values : array
			An array of shape (batch_size,) containing the values given by the value network
		next_value : float
			The value of the next state given by the value network

		Returns
		-------
		returns : array
			The cumulative discounted rewards
		advantages : array
			The advantages
		"""

        rewards = np.array(self.memory.rewards)
        if self.norm_reward:
            rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        returns, advantages = [], []
        last = next_value
        gae = 0

        for i in reversed(range(len(self.memory))):
            # build the returns
            returns.insert(
                0, rewards[i] + self.gamma * last * (1 - self.memory.dones[i]))

            # build the advantages
            delta = rewards[i] + self.gamma * next_value * (
                1 - self.memory.dones[i]) - values[i]
            gae = delta + self.gamma * self.lambd * (
                1 - self.memory.dones[i]) * gae
            advantages.insert(0, gae)
            next_value = values[i]

        returns = torch.FloatTensor(returns).to(self.device)
        advantages = torch.FloatTensor(advantages).to(self.device)

        return returns, advantages

    def training(self, epochs, optimize_every, max_episodes, max_steps):
        t1 = datetime.datetime.now()
        """Perform a training by batch
			Parameters
			----------
			epochs : int
				Number of epochs
			batch_size : int
				The size of a batch"""

        episode_count = 0
        timestep_count = 0
        rewards_test = []
        solved = False

        loss_evol = {'loss': [], 'dry_loss': [], 'entropy': []}
        if self.loss_name not in [
                "A2C_loss", "adaptative_KL_loss", "clipped_loss"
        ]:
            print('Unknown loss function, using clipped loss as default loss')
        else:
            print('Loss : ', self.loss_name)

        for ep in range(max_episodes):
            if not solved:
                episode_count += 1
                obs = self.env.reset()

                for i in range(max_steps):
                    timestep_count += 1
                    self.memory.observations.append(obs)
                    obs_t = torch.from_numpy(obs).float().to(self.device)
                    action = self.actor_network.select_action(obs_t.view(
                        1, -1))

                    if self.discrete_action_bool:
                        action = int(action)
                        self.memory.actions.append(action)
                        obs, reward, done, _ = self.env.step(action)

                    else:
                        self.memory.actions.append(action)
                        obs, reward, done, _ = self.env.step(action.view(-1))

                    # Store termination status reward
                    self.memory.dones.append(done)
                    self.memory.rewards.append(reward)

                    if (timestep_count % optimize_every) == 0:
                        for epoch in range(epochs):
                            loss_val, dry_loss_val, entrop_val = self.optimize_model(
                                obs)
                            if epoch == epochs - 1:
                                loss_evol["loss"].append(loss_val)
                                loss_evol["dry_loss"].append(dry_loss_val)
                                loss_evol["entropy"].append(entrop_val)

                        self.memory.clear_memory()

                    if done:
                        break

            # Test every 25 episodes
            if ep == 1 or (ep > 0 and ep % 50 == 0) or (ep
                                                        == max_episodes - 1):
                rewards_test.append(
                    np.array([self.evaluate() for _ in range(50)]))
                print(
                    f'Episode {ep}/{max_episodes}: Mean rewards: {round(rewards_test[-1].mean(), 2)}, Std: {round(rewards_test[-1].std(), 2)}'
                )
                if round(rewards_test[-1].mean(), 2) == 500.:
                    solved = True

        self.env.close()
        t2 = datetime.datetime.now()

        # save rewards
        r = pd.DataFrame(
            (itertools.chain(*(itertools.product([i], rewards_test[i])
                               for i in range(len(rewards_test))))),
            columns=['Episode', 'Reward'])
        r["Episode"] = r["Episode"] * 50
        r["loss_name"] = self.loss_name

        # Total time ellapsed
        time = t2 - t1
        if episode_count == max_episodes:
            print(f'The agent did not reach the perfect score')
        else:
            print(
                f'The agent reached the perfect score over a total of {episode_count} episodes'
            )
        print('Total time ellapsed during training : ', time)
        r["time"] = time
        loss_evol = pd.DataFrame(loss_evol).astype(float)
        loss_evol["loss_name"] = self.loss_name
        loss_evol["Update"] = range(len(loss_evol))

        return r, loss_evol

    def compute_proba_ratio(self, prob, actions):
        if self.discrete_action_bool:
            if len(self.probs_list) == 1:
                old_prob = self.probs_list[0]
            else:
                old_prob = self.probs_list[len(self.probs_list) - 2]

        else:
            if len(self.mean_list) == 1:
                old_prob_mean = self.mean_list[0]
            else:
                old_prob_mean = self.mean_list[len(self.mean_list) - 2]

            diag = torch.tensor(self.config['std'] *
                                np.ones(old_prob_mean.size()[1])).float()
            dist = Normal(old_prob_mean, scale=diag)
            old_prob = dist.log_prob(actions).detach()

            # build new ones
            dist = Normal(prob, scale=diag)
            prob = dist.log_prob(actions)

        if self.discrete_action_bool:
            # compute the ratio directly using gather function
            num = prob.gather(1, actions.long().view(-1, 1))
            denom = old_prob.detach().gather(1, actions.long().view(-1, 1))
            ratio_vect = num.view(-1) / denom.view(-1)

        else:
            if np.isnan(prob.cpu().detach().numpy()).any():
                print("NaN encountered in num ratio")

            if np.isnan(old_prob.cpu().detach().numpy()).any():
                print("NaN encountered in denom ratio")
            ratio_vect = prob / (old_prob + 1e-6)

        if np.isnan(ratio_vect.cpu().detach().numpy()).any():
            print("NaN encountered in proba ratio")

        return ratio_vect, old_prob

    def clipped_loss(self, prob, actions, advantages):

        ratio_vect = self.compute_proba_ratio(prob, actions)[0]
        if len(actions.size()) > 1 and not (self.discrete_action_bool):
            ratio_vect = torch.prod(ratio_vect, dim=1)
        loss1 = ratio_vect * advantages
        loss2 = torch.clamp(ratio_vect, 1 - self.config['eps_clipping'],
                            1 + self.config['eps_clipping']) * advantages
        loss = -torch.sum(torch.min(loss1, loss2))
        return loss

    def adaptative_KL_loss(self, prob, actions, advantages, observations):
        if self.discrete_action_bool:
            ratio_vect, old_prob = self.compute_proba_ratio(prob, actions)
            kl = torch.zeros(1)
            for i in range(prob.size()[0]):
                kl += (old_prob[i] * (old_prob[i].log() - prob[i].log())).sum()

        else:
            ratio_vect = self.compute_proba_ratio(prob, actions)[0]
            if len(actions.size()) > 1 and not (self.discrete_action_bool):
                ratio_vect = torch.prod(ratio_vect, dim=1)
            if len(self.mean_list) == 1:
                kl = torch.tensor(0.)
            else:
                mu = prob
                mu_old = self.mean_list[len(self.mean_list) - 2].detach()
                a = (mu - mu_old) / torch.tensor(
                    config["std"] * np.ones(actions.size())).float()
                b = (mu - mu_old)
                if len(actions.size()) > 1:
                    a = torch.prod(a, axis=1)
                    b = torch.prod(b, axis=1)
                kl = torch.dot(a, b) / 2

        loss = -torch.sum((ratio_vect * advantages)) + self.beta_kl * kl
        if np.isnan(torch.mean(kl).cpu().detach().numpy()):
            print("Nan encountered in average KL divergence")
        if kl < self.config["d_targ"] / 1.5:
            self.beta_kl = self.beta_kl / 2
        elif kl > self.config["d_targ"] * 1.5:
            self.beta_kl = self.beta_kl * 2
        return loss

    def A2C_loss(self, prob, actions, advantages):
        loss = 0.
        if self.discrete_action_bool:
            for i in range(len(actions)):
                loss -= torch.log(prob[i, int(actions[i])] +
                                  1e-6) * advantages[i]
        else:
            diag = torch.tensor(self.config["std"] *
                                np.ones(prob.size()[1])).float()
            dist = Normal(prob, scale=diag)
            prob = dist.log_prob(actions)
            if actions.size()[1] > 1:
                prob = torch.prod(prob, dim=1)
            loss = torch.dot(torch.log(prob.view(-1) + 1e-6), advantages)

        return loss

    def optimize_model(self, next_obs):

        losses = {"loss": [], "dry_loss": [], "entropy": []}
        idx = torch.arange(len(self.memory))

        observations = torch.tensor(self.memory.observations).float().to(
            self.device)
        if np.isnan(observations.cpu().detach().numpy()).any():
            print("nan in observations")

        if self.discrete_action_bool:
            actions = torch.tensor(self.memory.actions).float().to(self.device)
        else:
            actions = torch.squeeze(torch.stack(self.memory.actions),
                                    1).float().to(self.device)

        next_obs = torch.from_numpy(next_obs).float().to(self.device)
        next_value = self.value_network.predict(next_obs)
        values = self.value_network(observations)
        returns, advantages = self._returns_advantages(values, next_value)
        returns = returns.float().to(self.device)
        advantages = advantages.float().to(self.device)

        for i in range(0, returns.size()[0], self.batch_size):
            batch_observations = observations[i:i + self.batch_size]
            batch_actions = actions[i:i + self.batch_size]
            batch_returns = returns[i:i + self.batch_size]
            batch_advantages = advantages[i:i + self.batch_size]

            # Critic loss
            net_values: torch.Tensor = self.value_network(batch_observations)
            critic_loss = F.mse_loss(net_values.view(-1), batch_returns)
            critic_loss.backward()
            self.value_network_optimizer.step()

            # Actor & Entropy loss
            if np.isnan(batch_observations.cpu().detach().numpy()).any():
                print("nan in batch observations")

            prob: torch.Tensor = self.actor_network.forward(batch_observations)
            if np.isnan(prob.cpu().detach().numpy()).any():
                print("NAN HERE")

            if self.discrete_action_bool:
                self.probs_list.append(prob.detach())

            else:
                self.mean_list.append(prob.detach())

            if self.loss_name == "clipped_loss":
                loss = self.clipped_loss(prob, batch_actions, batch_advantages)

            elif self.loss_name == "adaptative_KL_loss":
                loss = self.adaptative_KL_loss(prob, batch_actions,
                                               batch_advantages,
                                               batch_observations)

            elif self.loss_name == "A2C_loss":
                loss = self.A2C_loss(prob, batch_actions, batch_advantages)

            else:
                loss = self.clipped_loss(prob, batch_actions, batch_advantages)

            dry_loss = loss
            entropy_term = -torch.sum(prob * torch.log(prob + 1e-6))
            loss -= (self.c2 * entropy_term)
            loss.backward()
            self.actor_network_optimizer.step()
            self.value_network_optimizer.zero_grad()
            self.actor_network_optimizer.zero_grad()

            losses["loss"].append(loss.mean().item())
            losses["dry_loss"].append(dry_loss.mean().item())
            losses["entropy"].append(entropy_term.mean().item())

        return np.mean(losses["loss"]), np.mean(losses["dry_loss"]), np.mean(
            losses["entropy"])

    def evaluate(self, render=False):
        env = self.monitor_env if render else self.env
        observation = env.reset()

        observation = torch.from_numpy(observation).float().to(self.device)
        reward_episode = 0
        done = False
        with torch.no_grad():
            while not done:
                policy = self.actor_network(observation)

                if self.discrete_action_bool:
                    action = int(torch.multinomial(policy, 1))
                    observation, reward, done, info = env.step(action)
                else:
                    action = self.actor_network.select_action(observation)
                    observation, reward, done, info = env.step(action.view(-1))
                observation = torch.from_numpy(observation).float().to(
                    self.device)
                reward_episode += reward

        env.close()
        if render:
            show_video("./gym-results")
            print(f'Reward: {reward_episode}')
        return reward_episode
Ejemplo n.º 2
0
def main(env_name):

    # 获取所有参数
    args = Args(env_name)
    env = args.env
    max_epochs = args.max_epochs
    max_timesteps = args.max_timesteps
    update_timestep = args.update_timestep
    print_interval = args.print_interval

    # 初始化memory
    memory = Memory()

    # 创建agent实例
    agent = Agent(input_size=args.input_size,
                  output_size=args.output_size,
                  hidden_size=args.hidden_size,
                  lr=args.lr,
                  beta=args.beta,
                  gamma=args.gamma,
                  update_epoch=args.update_epoch,
                  epsilon=args.epsilon)

    reward_plot = [0]  #记录每print_interval个epoch的平均reward 画图用
    timestep_count = 0  #记录步长 到update_timestep清零
    interval_reward = 0  #记录每print_interval个epoch的平均reward 后清零
    interval_timestep = 0  #记录每print_interval个epoch的平均步长 后清零

    file_name = 'RL_Proj_2/{}.txt'.format(args.env_name)

    # training loop
    for epoch in range(1, max_epochs + 1):
        state = env.reset()  #与env交互随机获取一个state
        # agent做出action
        for timestep in range(max_timesteps):
            timestep_count += 1

            # old policy sampling 做出action 与环境交互
            action = agent.old_policy.act(state, memory)
            state, reward, done, _ = env.step(action)
            memory.rewards.append(reward)
            memory.is_done.append(done)

            # 判断是否需要更新 policy
            if timestep_count % update_timestep == 0:
                agent.update(memory)
                memory.clear_memory()
                timestep_count = 0

            interval_reward += reward
            env.render()
            if done:
                break

        interval_timestep += timestep

        # 每print_interval打印一次数据
        if epoch % print_interval == 0:
            interval_timestep = np.divide(interval_timestep, print_interval)
            interval_reward = np.divide(interval_reward, print_interval)

            reward_plot.append(interval_reward)

            # 储存数据
            with open(file_name, 'a') as f:
                f.write(
                    str(epoch) + ' ' + str(interval_timestep) + ' ' +
                    str(interval_reward) + '\n')

            print('Epoch {} \t average timestep: {} \t reward: {}'.format(
                epoch, interval_timestep, interval_reward))

            interval_reward = 0
            interval_timestep = 0

    # 训练结束后 存储模型
    torch.save(agent.policy.state_dict(),
               'RL_Proj_2/{}.pth'.format(args.env_name))

    #画图
    plt.plot(reward_plot)
    plt.xlabel('Epoch = tick times {}'.format(print_interval))
    plt.ylabel('Reward')
    plt.savefig('RL_Proj_2/{}.png'.format(args.env_name))
    plt.show()
Ejemplo n.º 3
0
            state = env.reset()
            state[0] = np.random.randint(20, 30)
            state[1] = np.random.randint(20, 30)
            state[2] = np.random.randint(20, 30)
            break

        if num_transitions % 500 == 0:
            plot_hover_result(num_transitions, rewards_plot, sum_rewards_plot,
                              poses, True)

    if force_break:
        break

    print("going to train the model")
    ppo.update(memory)
    memory.clear_memory()

    avg_length += t
    #rewards_plot.append(running_reward)

    # stop training if avg_reward > solved_reward
    '''
    if running_reward > (PRINT_EVERY * SOLVED_REWARD):
        print("########## Solved! ##########")
        torch.save(ppo.policy.state_dict(), './PPO_continuous_solved_{}.pth'.format(args.env_name))
        break
    '''

    # save every 500 episodes
    if i_episode % 50 == 0:
        torch.save(ppo.policy.state_dict(),
Ejemplo n.º 4
0
def worker(name, input_shape, n_actions, global_agent, global_icm,
           optimizer, icm_optimizer, env_id, n_threads, icm=False):
    T_MAX = 20

    local_agent = ActorCritic(input_shape, n_actions)

    if icm:
        local_icm = ICM(input_shape, n_actions)
        algo = 'ICM'
    else:
        intrinsic_reward = T.zeros(1)
        algo = 'A3C'

    memory = Memory()

    env = gym.make(env_id)

    t_steps, max_eps, episode, scores, avg_score = 0, 1000, 0, [], 0

    while episode < max_eps:
        obs = env.reset()
        hx = T.zeros(1, 256)
        score, done, ep_steps = 0, False, 0
        while not done:
            state = T.tensor([obs], dtype=T.float)
            action, value, log_prob, hx = local_agent(state, hx)
            obs_, reward, done, info = env.step(action)
            t_steps += 1
            ep_steps += 1
            score += reward
            reward = 0  # turn off extrinsic rewards
            memory.remember(obs, action, reward, obs_, value, log_prob)
            obs = obs_
            if ep_steps % T_MAX == 0 or done:
                states, actions, rewards, new_states, values, log_probs = \
                        memory.sample_memory()
                if icm:
                    intrinsic_reward, L_I, L_F = \
                            local_icm.calc_loss(states, new_states, actions)

                loss = local_agent.calc_loss(obs, hx, done, rewards, values,
                                             log_probs, intrinsic_reward)

                optimizer.zero_grad()
                hx = hx.detach_()
                if icm:
                    icm_optimizer.zero_grad()
                    (L_I + L_F).backward()

                loss.backward()
                T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40)

                for local_param, global_param in zip(
                                        local_agent.parameters(),
                                        global_agent.parameters()):
                    global_param._grad = local_param.grad
                optimizer.step()
                local_agent.load_state_dict(global_agent.state_dict())

                if icm:
                    for local_param, global_param in zip(
                                            local_icm.parameters(),
                                            global_icm.parameters()):
                        global_param._grad = local_param.grad
                    icm_optimizer.step()
                    local_icm.load_state_dict(global_icm.state_dict())
                memory.clear_memory()

        if name == '1':
            scores.append(score)
            avg_score = np.mean(scores[-100:])
            print('{} episode {} thread {} of {} steps {:.2f}M score {:.2f} '
                  'intrinsic_reward {:.2f} avg score (100) {:.1f}'.format(
                      algo, episode, name, n_threads,
                      t_steps/1e6, score,
                      T.sum(intrinsic_reward),
                      avg_score))
        episode += 1
    if name == '1':
        x = [z for z in range(episode)]
        fname = algo + '_CartPole_no_rewards.png'
        plot_learning_curve(x, scores, fname)