Esempio n. 1
0
class PPO:
    def __init__(self,
                 env_id,
                 render=False,
                 num_process=4,
                 min_batch_size=2048,
                 lr_p=3e-4,
                 lr_v=3e-4,
                 gamma=0.99,
                 tau=0.95,
                 clip_epsilon=0.2,
                 ppo_epochs=10,
                 ppo_mini_batch_size=64,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.gamma = gamma
        self.tau = tau
        self.ppo_epochs = ppo_epochs
        self.ppo_mini_batch_size = ppo_mini_batch_size
        self.clip_epsilon = clip_epsilon
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.min_batch_size = min_batch_size
        self.model_path = model_path
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, self.num_states, self.num_actions = get_env_info(
            self.env_id)

        # seeding
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(self.num_states, self.num_actions).to(device)

        self.value_net = Value(self.num_states).to(device)
        self.running_state = ZFilter((self.num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_ppo.p from {}/{}_ppo.p".format(
                self.env_id, self.model_path, self.env_id))
            data = pickle.load(
                open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb"))
            self.policy_net, self.value_net, self.running_state = data.policy_net, data.value_net, data.running_state

        self.collector = MemoryCollector(self.env,
                                         self.policy_net,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)

    def choose_action(self, state):
        """select action"""
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action, log_prob = self.policy_net.get_action_log_prob(state)

        action = action.cpu().numpy()[0]
        return action

    def eval(self, i_iter, render=False):
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            state = self.running_state(state)
            action = self.choose_action(state)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalar("rewards/total_reward", log['total_reward'], i_iter)
        writer.add_scalar("rewards/average_reward", log['avg_reward'], i_iter)
        writer.add_scalar("rewards/min_reward", log['min_episode_reward'],
                          i_iter)
        writer.add_scalar("rewards/max_reward", log['max_episode_reward'],
                          i_iter)
        writer.add_scalar("rewards/num_steps", log['num_steps'], i_iter)

        batch, permuted_batch = memory.sample()  # sample all items in memory
        #  ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)
        batch_log_prob = FLOAT(batch.log_prob).to(device)

        with torch.no_grad():
            batch_value = self.value_net(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)

        alg_step_stats = {}
        if self.ppo_mini_batch_size:
            batch_size = batch_state.shape[0]
            mini_batch_num = int(
                math.ceil(batch_size / self.ppo_mini_batch_size))

            # update with mini-batch
            for _ in range(self.ppo_epochs):
                index = torch.randperm(batch_size)

                for i in range(mini_batch_num):
                    ind = index[slice(
                        i * self.ppo_mini_batch_size,
                        min(batch_size, (i + 1) * self.ppo_mini_batch_size))]
                    state, action, returns, advantages, old_log_pis = batch_state[ind], batch_action[ind], \
                        batch_return[
                        ind], batch_advantage[ind], \
                        batch_log_prob[
                        ind]

                    alg_step_stats = ppo_step(self.policy_net, self.value_net,
                                              self.optimizer_p,
                                              self.optimizer_v, 1, state,
                                              action, returns, advantages,
                                              old_log_pis, self.clip_epsilon,
                                              1e-3)
        else:
            for _ in range(self.ppo_epochs):
                alg_step_stats = ppo_step(self.policy_net, self.value_net,
                                          self.optimizer_p, self.optimizer_v,
                                          1, batch_state, batch_action,
                                          batch_return, batch_advantage,
                                          batch_log_prob, self.clip_epsilon,
                                          1e-3)

        return alg_step_stats

    def save(self, save_path):
        """save model"""
        check_path(save_path)
        pickle.dump((self.policy_net, self.value_net, self.running_state),
                    open('{}/{}_ppo_encoder.p'.format(save_path, self.env_id),
                         'wb'))
Esempio n. 2
0
class Learner:
    def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"):
        self.FILE = FILE
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.policy = Policy().to(self.device)
        self.policy.load_state_dict(torch.load(self.FILE))
        self.policy.eval()
        self.criterion = nn.CrossEntropyLoss()
        self.learning_rate = learning_rate
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=self.learning_rate)

    def simulate(self, episode: int, policyPercent: float, show=False):
        """
        Simulate the cartpole process
        :param episode: number of episode want to simulate, how many percentage of policy want to use
        :return: list of ([trajectory of actions], [trajectory of observation], totalReward)
        """
        env = gym.make('CartPole-v0')
        result = []
        for i_episode in range(episode):
            actions = []
            observations = []
            totalReward = 500  # if not failed
            observation = env.reset()
            for t in range(500):
                if show: env.render()
                observationTensor = torch.from_numpy(
                    observation)  # convert from numpy to tensor
                observationTensor = torch.tensor(observationTensor,
                                                 dtype=torch.float32)
                observationTensor = observationTensor.to(self.device)
                observations.append(observation.tolist())
                if random.random(
                ) <= policyPercent:  # policy mix with random choice
                    with torch.no_grad():
                        action = torch.max(self.policy(observationTensor),
                                           0)[1].item()  # 0 or 1
                else:
                    action = random.randint(0, 1)
                actions.append(action)
                observation, reward, done, info = env.step(action)
                if done:
                    totalReward = t + 1
                    # print(f"Episode finished after {t + 1} timesteps")
                    break
            result.append((actions, observations, totalReward))
        env.close()
        return result

    def trainPolicy(self, episodes, policyPercent=0.8):
        """ Train the policy """
        # First play serval times to determine the average reward.
        trajectoriesForAvgRwd = self.simulate(20, 1)
        averageReward = sum([i[2] for i in trajectoriesForAvgRwd
                             ]) / len(trajectoriesForAvgRwd)
        print(averageReward)

        trajectoriesForTrain = self.simulate(episodes, policyPercent)
        for trainTrajectory in trajectoriesForTrain:
            if trainTrajectory[2] > averageReward:
                # forward
                predictAction = self.policy(
                    torch.tensor(trainTrajectory[1]).to(self.device))
                loss = self.criterion(
                    predictAction,
                    torch.tensor(trainTrajectory[0]).to(self.device))

                # backwards
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        torch.save(self.policy.state_dict(), self.FILE)
Esempio n. 3
0
class Agent():
    def __init__(self, params):
        self.params = params
        self.__state_dim = params['state_dim']
        self.__action_dim = params['action_dim']
        self.__buffer_size = params['buffer_size']
        self.__batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__lr = params['lr']
        self.__update_every = params['update_every']
        eps = params['eps']
        eps_decay = params['eps_decay']
        min_eps = params['min_eps']
        seed = params['seed']
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        critic_params = dict()
        critic_params['seed'] = seed
        critic_params['arch_params'] = params['arch_params_critic']
        self.critic_local = QNetwork(critic_params).to(device)
        self.critic_target = QNetwork(critic_params).to(device)
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=self.__lr)

        #Policy
        actor_params = dict()
        actor_params['seed'] = seed
        actor_params['arch_params'] = params['arch_params_actor']
        actor_params['noise_type'] = params['noise_type']
        actor_params['eps'] = eps
        actor_params['eps_decay'] = eps_decay
        actor_params['min_eps'] = min_eps
        actor_params['arch_params'] = params['arch_params_actor']
        self.actor_local = Policy(actor_params).to(device)
        self.actor_target = Policy(actor_params).to(device)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=self.__lr)

        self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size)
        self.__t_step = 0

    def memorize_experience(self, state, action, reward, next_state, done):
        self.__memory.add(state, action, reward, next_state, done)
        self.__t_step = (self.__t_step + 1)

    def choose_action(self, state):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        state = torch.from_numpy(state.astype(dtype=np.float)).to(device)
        action, action_perturbed = self.actor_local(state)
        return action, action_perturbed

    def learn_from_past_experiences(self):
        if self.__t_step % self.__update_every == 0:
            if len(self.__memory) > self.__batch_size:
                experiences = self.__memory.sample()
                self.update_Qnet_and_policy(experiences)

    def update_Qnet_and_policy(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        next_actions, next_actions_perturbed = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones)
                               )  # if done == True: second term is equal to 0
        Q_expected = self.critic_local(states, actions)
        loss_func = nn.MSELoss()
        loss_critic = loss_func(Q_expected, Q_targets.detach())

        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        self.optimizer_critic.step()

        predicted_actions, predicted_actions_perturbed = self.actor_local(
            states)  # new predicted actions, not the ones stored in buffer

        if self.params['noise_type'] == 'parameter':
            #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise
            if (predicted_actions -
                    predicted_actions_perturbed).pow(2).mean() >= 0.3:
                self.actor_local.eps /= 1.01
                self.actor_target.eps /= 1.01
            else:
                self.actor_local.eps *= 1.01
                self.actor_target.eps *= 1.01

        loss_actor = -self.critic_local(states, predicted_actions).mean()

        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        self.optimizer_actor.step()

        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def update_eps(self):
        self.actor_local.eps = max(
            self.actor_local.eps * self.actor_local.eps_decay,
            self.actor_local.min_eps)
        self.actor_target.eps = max(
            self.actor_target.eps * self.actor_target.eps_decay,
            self.actor_target.min_eps)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.__tau * local_param.data +
                                    (1.0 - self.__tau) * target_param.data)

    def save_weights(self, save_to):
        actor_params = {
            'actor_params': self.actor_local.policy_params,
            'state_dict': self.actor_local.state_dict()
        }
        critic_params = {
            'critic_params': self.critic_local.qnet_params,
            'state_dict': self.critic_local.state_dict()
        }

        file = dict()
        file['critic_params'] = critic_params
        file['actor_params'] = actor_params
        torch.save(file, open(save_to, 'wb'))

    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)
        qnet_params = checkpoint['critic_params']
        policy_params = checkpoint['actor_params']

        self.actor_local = Policy(policy_params['actor_params'])
        self.actor_local.load_state_dict(
            checkpoint['actor_params']['state_dict'])

        self.critic_local = QNetwork(qnet_params['critic_params'])
        self.critic_local.load_state_dict(
            checkpoint['critic_params']['state_dict'])
        return self
Esempio n. 4
0
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True


seed_everything(seed)

env = LunarLander()
policy = Policy(env.observation_dim, env.action_dim)
optimizer = optim.Adam(policy.parameters(), lr=lr)
eps = np.finfo(np.float32).eps.item()


def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()


def finish_episode():
    R = 0
    policy_loss = []