Esempio n. 1
0
class Agent():
    def __init__(self, params):
        self.params = params
        self.__state_dim = params['state_dim']
        self.__action_dim = params['action_dim']
        self.__buffer_size = params['buffer_size']
        self.__batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__lr = params['lr']
        self.__update_every = params['update_every']
        eps = params['eps']
        eps_decay = params['eps_decay']
        min_eps = params['min_eps']
        seed = params['seed']
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        critic_params = dict()
        critic_params['seed'] = seed
        critic_params['arch_params'] = params['arch_params_critic']
        self.critic_local = QNetwork(critic_params).to(device)
        self.critic_target = QNetwork(critic_params).to(device)
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=self.__lr)

        #Policy
        actor_params = dict()
        actor_params['seed'] = seed
        actor_params['arch_params'] = params['arch_params_actor']
        actor_params['noise_type'] = params['noise_type']
        actor_params['eps'] = eps
        actor_params['eps_decay'] = eps_decay
        actor_params['min_eps'] = min_eps
        actor_params['arch_params'] = params['arch_params_actor']
        self.actor_local = Policy(actor_params).to(device)
        self.actor_target = Policy(actor_params).to(device)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=self.__lr)

        self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size)
        self.__t_step = 0

    def memorize_experience(self, state, action, reward, next_state, done):
        self.__memory.add(state, action, reward, next_state, done)
        self.__t_step = (self.__t_step + 1)

    def choose_action(self, state):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        state = torch.from_numpy(state.astype(dtype=np.float)).to(device)
        action, action_perturbed = self.actor_local(state)
        return action, action_perturbed

    def learn_from_past_experiences(self):
        if self.__t_step % self.__update_every == 0:
            if len(self.__memory) > self.__batch_size:
                experiences = self.__memory.sample()
                self.update_Qnet_and_policy(experiences)

    def update_Qnet_and_policy(self, experiences):
        states, actions, rewards, next_states, dones = experiences
        next_actions, next_actions_perturbed = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones)
                               )  # if done == True: second term is equal to 0
        Q_expected = self.critic_local(states, actions)
        loss_func = nn.MSELoss()
        loss_critic = loss_func(Q_expected, Q_targets.detach())

        self.optimizer_critic.zero_grad()
        loss_critic.backward()
        self.optimizer_critic.step()

        predicted_actions, predicted_actions_perturbed = self.actor_local(
            states)  # new predicted actions, not the ones stored in buffer

        if self.params['noise_type'] == 'parameter':
            #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise
            if (predicted_actions -
                    predicted_actions_perturbed).pow(2).mean() >= 0.3:
                self.actor_local.eps /= 1.01
                self.actor_target.eps /= 1.01
            else:
                self.actor_local.eps *= 1.01
                self.actor_target.eps *= 1.01

        loss_actor = -self.critic_local(states, predicted_actions).mean()

        self.optimizer_actor.zero_grad()
        loss_actor.backward()
        self.optimizer_actor.step()

        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def update_eps(self):
        self.actor_local.eps = max(
            self.actor_local.eps * self.actor_local.eps_decay,
            self.actor_local.min_eps)
        self.actor_target.eps = max(
            self.actor_target.eps * self.actor_target.eps_decay,
            self.actor_target.min_eps)

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.__tau * local_param.data +
                                    (1.0 - self.__tau) * target_param.data)

    def save_weights(self, save_to):
        actor_params = {
            'actor_params': self.actor_local.policy_params,
            'state_dict': self.actor_local.state_dict()
        }
        critic_params = {
            'critic_params': self.critic_local.qnet_params,
            'state_dict': self.critic_local.state_dict()
        }

        file = dict()
        file['critic_params'] = critic_params
        file['actor_params'] = actor_params
        torch.save(file, open(save_to, 'wb'))

    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)
        qnet_params = checkpoint['critic_params']
        policy_params = checkpoint['actor_params']

        self.actor_local = Policy(policy_params['actor_params'])
        self.actor_local.load_state_dict(
            checkpoint['actor_params']['state_dict'])

        self.critic_local = QNetwork(qnet_params['critic_params'])
        self.critic_local.load_state_dict(
            checkpoint['critic_params']['state_dict'])
        return self
Esempio n. 2
0
class Learner:
    def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"):
        self.FILE = FILE
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.policy = Policy().to(self.device)
        self.policy.load_state_dict(torch.load(self.FILE))
        self.policy.eval()
        self.criterion = nn.CrossEntropyLoss()
        self.learning_rate = learning_rate
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=self.learning_rate)

    def simulate(self, episode: int, policyPercent: float, show=False):
        """
        Simulate the cartpole process
        :param episode: number of episode want to simulate, how many percentage of policy want to use
        :return: list of ([trajectory of actions], [trajectory of observation], totalReward)
        """
        env = gym.make('CartPole-v0')
        result = []
        for i_episode in range(episode):
            actions = []
            observations = []
            totalReward = 500  # if not failed
            observation = env.reset()
            for t in range(500):
                if show: env.render()
                observationTensor = torch.from_numpy(
                    observation)  # convert from numpy to tensor
                observationTensor = torch.tensor(observationTensor,
                                                 dtype=torch.float32)
                observationTensor = observationTensor.to(self.device)
                observations.append(observation.tolist())
                if random.random(
                ) <= policyPercent:  # policy mix with random choice
                    with torch.no_grad():
                        action = torch.max(self.policy(observationTensor),
                                           0)[1].item()  # 0 or 1
                else:
                    action = random.randint(0, 1)
                actions.append(action)
                observation, reward, done, info = env.step(action)
                if done:
                    totalReward = t + 1
                    # print(f"Episode finished after {t + 1} timesteps")
                    break
            result.append((actions, observations, totalReward))
        env.close()
        return result

    def trainPolicy(self, episodes, policyPercent=0.8):
        """ Train the policy """
        # First play serval times to determine the average reward.
        trajectoriesForAvgRwd = self.simulate(20, 1)
        averageReward = sum([i[2] for i in trajectoriesForAvgRwd
                             ]) / len(trajectoriesForAvgRwd)
        print(averageReward)

        trajectoriesForTrain = self.simulate(episodes, policyPercent)
        for trainTrajectory in trajectoriesForTrain:
            if trainTrajectory[2] > averageReward:
                # forward
                predictAction = self.policy(
                    torch.tensor(trainTrajectory[1]).to(self.device))
                loss = self.criterion(
                    predictAction,
                    torch.tensor(trainTrajectory[0]).to(self.device))

                # backwards
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        torch.save(self.policy.state_dict(), self.FILE)