コード例 #1
0
class AgentDDPG:
    def __init__(self, state_size, action_size, seed):
        """

        :state_size: size of the state vector
        :action_size: size of the action vector
        """

        self.state_size = state_size
        self.action_size = action_size
        self.t_step = 0
        self.score = 0.0
        self.best = 0.0
        self.seed = seed
        self.learning_rate_actor = 0.0001
        self.learning_rate_critic = 0.001

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target network definitions
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.seed).to(device)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.seed).to(device)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.seed).to(device)

        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.seed).to(device)
        # Actor Optimizer
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate_actor)

        # Critic Optimizer
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.learning_rate_critic)

        # Make sure local and target start with the same weights
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.001  # Soft update for target parameters Actor Critic with Advantage

    # Actor determines what to do based on the policy
    def act_local(self, state):
        # Given a state return the action recommended by the policy actor_local
        # Reshape the state to fit the torch tensor input
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Pass the state to the actor local model to get an action
        # recommend for the policy in a state
        # set the actor_local model to predict not to train
        self.actor_local.eval()
        # set the model so this operation is not counted in the
        # gradiant calculation.
        with torch.no_grad():
            actions = self.actor_local(state)
        # set the model back to training mode
        self.actor_local.train()

        # Return actions tensor
        return actions.detach()

    def act_target(self, states):
        # Pass the state to the actor target model to get an action
        # recommend for the policy in a state
        # set the actor_target model to predict not to train
        self.actor_target.eval()
        # set the model so this operation is not counted in the
        # gradiant calculation.
        with torch.no_grad():
            actions = self.actor_target(states)
        # set the model back to training mode
        self.actor_target.train()

        # Return actions tensor
        return actions.detach()

    def get_episode_score(self):
        """
        Calculate the episode scores
        :return: None
        """
        # Update score and best score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best:
            self.best = self.score

    def save_model_weights(self):
        torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
コード例 #2
0
class AgentDDPG:
    def __init__(self, state_size, action_size, seed):
        """

        :state_size: size of the state vector
        :action_size: size of the action vector
        """

        self.state_size = state_size
        self.action_size = action_size
        self.t_step = 0
        self.score = 0.0
        self.best = 0.0
        self.seed = seed
        self.total_reward = 0.0
        self.count = 0
        self.learning_rate_actor = 0.0001
        self.learning_rate_critic = 0.001
        self.batch_size = 128
        self.update_every = 1

        # Instances of the policy function or actor and the value function or critic
        # Actor critic with Advantage

        # Actor local and target network definitions
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.seed).to(device)

        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.seed).to(device)

        # Critic local and target
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.seed).to(device)

        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.seed).to(device)
        # Actor Optimizer
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate_actor)

        # Critic Optimizer
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.learning_rate_critic)

        # Make sure local and target start with the same weights
        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.critic_target.load_state_dict(self.critic_local.state_dict())

        # Initialize the Gaussin Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Initialize the Replay Memory
        self.buffer_size = 1000000
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for the Algorithm
        self.gamma = 0.99  # Discount factor
        self.tau = 0.001  # Soft update for target parameters Actor Critic with Advantage

    # Actor interact with the environment through the step
    def step(self, state, action, reward, next_state, done):
        # Add to the total reward the reward of this time step
        self.total_reward += reward
        # Increase your count based on the number of rewards
        # received in the episode
        self.count += 1
        # Stored experience tuple in the replay buffer
        self.memory.add(state, action, reward, next_state, done)

        # Learn every update_times time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:

            # Check to see if you have enough to produce a batch
            # and learn from it

            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                # Train the networks using the experiences
                self.learn(experiences)

        # Roll over last state action (not needed)
        # self.last_state = next_state

    # Actor determines what to do based on the policy
    def act(self, state):
        # Given a state return the action recommended by the policy
        # Reshape the state to fit the torch tensor input
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Pass the state to the actor local model to get an action
        # recommend for the policy in a state
        # set the actor_local model to predict not to train
        self.actor_local.eval()
        # set the model so this operation is not counted in the
        # gradiant calculation.
        with torch.no_grad():
            actions = self.actor_local(state)
        # set the model back to training mode
        self.actor_local.train()

        # Because we are exploring we add some noise to the
        # action vector
        return list(actions.detach().numpy().reshape(4, ) +
                    self.noise.sample())

    # This is the Actor learning logic called when the agent
    # take a step to learn
    def learn(self, experiences):
        """
        Learning means that the networks parameters needs to be updated
        Using the experineces batch.
        Network learns from experiences not form interaction with the
        environment
        """

        # Reshape the experience tuples in separate arrays of states, actions
        # rewards, next_state, done
        # Your are converting every member of the tuple in a column or vector
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Now reshape the numpy arrays for states, actions and next_states to torch tensors
        # rewards and dones does not need to be tensors.
        states = torch.from_numpy(states).float().unsqueeze(0).to(device)
        actions = torch.from_numpy(actions).float().unsqueeze(0).to(device)
        next_states = torch.from_numpy(next_states).float().unsqueeze(0).to(
            device)

        # Firs we pass a batch of next states to the actor so it tell us what actions
        # to execute, we use the actor target network instead of the actor local network
        # because of the advantage principle

        # set the target network to predict because this is not part of the training, this model
        # weights are alter by a soft update not by an optimizer
        self.actor_target.eval()
        with torch.no_grad():
            next_state_actions = self.actor_target(next_states).detach()
        self.actor_target.train()

        # The critic evaluates the actions taking by the actor in the next state and generates the
        # Q(a,s) value of the next state taking those actions. These action, next_state tuple comes from the
        # ReplayBuffer not from interacting with the environment.
        # Remember the Critic or q_value function inputs is states, actions
        # We calculate the q_targets of the next state. We will use this to calculate the current
        # state q_value using the bellman equation.

        # set the target network to predict because this is not part of the training, this model
        # weights are alter by a soft update not by an optimizer
        self.critic_target.eval()
        with torch.no_grad():
            q_targets_next_state_action_values = self.critic_target(
                next_states, next_state_actions).detach()
        self.actor_target.train()

        # With the next state q_value that is a vector of action values Q(s,a) of a random selected
        # next_states from the replay buffer. We calculate the CURRENT state target Q(s,a).
        # using the TD one-step Sarsa equations and the q_target_next value we got from the critic_target net
        # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value
        # This is done to train the critic_local model in a supervise learning fashion, this is the target values.
        q_targets = torch.from_numpy(
            rewards + self.gamma * q_targets_next_state_action_values.numpy() *
            (1 - dones)).float()

        # --- Optimize the local Critic Model ----#

        # Here we start the supervise training process of the critic_local network
        # we pass a bunch of states actions samples it produces the expected output
        # q_value of each action we passed.
        q_expected = self.critic_local(states, actions)

        # Clear grad buffer values in preparation.
        self.critic_optimizer.zero_grad()

        # loss function for the critic_local model mean square of the difference
        # between the q_expected value and the q_target value.
        critic_loss = F.smooth_l1_loss(q_expected, q_targets)
        critic_loss.backward(retain_graph=True)

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        # optimize the critic_local model using the optimizer defined for the critic
        # In the init function of this class
        self.critic_optimizer.step()

        # --- Optimize the local Actor Model ---#

        # Get the actor actions using the experience buffer states
        actor_actions = self.actor_local(states)

        # Use as a loss the negative sum of the q_values produce by the optimized critic local model given the
        # action of the actor_local model obtain using the states of the sampled buffer.
        loss_actor = -1 * torch.sum(
            self.critic_local.forward(states, actor_actions))

        # Set the model gradients to zero in preparation
        self.actor_optimizer.zero_grad()

        # Back propagate
        loss_actor.backward()

        # optimize the actor_local model using the optimizer defined for the actor
        # In the init function of this class
        self.actor_optimizer.step()

        # Soft-update target models
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

    def soft_update(self, local_model, target_model):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def get_episode_score(self):
        """
        Calculate the episode scores
        :return: None
        """
        # Update score and best score
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best:
            self.best = self.score

    def save_model_weights(self):
        torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
コード例 #3
0
class TD3(object):
    # Twin Delay DDGP
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = Critic(state_dim, action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())

        self.max_action = max_action

    def select_action(self, state):
        state = torch.Tensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self,
              replay_buffer,
              iterations,
              batch_size=100,
              discount=0.99,
              tau=0.005,
              policy_noise=0.2,
              noise_clip=0.5,
              policy_freq=2):
        for it in range(iterations):
            # step 4: we sample batch of transitions (s, s', a, r) from the memory
            batch_state, batch_next_state, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(
                batch_size)
            state = torch.Tensor(batch_state).to(device)
            next_state = torch.Tensor(batch_next_state).to(device)
            action = torch.Tensor(batch_actions).to(device)
            reward = torch.Tensor(batch_rewards).to(device)
            done = torch.Tensor(batch_dones).to(device)

            # step 5: from the next state s', the actor traget plays next action a'
            next_action = self.actor_target(next_state)

            # step 6: we add Gaussian noise to the next action a' and we clamp it in a range of values supported by the environment
            noise = torch.Tensor(batch_actions).data.normal_(
                0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action,
                                                      self.max_action)

            # step 7: the two critic targets take each the couple (s', a') as input and return  2 Q-values  Qt1(s', a') and Qt2(s', a') as output
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)

            # step 8: we keep the minimum of these Q-values min(Qt1, Qt2)
            target_Q = torch.min(target_Q1, target_Q2)

            # step 9: we get the final target of the 2 critic models, which is : Qt = r + gamma * target_Q
            target_Q = reward + (1 - done) * discount * target_Q

            # step 10: the 2 critic models take each the couple (s, a) as input and return 2 Q-values Qt1(s, a) and Qt2(s, a) as outputs
            current_Q1, current_Q2 = self.critic(state, action)

            # step 11: we compute the less coming from the 2 critic models : critic loss = mse_loss(Q1(s,a), Qt) + mse_loss(Q2(s,a), Qt)
            critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
                current_Q2, target_Q)

            # step 12:  we backpropagate the critic loss and update the parameters of the 2 critic models with SGD optimizer
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # step 13: one every 2 iterations, we update our Actor model by performing gradient ascent on the output of the first critic model
            if it % policy_freq == 0:
                # deterministic policy gradient DPG
                actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
                self.actor.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                # Delay
                # step 14: still once every 2 iterations, we update the weights of the actor target by polyak averaging
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data +
                                            (1 - tau) * target_param.data)

                # step 15: still ones every 2 iterations, we uodate the weights of the critic target by polyak averaging
                for param, target_param in zip(
                        self.critic.parameters(),
                        self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data +
                                            (1 - tau) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(),
                   '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(),
                   '%s/%s_critic.pth' % (directory, filename))

    def load(self, filename, directory):
        self.actor.load_state_dict(
            torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(
            torch.load('%s/%s_critic.pth' % (directory, filename)))
コード例 #4
0
ファイル: agent.py プロジェクト: tahsmith/drlnd-p2-reacher
class Agent:
    def __init__(self, device, state_size, action_size, buffer_size=10,
                 batch_size=10,
                 actor_learning_rate=1e-4,
                 critic_learning_rate=1e-3,
                 discount_rate=0.99,
                 tau=0.1,
                 steps_per_update=4,
                 action_range=None,
                 dropout_p=0.0,
                 weight_decay=0.0001,
                 noise_max=0.2,
                 noise_decay=1.0,
                 n_agents=1
                 ):
        self.device: torch.device = device
        self.state_size = state_size
        self.action_size = action_size

        self.critic_control = Critic(state_size, action_size).to(device)
        self.critic_control.dropout.p = dropout_p
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_target.eval()
        self.critic_optimizer = torch.optim.Adam(
            self.critic_control.parameters(),
            weight_decay=weight_decay,
            lr=critic_learning_rate)

        self.actor_control = Actor(state_size, action_size, action_range).to(
            device)
        self.actor_control.dropout.p = dropout_p
        self.actor_target = Actor(state_size, action_size, action_range).to(
            device)
        self.actor_target.eval()
        self.actor_optimizer = torch.optim.Adam(
            self.actor_control.parameters(),
            weight_decay=weight_decay,
            lr=actor_learning_rate)

        self.batch_size = batch_size
        self.min_buffer_size = batch_size
        self.replay_buffer = ReplayBuffer(device, state_size, action_size,
                                          buffer_size)

        self.discount_rate = discount_rate

        self.tau = tau

        self.step_count = 0
        self.steps_per_update = steps_per_update

        self.noise_max = noise_max
        self.noise = OUNoise([n_agents, action_size], 15071988, sigma=self.noise_max)
        self.noise_decay = noise_decay
        self.last_score = float('-inf')

    def policy(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_control.eval()
        with torch.no_grad():
            action = self.actor_control(state).cpu().numpy()
        self.actor_control.train()
        if add_noise:
            noise = self.noise.sample()
            action += noise
        return action

    def step(self, state, action, reward, next_state, done):
        p = self.calculate_p(state, action, reward, next_state, done)

        for i in range(state.shape[0]):
            self.replay_buffer.add(state[i, :], action[i, :], reward[i],
                                   next_state[i, :], done[i], p[i])
        if self.step_count % self.steps_per_update == 0:
            self.learn()
        self.step_count += 1

    def learn(self):
        if len(self.replay_buffer) < self.min_buffer_size:
            return
        indicies, (states, actions, rewards, next_states, dones, p) = \
            self.replay_buffer.sample(self.batch_size)

        self.actor_control.eval()
        error = self.bellman_eqn_error(
            states, actions, rewards, next_states, dones)
        self.actor_control.train()

        importance_scaling = (self.replay_buffer.buffer_size * p) ** -1
        importance_scaling /= importance_scaling.max()
        self.critic_optimizer.zero_grad()
        loss = (importance_scaling * (error ** 2)).sum() / self.batch_size
        loss.backward()
        self.critic_optimizer.step()

        self.actor_optimizer.zero_grad()
        expected_actions = self.actor_control(states)
        critic_score = self.critic_control(states, expected_actions)
        loss = -1 * (importance_scaling * critic_score).sum() / self.batch_size
        loss.backward()
        self.actor_optimizer.step()

        self.update_target(self.critic_control, self.critic_target)
        self.update_target(self.actor_control, self.actor_target)

        self.replay_buffer.update(indicies, error.detach().abs().cpu() + 1e-3)

    def bellman_eqn_error(self, states, actions, rewards, next_states, dones):
        """Double DQN error - use the control network to get the best action
        and apply the target network to it to get the target reward which is
        used for the bellman eqn error.
        """
        next_actions = self.actor_control(next_states)

        target_action_values = self.critic_target(next_states, next_actions)

        target_rewards = (
                rewards
                + self.discount_rate * (1 - dones) * target_action_values
        )

        current_rewards = self.critic_control(states, actions)
        error = current_rewards - target_rewards
        return error

    def calculate_p(self, state, action, reward, next_state, done):
        next_state = torch.from_numpy(next_state).float().to(
            self.device)
        state = torch.from_numpy(state).float().to(self.device)
        action = torch.from_numpy(action).float().to(self.device)
        reward = torch.from_numpy(reward).float().to(self.device)
        done = torch.from_numpy(done).float().to(
            self.device)

        done = done.unsqueeze(1)
        reward = reward.unsqueeze(1)

        self.actor_control.eval()
        self.critic_control.eval()

        with torch.no_grad():
            retval = abs(
                self.bellman_eqn_error(state, action, reward, next_state,
                                       done)) + 1e-3
        self.critic_control.train()
        self.actor_control.train()
        return retval

    def update_target(self, control, target):
        for target_param, control_param in zip(
                target.parameters(),
                control.parameters()):
            target_param.data.copy_(
                self.tau * control_param.data + (1.0 - self.tau) *
                target_param.data)

    def end_of_episode(self, final_score):
        self.step_count = 0

        self.noise.sigma *= self.noise_decay
        self.last_score = final_score
        self.noise.reset()

    def save(self, path):
        torch.save(self.critic_control.state_dict(), path + '-critic.p')
        torch.save(self.actor_control.state_dict(), path + '-actor.p')

    def restore(self, path):
        self.critic_control.load_state_dict(
            torch.load(path + '-critic.p', map_location='cpu'))
        self.actor_control.load_state_dict(
            torch.load(path + '-actor.p', map_location='cpu'))
コード例 #5
0
class ActorCritic(Model):
    def __init__(self,
                 observation_space_size,
                 action_space_size,
                 name=None,
                 env_name=None,
                 model_config=None,
                 play_mode=False):

        if name is None:
            name = "Unnamed-ActorCritic"
        super(ActorCritic,
              self).__init__(observation_space_size, action_space_size, name,
                             env_name, model_config, play_mode)

    def build_model(self):

        self.policy_net = Actor(self.observation_space_size,
                                self.action_space_size)
        self.critic_net = Critic(self.observation_space_size)

        if self.model_config is None:
            self.gamma = 0.99

            self.actor_optimizer = optim.Adam(self.policy_net.parameters())
            self.actor_loss = nn.MSELoss()

            self.critic_optimizer = optim.Adam(self.critic_net.parameters())
            self.critic_loss = nn.MSELoss()

            self.get_epsilon = self.get_epsilon_default
        else:
            pass

    def save_checkpoint(self, n=0, filepath=None):
        """
        n - number of epoch / episode or whatever is used for enumeration
        """

        # TO DO: ADD OTHER RELEVANT PARAMETERS
        checkpoint = {
            'policy': self.policy_net.state_dict(),
            'critic': self.critic_net.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }
        super(ActorCritic, self).save_checkpoint(n, filepath, checkpoint)

    def load_checkpoint(self, filepath):
        # TO DO: ADD OTHER RELEVANT parameters
        checkpoint = torch.load(filepath)
        self.policy_net.load_state_dict(checkpoint['policy'])
        self.critic_net.load_state_dict(checkpoint['critic'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])

    def prepare_sample(self, sample):
        sample = np.array(sample)
        states = torch.tensor(sample[:, 0], dtype=torch.float32)
        actions = torch.tensor(sample[:, 1], dtype=torch.float32)
        rewards = torch.tensor(sample[:, 2], dtype=torch.float32)
        next_states = torch.tensor(sample[:, 3], dtype=torch.float32)
        dones = torch.tensor(sample[:, 4], dtype=torch.int32)

        return states, actions, rewards, next_states, dones

    def critic_update(self, V, V_target):
        self.critic_optimizer.zero_grad()
        critic_loss = self.critic_loss(V, V_target)
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    def actor_update(self, advantages, actions, mus):
        self.actor_optimizer.zero_grad()
        actor_loss = self.actor_loss(actions, mus)
        gradient_term = advantages * actor_loss
        gradient_term.backward()
        self.actor_optimizer.step()

        return actor_loss.item()

    def update(self, sample, prepare_state=None):
        actor_running_loss = []
        critic_running_loss = []

        for state, action, reward, next_state, done in sample:
            if prepare_state is not None:
                state = prepare_state(state)
                next_state = prepare_state(next_state)

            state = torch.tensor(state, dtype=torch.float32)
            next_state = torch.tensor(next_state, dtype=torch.float32)
            action = torch.tensor(action, dtype=torch.float32)

            # Update Critic
            V = self.critic_net.forward(state)
            V_target = torch.tensor([reward], dtype=torch.float32)
            if done is False:
                V_target += self.gamma * self.critic_net.forward(next_state)

            critic_loss = self.critic_update(V, V_target)
            critic_running_loss.append(critic_loss)

            # Update Actor
            advantage = (V_target - V).detach()
            mu = self.policy_net(state)

            actor_loss = self.actor_update(advantage, action, mu)
            actor_running_loss.append(actor_loss)

        return actor_running_loss, critic_running_loss

    def batch_update(self, sample, prepare_state=None):
        actor_running_loss = []
        critic_running_loss = []

        states, actions, rewards, next_states, dones = self.prepare_sample(
            sample)

        # Update Critic
        V = self.critic_net.forward(states)
        V_target = rewards + self.gamma * self.critic_net.forward(
            next_states) * (1 - dones)

        critic_loss = self.critic_update(V, V_target)
        critic_running_loss.append(critic_loss)

        # Update Actor
        advantage = (V_target - V).detach()
        mu = self.policy_net(states)

        actor_loss = self.actor_update(advantage, actions, mu)
        actor_running_loss.append(actor_loss)

        return actor_running_loss, critic_running_loss
コード例 #6
0
class Agent(object):
    def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma,
                 mem_size, actor_l1_size, actor_l2_size, critic_l1_size,
                 critic_l2_size, batch_size):

        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(mem_size, n_states, n_actions)
        self.batch_size = batch_size

        self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                           actor_l2_size)
        self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size,
                             critic_l2_size)

        self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size,
                                  actor_l2_size)
        self.target_critic = Critic(lr_critic, n_states, n_actions,
                                    critic_l1_size, critic_l2_size)

        self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005)

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,
                                   dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)

        # add noise to action - for exploration
        mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to(
            self.actor.device)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()

    def choose_action_no_train(self, observation):
        self.actor.eval()
        observation = torch.tensor(observation,
                                   dtype=torch.float).to(self.actor.device)
        mu = self.actor.forward(observation).to(self.actor.device)

        return mu.cpu().detach().numpy()

    def remember(self, state, action, reward, new_state, done):
        self.memory.push(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.idx_last < self.batch_size:
            # not enough data in replay buffer
            return

        # select random events
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)

        reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device)
        done = torch.tensor(done).to(self.critic.device)
        new_state = torch.tensor(new_state,
                                 dtype=torch.float).to(self.critic.device)
        action = torch.tensor(action, dtype=torch.float).to(self.critic.device)
        state = torch.tensor(state, dtype=torch.float).to(self.critic.device)

        self.target_actor.eval()
        self.target_critic.eval()
        self.critic.eval()
        target_actions = self.target_actor.forward(new_state)
        critic_value_ = self.target_critic.forward(new_state, target_actions)
        critic_value = self.critic.forward(state, action)

        target = []
        for j in range(self.batch_size):
            target.append(reward[j] + self.gamma * critic_value_[j] * done[j])
        target = torch.tensor(target).to(self.critic.device)
        target = target.view(self.batch_size, 1)

        self.critic.train()
        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.critic.eval()
        self.actor.optimizer.zero_grad()
        mu = self.actor.forward(state)
        self.actor.train()
        actor_loss = -self.critic.forward(state, mu)
        actor_loss = torch.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                      (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)

    def save_models(self):
        timestamp = time.strftime("%Y%m%d-%H%M%S")

        self.actor.save("actor_" + timestamp)
        self.target_actor.save("target_actor_" + timestamp)
        self.critic.save("critic_" + timestamp)
        self.target_critic.save("target_critic_" + timestamp)

    def load_models(self, fn_actor, fn_target_actor, fn_critic,
                    fn_target_critic):
        self.actor.load_checkpoint(fn_actor)
        self.target_actor.load_checkpoint(fn_target_actor)
        self.critic.load_checkpoint(fn_critic)
        self.target_critic.load_checkpoint(fn_target_critic)
コード例 #7
0
class TD3(object):
    """Agent class that handles the training of the networks
    and provides outputs as actions.

    Args:
        state_dim (array): state size
        action_dim (array): action size
        policy_noise (float): how much noise to add to actions
        device (device): cuda or cpu to process the tensors
        discount (float): discount factor
        tau (float): soft update for main networks to target networks
        policy_noise (float): noise factor
        noise_clip (float): clip factor
        policy_freq (int): frequency of policy updates

    """
    def __init__(self, state_dim, action_dim, max_action, discount, tau,
                 policy_noise, noise_clip, policy_freq, device):

        self.state_dim = len(state_dim[0])
        self.action_dim = len(action_dim)
        self.max_action = max_action[2]
        self.actor = Actor(self.state_dim, self.action_dim,
                           self.max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor).float()
        # self.actor_target = Actor(state_dim, action_dim, self.max_action).to(device)
        # self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=3e-4)  # or 1e-3

        self.critic = Critic(self.state_dim, self.action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic).float()
        # self.critic_target = Critic(state_dim, action_dim).to(device)
        # self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=3e-4)  # or 1e-2

        self.device = device
        self.max_action = max_action
        self.discount = discount
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.total_it = 0

    def select_action(self, state):
        """Select an appropriate action from the agent policy
            Args:
                state (array): current state of environment

            Returns:
                action (float): action clipped within action range
        """

        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)

        #  if noise != 0:
        #      action_dim = len(self.env.action_space())
        #      action = (action + np.random.normal(0, noise, size=action_dim))
        #  action_space_low, _, action_space_high = self.env.action_domain()
        #  return action.clip(action_space_low, action_space_high)

        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=100):
        """Train and update actor and critic networks
            Args:
                replay_buffer (ReplayBuffer): buffer for experience replay
                batch_size(int): batch size to sample from replay buffer
            Return:
                actor_loss (float): loss from actor network
                critic_loss (float): loss from critic network
        """
        self.total_it += 1
        # Sample replay buffer
        state, next_state, action, reward, done = replay_buffer.sample(
            batch_size)

        state = torch.from_numpy(
            np.asarray([np.array(i.item().values()) for i in state]))

        next_state = np.asarray(
            [np.array(i.item().values()) for i in next_state])
        reward = torch.as_tensor(reward, dtype=torch.float32)
        done = torch.as_tensor(done, dtype=torch.float32)

        with torch.no_grad():
            # select an action according to the policy an add clipped noise
            # need to select set of actions
            noise = (torch.rand_like(torch.from_numpy(action)) *
                     self.policy_noise).clamp(-self.noise_clip,
                                              self.noise_clip)

            next_action = (self.actor_target(
                torch.tensor(next_state, dtype=torch.float32)) +
                           torch.tensor(noise, dtype=torch.float32)).clamp(
                               self.max_action[0], self.max_action[2])
            # next_action_d =torch.as_tensor(next_action, dtype=torch.double)
            # Compute the target Q value
            target_Q1, target_Q2 = self.critic(state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + done * self.discount * target_Q

        # update action datatype, can't do earlier, use np.array earlier
        action = torch.as_tensor(action, dtype=torch.float32)

        # get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        # compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q[:1, :].transpose(
            0, 1)) + F.mse_loss(current_Q2, target_Q[:1, :].transpose(0, 1))

        # optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # delayed policy updates
        if self.total_it % self.policy_freq == 0:
            # compute the actor loss
            actor_loss = -self.critic.get_q(state, self.actor(state)).mean()

            # optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(),
                   '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(),
                   '%s/%s_critic.pth' % (directory, filename))

    def load(self, filename="best_avg", directory="./saves"):
        self.actor.load_state_dict(
            torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(
            torch.load('%s/%s_critic.pth' % (directory, filename)))
コード例 #8
0
ファイル: td3.py プロジェクト: TomoyaAkiyama/TD3
class TD3:
    def __init__(self,
                 device,
                 state_dim,
                 action_dim,
                 action_max,
                 gamma=0.99,
                 tau=0.005,
                 lr=3e-4,
                 policy_noise=0.2,
                 noise_clip=0.5,
                 exploration_noise=0.1,
                 policy_freq=2):

        self.actor = Actor(state_dim, 256, action_dim, action_max).to(device)
        self.target_actor = copy.deepcopy(self.actor)
        self.actor_optimizer = optim.Adam(params=self.actor.parameters(),
                                          lr=lr)
        self.critic = Critic(state_dim, 256, action_dim).to(device)
        self.target_critic = copy.deepcopy(self.critic)
        self.critic_optimizer = optim.Adam(params=self.critic.parameters(),
                                           lr=lr)

        self.device = device
        self.gamma = gamma
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq

        self.rollout_actor = TD3RolloutActor(state_dim, action_dim, action_max,
                                             exploration_noise)
        self.sync_rollout_actor()

        self.iteration_num = 0

    def train(self, replay_buffer, batch_size=256):
        self.iteration_num += 1

        st, nx_st, ac, rw, mask = replay_buffer.sample(batch_size)
        with torch.no_grad():
            noise = (torch.randn_like(ac) * self.policy_noise).clamp(
                -self.noise_clip, self.noise_clip)
            nx_ac = self.target_actor.forward(nx_st, noise)

            target_q1, target_q2 = self.target_critic.forward(nx_st, nx_ac)
            min_q = torch.min(target_q1, target_q2)
            target_q = rw + mask * self.gamma * min_q

        q1, q2 = self.critic.forward(st, ac)
        critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q)
        self.critic.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        if self.iteration_num % self.policy_freq == 0:
            actor_loss = -self.critic.q1(st, self.actor.forward(st)).mean()
            self.actor.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for param, target_param in zip(self.critic.parameters(),
                                           self.target_critic.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(),
                                           self.target_actor.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

        self.sync_rollout_actor()

    def sync_rollout_actor(self):
        for param, target_param in zip(self.actor.parameters(),
                                       self.rollout_actor.parameters()):
            target_param.data.copy_(param.data.cpu())

    def save(self, path):
        torch.save(self.critic.state_dict(), os.path.join(path, 'critic.pth'))
        torch.save(self.target_critic.state_dict(),
                   os.path.join(path, 'target_critic.pth'))
        torch.save(self.critic_optimizer.state_dict(),
                   os.path.join(path, 'critic_optimizer.pth'))

        torch.save(self.actor.state_dict(), os.path.join(path, 'actor.pth'))
        torch.save(self.target_actor.state_dict(),
                   os.path.join(path, 'target_actor.pth'))
        torch.save(self.actor_optimizer.state_dict(),
                   os.path.join(path, 'actor_optimizer.pth'))

    def load(self, path):
        self.critic.load_state_dict(
            torch.load(os.path.join(path, 'critic.pth')))
        self.target_critic.load_state_dict(
            torch.load(os.path.join(path, 'target_critic.pth')))
        self.critic_optimizer.load_state_dict(
            torch.load(os.path.join(path, 'critic_optimizer.pth')))

        self.actor.load_state_dict(torch.load(os.path.join(path, 'actor.pth')))
        self.target_actor.load_state_dict(
            torch.load(os.path.join(path, 'target_actor.pth')))
        self.actor_optimizer.load_state_dict(
            torch.load(os.path.join(path, 'actor_optimizer.pth')))
        self.sync_rollout_actor()
コード例 #9
0
class TD3(object):
    """Agent class that handles the training of the networks
    and provides outputs as actions.
    """

    def __init__(self):
        state_dim = cons.STATE_DIM.flatten().shape[0]
        action_dim = cons.ACTION_DIM
        self.actor = Actor(state_dim, action_dim, cons.MAX_ACTION).to(cons.DEVICE)
        self.actor_target = Actor(state_dim,  action_dim, cons.MAX_ACTION).to(cons.DEVICE)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)  # or 1e-3

        self.critic = Critic(state_dim,  action_dim).to(cons.DEVICE)
        self.critic_target = Critic(state_dim,  action_dim).to(cons.DEVICE)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)  # or 1e-3

        self.total_it = 0
        self.critic_loss_plot = []
        self.actor_loss_plot = []

    def select_action(self, state, noise=cons.POLICY_NOISE):
        """Select an appropriate action from the agent policy
            Args:
                state (array): current state of environment
                noise (float): how much noise to add to actions
            Returns:
                action (list): nn action results
        """
        state = torch.FloatTensor(state).to(cons.DEVICE)
        action = self.actor(state)
        # action space noise introduces noise to change the likelihoods of each action the agent might take
        if noise != 0:
            # creates tensor of gaussian noise
            noise = torch.clamp(torch.randn(14, dtype=torch.float32, device='cuda') * noise,
                                min=-cons.NOISE_CLIP, max=cons.NOISE_CLIP)
        action = action + noise
        torch.clamp(action, min=cons.MIN_ACTION, max=cons.MAX_ACTION)
        return action

    def train(self, replay_buffer, iterations):
        """Train and update actor and critic networks
            Args:
                replay_buffer (ReplayBuffer): buffer for experience replay
                iterations (int): how many times to run training
            Return:
                actor_loss (float): loss from actor network
                critic_loss (float): loss from critic network
        """
        for it in range(iterations):
            self.total_it += 1  # keep track of the total training iterations
            # Sample replay buffer (priority replay)
            # choose type of replay
            if cons.PRIORITY:
                state, action, reward, next_state, done, weights, indexes = replay_buffer.sample(cons.BATCH_SIZE,
                                                                                     beta=cons.BETA_SCHED.value(it))
            else:
                state, action, reward, next_state, done = replay_buffer.sample(cons.BATCH_SIZE)

            state = torch.from_numpy(state).float().to(cons.DEVICE)                 # torch.Size([100, 14])
            next_state = torch.from_numpy(next_state).float().to(cons.DEVICE)       # torch.Size([100, 14])
            action = torch.from_numpy(action).float().to(cons.DEVICE)               # torch.Size([100, 14])
            reward = torch.as_tensor(reward, dtype=torch.float32).to(cons.DEVICE)   # torch.Size([100])
            done = torch.as_tensor(done, dtype=torch.float32).to(cons.DEVICE)       # torch.Size([100])

            with torch.no_grad():
                # select an action according to the policy and add clipped noise
                next_action = self.actor_target(next_state)
                noise = torch.clamp(torch.randn((100, 14), dtype=torch.float32, device='cuda') * cons.POLICY_NOISE,
                                    min=-cons.NOISE_CLIP, max=cons.NOISE_CLIP)
                next_action = torch.clamp((next_action + noise), min=cons.MIN_ACTION, max=cons.MAX_ACTION)

                # Compute the target Q value
                target_q1, target_q2 = self.critic(state.float(), next_action.float())
                target_q = torch.min(target_q1, target_q2)
                gamma = torch.ones((100, 1), dtype=torch.float32, device='cuda')
                gamma = gamma.new_full((100, 1), cons.GAMMA)
                target_q = reward.unsqueeze(1) + (done.unsqueeze(1) * gamma * target_q).detach()

            # get current Q estimates
            current_q1, current_q2 = self.critic(state.float(), action.float())

            # compute critic loss
            critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
            cons.TD3_REPORT.write_critic_loss(self.total_it, it, critic_loss)

            # optimize the critic
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            # using the minimum of the q values as the weight, use min to prevent overestimation
            if cons.PRIORITY:
                new_priorities = torch.flatten(torch.min(current_q1, current_q2))
                # convert any negative priorities to a minimum value, can't have a negative priority
                new_priorities = torch.clamp(new_priorities, min=0.0000001).tolist()  # convert to a list for storage
                replay_buffer.update_priorities(indexes, new_priorities)

            # delayed policy updates
            if it % cons.POLICY_FREQ == 0:  # update the actor policy less frequently

                # compute the actor loss
                q_action = self.actor(state).float().detach()
                actor_loss = -self.critic.get_q(state, q_action).mean()
                cons.TD3_REPORT.write_actor_loss(self.total_it, it, actor_loss, 1)

                # optimize the actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                self.actor_loss_plot.append(actor_loss.item())

                # Update the frozen right_target models
                for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                    target_param.data.copy_(cons.TAU * param.data + (1 - cons.TAU) * target_param.data)

                for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                    target_param.data.copy_(cons.TAU * param.data + (1 - cons.TAU) * target_param.data)

    def save(self, filename, directory):
        torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
        torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))

    def load(self, filename="best_avg", directory="td3/saves/shared_agent"):
        self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
        self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
コード例 #10
0
class Agent:
    def __init__(self, state_size, action_size):
        self._state_size = state_size
        self._action_size = action_size

        # Actor network
        self._actor_local = Actor(state_size, action_size).to(device)
        self._actor_target = Actor(state_size, action_size).to(device)
        self._actor_optimizer = optim.Adam(self._actor_local.parameters())

        # Critic network
        self._critic_local = Critic(state_size, action_size).to(device)
        self._critic_target = Critic(state_size, action_size).to(device)
        self._critic_optimizer = optim.Adam(self._critic_local.parameters())

        # Memory
        self._memory = Memory(BUFFER_SIZE)

        # Do equal weights
        self.hard_update(self._actor_local, self._actor_target)
        self.hard_update(self._critic_local, self._critic_target)

    def step(self, state, action, reward, next_state, done):
        self._memory.push((state, action, reward, next_state, done))

        if len(self._memory) > BATCH_SIZE:
            for _ in range(UPDATES_PER_STEP):
                samples = self._memory.sample(BATCH_SIZE)
                self.learn(samples)

    def act(self, state):
        state = torch.from_numpy(state).float().to(device)

        if binom.rvs(1, PROBABILITY_RAND_STEP):
            action = np.ndarray((1, ), buffer=np.array(uniform(-1, 1).rvs()))
        else:
            self._actor_local.eval()
            with torch.no_grad():
                action = self._actor_local(state).cpu().data.numpy()
            self._actor_local.train()

        return np.clip(action, -1, 1)

    def hard_update(self, local, target):
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(local_param.data)

    def soft_update(self, local, target, tau):
        for target_param, local_param in zip(target.parameters(),
                                             local.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

    def learn(self, samples):

        states, actions, rewards, next_states, dones = samples

        actions_next = self._actor_target(next_states)
        Q_targets_next = self._critic_target(next_states, actions_next)
        Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        Q_expected = self._critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self._critic_optimizer.zero_grad()
        critic_loss.backward()
        self._critic_optimizer.step()

        actions_pred = self._actor_local(states)
        actor_loss = -self._critic_local(states, actions_pred).mean()

        self._actor_optimizer.zero_grad()
        actor_loss.backward()
        self._actor_optimizer.step()

        self.soft_update(self._critic_local, self._critic_target, TAU)
        self.soft_update(self._actor_local, self._actor_target, TAU)

    def save(self):
        torch.save(self._actor_local.state_dict(), ACTOR_PATH)
        torch.save(self._critic_local.state_dict(), CRITIC_PATH)

    def load(self):
        self._actor_local.load_state_dict(torch.load(ACTOR_PATH))
        self._actor_local.eval()
        self._critic_local.load_state_dict(torch.load(CRITIC_PATH))
        self._critic_local.eval()