def __init__(self, state_size, action_size, num_agents, seed, actor_hidden_layers, critic_hidden_layers, use_batch_norm=False, use_noise=False):
		super(Agent, self).__init__()
		
		self.state_size = state_size
		self.action_size = action_size
		
		self.random_seed = random.seed(seed)
		
		# Actor networks
		self.actor_local = Actor(state_size, action_size, seed, actor_hidden_layers, use_batch_norm).to(device)
		self.actor_target = Actor(state_size, action_size, seed, actor_hidden_layers, use_batch_norm).to(device)
		self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR)
		copy_weights(self.actor_local, self.actor_target)
		
		# Critic networks
		self.critic_local = Critic(state_size, action_size, seed, critic_hidden_layers).to(device)
		self.critic_target = Critic(state_size, action_size, seed, critic_hidden_layers).to(device)
		self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR)
		copy_weights(self.critic_local, self.critic_target)
		
		# Replay Memory
		self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
		
		# Noise process
		self.noise = OUNoise((num_agents, action_size), seed)
		self.use_noise = use_noise

		self.t_step = 0
    def __init__(self, env, config: DDPGConfig):
        super().__init__(env)
        self.config = config
        self.replay_buffer = ReplayBuffer(config.buffer_size,
                                          config.batch_size)

        # Actor
        self.actor_current = Actor(env.state_size, env.action_size,
                                   config.fc1_units,
                                   config.fc2_units).to(device)
        self.actor_target = Actor(env.state_size, env.action_size,
                                  config.fc1_units,
                                  config.fc2_units).to(device)
        self.actor_optimizer = torch.optim.Adam(
            self.actor_current.parameters(), lr=config.learning_rate)

        # Critic
        self.critic_current = Critic(env.state_size, env.action_size,
                                     config.fc1_units,
                                     config.fc2_units).to(device)
        self.critic_target = Critic(env.state_size, env.action_size,
                                    config.fc1_units,
                                    config.fc2_units).to(device)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_current.parameters(), lr=config.learning_rate)

        self.metrics = Metrics()
Beispiel #3
0
    def __init__(self, state_size, action_size, random_seed=42):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        self.noise = OUNoise(action_size, random_seed)
Beispiel #4
0
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.copy_weights(self.actor_local, self.actor_target)
        self.copy_weights(self.critic_local, self.critic_target)

        print("\nActor network...\n", self.actor_local)
        print("\nCritic network...\n", self.critic_local)

        # Noise process
        self.noise = OUNoise(self.num_agents * action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
    def __init__(self, id, state_size, action_size, config = Config()):
        """Initialize an Agent object.
        
        Params
        ======
            id (int): id used to identify the agent
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            config (Config): the agents configuration
        """
        self.state_size = state_size
        self.action_size = action_size
        self.id = id

        self.t_step = 0

        self.config = config

        random.seed(config.random_seed)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Actor & Target Network 
        self.actor_local = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device)
        self.actor_target = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor)

        # Critic & Target Network
        self.critic_local = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device)
        self.critic_target = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, config.random_seed, config.noise_mu, config.noise_theta, config.noise_sigma)
        
        # Replay memory
        if config.use_per:
            self.memory = NaivePrioritizedReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed, config.per_alpha,config.per_epsilon)
        else:
            self.memory = ReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed)
class DDPGAgent(Agent):
    def __init__(self, env, config: DDPGConfig):
        super().__init__(env)
        self.config = config
        self.replay_buffer = ReplayBuffer(config.buffer_size,
                                          config.batch_size)

        # Actor
        self.actor_current = Actor(env.state_size, env.action_size,
                                   config.fc1_units,
                                   config.fc2_units).to(device)
        self.actor_target = Actor(env.state_size, env.action_size,
                                  config.fc1_units,
                                  config.fc2_units).to(device)
        self.actor_optimizer = torch.optim.Adam(
            self.actor_current.parameters(), lr=config.learning_rate)

        # Critic
        self.critic_current = Critic(env.state_size, env.action_size,
                                     config.fc1_units,
                                     config.fc2_units).to(device)
        self.critic_target = Critic(env.state_size, env.action_size,
                                    config.fc1_units,
                                    config.fc2_units).to(device)
        self.critic_optimizer = torch.optim.Adam(
            self.critic_current.parameters(), lr=config.learning_rate)

        self.metrics = Metrics()

    def restore(self, actor_file, critic_file):
        self.actor_current.load_state_dict(torch.load(actor_file))
        self.critic_current.load_state_dict(torch.load(critic_file))

    def compute_action(self, state, epsilon=0):
        action = self.actor_current.action_values_for(state)
        if np.random.random() < epsilon:
            action += np.random.randn(self.env.action_size) * epsilon
            action = np.clip(action, -1, 1)
        return action

    def train(self,
              n_steps,
              update_every,
              print_every,
              epsilon_init=1.0,
              epsilon_decay=0.995,
              epsilon_min=0.01):
        epsilon = epsilon_init
        state = self._warmup(epsilon)
        self.metrics.plot()

        for t_step in range(1, n_steps + 1):
            state = self._step(state, epsilon)
            epsilon = max(epsilon_min, epsilon * epsilon_decay)

            if t_step % update_every == 0:
                self._batch_train()
                if self._check_solved():
                    break

            if t_step % print_every == 0:
                print(f"Step #{t_step}" +
                      f", Running score {self.metrics.running_score():.2f}" +
                      f", Total episodes {self.metrics.episode_count}")

    def _warmup(self, epsilon):
        state = self.env.reset(train_mode=True)
        needed_experiences = max(
            0, self.replay_buffer.batch_size - len(self.replay_buffer))
        for i in range(needed_experiences):
            state = self._step(state, epsilon)
        return state

    def _step(self, state, epsilon):
        action = self.compute_action(state, epsilon)
        next_state, reward, done = self.env.step(action)
        self.replay_buffer.add(
            Experience(state, action, reward, next_state, done))
        self.metrics.on_step(reward, done)
        if done:
            return self.env.reset(train_mode=True)
        return next_state

    def _batch_train(self):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
        )

        # Update Critic

        target_actions_next = self.actor_target(next_states)
        target_values_next = self.critic_target(
            next_states, target_actions_next).detach().max(1)[0].unsqueeze(1)
        target_values = rewards + (self.config.gamma * target_values_next *
                                   (1 - dones))
        expected_values = self.critic_current(states, actions)

        critic_loss = F.mse_loss(expected_values, target_values)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        self.critic_target.soft_update(self.critic_current, self.config.tau)

        # Update Actor

        current_actions = self.actor_current(states)
        actor_loss = -self.critic_current(states, current_actions).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.actor_target.soft_update(self.actor_current, self.config.tau)

    def _check_solved(self):
        if self.metrics.running_score() >= 30:
            print(
                f"\nEnvironment solved in {self.metrics.episode_count} episodes!\t"
                + f"Average Score: {self.metrics.running_score():.2f}")
            torch.save(self.actor_current.state_dict(), "actor_model.pt")
            torch.save(self.critic_current.state_dict(), "critic_model.pt")
            return True

        return False
Beispiel #7
0
state_size = env.state_size
action_size = env.action_size
print('There are {} agents.'.format(states.shape[0]))
print('Each agent observes a state with length: {}'.format(state_size))
print('Each agent performs an action of size: {}'.format(action_size))
print('The state for the first agent looks like:', states[0])
print('The state shape looks like:', states.shape)

####################################################################################################

BUFFER_SIZE = int(1e6)
BATCH_SIZE = 256
random_seed = 0

# Local and Target Actor Networks
actor_local = Actor(state_size, action_size, random_seed)
actor_target = Actor(state_size, action_size, random_seed)

# Local and Traget Critic Networks
state_action_size = state_size + action_size
critic_local = Critic(num_agents * state_action_size, num_agents, random_seed)
critic_target = Critic(num_agents * state_action_size, num_agents, random_seed)

# Noise processes
noise_process1 = OUNoise(action_size,
                         random_seed,
                         mu=0.,
                         theta=0.15,
                         sigma=0.1)
noise_process2 = OUNoise(action_size,
                         random_seed,
Beispiel #8
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.copy_weights(self.actor_local, self.actor_target)
        self.copy_weights(self.critic_local, self.critic_target)

        print("\nActor network...\n", self.actor_local)
        print("\nCritic network...\n", self.critic_local)

        # Noise process
        self.noise = OUNoise(self.num_agents * action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    @classmethod
    def copy_weights(cls, src, dst):
        """Clones the weights from the source to the target"""
        for dst_wts, src_wts in zip(src.parameters(), dst.parameters()):
            dst_wts.data.copy_(src_wts.data)

    def step(self, states, actions, rewards, next_states, is_dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward

        # Doing this
        for state, action, reward, next_state, is_done in zip(
                states, actions, rewards, next_states, is_dones):
            self.memory.add(state, action, reward, next_state, is_done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True, weight=1.0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += weight * self.noise.sample().reshape(
                (-1, self.action_size))
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Agent that interacts with and learns from the environment."""

    def __init__(self, id, state_size, action_size, config = Config()):
        """Initialize an Agent object.
        
        Params
        ======
            id (int): id used to identify the agent
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            config (Config): the agents configuration
        """
        self.state_size = state_size
        self.action_size = action_size
        self.id = id

        self.t_step = 0

        self.config = config

        random.seed(config.random_seed)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Actor & Target Network 
        self.actor_local = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device)
        self.actor_target = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor)

        # Critic & Target Network
        self.critic_local = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device)
        self.critic_target = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, config.random_seed, config.noise_mu, config.noise_theta, config.noise_sigma)
        
        # Replay memory
        if config.use_per:
            self.memory = NaivePrioritizedReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed, config.per_alpha,config.per_epsilon)
        else:
            self.memory = ReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed)
    
    def step(self, state, action, reward, next_state, done, beta=None):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every n time steps.
        self.t_step = (self.t_step + 1) % self.config.update_n_step
        if self.t_step != 0:
            return

        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > self.config.batch_size:
            if self.config.use_per:
                assert(beta != None)
                experiences, weights = self.memory.sample(beta)
                states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device)
                actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device)
                rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
                next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device)
                dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
                weights = torch.from_numpy(np.vstack(weights)).float().to(self.device)

                experiences = (states, actions, rewards, next_states, dones)
                self.learn(experiences, self.config.gamma, weights)
            else:
                experiences = self.memory.sample()

                states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device)
                actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device)
                rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
                next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device)
                dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)

                experiences = (states, actions, rewards, next_states, dones)
                self.learn(experiences, self.config.gamma)


    def act(self, state):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if self.config.add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, weights=None):
        """
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
            weights (array_like): list of weights for compensation the non-uniform sampling (used only
                                    with prioritized experience replay)
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        if self.config.use_per:
            td_error = Q_expected - Q_targets
            critic_loss = (td_error) ** 2
                
            critic_loss = critic_loss * weights
            critic_loss = critic_loss.mean()

            self.memory.update_priorities(np.hstack(td_error.detach().cpu().numpy()))

        else:
            critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ------------------- update target networks ------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.config.tau)
        self.soft_update(self.actor_local, self.actor_target, self.config.tau)                    

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def getId(self):
        """ Return the ID of the agent """
        return self.id 

    def summary(self):
        """ Return a brief summary of the agent"""
        s = 'DDPG Agent {}:\n'.format(self.id)
        s += self.config.__str__()
        s += self.actor_local.__str__()
        s += self.critic_local.__str__()
        return s
class Agent():
	#''' DDPG agent '''
	def __init__(self, state_size, action_size, num_agents, seed, actor_hidden_layers, critic_hidden_layers, use_batch_norm=False, use_noise=False):
		super(Agent, self).__init__()
		
		self.state_size = state_size
		self.action_size = action_size
		
		self.random_seed = random.seed(seed)
		
		# Actor networks
		self.actor_local = Actor(state_size, action_size, seed, actor_hidden_layers, use_batch_norm).to(device)
		self.actor_target = Actor(state_size, action_size, seed, actor_hidden_layers, use_batch_norm).to(device)
		self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR)
		copy_weights(self.actor_local, self.actor_target)
		
		# Critic networks
		self.critic_local = Critic(state_size, action_size, seed, critic_hidden_layers).to(device)
		self.critic_target = Critic(state_size, action_size, seed, critic_hidden_layers).to(device)
		self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR)
		copy_weights(self.critic_local, self.critic_target)
		
		# Replay Memory
		self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
		
		# Noise process
		self.noise = OUNoise((num_agents, action_size), seed)
		self.use_noise = use_noise

		self.t_step = 0

	def step(self, states, actions, rewards, next_states, dones):
		''' Save experience in replay memory, and use random sample from buffer to learn. '''
		for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
			self.memory.add(state, action, reward, next_state, done)

		# update time steps
		self.t_s = (self.t_step + 1) % UPDATE_EVERY
		if self.t_step == 0:
			# time to learn again
			# provided that there are enough
			#if len(shared_memory.shared_buffer) > BATCH_SIZE:
			if len(self.memory) > BATCH_SIZE:
				#experiences = self.memory.sample()
				#experiences = shared_memory.shared_buffer.sample()
				experiences = self.memory.sample()
				self.learn(experiences, GAMMA)

	def act(self, state):
		''' Returns actions for a given state as per current policy '''
		
		# Make current state into a Tensor that can be passed as input to the network
		state = torch.from_numpy(state).float().to(device)

		# Set network in evaluation mode to prevent things like dropout from happening
		self.actor_local.eval()

		# Turn off the autograd engine
		with torch.no_grad():
			# Do a forward pass through the network
			action_values = self.actor_local(state).cpu().data.numpy()

		# Put network back into training mode
		self.actor_local.train()
		
		if self.use_noise:
			action_values += self.noise.sample()

		return np.clip(action_values, -1, 1)
		
	def reset(self):
		''' Reset the noise in the OU process '''
		self.noise.reset()


	def learn(self, experiences, gamma):
		''' Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) '''

		states, actions, rewards, next_states, dones = experiences

		# ------------------------ Update Critic Network ------------------------ #
		next_actions = self.actor_target(next_states)
		Q_targets_prime = self.critic_target(next_states, next_actions)

		# Compute y_i
		Q_targets = rewards + (gamma * Q_targets_prime * (1 - dones))

		# Compute the critic loss
		Q_expected = self.critic_local(states, actions)
		critic_loss = F.mse_loss(Q_expected, Q_targets)
		# Minimise the loss
		self.critic_optimizer.zero_grad() # Reset the gradients to prevent accumulation
		critic_loss.backward()            # Compute gradients
		torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
		self.critic_optimizer.step()      # Update weights

		# ------------------------ Update Actor Network ------------------------- #
		# Compute the actor loss
		actions_pred = self.actor_local(states)
		actor_loss = -self.critic_local(states, actions_pred).mean()

		# Minimise the loss
		self.actor_optimizer.zero_grad()
		actor_loss.backward()
		self.actor_optimizer.step()


		# ------------------------ Update Target Networks ----------------------- #
		self.soft_update(self.critic_local, self.critic_target, TAU)
		self.soft_update(self.actor_local, self.actor_target, TAU)
		
	def soft_update(self, local_model, target_model, tau):
		"""Soft update model parameters.
		θ_target = τ*θ_local + (1 - τ)*θ_target

		Params
		======
			local_model (PyTorch model): weights will be copied from
			target_model (PyTorch model): weights will be copied to
			tau (float): interpolation parameter 
		"""
		for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
			target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #11
0
    def __init__(
        self,
        state_size,
        action_size,
        observed_state_size,
        observed_action_size,
        random_seed,
        actor_local_load_filename=None,
        actor_target_load_filename=None,
        critic_local_load_filename=None,
        critic_target_load_filename=None,
    ):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            observed_state_size(int): dimension of the states of all agents
            observed_action_size(int): dimension of the actions of all agents
            random_seed (int): random seed
            actor_local_load_filename   : if given, the initial weights of the local  NN
            critic_local_load_filename  : if given, the initial weights if the target NN
            actor_target_load_filename  : if given, the initial weights of the local  NN
            critic_target_load_filename : if given, the initial weights if the target NN
        """
        self.state_size = state_size
        self.action_size = action_size
        self.observed_state_size = observed_state_size
        self.observed_action_size = observed_action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=cfg.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(observed_state_size, observed_action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(observed_state_size, observed_action_size,
                                    random_seed).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=cfg.LR_CRITIC,
            weight_decay=cfg.WEIGHT_DECAY,
        )
        self.load(
            actor_local_load_filename,
            actor_target_load_filename,
            critic_local_load_filename,
            critic_target_load_filename,
        )

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(cfg.BUFFER_SIZE, cfg.BATCH_SIZE,
                                   random_seed)
        self.t_step = 0
        self.epsilon = cfg.EPSILON
        self.epsilon_decay = cfg.EPSILON_DECAY
Beispiel #12
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(
        self,
        state_size,
        action_size,
        observed_state_size,
        observed_action_size,
        random_seed,
        actor_local_load_filename=None,
        actor_target_load_filename=None,
        critic_local_load_filename=None,
        critic_target_load_filename=None,
    ):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            observed_state_size(int): dimension of the states of all agents
            observed_action_size(int): dimension of the actions of all agents
            random_seed (int): random seed
            actor_local_load_filename   : if given, the initial weights of the local  NN
            critic_local_load_filename  : if given, the initial weights if the target NN
            actor_target_load_filename  : if given, the initial weights of the local  NN
            critic_target_load_filename : if given, the initial weights if the target NN
        """
        self.state_size = state_size
        self.action_size = action_size
        self.observed_state_size = observed_state_size
        self.observed_action_size = observed_action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=cfg.LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(observed_state_size, observed_action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(observed_state_size, observed_action_size,
                                    random_seed).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=cfg.LR_CRITIC,
            weight_decay=cfg.WEIGHT_DECAY,
        )
        self.load(
            actor_local_load_filename,
            actor_target_load_filename,
            critic_local_load_filename,
            critic_target_load_filename,
        )

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(cfg.BUFFER_SIZE, cfg.BATCH_SIZE,
                                   random_seed)
        self.t_step = 0
        self.epsilon = cfg.EPSILON
        self.epsilon_decay = cfg.EPSILON_DECAY

    def step(self, states, actions, rewards, next_states, dones, t_step):
        pass

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            self.epsilon = self.epsilon_decay * self.epsilon
            action += self.epsilon * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        pass

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save(
        self,
        actor_local_save_filename,
        actor_target_save_filename,
        critic_local_save_filename,
        critic_target_save_filename,
    ):
        torch.save(self.actor_local.state_dict(), actor_local_save_filename)
        torch.save(self.actor_target.state_dict(), actor_target_save_filename)
        torch.save(self.critic_local.state_dict(), critic_local_save_filename)
        torch.save(self.critic_target.state_dict(),
                   critic_target_save_filename)

    def load(
        self,
        actor_local_load_filename,
        actor_target_load_filename=None,
        critic_local_load_filename=None,
        critic_target_load_filename=None,
    ):
        if actor_local_load_filename is not None:
            self.actor_local.load_state_dict(
                torch.load(actor_local_load_filename))
        if actor_target_load_filename is not None:
            self.actor_target.load_state_dict(
                torch.load(actor_target_load_filename))
        if critic_local_load_filename is not None:
            self.critic_local.load_state_dict(
                torch.load(critic_local_load_filename))
        if critic_target_load_filename is not None:
            self.critic_target.load_state_dict(
                torch.load(critic_target_load_filename))
Beispiel #13
0
class Agent():
    def __init__(self, state_size, action_size, random_seed=42):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        self.noise = OUNoise(action_size, random_seed)

        #self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

    def step(self, memory, state, action, reward, next_state, done):

        memory.add(state, action, reward, next_state, done)

        if len(memory) > BATCH_SIZE:
            experiences = memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        if GRAD_CLIPPING > 0.0:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                           GRAD_CLIPPING)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_copy_weights(self, target, source):

        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)