コード例 #1
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(
            state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(
            state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(
            state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(
            state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(
            action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
コード例 #2
0
    def __init__(self,
                 dim_obs,
                 dim_act,
                 actor_lr=0.001,
                 critic_lr=0.01,
                 gamma=0.9,
                 capacity=1000,
                 batch_size=64,
                 tau=0.01,
                 hidden_size=64):
        self.gamma = gamma
        self.memory = ReplayMemory(capacity)
        self.batch_size = batch_size
        self.tau = tau
        self.device = 'gpu' if GPU_CONFIG.use_cuda else 'cpu'
        self.learn_cnt = 0
        self.FloatTensor = th.cuda.FloatTensor if GPU_CONFIG.use_cuda else th.FloatTensor

        self.critic = Critic(dim_obs, dim_act, hidden_size).to(self.device)
        self.actor = Actor(dim_obs, dim_act, hidden_size).to(self.device)
        self.target_critic = Critic(dim_obs, dim_act,
                                    hidden_size).to(self.device)
        self.target_actor = Actor(dim_obs, dim_act,
                                  hidden_size).to(self.device)
        self.target_critic.load_state_dict(self.critic.state_dict())
        self.target_actor.load_state_dict(self.actor.state_dict())
        # for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
        #     target_param.data.copy_(param.data)  # 方式二: todo 试试

        self.critic_optimizer = th.optim.Adam(self.critic.parameters(),
                                              lr=critic_lr)
        self.actor_optimizer = th.optim.Adam(self.actor.parameters(),
                                             lr=actor_lr)
コード例 #3
0
    def __init__(self, state_size, action_size, random_seed):
        """
        Initialize the agent
        :param state_size: state space size
        :param action_size: action space size
        :param random_seed: seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Iteration counter
        self.step_counter = 0
コード例 #4
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(
            state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(
            state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(
            state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(
            state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(
            action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(20):
            self.memory.add(state[i], action[i],
                            reward[i], next_state[i], done[i])

        # Learn, if enough samples are available in memory
        if timestep % 20 == 0:
            if len(self.memory) > BATCH_SIZE:
                for i in range(10):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            for i in range(20):
                action[i] += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(
                tau*local_param.data + (1.0-tau)*target_param.data)
コード例 #5
0
ファイル: agent.py プロジェクト: alessandroleite/reacher-drl
    def __init__(self,
                 state_size,
                 action_size,
                 n_agents,
                 seed,
                 buffer_size=int(1e6),
                 batch_size=200,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 gamma=0.99,
                 weight_decay=0,
                 tau=1e-3,
                 update_frequency=20,
                 n_learns=10):
        """Initialize a DDPG agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            n_agents (int): number of agents
            random_seed (int): random seed
            batch_size (int): minibatch size
            lr_actor (float): learning rate of the actor 
            lr_critic (float): learning rate of the critic
            gamma (float):  discount factor
            weight_decay (float): critic L2 weight decay
            tau (float): value for soft update of target parameters
            update_frequency (int): how much steps must be executed before starting learning
            n_learns (int): how many learning for update 
            
        """
        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.gamma = gamma
        self.batch_size = batch_size
        self.tau = tau
        self.seed = random.seed(seed)
        self.update_frequency = update_frequency
        self.n_learns = n_learns

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise process
        self.noise = Ornstein((n_agents, action_size), seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, seed, device)

        # Initialize the time step (for every update_frequency steps)
        self.t_step = 0
コード例 #6
0
ファイル: agent.py プロジェクト: alessandroleite/reacher-drl
class DDPGAgent:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 n_agents,
                 seed,
                 buffer_size=int(1e6),
                 batch_size=200,
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 gamma=0.99,
                 weight_decay=0,
                 tau=1e-3,
                 update_frequency=20,
                 n_learns=10):
        """Initialize a DDPG agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            n_agents (int): number of agents
            random_seed (int): random seed
            batch_size (int): minibatch size
            lr_actor (float): learning rate of the actor 
            lr_critic (float): learning rate of the critic
            gamma (float):  discount factor
            weight_decay (float): critic L2 weight decay
            tau (float): value for soft update of target parameters
            update_frequency (int): how much steps must be executed before starting learning
            n_learns (int): how many learning for update 
            
        """
        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.gamma = gamma
        self.batch_size = batch_size
        self.tau = tau
        self.seed = random.seed(seed)
        self.update_frequency = update_frequency
        self.n_learns = n_learns

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise process
        self.noise = Ornstein((n_agents, action_size), seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, seed, device)

        # Initialize the time step (for every update_frequency steps)
        self.t_step = 0

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        self.t_step = (self.t_step + 1) % self.update_frequency
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            for _ in range(self.n_learns):
                if len(self.memory) > self.batch_size:
                    experiences = self.memory.sample(self.batch_size)
                    self.learn(experiences, self.gamma)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #7
0
class Agent:
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """
        Initialize the agent
        :param state_size: state space size
        :param action_size: action space size
        :param random_seed: seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Iteration counter
        self.step_counter = 0

    @staticmethod
    def hard_copy_weights(target, source):
        """ copy weights from source to target network"""
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)
        self.step_counter += 1

        # Learn, if enough samples are available in memory
        if self.step_counter % UPDATE_EVERY == 0:
            if len(self.memory) > BATCH_SIZE:
                for i in range(0, UPDATE_TIMES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)
                self.step_counter = 0

    def act(self, states, add_noise=False):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()

        self.actor_local.train()
        if add_noise:
            actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
        actor_target(state) -> action
        critic_target(state, action) -> Q-value

        :param experiences: tensor of (s,a,r,s') tuples
        :param gamma: discount factor
        :return:
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model: local model where weights are copied from
        :param target_model: target model where weights are copied to
        :param tau: soft update rate
        :return:
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
コード例 #8
0
class DDPG:
    def __init__(self,
                 dim_obs,
                 dim_act,
                 actor_lr=0.001,
                 critic_lr=0.01,
                 gamma=0.9,
                 capacity=1000,
                 batch_size=64,
                 tau=0.01,
                 hidden_size=64):
        self.gamma = gamma
        self.memory = ReplayMemory(capacity)
        self.batch_size = batch_size
        self.tau = tau
        self.device = 'gpu' if GPU_CONFIG.use_cuda else 'cpu'
        self.learn_cnt = 0
        self.FloatTensor = th.cuda.FloatTensor if GPU_CONFIG.use_cuda else th.FloatTensor

        self.critic = Critic(dim_obs, dim_act, hidden_size).to(self.device)
        self.actor = Actor(dim_obs, dim_act, hidden_size).to(self.device)
        self.target_critic = Critic(dim_obs, dim_act,
                                    hidden_size).to(self.device)
        self.target_actor = Actor(dim_obs, dim_act,
                                  hidden_size).to(self.device)
        self.target_critic.load_state_dict(self.critic.state_dict())
        self.target_actor.load_state_dict(self.actor.state_dict())
        # for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
        #     target_param.data.copy_(param.data)  # 方式二: todo 试试

        self.critic_optimizer = th.optim.Adam(self.critic.parameters(),
                                              lr=critic_lr)
        self.actor_optimizer = th.optim.Adam(self.actor.parameters(),
                                             lr=actor_lr)

    def learn(self):
        # sample batch from all memory
        transitions = self.memory.sample(self.batch_size)
        batch = Experience(*zip(*transitions))  # class(list)
        obs_batch = self.FloatTensor(np.array(
            batch.obs))  # torch.Size([32, 4])
        # obs_batch = th.tensor(obs_batch, device=self.device, dtype=th.float)  # 方法二
        logger.debug("obs_batch: {}".format(obs_batch.shape))
        action_batch = self.FloatTensor(np.array(batch.action))  # (batch, 1)
        logger.debug('action batch: {}'.format(action_batch.shape))
        reward_batch = self.FloatTensor(np.array(batch.reward)).view(
            self.batch_size, 1)  # (batch, 1)
        logger.debug('reward_batch: {}'.format(reward_batch.shape))
        next_obs_batch = self.FloatTensor(np.array(batch.next_obs))
        done_batch = self.FloatTensor(np.array(batch.done)).view(
            self.batch_size, 1)  # (batch, 1)
        logger.debug('done_batch: {}'.format(done_batch))

        # c loss
        self.critic_optimizer.zero_grad()
        q_eval = self.critic(obs_batch, action_batch)
        next_action = self.target_actor(next_obs_batch).detach()
        q_next = self.target_critic(next_obs_batch, next_action).detach()
        q_target = reward_batch + self.gamma * q_next * (1 - done_batch)
        c_loss = nn.MSELoss()(q_eval, q_target)
        c_loss.backward()
        self.critic_optimizer.step()

        # a loss
        self.actor_optimizer.zero_grad()
        current_action = self.actor(obs_batch)
        policy_loss = self.critic(obs_batch, current_action)
        a_loss = -policy_loss.mean()
        a_loss.backward()
        self.actor_optimizer.step()

        # soft update actor_target, critic_target
        soft_update(self.target_critic, self.critic, self.tau)
        soft_update(self.target_actor, self.actor, self.tau)

        a_loss = a_loss.detach().cpu().numpy(
        ) if GPU_CONFIG.use_cuda else a_loss.detach().numpy()
        c_loss = c_loss.detach().cpu().numpy(
        ) if GPU_CONFIG.use_cuda else c_loss.detach().numpy()
        return a_loss, c_loss

    @th.no_grad()
    def select_action(self, obs):
        obs = self.FloatTensor(obs).unsqueeze(0)
        action = self.actor(obs).detach()
        action = action.cpu().numpy() if GPU_CONFIG.use_cuda else action.numpy(
        )
        return action
    def __init__(self, device, memory, config):
        """Initialize an Agent object.
        
        Params
        ======
            device (object): hardware device to run on CPU or GPU
            memory (object): memory for replay buffer
            config (dict)
                - "state_size": dimension of each state
                - "action_size": dimension of each action
                - "buffer_size": replay buffer size
                - "batch_size": minibatch size
                - "random_seed": random seed
                - "gamma": discount factor
                - "tau": for soft update of target parameters
                - "weight_decay": L2 weight decay
                - "learn_every": learn from replay buffer every time step
                - "learn_batch_size": number of batches to learn from replay buffer every learn_every time step
                - "grad_clip": gradient value to clip at for critic
                - "eps_start": starting value of epsilon, for epsilon-greedy action selection
                - "eps_end": minimum value of epsilon
                - "eps_decay": multiplicative factor (per episode) for decreasing epsilon
                - "print_every": Print average every x episode,
                - "episode_steps": Maximum number of steps to run for each episode
                - "mu": mu for noise
                - "theta": theta for noise 
                - "sigma": sigma for noise
                - "actor": actor specific config object
                    - "fc":  array of input sizes for hidden layers
                    - "learning_rate": learning rate 
                - "critic": actor specific config object
                    - "fc":  array of input sizes for hidden layers
                    - "learning_rate": learning rate
        """
        self.num_agents = config['num_agents']
        self.state_size = config['state_size']
        self.action_size = config['action_size']
        if config['random_seed'] is not None:
            self.seed = random.seed(config['random_seed'])
        else:
            self.seed = random.seed()
        self.eps = config['eps_start']
        self.eps_decay = config['eps_decay']
        self.eps_end = config['eps_end']

        self.device = device
        # Replay memory
        self.memory = memory
        self.batch_size = config['batch_size']
        self.gamma = config['gamma']
        self.tau = config['tau']
        self.lr_actor = config['actor']['learning_rate']
        self.lr_critic = config['critic']['learning_rate']
        self.weight_decay = config['weight_decay']
        self.learn_every = config['learn_every']
        self.learn_batch_size = config['learn_batch_size']
        self.grad_clip = config['grad_clip']

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(config).to(self.device)
        self.actor_target = Actor(config).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(config).to(self.device)
        self.critic_target = Critic(config).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        # Noise process
        self.noise = OUNoise(config)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, device, memory, config):
        """Initialize an Agent object.
        
        Params
        ======
            device (object): hardware device to run on CPU or GPU
            memory (object): memory for replay buffer
            config (dict)
                - "state_size": dimension of each state
                - "action_size": dimension of each action
                - "buffer_size": replay buffer size
                - "batch_size": minibatch size
                - "random_seed": random seed
                - "gamma": discount factor
                - "tau": for soft update of target parameters
                - "weight_decay": L2 weight decay
                - "learn_every": learn from replay buffer every time step
                - "learn_batch_size": number of batches to learn from replay buffer every learn_every time step
                - "grad_clip": gradient value to clip at for critic
                - "eps_start": starting value of epsilon, for epsilon-greedy action selection
                - "eps_end": minimum value of epsilon
                - "eps_decay": multiplicative factor (per episode) for decreasing epsilon
                - "print_every": Print average every x episode,
                - "episode_steps": Maximum number of steps to run for each episode
                - "mu": mu for noise
                - "theta": theta for noise 
                - "sigma": sigma for noise
                - "actor": actor specific config object
                    - "fc":  array of input sizes for hidden layers
                    - "learning_rate": learning rate 
                - "critic": actor specific config object
                    - "fc":  array of input sizes for hidden layers
                    - "learning_rate": learning rate
        """
        self.num_agents = config['num_agents']
        self.state_size = config['state_size']
        self.action_size = config['action_size']
        if config['random_seed'] is not None:
            self.seed = random.seed(config['random_seed'])
        else:
            self.seed = random.seed()
        self.eps = config['eps_start']
        self.eps_decay = config['eps_decay']
        self.eps_end = config['eps_end']

        self.device = device
        # Replay memory
        self.memory = memory
        self.batch_size = config['batch_size']
        self.gamma = config['gamma']
        self.tau = config['tau']
        self.lr_actor = config['actor']['learning_rate']
        self.lr_critic = config['critic']['learning_rate']
        self.weight_decay = config['weight_decay']
        self.learn_every = config['learn_every']
        self.learn_batch_size = config['learn_batch_size']
        self.grad_clip = config['grad_clip']

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(config).to(self.device)
        self.actor_target = Actor(config).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(config).to(self.device)
        self.critic_target = Critic(config).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic,
                                           weight_decay=self.weight_decay)

        # Noise process
        self.noise = OUNoise(config)

    def step(self, states, actions, rewards, next_states, dones, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        # Learn, if enough samples are available in memory and every update_every time steps
        if len(self.memory
               ) > self.batch_size and timestep % self.learn_every == 0:
            for i in range(self.learn_batch_size):
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.eps * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def reset_episode(self):
        self.reset()
        self.memory.reset_episode()

    def learn_best_episode(self):
        self.learn(self.memory.sample(True))

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # gradient clipping for critic
        if self.grad_clip > 0:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                           self.grad_clip)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

        if self.eps_decay > 0:
            self.eps = max(self.eps_end,
                           self.eps - self.eps_decay)  # decrease epsilon
            self.reset()

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)