コード例 #1
0
    def __init__(self, state_size, action_size, random_seed=1):
        """Initialize an Agent object.

        Params
        ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                random_seed (int): random seed
        """

        # Store parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON
        self.lr_actor = LR_ACTOR
        self.lr_critic = LR_CRITIC
        self.lr_decay = WEIGHT_DECAY
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=self.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        self.timestep = 0
        # Replay memory
        self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE)
        # Short term memory contains only 1/100 of the complete memory and the most recent samples
        self.memory_success = FifoMemory(int(BUFFER_SIZE), int(BATCH_SIZE))
        self.memory_short = FifoMemory(5, 5)
コード例 #2
0
    def __init__(self, state_size, action_size, random_seed=1):
        """Initialize an Agent object.

        Params
        ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                random_seed (int): random seed
        """

        # Store parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON
        self.lr_actor = LR_ACTOR
        self.lr_critic = LR_CRITIC
        self.lr_decay = WEIGHT_DECAY
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE)

        # Success memory contains only the last 10 samples which led to a positive reward
        self.memory_success = FifoMemory(int(BUFFER_SIZE), int(BATCH_SIZE))

        # Rolling sample memory of last 10 samples
        self.memory_short = FifoMemory(10, 10)
コード例 #3
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic One Network (w/ Target Network)
        self.critic_local_one = Critic(state_size, action_size,
                                       random_seed).to(device)
        self.critic_target_one = Critic(state_size, action_size,
                                        random_seed).to(device)
        self.critic_one_optimizer = optim.Adam(
            self.critic_local_one.parameters(),
            lr=LR_CRITIC,
            weight_decay=WEIGHT_DECAY)

        # Critic Two Network (w/ Target Network)
        self.critic_local_two = Critic(state_size, action_size,
                                       random_seed).to(device)
        self.critic_target_two = Critic(state_size, action_size,
                                        random_seed).to(device)
        self.critic_two_optimizer = optim.Adam(
            self.critic_local_two.parameters(),
            lr=LR_CRITIC,
            weight_decay=WEIGHT_DECAY)
        #
        # # Noise process
        # self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Counter
        self.t_step = 0

        # learn_counter
        self.learn_ctr = 0
    def __init__(self, num_agents, state_size, action_size, random_seed,
                 actor_fc1_units, actor_fc2_units, critic_fcs1_units,
                 critic_fc2_units, buffer_size, batch_size, gamma, tau,
                 lr_actor, lr_critic, weight_decay, ou_mu, ou_theta, ou_sigma,
                 update_every_t_steps, num_of_updates):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            buffer_size (int) : replay buffer size
            batch_size (int) : minibatch size
            gamma (float) : discount factor
            tau (float) : for soft update of target parameter
            lr_actor (float) : learning rate of the actor 
            lr_critic (float) : learning rate of the critic 
            weight_decay (float) : L2 weight decay
            ou_mu (float) : OUNoise mu
            ou_theta (float) : OUNoise theta
            ou_sigma (float) : OUNoise sigma
            update_every_t_steps (int): timesteps between updates
            num_of_updates (int): num of update passes when updating
        """
        print(
            "[AGENT INFO] DDPG constructor initialized parameters:\n num_agents={} \n state_size={} \n action_size={} \n random_seed={} \n actor_fc1_units={} \n actor_fc2_units={} \n critic_fcs1_units={} \n critic_fc2_units={} \n buffer_size={} \n batch_size={} \n gamma={} \n tau={} \n lr_actor={} \n lr_critic={} \n weight_decay={} \n ou_mu={}\n ou_theta={}\n ou_sigma={}\n update_every_t_steps={}\n num_of_updates={}\n"
            .format(num_agents, state_size, action_size, random_seed,
                    actor_fc1_units, actor_fc2_units, critic_fcs1_units,
                    critic_fc2_units, buffer_size, batch_size, gamma, tau,
                    lr_actor, lr_critic, weight_decay, ou_mu, ou_theta,
                    ou_sigma, update_every_t_steps, num_of_updates))

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.actor_fc1_units = actor_fc1_units
        self.actor_fc2_units = actor_fc2_units
        self.critic_fcs1_units = critic_fcs1_units
        self.critic_fc2_units = critic_fc2_units
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.ou_mu = ou_mu
        self.ou_theta = ou_theta
        self.ou_sigma = ou_sigma
        self.update_every_t_steps = update_every_t_steps
        self.num_of_updates = num_of_updates

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 actor_fc1_units, actor_fc2_units).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed,
                                  actor_fc1_units, actor_fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   critic_fcs1_units,
                                   critic_fc2_units).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    critic_fcs1_units,
                                    critic_fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=self.weight_decay)

        # Noise process
        self.noise = OUNoise(action_size,
                             random_seed,
                             mu=self.ou_mu,
                             theta=self.ou_theta,
                             sigma=self.ou_sigma)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, num_agents, state_size, action_size, random_seed,
                 actor_fc1_units, actor_fc2_units, critic_fcs1_units,
                 critic_fc2_units, buffer_size, batch_size, gamma, tau,
                 lr_actor, lr_critic, weight_decay, ou_mu, ou_theta, ou_sigma,
                 update_every_t_steps, num_of_updates):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            buffer_size (int) : replay buffer size
            batch_size (int) : minibatch size
            gamma (float) : discount factor
            tau (float) : for soft update of target parameter
            lr_actor (float) : learning rate of the actor 
            lr_critic (float) : learning rate of the critic 
            weight_decay (float) : L2 weight decay
            ou_mu (float) : OUNoise mu
            ou_theta (float) : OUNoise theta
            ou_sigma (float) : OUNoise sigma
            update_every_t_steps (int): timesteps between updates
            num_of_updates (int): num of update passes when updating
        """
        print(
            "[AGENT INFO] DDPG constructor initialized parameters:\n num_agents={} \n state_size={} \n action_size={} \n random_seed={} \n actor_fc1_units={} \n actor_fc2_units={} \n critic_fcs1_units={} \n critic_fc2_units={} \n buffer_size={} \n batch_size={} \n gamma={} \n tau={} \n lr_actor={} \n lr_critic={} \n weight_decay={} \n ou_mu={}\n ou_theta={}\n ou_sigma={}\n update_every_t_steps={}\n num_of_updates={}\n"
            .format(num_agents, state_size, action_size, random_seed,
                    actor_fc1_units, actor_fc2_units, critic_fcs1_units,
                    critic_fc2_units, buffer_size, batch_size, gamma, tau,
                    lr_actor, lr_critic, weight_decay, ou_mu, ou_theta,
                    ou_sigma, update_every_t_steps, num_of_updates))

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.actor_fc1_units = actor_fc1_units
        self.actor_fc2_units = actor_fc2_units
        self.critic_fcs1_units = critic_fcs1_units
        self.critic_fc2_units = critic_fc2_units
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.ou_mu = ou_mu
        self.ou_theta = ou_theta
        self.ou_sigma = ou_sigma
        self.update_every_t_steps = update_every_t_steps
        self.num_of_updates = num_of_updates

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 actor_fc1_units, actor_fc2_units).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed,
                                  actor_fc1_units, actor_fc2_units).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   critic_fcs1_units,
                                   critic_fc2_units).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    critic_fcs1_units,
                                    critic_fc2_units).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=self.weight_decay)

        # Noise process
        self.noise = OUNoise(action_size,
                             random_seed,
                             mu=self.ou_mu,
                             theta=self.ou_theta,
                             sigma=self.ou_sigma)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

        # Make sure target is with the same weight as the source
        #self.hard_copy(self.actor_target, self.actor_local)
        #self.hard_copy(self.critic_target, self.critic_local)

    def step(self, states, actions, rewards, next_states, dones, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(
                self.memory
        ) > self.batch_size and timestep % self.update_every_t_steps == 0:
            for _ in range(self.num_of_updates):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q1_targets_next, Q2_targets_next = self.critic_target(
            next_states, actions_next)
        Q_targets_next = torch.min(Q1_targets_next, Q2_targets_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q1_expected, Q2_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q1_expected, Q_targets) + F.mse_loss(
            Q2_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local.Q1(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_copy(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
コード例 #6
0
class TD3Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, random_seed=1):
        """Initialize an Agent object.

        Params
        ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                random_seed (int): random seed
        """

        # Store parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON
        self.lr_actor = LR_ACTOR
        self.lr_critic = LR_CRITIC
        self.lr_decay = WEIGHT_DECAY
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE)

        # Success memory contains only the last 10 samples which led to a positive reward
        self.memory_success = FifoMemory(int(BUFFER_SIZE), int(BATCH_SIZE))

        # Rolling sample memory of last 10 samples
        self.memory_short = FifoMemory(10, 10)

    def update_model(self, state, action, reward, next_state, done):
        self.step(state, action, reward, next_state, done)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        reached = True
        if len(self.memory_success) < BATCH_SIZE:
            reached = False

        self.memory.add(state, action, reward, next_state, done)
        self.memory_short.add(state, action, reward, next_state, done)

        # Fill the success memory in case this agents receives positive reward
        if reward > 0.0:
            for i in range(len(self.memory_short)):
                self.memory_success.add(
                    self.memory_short.samples[i].state, \
                    self.memory_short.samples[i].action, \
                    self.memory_short.samples[i].reward, \
                    self.memory_short.samples[i].next_state, \
                    self.memory_short.samples[i].done)
                    
            self.memory_short.clear()

        if reached == False and len(self.memory_success) > BATCH_SIZE:
            print("Success memory ready for use!")

        # Train with the complete replay memory
        if len(self.memory) > BATCH_SIZE:
            for i in range(LEARN_NUM_MEMORY):
                experiences = self.memory.sample() 
                # delay update of the policy and only update every 2nd training
                self.learn(experiences, 0 , GAMMA)

        # Train with the success replay memory
        if (len(self.memory_success) > self.memory_success.batch_size):
            for i in range(LEARN_NUM_MEMORY_SUCCESS):
                experiences_success = self.memory_success.sample() 
                self.learn(experiences_success, 0 ,GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        # TD3 --> Action noise regularisation
        if add_noise:
            action += self.epsilon * self.noise.sample()

        # The range of noise is clipped in order to keep the target value 
        # close to the original action.
        clipped_action = np.clip(action, -1, 1) 
        
        self.epsilon *= EPSILON_DECAY

        return clipped_action

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, delay ,gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value

        Params
        ======
                experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
                gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # TD3 --> Using a pair of critic networks (The twin part of the title)

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next1, Q_targets_next2 = self.critic_target(next_states, actions_next)

        # TD3 --> Take the minimum of both critic in order to avoid overestimation
        #Q_targets_next = torch.min(Q_targets_next1, Q_targets_next2)
        Q_targets_next = Q_targets_next1

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected1, Q_expected2 = self.critic_local(states, actions)

        # compute critic loss [HOW MUCH OFF?] as sum of both loss from target
        #critic_loss = F.mse_loss(Q_expected1, Q_targets)+F.mse_loss(Q_expected2, Q_targets)
        critic_loss = F.mse_loss(Q_expected1, Q_targets)
        # minimize loss [TRAIN]
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # TD3 --> Delayed updates of the actor = policy (The delayed part)
        # Compute actor loss
        if delay == 0:
            actions_pred = self.actor_local(states)

            # compute loss [HOW MUCH OFF?]
            actor_loss = -self.critic_local.Q1(states, actions_pred).mean()
            
            # minimize loss [TRAIN]
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # ----------------------- update target networks ----------------------- #
            self.soft_update(self.critic_local, self.critic_target, TAU)
            self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
                local_model: PyTorch model (weights will be copied from)
                target_model: PyTorch model (weights will be copied to)
                tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
コード例 #7
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic One Network (w/ Target Network)
        self.critic_local_one = Critic(state_size, action_size,
                                       random_seed).to(device)
        self.critic_target_one = Critic(state_size, action_size,
                                        random_seed).to(device)
        self.critic_one_optimizer = optim.Adam(
            self.critic_local_one.parameters(),
            lr=LR_CRITIC,
            weight_decay=WEIGHT_DECAY)

        # Critic Two Network (w/ Target Network)
        self.critic_local_two = Critic(state_size, action_size,
                                       random_seed).to(device)
        self.critic_target_two = Critic(state_size, action_size,
                                        random_seed).to(device)
        self.critic_two_optimizer = optim.Adam(
            self.critic_local_two.parameters(),
            lr=LR_CRITIC,
            weight_decay=WEIGHT_DECAY)
        #
        # # Noise process
        # self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Counter
        self.t_step = 0

        # learn_counter
        self.learn_ctr = 0

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        # self.t_step = (self.t_step + 1) % UPDATE_EVERY
        #
        # if self.t_step == 0:

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += np.random.normal(0, 0.2, size=action.shape)
        return np.clip(action, -1, 1)

#     def act(self, state, add_noise=True):
#         """Returns actions for given state as per current policy."""

#         if self.t_step < WARMUP:
#             action = np.random.normal(scale=0.1, size=(self.action_size))
#         else:
#             state = torch.from_numpy(state).float().to(device)
#             self.actor_local.eval()
#             with torch.no_grad():
#                 action = self.actor_local(state).cpu().data.numpy()
#             self.actor_local.train()
#             if add_noise:
#                 action += np.random.normal(0, 0.1, action.shape)

#         #update counter
#         self.t_step += 1

#         return np.clip(action, -1, 1)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        noise = torch.randn_like(actions_next).mul(0.2)
        noise = noise.clamp(-0.5, 0.5)
        actions_next = (actions_next + noise).clamp(-1, 1)

        # actions_next = self.actor_target(next_states)
        # actions_next = actions_next + torch.clamp(torch.from_numpy(np.random.normal(loc=0, scale=0.2, size=actions_next.shape)).float().to(device), -0.5, 0.5)
        # actions_next = torch.clamp(actions_next, self.min_size[0], self.max_size[0])

        critic_one_target = self.critic_target_one(next_states, actions_next)
        critic_two_target = self.critic_target_two(next_states, actions_next)

        Q_targets_next = torch.min(critic_one_target, critic_two_target)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_targets.detach()

        critic_one_expected = self.critic_local_one(states, actions)
        critic_two_expected = self.critic_local_two(states, actions)

        # Compute Q targets for current states (y_i)

        # Compute both critics loss
        # Minimize the loss

        critic_one_loss = F.mse_loss(critic_one_expected, Q_targets)
        critic_two_loss = F.mse_loss(critic_two_expected, Q_targets)
        critic_loss = critic_one_loss + critic_two_loss

        self.critic_one_optimizer.zero_grad()
        self.critic_two_optimizer.zero_grad()

        critic_loss.backward()

        self.critic_one_optimizer.step()
        self.critic_two_optimizer.step()

        self.learn_ctr = (self.learn_ctr + 1) % UPDATE_EVERY

        if self.learn_ctr != 0:
            return

        # ---------------------------- update actor ---------------------------- #

        # Compute actor loss
        actor_loss = -self.critic_local_one(states,
                                            self.actor_local(states)).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local_one, self.critic_target_one, TAU)
        self.soft_update(self.critic_local_two, self.critic_target_two, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)