def __init__(self, state_size, action_size, seed, memory_type):

        # keep params
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Create local and target network
        self.qnn_local = QNetwork(state_size,
                                  action_size,
                                  seed,
                                  fc1_units=74,
                                  fc2_units=74).to(device)
        self.qnn_target = QNetwork(state_size,
                                   action_size,
                                   seed,
                                   fc1_units=74,
                                   fc2_units=74).to(device)

        # Init optimizer
        self.optimizer = optim.Adam(self.qnn_local.parameters(), lr=LR)

        # Replay Memory
        if memory_type == 0:
            print("Using FifoMemory...")
            self.memory = FifoMemory(BUFFER_SIZE)
        else:
            print("Using simple DequeMemory...")
            self.memory = DequeMemory(action_size, BUFFER_SIZE, BATCH_SIZE,
                                      seed)

        self.t_step = 0
예제 #2
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                random_seed (int): random seed
        """

        # Store parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE)
        # Short term memory contains only 1/100 of the complete memory and the most recent samples
        self.memory_short = FifoMemory(int(BUFFER_SIZE / 100), int(BATCH_SIZE))
예제 #3
0
    def __init__(self, state_size, action_size, random_seed=1):
        """Initialize an Agent object.

        Params
        ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                random_seed (int): random seed
        """

        # Store parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON
        self.lr_actor = LR_ACTOR
        self.lr_critic = LR_CRITIC
        self.lr_decay = WEIGHT_DECAY
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE)

        # Success memory contains only the last 10 samples which led to a positive reward
        self.memory_success = FifoMemory(int(BUFFER_SIZE), int(BATCH_SIZE))

        # Rolling sample memory of last 10 samples
        self.memory_short = FifoMemory(10, 10)
예제 #4
0
class AgentTD3():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.

        Params
        ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                random_seed (int): random seed
        """

        # Store parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE)
        # Short term memory contains only 1/100 of the complete memory and the most recent samples
        self.memory_short = FifoMemory(int(BUFFER_SIZE / 100), int(BATCH_SIZE))

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        self.memory_short.add(state, action, reward, next_state, done)

        # Learn at defined interval, if enough samples are available in memory
        # HINT from Udacity "benchmark": learn every 20 timesteps and train 10 samples
        if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                experiences_short = self.memory_short.sample()

                # delay update of the policy and only update every 2nd training
                self.learn(experiences_short, timestep % 2, GAMMA)
                self.learn(experiences, timestep % 2, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        # TD3 --> Action noise regularisation
        if add_noise:
            action += self.epsilon * self.noise.sample()

        # The range of noise is clipped in order to keep the target value
        # close to the original action.
        clipped_action = np.clip(action, -1, 1)

        return clipped_action

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, delay, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value

        Params
        ======
                experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
                gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # TD3 --> Using a pair of critic networks (The twin part of the title)

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next1, Q_targets_next2 = self.critic_target(
            next_states, actions_next)

        # TD3 --> Take the minimum of both critic in order to avoid overestimation
        Q_targets_next = torch.min(Q_targets_next1, Q_targets_next2)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected1, Q_expected2 = self.critic_local(states, actions)

        # compute critic loss [HOW MUCH OFF?] as sum of both loss from target
        critic_loss = F.mse_loss(Q_expected1, Q_targets) + F.mse_loss(
            Q_expected2, Q_targets)

        # minimize loss [TRAIN]
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # TD3 --> Delayed updates of the actor = policy (The delayed part)
        # Compute actor loss
        if delay == 0:
            actions_pred = self.actor_local(states)

            # compute loss [HOW MUCH OFF?]
            actor_loss = -self.critic_local.Q1(states, actions_pred).mean()

            # minimize loss [TRAIN]
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # ----------------------- update target networks ----------------------- #
            self.soft_update(self.critic_local, self.critic_target, TAU)
            self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        self.epsilon -= EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
                local_model: PyTorch model (weights will be copied from)
                target_model: PyTorch model (weights will be copied to)
                tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #5
0
class TD3Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, random_seed=1):
        """Initialize an Agent object.

        Params
        ======
                state_size (int): dimension of each state
                action_size (int): dimension of each action
                random_seed (int): random seed
        """

        # Store parameters
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON
        self.lr_actor = LR_ACTOR
        self.lr_critic = LR_CRITIC
        self.lr_decay = WEIGHT_DECAY
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = FifoMemory(BUFFER_SIZE, BATCH_SIZE)

        # Success memory contains only the last 10 samples which led to a positive reward
        self.memory_success = FifoMemory(int(BUFFER_SIZE), int(BATCH_SIZE))

        # Rolling sample memory of last 10 samples
        self.memory_short = FifoMemory(10, 10)

    def update_model(self, state, action, reward, next_state, done):
        self.step(state, action, reward, next_state, done)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        reached = True
        if len(self.memory_success) < BATCH_SIZE:
            reached = False

        self.memory.add(state, action, reward, next_state, done)
        self.memory_short.add(state, action, reward, next_state, done)

        # Fill the success memory in case this agents receives positive reward
        if reward > 0.0:
            for i in range(len(self.memory_short)):
                self.memory_success.add(
                    self.memory_short.samples[i].state, \
                    self.memory_short.samples[i].action, \
                    self.memory_short.samples[i].reward, \
                    self.memory_short.samples[i].next_state, \
                    self.memory_short.samples[i].done)
                    
            self.memory_short.clear()

        if reached == False and len(self.memory_success) > BATCH_SIZE:
            print("Success memory ready for use!")

        # Train with the complete replay memory
        if len(self.memory) > BATCH_SIZE:
            for i in range(LEARN_NUM_MEMORY):
                experiences = self.memory.sample() 
                # delay update of the policy and only update every 2nd training
                self.learn(experiences, 0 , GAMMA)

        # Train with the success replay memory
        if (len(self.memory_success) > self.memory_success.batch_size):
            for i in range(LEARN_NUM_MEMORY_SUCCESS):
                experiences_success = self.memory_success.sample() 
                self.learn(experiences_success, 0 ,GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        # TD3 --> Action noise regularisation
        if add_noise:
            action += self.epsilon * self.noise.sample()

        # The range of noise is clipped in order to keep the target value 
        # close to the original action.
        clipped_action = np.clip(action, -1, 1) 
        
        self.epsilon *= EPSILON_DECAY

        return clipped_action

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, delay ,gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
                actor_target(state) -> action
                critic_target(state, action) -> Q-value

        Params
        ======
                experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
                gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # TD3 --> Using a pair of critic networks (The twin part of the title)

        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next1, Q_targets_next2 = self.critic_target(next_states, actions_next)

        # TD3 --> Take the minimum of both critic in order to avoid overestimation
        #Q_targets_next = torch.min(Q_targets_next1, Q_targets_next2)
        Q_targets_next = Q_targets_next1

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected1, Q_expected2 = self.critic_local(states, actions)

        # compute critic loss [HOW MUCH OFF?] as sum of both loss from target
        #critic_loss = F.mse_loss(Q_expected1, Q_targets)+F.mse_loss(Q_expected2, Q_targets)
        critic_loss = F.mse_loss(Q_expected1, Q_targets)
        # minimize loss [TRAIN]
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # TD3 --> Delayed updates of the actor = policy (The delayed part)
        # Compute actor loss
        if delay == 0:
            actions_pred = self.actor_local(states)

            # compute loss [HOW MUCH OFF?]
            actor_loss = -self.critic_local.Q1(states, actions_pred).mean()
            
            # minimize loss [TRAIN]
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # ----------------------- update target networks ----------------------- #
            self.soft_update(self.critic_local, self.critic_target, TAU)
            self.soft_update(self.actor_local, self.actor_target, TAU)

        # ---------------------------- update noise ---------------------------- #
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
                local_model: PyTorch model (weights will be copied from)
                target_model: PyTorch model (weights will be copied to)
                tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)