Beispiel #1
0
class Actor():
    def __init__(self, state_size, action_size, random_seed, learning_rate,
                 noise, device):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.learning_rate = learning_rate

        self.actor_local = ActorNetwork(state_size, action_size,
                                        random_seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         random_seed).to(device)
        hard_update(self.actor_target, self.actor_local)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.learning_rate)

        self.noise = noise
        self.device = device

    def act(self, state, noise_factor, add_noise):
        """ Returns actions for given state as per given policy"""
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += (noise_factor * self.noise.sample())
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()
Beispiel #2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, agent_id, args):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = args['seed']
        self.device = args['device']
        self.args = args

        # Q-Network
        self.actor_network = ActorNetwork(state_size, action_size,
                                          args).to(self.device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         args).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_network.parameters(),
                                          lr=args['LR_ACTOR'])

        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        if not agent_id:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p0_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p0_path']),
                                              strict=False)
        else:
            self.actor_network.load_state_dict(torch.load(
                args['agent_p1_path']),
                                               strict=False)
            self.actor_target.load_state_dict(torch.load(
                args['agent_p1_path']),
                                              strict=False)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'],
                                   args['BATCH_SIZE'], self.seed)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory

        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences)

    def act(self, current_state):

        with torch.no_grad():

            self.actor_network.eval()

            input_state = torch.from_numpy(current_state).float().to(
                self.device)

            with torch.no_grad():
                action = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()

            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def train(self, experiences):

        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #

        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = mCritic.network(states, actions)
        mCritic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        mCritic.optimizer.zero_grad()
        mCritic_loss.backward()
        mCritic.optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_network(states)
        actor_loss = -mCritic.network(states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(mCritic.network, mCritic.target, TAU)
        self.soft_update(self.actor_network, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Beispiel #3
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, agent_size=1):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.agent_size = agent_size

        self.local_actor = ActorNetwork(state_size, action_size,
                                        random_seed).to(device)
        self.target_actor = ActorNetwork(state_size, action_size,
                                         random_seed).to(device)
        self.local_critic = CriticNetwork(state_size, action_size,
                                          random_seed).to(device)
        self.target_critic = CriticNetwork(state_size, action_size,
                                           random_seed).to(device)

        self.opt_actor = optim.Adam(self.local_actor.parameters(), lr=LR_ACTOR)
        self.opt_critic = optim.Adam(self.local_critic.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def save_experience(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience
        self.memory.add(state, action, reward, next_state, done)

    def multi_step(self, t):
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            if t % 20 == 0:
                for i in range(0, 10):
                    self.learn(self.memory.sample(), GAMMA)
            else:
                pass

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.local_actor.eval()
        with torch.no_grad():
            action = self.local_actor(state).cpu().data.numpy()
        self.local_actor.train()
        if add_noise:
            for a in range(0, self.agent_size):
                action[a] += self.noise.sample()
        return np.clip(action, -1, 1)  # all actions between -1 and 1

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """
        Target and Local Critics-Actors are used to sove the moving targets problem.
        TargetActor generates the next action, and TargetCritic generates the corresponding Q-value.
        This function updates policy and value parameters using given batch of experience tuples.

        Q_targets = r + gamma * critic_t(next_state, actor_t(next_state))

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.target_actor(next_states)
        Q_targets_next = self.target_critic(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.local_critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.opt_critic.zero_grad()
        critic_loss.backward()
        #use gradient clipping when training the critic network
        torch.nn.utils.clip_grad_norm_(self.local_critic.parameters(), 1)
        self.opt_critic.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.local_actor(states)
        actor_loss = -self.local_critic(states, actions_pred).mean()
        # Minimize the loss
        self.opt_actor.zero_grad()
        torch.nn.utils.clip_grad_norm_(self.local_actor.parameters(), 1)
        actor_loss.backward()
        self.opt_actor.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.local_critic, self.target_critic, TAU)
        self.soft_update(self.local_actor, self.target_actor, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        this function manages the update of local and target models syncing
        theta_target = tau*theta_local + (1 - tau)*theta_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    def __init__(self, state_size, action_size, n_agents, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.stacked_state_size = state_size * n_agents
        self.stacked_action_size = action_size * n_agents

        # Actor networks
        self.actor_local = ActorNetwork(state_size, action_size,
                                        seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size,
                                         seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)

        # Critic networks
        self.critic_local = CriticNetwork(self.stacked_state_size,
                                          self.stacked_action_size,
                                          seed).to(device)
        self.critic_target = CriticNetwork(self.stacked_state_size,
                                           self.stacked_action_size,
                                           seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR)

        # OUNoise
        self.exploration_noise = OUNoise(action_size, seed)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        # Add exploration noise
        action += self.exploration_noise.sample()

        return np.clip(action, -1, 1)

    def update(self, states, current_agent_states, actions,
               current_agent_actions, target_next_actions, rewards,
               current_agent_rewards, next_states, dones, current_agent_dones,
               action_preds):
        flatten_states = torch.reshape(states, shape=(BATCH_SIZE, -1))
        flatten_next_states = torch.reshape(next_states,
                                            shape=(BATCH_SIZE, -1))
        flatten_actions = torch.reshape(actions, shape=(BATCH_SIZE, -1))

        y = current_agent_rewards + GAMMA * self.critic_target(
            flatten_next_states,
            target_next_actions) * (1 - current_agent_dones)

        # Critic loss
        critic_loss = F.mse_loss(
            y, self.critic_local(flatten_states, flatten_actions))

        # Critic backprop
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss
        actor_loss = -self.critic_local(flatten_states, action_preds).mean()

        # Actor backprop
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Soft updates
        self.update_target_network()

    def update_target_network(self):
        for target_param, local_param in zip(self.actor_target.parameters(),
                                             self.actor_local.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)

        for target_param, local_param in zip(self.critic_target.parameters(),
                                             self.critic_local.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)
class Actor:

    def __init__(self,
        device,
        key,
        state_size, action_size, random_seed,
        memory, noise,
        lr, weight_decay,
        checkpoint_folder = './Saved_Model/'):

        self.DEVICE = device

        self.KEY = key

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Hyperparameters
        self.LR = lr
        self.WEIGHT_DECAY = weight_decay

        self.CHECKPOINT_FOLDER = checkpoint_folder

        # Actor Network (w/ Target Network)
        self.local = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.target = ActorNetwork(state_size, action_size, random_seed).to(self.DEVICE)
        self.optimizer = optim.Adam(self.local.parameters(), lr=self.LR)

        self.checkpoint_full_name = self.CHECKPOINT_FOLDER + 'checkpoint_actor_' + str(self.KEY) + '.pth'
        if os.path.isfile(self.checkpoint_full_name):
            self.local.load_state_dict(torch.load(self.checkpoint_full_name))
            self.target.load_state_dict(torch.load(self.checkpoint_full_name))

        # Replay memory
        self.memory = memory

        # Noise process
        self.noise = noise

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(self.DEVICE)

        self.local.eval()
        with torch.no_grad():
            action = self.local(state).cpu().data.numpy()
        self.local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

    def reset(self):
        self.noise.reset()

    def checkpoint(self):
        torch.save(self.local.state_dict(), self.checkpoint_full_name)
Beispiel #6
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 hidden_in_actor=512,
                 hidden_out_actor=256,
                 lr_actor=1e-4,
                 hidden_in_critic=512,
                 hidden_out_critic=256,
                 lr_critic=3e-4,
                 weight_decay_critic=0,
                 seed=1,
                 device='cpu'):
        super(DDPGAgent, self).__init__()

        self.device = device

        # Actor
        self.actor = ActorNetwork(state_size, hidden_in_actor,
                                  hidden_out_actor, action_size,
                                  seed).to(device)
        self.target_actor = ActorNetwork(state_size, hidden_in_actor,
                                         hidden_out_actor, action_size,
                                         seed).to(device)
        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)

        # Target
        self.critic = CriticNetwork(state_size, action_size, num_agents,
                                    hidden_in_critic, hidden_out_critic,
                                    seed).to(device)
        self.target_critic = CriticNetwork(state_size, action_size, num_agents,
                                           hidden_in_critic, hidden_out_critic,
                                           seed).to(device)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay_critic)

        # Noise
        self.noise = OUNoise(action_size, seed, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

    def reset(self):
        self.noise.reset()

    def act(self, obs, noise_factor=0.0):

        if torch.is_tensor(obs):
            states = obs
        else:
            states = torch.from_numpy(obs).float().to(self.device)

        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(states).cpu().data.numpy()
        self.actor.train()
        actions += noise_factor * self.noise.sample()
        return np.clip(actions, -1, 1)

    def target_act(self, obs):

        if torch.is_tensor(obs):
            states = obs
        else:
            states = torch.from_numpy(obs).float().to(self.device)

        self.target_actor.eval()
        with torch.no_grad():
            actions = self.target_actor(states).cpu().data.numpy()
        self.target_actor.train()
        return np.clip(actions, -1, 1)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, memory, seed=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        
        self.state_size = state_size
        self.action_size = action_size
        
        if seed is not None:
            self.seed = seed

        # create the local and target actor networks
        self.actor_local = ActorNetwork(state_size, action_size, seed).to(device)
        self.actor_target = ActorNetwork(state_size, action_size, seed).to(device)
        
        # create the local and target critic networks
        self.critic_local = CriticNetwork(state_size, action_size, seed).to(device)
        self.critic_target = CriticNetwork(state_size, action_size, seed).to(device)
        
        # optimizers for local actor and critic 
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR, weight_decay=0.0)
        
        # MSE loss for updating the critic
        # self.critic_loss_function = nn.MSELoss()
        self.critic_loss_function = nn.SmoothL1Loss()

        # copy the local networks weights to the target network 
        self.copy_weights_from_local_to_target()
        
        # Replay memory
        self.memory = memory
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        
        # init the noise class to sample from
        self.noise = GaussianNoise(self.action_size)
        
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                for _ in range(LEARN_TIMES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)
                self.soft_update_all()


    def copy_weights_from_local_to_target(self):
        # ensure that the local and target networks are initialized with the same random weights
        # or copy you saved weights after loading into local
        for target_param, param in zip(self.actor_target.parameters(), self.actor_local.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()):
            target_param.data.copy_(param.data)

    def act(self, state, add_noise=False):
        """Returns actions for given state as per current policy. 
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        
        # get predicted actions for current state from actor network
        self.actor_local.eval()
        with torch.no_grad():
            action_values = self.actor_local(state)
        self.actor_local.train()

        # take the predicted actions and add noise, used as exploration in a continuous environment
        action_values = action_values.cpu().data.numpy()
        
        if add_noise == True:
            action_values += self.noise.sample()
        
        return action_values
        
    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        
        # unpack the experiences tuple 
        states, actions, rewards, next_states, dones = experiences
        
        # computer the loss for the actor network per the DDPG algorithm
        actor_local_predicted_actions = self.actor_local(states)
        policy_loss = -self.critic_local(states, actor_local_predicted_actions).mean()
        
        # compute the loss for the critic network per the DDPG algorithm
        predicted_Q_vals = self.critic_local(states, actions)
        predicted_actions = self.actor_target(next_states)
        Q_next = self.critic_target(next_states, predicted_actions)
        Q_targets = rewards + (gamma * Q_next * (1 - dones))
        
        critic_loss = self.critic_loss_function(predicted_Q_vals, Q_targets)
        
        # update the networks
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        
    
    def soft_update_all(self):
        # and soft update the target networks
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        
        
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter   # use percent tau local_param.data and rest target_param.data
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Beispiel #8
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, agent_id):

        self.state_size  = state_size
        self.action_size = action_size
        self.seed        = args['seed']
        self.device      = args['device']
        #self.args        = args

        # Q-Network
        self.actor_network    = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_target     = ActorNetwork(state_size, action_size).to(self.device)
        self.actor_optimizer  = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR'])
        
        #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine)
        #if not agent_id:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False)
        #else:
        #    self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        #    self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False)
        
        # Replay memory
        self.memory      = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed)
        
        # Noise process
        self.noise       = OUNoise(action_size, self.seed)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step      = 0
        
        self.mCriticLoss = 0
        
        self.actorLoss   = 0
        
    
    def step(self, state, action, reward, next_state, done, mCritic):
        # Save experience in replay memory
        
        self.memory.add(state, action, reward, next_state, done)
        
        if len(self.memory) > args['BATCH_SIZE']:
            experiences = self.memory.sample()
            self.train(experiences, mCritic)
            

    def act(self, current_state):
        
        with torch.no_grad():
                
            self.actor_network.eval()
            
            # PUT CONDITIONAL CHECK -> if CNN reshape, ELSE...dont
            #input_state = torch.from_numpy(current_state).float().reshape(args['reshape_size']).unsqueeze(0).unsqueeze(0).to(self.device)
            
            input_state = torch.from_numpy(current_state).float().to(self.device)
                
            with torch.no_grad():
                action  = self.actor_network(input_state).cpu().data.numpy()

            self.actor_network.train()
                
            #action     += self.noise.sample()
            
        return action

    def reset(self):
        self.noise.reset()
        
        
    def train(self, experiences, mCritic):
        
        global states_
        global next_states_
        global actions_
        global max_min_actions_vector
        global max_min_states_vector

        states, actions, rewards, next_states, dones = experiences

        
        # ---------------------------- update critic ---------------------------- #
        
        with torch.no_grad():
            # Get predicted next-state actions and Q values from target models
            actions_next   = self.actor_target(next_states)
            
            #PUT CONDITIONAL CHECK: if CNN reshape ELSE..Dont...
            #Q_targets_next = mCritic.target(next_states, actions_next[np.newaxis, :])
            Q_targets_next = mCritic.target(next_states, actions_next)

            # Compute Q targets for current states (y_i)
            Q_targets      = rewards + (args['GAMMA'] * Q_targets_next * (1 - dones))
        
        
        # Compute critic loss
        Q_expected         = mCritic.network(states, actions)
        mCritic_loss       = F.mse_loss(Q_expected, Q_targets)