コード例 #1
0
class DDPG_Agent():
    """Interacts with and learns from the environment."""
#self.state_size, self.action_size, self.seed, hidden_layers_actor, hidden_layers_critic, self.buffer_size, learning_rate_actor, learning_rate_critic
    def __init__(self, state_size, action_size, num_agents, seed, device,
                 buffer_size=int(1e5), batch_size=128, num_batches = 5, update_every=10,
                 gamma=0.99, tau=8e-3,
                 learning_rate_actor=1e-3, learning_rate_critic=1e-3, weight_decay=0.0001,                
                 hidden_layers_actor=[32,32], hidden_layers_critic=[32, 32, 32],
                 add_noise=True, start_eps=5.0, end_eps=0.0, end_eps_episode=500,
                 agent_id=-1):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate_X (float; optional): learning rate for X=actor or critic
        """
        print('In DPPG_AGENT: seed = ', seed)
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(seed)
        self.device = device
        
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.num_batches = num_batches
        
        self.gamma = gamma
        self.tau = tau
        
        self.lr_actor = learning_rate_actor
        self.lr_critic = learning_rate_critic
        self.weight_decay_critic = weight_decay
        
        self.add_noise = add_noise
        self.start_eps = start_eps
        self.eps = start_eps
        self.end_eps = end_eps
        self.eps_decay = 1/(end_eps_episode*num_batches)  # set decay rate based on epsilon end target
        self.timestep = 0
        
        self.agent_id = agent_id
     
        ### SET UP THE ACTOR NETWORK ###
        # Assign model parameters and assign device
        model_params_actor  = [state_size, action_size, seed, hidden_layers_actor]
        
        # Create the Actor Network (w/ Target Network)
        self.actor_local = Actor(*model_params_actor).to(self.device)
        self.actor_target = Actor(*model_params_actor).to(self.device)
        #print('actor_local network is: ', print(self.actor_local))
        
        # Set up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)       
        
        ### SET UP THE CRITIC NETWORK ###
        model_params_critic = [state_size, action_size, seed, hidden_layers_critic]

        # Create the Critic Network (w/ Target Network)
        self.critic_local = Critic(*model_params_critic).to(self.device)
        self.critic_target = Critic(*model_params_critic).to(self.device)
        
        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)

        # Noise process
        self.noise = OUNoise(action_size, self.seed)
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, device)

    def step(self, states, actions, rewards, next_states, dones, agent_number):
        # Increment timestep by 1
        self.timestep += 1
        
        # Save experience in replay memory
        self.memory.add(states, actions, rewards, next_states, dones)
        
         # If there are enough samples and a model update is to be made at this time step
        if len(self.memory) > self.batch_size and self.timestep%self.update_every == 0:
            # For each batch
            for i in range(self.num_batches):
                # Sample experiences from memory
                experiences = self.memory.sample()
        
                # Learn from the experience
                self.learn(experiences, self.gamma, agent_number)

    def act(self, state, scale_noise=True):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().to(self.device)
        
        # Go to evaluation mode and get Q values for current state
        self.actor_local.eval()
        with torch.no_grad():
            # Get action for the agent and concatenate them
            action = [self.actor_local(state[0]).cpu().data.numpy()]
            
        # get back to train mode
        self.actor_local.train()
        
        # Add noise to the action probabilities
        # Note, we want the magnitude of noise to decrease as the agent keeps learning
        action += int(scale_noise)*(self.eps)*self.noise.sample()
        
        return np.clip(action, -1.0, 1.0)
    
    def reset(self):
        """
        Reset the noise, and all neural network parameters for the current agent
        """
        self.noise.reset()
        self.eps = self.start_eps
        self.timestep = 0
        self.critic_local.reset_parameters()
        self.actor_local.reset_parameters()
        self.critic_target.reset_parameters()
        self.actor_target.reset_parameters()
        
        # ReSet up optimizer for the Actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor)
        
        # Set up optimizer for the Critic Network
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic)
        
        # Clear the experience buffer
        self.memory.clear_buffer()
        
    def reset_noise(self):
        """
        Reset the noise only
        """
        self.noise.reset()
   
    def learn(self, experiences, gamma, agent_number):
        ####     DRAW FROM MEMORY AND PREPARE SARS DATA        ####
        # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data
        states, actions, rewards, next_states, dones = experiences
        
        # NOTE: actions has dimension of batch_size x concatenated action for all agents
      
        # get the next action for the current agent for the entire batch
        actions_next = self.actor_target(next_states)
    
        # Construct next action vector for the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:,2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:,:2], actions_next), dim=1)
        
        ####    UPDATE CRITIC   ####
        # Get predicted next-state actions and Q values from target models
        # Get the next targets
        Q_targets_next = self.critic_target(next_states, actions_next)
        
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        
        # Define the loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        
        # Clip gradient @1
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
   

        # --------------UPDATE ACTOR -----------------------#
        # Compute actor loss
        actions_pred = self.actor_local(states)

        # Construct action prediction vector relative to each agent
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:,2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:,:2], actions_pred), dim=1)
        
        # Calculate the loss. Note the negative sign since we use steepest ascent
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the target networks using the local and target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        
        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, self.end_eps)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        X_target = tau*X_local + (1 - tau)*X_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
コード例 #2
0
class DdpgAgent():
    def __init__(self, config, seed, device="cpu"):
        self.seed = seed
        
        # -- Set environment
        self.action_size = config["env"]["action_size"]
        self.env = config["env"]["simulator"]
        self.brain_name = config["env"]["brain_name"]
        self.num_agents = config["env"]["num_agents"]


        # -- Construct Actor/Critic models
        self.actor_local = Actor(config["env"]["state_size"], config["env"]["action_size"], seed, config["actor"]["hidden_layers"]).to(device)
        self.actor_target = Actor(config["env"]["state_size"], config["env"]["action_size"], seed, config["actor"]["hidden_layers"]).to(device)
        self.checkpoint = {"state_size":config["env"]["state_size"],
                           "action_size":config["env"]["action_size"],
                           "hidden_layers":config["actor"]["hidden_layers"],
                           "state_dict":self.actor_local.state_dict()}
        
        self.critic_local = Critic(config["env"]["state_size"], config["env"]["action_size"], seed, config["critic"]["hidden_layers"]).to(device)
        self.critic_target = Critic(config["env"]["state_size"], config["env"]["action_size"], seed, config["critic"]["hidden_layers"]).to(device)

#        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config["learning"]["lr_critic"], weight_decay=0.0001)
        
        # -- Configure optimizer
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config["learning"]["lr_actor"])
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config["learning"]["lr_critic"])

        self.optimizer_lr_decay = config["learning"]["lr_decay"]["activate"]
        self.actor_optimizer_lr_scheduler = optim.lr_scheduler.StepLR(self.actor_optimizer,
                                                                      step_size=config["learning"]["lr_decay"]["actor_step"],
                                                                      gamma=config["learning"]["lr_decay"]["actor_gamma"])
        self.critic_optimizer_lr_scheduler = optim.lr_scheduler.StepLR(self.critic_optimizer,
                                                                       step_size=config["learning"]["lr_decay"]["critic_step"],
                                                                       gamma=config["learning"]["lr_decay"]["critic_gamma"])
        
        # -- Set learning parameters
        self.batch_size = config["learning"]["batch_size"]
        self.buffer_size = config["learning"]["buffer_size"]
        self.discount = config["learning"]["discount"]
        self.max_t = config["learning"]["max_t"]
        self.tau = config["learning"]["soft_update_tau"]
        self.learn_every_n_steps = config["learning"]["learn_every_n_steps"]
        self.num_learn_steps = config["learning"]["num_learn_steps"]
        self.checkpointfile = config["learning"]["checkpointfile"]
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size, seed, device)
                
        self.device=device
        
        self.add_noise = True
        self.ou_noise = OUNoise(self.action_size, seed)
        
        self.hard_copy(self.actor_local, self.actor_target)
        self.hard_copy(self.critic_local, self.critic_target)
        
    def steps(self):
        if self.optimizer_lr_decay:
            self.actor_optimizer_lr_scheduler.step()
            self.critic_optimizer_lr_scheduler.step()
            
        env_info = self.env.reset(train_mode=True)[self.brain_name]
        self.ou_noise.reset()
        state = env_info.vector_observations
        score = np.zeros(self.num_agents)
        self.step_ctr = 0
        while True:
            action = self.act(state)
            env_info = self.env.step(action)[self.brain_name]
            next_state = env_info.vector_observations         # get next state (for each agent)
            reward = env_info.rewards                         # get reward (for each agent)
            done = env_info.local_done   
            self.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if np.any(done):
                break
        
        return score, self.step_ctr
            
    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        self.step_ctr += 1
        if len(self.memory) > self.batch_size and self.step_ctr % self.learn_every_n_steps == 0:
            for _ in range(self.num_learn_steps):
                self.learn()
            
    def act(self, state):
#        print(state)
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()   # set train= False
        with torch.no_grad():
#            action = self.actor_local(state).data.numpy()
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()  # set back train=True
        
        if self.add_noise:
            action += self.ou_noise.sample()
        
        return np.clip(action, -1, 1)
    
    def learn(self):
#        print("IM IN")
#        print("*")
        states, actions, rewards, next_states, dones = self.memory.sample_random()
        
        # -------------------- Update Critic -----------------------------
        # Get predicted next-state actions and Q values from target model
        next_actions = self.actor_target(next_states)
#        Q_targets_next = self.critic_target(next_states, next_actions)
        Q_targets_next = self.critic_target(next_states, next_actions).detach()
        
        # Compare Q targets for current states (y_i)
        Q_targets = rewards + (self.discount * Q_targets_next * (1 - dones))
#        Q_targets = Q_targets.detach()
        
        # Compute Critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
#        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        
        # -------------------- Update Actor -----------------------------
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        
        # Minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # ------------------ Update Target Networds --------------------
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)
        
    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau*local_param + (1.0-self.tau)*target_param)        
        
    def hard_copy(self, model_a, model_b):
        """ copy model_a to model_b """
        for param_a, param_b in zip(model_a.parameters(), model_b.parameters()):
            param_b.data.copy_(param_a)
            
    def reset(self):
        self.actor_local.reset_parameters()
        self.actor_target.reset_parameters()
        self.critic_local.reset_parameters()
        self.critic_target.reset_parameters()
#        self.hard_copy(self.actor_local, self.actor_target)
#        self.hard_copy(self.critic_local, self.critic_target)
        
    def set_lr(self, actor_lr=None, critic_lr=None):
        if actor_lr is not None:
            self.actor_optimizer
 
    def save_model(self):
        torch.save(self.checkpoint, self.checkpointfile)
        
    def add_noise_on_act(self, nois_on_act):
        """ When nois_on_act is True, OU noise is added in act() """
        self.add_noise = nois_on_act       
class Agent():
    def __init__(self, state_size, action_size, seed):
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.qvalue_local = Critic(state_size, action_size, seed).to(device)
        self.qvalue_target = Critic(state_size, action_size, seed).to(device)
        self.seed = seed

        self.actor_target.load_state_dict(self.actor_local.state_dict())
        self.qvalue_target.load_state_dict(self.qvalue_local.state_dict())
        self.noise = OUNoise(action_size, self.seed)

        self.qvalue_optimizer = optim.Adam(self.qvalue_local.parameters(),
                                           lr=LR_CRITIC)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

    def reset(self):
        self.actor_local.reset_parameters()
        self.qvalue_local.reset_parameters()
        self.actor_target.reset_parameters()
        self.qvalue_target.reset_parameters()
        self.noise.reset()

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def into_tensor(self, state):
        return torch.tensor(state).float().to(device)

    def play(self, max_len=200):
        state1 = env.reset()
        score = 0
        for _ in range(max_len):
            state1 = self.into_tensor(state1)
            action1 = self.act(state1)
            action1_t = self.into_tensor(action1)
            value1_pred = self.qvalue_local(state1, action1_t)

            state2, reward, done, _ = env.step(action1)

            state2 = self.into_tensor(state2)
            action2 = self.actor_target(state2)
            action2_t = self.into_tensor(action2)
            value2 = self.qvalue_target(state2, action2_t)

            if not done:
                expected_value = self.into_tensor(reward) + GAMMA * value2
            else:
                expected_value = self.into_tensor(reward)
            score += reward
            ## Critic Update
            qvalue_loss = F.mse_loss(value1_pred, expected_value)
            self.qvalue_optimizer.zero_grad()
            qvalue_loss.backward()
            self.qvalue_optimizer.step()

            ## Actor Update
            actor_loss = -self.qvalue_target(state1, action1_t)
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            state1 = state2

            self.soft_update(self.actor_local, self.actor_target, TAU)
            self.soft_update(self.qvalue_local, self.qvalue_target, TAU)
            if done: break
            # print(actor_loss, qvalue_loss)
        return score

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def run(self, iteration, max_len=200):
        self.score_deque = deque(maxlen=100)
        for episode in range(iteration):
            score = self.play(max_len)
            self.score_deque.append(score)
            print("\rIteration {} with Current Score {}".format(
                episode, score),
                  end=" ")
            if (episode % 50 == 0):
                print("\rIteration {} with Average Score {} ".format(
                    episode,
                    sum(self.score_deque) / len(self.score_deque)))