Example #1
0
class Agent():
    def __init__(self, state_size, action_size, n_agents, random_seed):

        self.state_size = state_size
        self.action_size = action_size
        self.n_agents = n_agents
        self.seed = random.seed(random_seed)

        #Actor Network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        #Critic Network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)

        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        #Noise Process
        self.noise = OUNoise((n_agents, action_size), random_seed)

        #Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, timestep):

        #Save Memory
        for state, action, reward, next_state, done in zip(
                state, action, reward, next_state, done):
            self.memory.add(state, action, reward, next_state, done)

        if timestep % N_LEARN_TIMESTEPS != 0:
            return

        #IF enough samples in memory
        if len(self.memory) > BATCH_SIZE:
            for i in range(N_LEARN_UPDATES):
                #Load sample of tuples from memory
                experiences = self.memory.sample()

                #Learn from a randomly selected sample
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        #Return action
        return np.clip(action, -1, 1)

    def reset(self):

        self.noise.reset()

    def learn(self, experiences, gamma):

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        #Get predicted actions + Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        #Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        #Critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        #Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        #Actor Loss
        actions_pred = self.actor_local(states)

        #Negative sign for gradient ascent
        actor_loss = -self.critic_local(states, actions_pred).mean()

        #Minimize Loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        for local_param, target_param in zip(local_model.parameters(),
                                             target_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #2
0
class Agent():
    def __init__(self, state_size, action_size, random_seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor w/ target
        self.actor_local = Actor(state_size, action_size,
                                 seed=random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  seed=random_seed).to(device)
        self.actor_opt = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic w/ target
        self.critic_local = Critic(state_size, action_size,
                                   seed=random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    seed=random_seed).to(device)
        self.critic_opt = optim.Adam(self.critic_local.parameters(),
                                     lr=LR_CRITIC,
                                     weight_decay=WEIGHT_DECAY)

        # Misc
        self.noise = OUNoise(action_size, random_seed)
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, +1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # update critic
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + gamma * Q_targets_next * (1 - dones)

        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # update actor
        actions_pred = self.actor_local(states)

        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # target network upates
        self.soft_update(self.actor_local, self.actor_target, TAU)
        self.soft_update(self.critic_local, self.critic_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            mixed_param = tau * local_param.data + (1 -
                                                    tau) * target_param.data
            target_param.data.copy_(mixed_param)
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, params, device = DEVICE, critic_input_size = None):
        """Initialize an Agent object.
        """
        
        self.params = params
        self.state_size = params.STATE_SIZE
        self.action_size = params.ACTION_SIZE
        self.seed = params.SEED
        self.tau = params.TAU
        
        self.device = device
        
        if critic_input_size is None:
            critic_input_size = 2 * (self.state_size + self.action_size)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), 
                                           lr=params.LR_ACTOR, weight_decay=params.WEIGHT_DECAY_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(critic_input_size, self.seed).to(device)
        self.critic_target = Critic(critic_input_size, self.seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), 
                                           lr=params.LR_CRITIC, weight_decay=params.WEIGHT_DECAY_CRITIC)

        # Noise process
        self.noise = OUNoise(self.action_size, self.seed, 
                             mu=0., theta=params.NOISE_THETA, sigma=params.NOISE_SIGMA)
        
        # Parameters for learning
        self.gamma = params.GAMMA
        self.learning_step = 0 # Counter for learning steps
    
    def act(self, state, add_noise=False, sigma = 0.1):
        """
        Returns actions for given state as per current policy.
        Arguments:
            state - input state
            add_noise - can be:
                False   - No nose added (default)
                'OU'    - Ornstein-Uhlenbeck noise added
                'rand'  - uniformly random noise added
                'sigma' - noise is scaled from -simga/2 to sigma/2. Works with 'rand' noise
        """
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            if add_noise == 'OU':
                action += self.noise.sample()
            else:
                action += sigma * np.random.rand(len(action)) - sigma / 2
                
            return np.clip(action, -1, 1) # Clipping is necessary if we are adding noise
        else:
            return action
        
    def reset(self):
        self.noise.reset()

    def learn(self, 
              states, actions, rewards, next_states, dones,
              next_actions, 
              ag2_states, ag2_actions, ag2_next_states, 
              ag2_next_actions):              
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            states, actions, rewards, next_states, dones - parameters for agent
            next_actions - actions produced by target network
            ag2_states, ag2_actions, ag2_next_states - parameters for the other agent
            ag2_next_actions - actions produced by target network of the other agent
        """

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        with torch.no_grad():
            Q_targets_next = self.critic_target(next_states, next_actions, ag2_next_states, ag2_next_actions)
        
            # Compute Q targets for current states 
            Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
            
        # Compute critic loss
        Q_expected = self.critic_local(states, actions, ag2_states, ag2_actions)
            
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        pred_actions = self.actor_local(states)
        actor_loss = -self.critic_local(states, pred_actions, ag2_states, ag2_next_actions).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 seed,
                 fc1=400,
                 fc2=300):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.num_agents = num_agents

        self.noise = [
            OrnsteinUhlenbeckProcess(size=(action_size, ), std=0.2)
            for i in range(num_agents)
        ]

        # actor local and target network (Policy gradient)
        self.actor_local = Actor(state_size, action_size, fc1, fc2,
                                 seed).to(device)
        self.actor_target = Actor(state_size, action_size, fc1, fc2,
                                  seed).to(device)

        # critic local and target network (Q-Learning)
        self.critic_local = Critic(state_size, action_size, fc1, fc2,
                                   seed).to(device)
        self.critic_target = Critic(state_size, action_size, fc1, fc2,
                                    seed).to(device)

        # optimizer for critic and actor network
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=ACTOR_LR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=CRITIC_LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        for i in range(self.num_agents):
            self.memory.add(state[i], action[i], reward[i], next_state[i],
                            done[i])

        self.t_step += 1

        # If enough samples are available in memory, get random subset and learn
        if len(self.memory) > BATCH_SIZE:
            if self.t_step % UPDATE_EVERY == 0:
                for i in range(UPDATE_TIMES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, training=True):
        """Returns continous actions values for all action for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
        """

        state = torch.from_numpy(state).float().detach().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        noise = np.array(
            [self.noise[i].sample() for i in range(self.num_agents)])

        return np.clip(actions + noise, -1, 1)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def reset_random(self):
        for i in range(self.num_agents):
            self.noise[i].reset_states()
Example #5
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)
    
    # writer = SummaryWriter(args.logdir)
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{} episode score is {:.2f}'.format(episodes, score_avg))
        # writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train() 
        train_discrim(discrim, memory, discrim_optim, demonstrations, args)
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)
Example #6
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        # Actor Network (w/ Target Network)
        self.actor_local1 = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target1 = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer1 = optim.Adam(self.actor_local1.parameters(), lr=LR_ACTOR)
        self.actor_local2 = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target2 = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer2 = optim.Adam(self.actor_local2.parameters(), lr=LR_ACTOR)
        #critic_state_size = np.reshape(state_size, 48)
        # Critic Network (w/ Target Network)
        self.critic_local1 = Critic(state_size*2, action_size, random_seed).to(device)
        self.critic_target1 = Critic(state_size*2,action_size, random_seed).to(device)
        self.critic_optimizer1 = optim.Adam(self.critic_local1.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        self.critic_local2 = Critic(state_size*2, action_size, random_seed).to(device)
        self.critic_target2 = Critic(state_size*2, action_size, random_seed).to(device)
        self.critic_optimizer2 = optim.Adam(self.critic_local2.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
       
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > 7000:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
    
    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        #self.noise = OUNoise(self.action_size, random_seed,sigma=sigma)
        state = torch.from_numpy(state).float().to(device)
        self.actor_local1.eval()
        with torch.no_grad():
            action1 = self.actor_local1(state[0]).cpu().data.numpy()
        self.actor_local1.train()
        if add_noise:
            action1 += self.noise.sample()
        self.actor_local2.eval()
        with torch.no_grad():
            action2 = self.actor_local2(state[1]).cpu().data.numpy()
        self.actor_local2.train()
        if add_noise:
            action2 += self.noise.sample()
        
        return np.vstack((np.clip(action1, -1, 1), np.clip(action2, -1, 1)))

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        
        statesforcritic = torch.reshape(states, (BATCH_SIZE, self.state_size*2))
        nextstatesforcritic = torch.reshape(next_states, (BATCH_SIZE, self.state_size*2))
        actionsforcritic = torch.reshape(actions, (BATCH_SIZE, self.action_size*2))
        nextstatesforactor = torch.split(nextstatesforcritic,self.state_size,1)
        statesforactor = torch.split(statesforcritic,self.state_size,1)
        actionsforactor = torch.split(actionsforcritic,self.action_size,1)
        rewardsforactor = torch.split(rewards,1,1)
        donesforactor = torch.split(dones,1,1)
      
        # --------------------------- update critic 1---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next1 = self.actor_target1(nextstatesforactor[0])
        actions_next2 = self.actor_target2(nextstatesforactor[1])
        actions_next = torch.cat((actions_next1, actions_next2), 1)
        
        Q_targets_next_1 = self.critic_target1(nextstatesforcritic, actions_next1)
        # Compute Q targets for current states (y_i)
        Q_targets1 = rewardsforactor[0] + (gamma * Q_targets_next_1 * (1 - donesforactor[0]))
        # Compute critic loss
        Q_expected1 = self.critic_local1(statesforcritic, actionsforactor[0])
        critic_loss1 = F.mse_loss(Q_expected1, Q_targets1)
        # Minimize the loss
        self.critic_optimizer1.zero_grad()
        critic_loss1.backward()
        self.critic_optimizer1.step()
        
        
        Q_targets_next_2 = self.critic_target2(nextstatesforcritic, actions_next2)
        # Compute Q targets for current states (y_i)
        Q_targets2 = rewardsforactor[1] + (gamma * Q_targets_next_2 * (1 - donesforactor[1]))
        # Compute critic loss
        Q_expected2 = self.critic_local2(statesforcritic, actionsforactor[1])
        critic_loss2 = F.mse_loss(Q_expected2, Q_targets2)
            # Minimize the loss
        self.critic_optimizer2.zero_grad()
        critic_loss2.backward()
        self.critic_optimizer2.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred1 = self.actor_local1(statesforactor[0])
        
        actor_loss1 = -self.critic_local1(statesforcritic, actions_pred1).mean()
        # Minimize the loss
        self.actor_optimizer1.zero_grad()
        actor_loss1.backward()
        self.actor_optimizer1.step()
        
        actions_pred2 = self.actor_local2(statesforactor[1])
        actor_loss2 = -self.critic_local2(statesforcritic, actions_pred2).mean()
        # Minimize the loss
        self.actor_optimizer2.zero_grad()
        actor_loss2.backward()
        self.actor_optimizer2.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local1, self.critic_target1, TAU)
        self.soft_update(self.critic_local2, self.critic_target2, TAU)
        self.soft_update(self.actor_local1, self.actor_target1, TAU)
        self.soft_update(self.actor_local2, self.actor_target2, TAU)
                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, n_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((n_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Epsilon
        self.epsilon = EPSILON


#         # Make sure target is with the same weight as the source
#         self.hard_update(self.actor_target, self.actor_local)
#         self.hard_update(self.critic_target, self.critic_local)

    def step(self, states, actions, rewards, next_states, dones, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward

        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)
        #self.memory.add(states, actions, rewards, next_states, dones)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0:
            for _ in range(UPDATE_TIMES):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:

            # epsilon decay
            self.epsilon -= EPSILON_DECAY
            self.epsilon = np.maximum(self.epsilon, 0.001)
            action += self.epsilon * self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # perform gradient clipping
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def load_weights(self, cp_actor, cp_critic):
        self.critic_local.load_state_dict(torch.load(cp_critic))
        self.actor_local.load_state_dict(torch.load(cp_actor))

    def eval_act(self, state):
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        return action
Example #8
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed,num_agents=1):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            num_agents (int) : number of agents in the environment 
        """
        
        """
        Base Working for multiple agents
        ======
        
        Many different agents will sample the environment at the same time to get different states, 
        for which based on the current policy actions will be decided, rewards will be received along with
        the next states. All the agents update the same experience replay buffer and utilise the same neural 
        net to decide on the optimal set of actions. This should theoretically increase training efficiency 
        since so many different states are being experienced at the same time.
        """
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents,action_size),random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
        
        #Initial target and local networks with same weights (Student Hub Discussion)
        self.hard_update(self.actor_local,self.actor_target)
        self.hard_update(self.critic_local,self.critic_target)
    
    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(states, actions,rewards,next_states,dones):
            self.memory.add(state, action, reward, next_state, done)
    
    """To decouple learning from experience collection and use random sample from buffer to learn."""
    def update(self):
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, eps, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        
        if add_noise and np.random.random() < eps:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),1)
        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def hard_update(self,local_model,target_model):
        
        for target_param,local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(local_param.data)
class Agent():
    """Implements a DDPG Agent

    Args:
        state_size (int): dimension of each state
        action_size (int): dimension of each action
        device (str, optional): device used for tensor operations
        buffer_size (int, optional): size of the experience replay buffer
        batch_size (int, optional): size of batch sampled for experience replay
        lr (float, optional): learning rate of both actor and critic models
        lr_steps (int, optional): number of steps between each scheduler step
        lr_gamma (float, optional): LR multiplier applied at each scheduler step
        gamma (float, optional): discount factor
        tay (float, optional): soft update rate
        noise_mean (float, optional): mean of Ornstein-Uhlenbeck process
        noise_theta (float, optional): theta parameter Ornstein-Uhlenbeck process
        noise_sigma (float, optional): sigma parameter of Ornstein-Uhlenbeck process
        grad_clip (float, optional): gradient clip
    """

    def __init__(self, state_size, action_size, train=False, device=None, buffer_size=1e6, batch_size=128,
                 lr=1e-3, gamma=0.99, tau=1e-3, update_freq=20, nb_updates=10,
                 noise_mean=0, noise_theta=0.05, noise_sigma=0.15, eps=1.0, eps_decay=1e-6,
                 grad_clip=1.0):

        self.state_size = state_size
        self.action_size = action_size
        self.train = train
        self.bs = batch_size
        self.gamma = gamma
        self.tau = tau
        self.grad_clip = grad_clip
        self.update_freq = update_freq
        self.nb_updates = nb_updates
        self.eps = eps
        self.eps_decay = eps_decay

        if device is None:
            if torch.cuda.is_available():
                device = 'cuda:0'
            else:
                device = 'cpu'
        self.device = torch.device(device)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(self.device)
        if self.train:
            self.actor_target = Actor(state_size, action_size).to(self.device)
            self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size).to(self.device)
        if self.train:
            self.critic_target = Critic(state_size, action_size).to(self.device)
            self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr, weight_decay=0.)

            # Noise process
            self.noise = OUNoise(action_size, noise_mean, noise_theta, noise_sigma)

            # Replay memory
            self.memory = ReplayBuffer(action_size, int(buffer_size), batch_size, self.device)

    def step(self, state, action, reward, next_state, done, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        if not self.train:
            raise ValueError('agent cannot be trained if constructor argument train=False')

        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.bs and timestep % self.update_freq == 0:
            for _ in range(self.nb_updates):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Resolves action for given state as per current policy.

        Args:
            state (numpy.ndarray): current state representation
            add_noise (bool, optional): should noise be add to action value
        Returns:
            numpy.ndarray: clipped action value
        """
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        if self.train:
            self.actor_local.train()

            if add_noise:
                action += self.eps * self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """

        if not self.train:
            raise ValueError('agent cannot be trained if constructor argument train=False')
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions.to(dtype=torch.float32))
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # gradient clipping for critic
        if self.grad_clip > 0:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.grad_clip)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        # --------------------- and update epsilon decay ----------------------- #
        if self.eps_decay > 0:
            self.eps -= self.eps_decay
            self.noise.reset()

    @staticmethod
    def soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)
        self.eps = EPS_START
        self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM
                              )  # set decay rate based on epsilon end target
        self.timestep = 0

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, agent_number):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.timestep += 1
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)
        # Learn, if enough samples are available in memory and at learning interval settings
        if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA, agent_number)

    def act(self, states, add_noise):
        """Returns actions for both agents as per current policy, given their respective states."""
        states = torch.from_numpy(states).float().to(device)
        actions = np.zeros((self.num_agents, self.action_size))
        self.actor_local.eval()
        with torch.no_grad():
            # get action for each agent and concatenate them
            for agent_num, state in enumerate(states):
                action = self.actor_local(state).cpu().data.numpy()
                actions[agent_num, :] = action
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            actions += self.eps * self.noise.sample()
        actions = np.clip(actions, -1, 1)
        return actions

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma, agent_number):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        # Construct next actions vector relative to the agent
        if agent_number == 0:
            actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1)
        else:
            actions_next = torch.cat((actions[:, :2], actions_next), dim=1)
        # Compute Q targets for current states (y_i)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        # Construct action prediction vector relative to each agent
        if agent_number == 0:
            actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1)
        else:
            actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1)
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # update noise decay parameter
        self.eps -= self.eps_decay
        self.eps = max(self.eps, EPS_FINAL)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #11
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 hidden_sizes_actor=[64, 64],
                 hidden_sizes_critic=[128, 64, 32]):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            hidden_sizes_actor (list): list of neurons in each layer of the actor network
            hidden_sizes_critic (list): list of neurons in each layer of the critic network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.hidden_sizes_actor = hidden_sizes_actor
        self.hidden_sizes_critic = hidden_sizes_critic

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed,
                                 hidden_sizes_actor).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed,
                                  hidden_sizes_actor).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR,
                                          weight_decay=WEIGHT_DECAY_AC)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed,
                                   hidden_sizes_critic).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed,
                                    hidden_sizes_critic).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY_CR)

        # Initialize target networks with same weights:
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

        # Add Ornstein-Uhlenbeck noise
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def show_actor_local(self):
        network = self.actor_local
        x = Variable(torch.randn(1, self.state_size))
        y = network(x)
        return make_dot(y, params=dict(list(network.named_parameters())))

    def show_actor_target(self):
        network = self.actor_target
        x = Variable(torch.randn(1, self.state_size))
        y = network(x)
        return make_dot(y, params=dict(list(network.named_parameters())))

    def show_critic_local(self):
        network = self.critic_local
        x1 = Variable(torch.randn(1, self.state_size))
        x2 = Variable(torch.randn(1, self.action_size))
        y = network(x1, x2)
        return make_dot(y, params=dict(list(network.named_parameters())))

    def show_critic_target(self):
        network = self.critic_target
        x1 = Variable(torch.randn(1, self.state_size))
        x2 = Variable(torch.randn(1, self.action_size))
        y = network(x1, x2)
        return make_dot(y, params=dict(list(network.named_parameters())))

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for s, a, r, n_s, d in zip(state, action, reward, next_state, done):
            self.memory.add(s, a, r, n_s, d)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weight_decay=WEIGHT_DECAY):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of entire state
            action_size (int): dimension of each action
            random_seed (int): random seed
            n_agent (int): number of agents
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size * 2, action_size * 2,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * 2, action_size * 2,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Add noise process to agent
        self.noise = OUNoise(action_size, random_seed**2)

    def act(self, obs, add_noise=True):
        """Returns actions for given state as per current policy."""

        obs = torch.from_numpy(np.expand_dims(obs, 0)).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(obs).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.squeeze(np.clip(action, -1, 1), axis=0)

    def target_act(self, obs):
        """get target network actions from the agent in the MADDPG object """
        obs = torch.from_numpy(np.expand_dims(obs, 0)).float().to(device)
        action = self.actor_target(obs)
        return action

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #13
0
class DDPG():
    '''
    Deep determinstic policy gradient agent
    '''
    def __init__(self, state_size, action_size, random_seed, gamma, lr_actor,
                 lr_critic, weight_decay, tau, buffer_size, batch_size,
                 update_rate, updates_per_step):
        '''
        Initialize an DDPG Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            nbr_env (int): number of environments the agent is acting with
            
        '''

        self.state_size = state_size
        self.action_size = action_size

        self.seed = random.seed(random_seed)

        # Hyperparameter
        self.gamma = gamma
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.tau = tau
        self.update_rate = update_rate
        self.updates_per_step = updates_per_step

        # Instantiate Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)

        # Instantiate Critic Networks
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)

        # Instantiate Optimizers
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)

        self.update_counter = 0

    def step(self, state, action, reward, next_state, done):
        '''
        Save experience in replay memory, and use random sample from buffer to learn.
        '''

        # Store experiences
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > self.memory.batch_size:

            # Update counter
            self.update_counter += 1

            if self.update_counter >= self.update_rate:
                for _ in range(self.updates_per_step):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma)

                self.update_counter = 0

    def act(self, state, add_noise=True):
        '''
        Returns actions for given state as per current policy.
        '''

        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        '''
        '''
        self.noise.reset()

    def learn(self, experiences, gamma):
        '''
        Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        '''

        states, actions, rewards, next_states, dones = experiences

        # update critic ##########################################################

        # compute predicted Q values
        next_actions = self.actor_target(next_states)
        next_Q_targets = self.critic_target(next_states, next_actions)

        # compute Q values for current states
        Q_targets = rewards + (gamma * next_Q_targets * (1 - dones))

        # compute loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # update weigths
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # update actor ##########################################################

        # compute loss
        pred_actions = self.actor_local(states)
        actor_loss = -self.critic_local(states, pred_actions).mean()

        # update weights
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update target networks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        '''
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        '''

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #14
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)
    discrim = Discriminator(num_inputs + num_actions, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)
    
    # load demonstrations
    expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb"))
    demonstrations = np.array(expert_demo)
    print("demonstrations.shape", demonstrations.shape)
    
    writer = SummaryWriter(args.logdir)

    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                irl_reward = get_reward(discrim, state, action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, irl_reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args)
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                train_discrim_flag = False
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
Example #15
0
class DDPGAgent:
    def __init__(self, config):
        self.config = config
        self.seed = config.seed

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(config.action_size, config.state_size,
                                 config.actor_hidden_units,
                                 config.seed).to(device)
        self.actor_target = Actor(config.action_size, config.state_size,
                                  config.actor_hidden_units,
                                  config.seed).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
                                                lr=config.actor_learning_rate)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(config.action_size, config.state_size,
                                   config.critic_hidden_units,
                                   config.seed).to(device)
        self.critic_target = Critic(config.action_size, config.state_size,
                                    config.critic_hidden_units,
                                    config.seed).to(device)

        self.critic_optimizer = torch.optim.Adam(
            self.critic_local.parameters(), lr=config.critic_learning_rate)

        # ----------------------- initialize target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)

        self.noise = OUNoise(config.action_size, config.seed)

        if config.shared_replay_buffer:
            self.memory = config.memory
        else:
            self.memory = ReplayBuffer(config.action_size, config.buffer_size,
                                       config.batch_size, config.seed)

    def reset(self):
        self.noise.reset()

    def act(self, states):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        actions += self.noise.sample()
        return np.clip(actions, -1, 1)

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #         torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.tau)
        self.soft_update(self.actor_local, self.actor_target, self.config.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent():
    def __init__(self, state_size, action_size, config, n_agents=1, seed=0):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            n_agents: number of agents it will control in the environment
            seed (int): random seed
        """
        self.config = config
        self.state_size = state_size
        self.action_size = action_size
        self.seed = np.random.seed(seed)
        random.seed(seed)
        self.n_agents = n_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size,
                                 action_size,
                                 leak=config['LEAKINESS'],
                                 seed=seed).to(device)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  leak=config['LEAKINESS'],
                                  seed=seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=config['LR_ACTOR'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   leak=config['LEAKINESS'],
                                   seed=seed).to(device)
        self.critic_target = Critic(state_size,
                                    action_size,
                                    leak=config['LEAKINESS'],
                                    seed=seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=config['LR_CRITIC'])

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, config['BUFFER_SIZE'],
                                   config['BATCH_SIZE'], seed)
        self.timesteps = 0
        self.config = config

    def step(self, states, actions, rewards, next_states, dones):
        """ Given a batch of S,A,R,S' experiences, it saves them into the
            experience buffer, and occasionally samples from the experience
            buffer to perform training steps.
        """
        self.timesteps += 1
        for i in range(self.n_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])

        if (len(self.memory) > self.config['BATCH_SIZE']) and (self.timesteps %
                                                               20 == 0):
            for _ in range(10):
                experiences = self.memory.sample()
                self.learn(experiences, self.config['GAMMA'])

    def act(self, states, add_noise=True):
        """ Given a list of states for each agent it returns the actions to be
            taken by each agent based on the current policy.
            Returns a numpy array of shape [n_agents, n_actions]
            NOTE: clips actions to be between -1, 1
        Args:
            states:    () one row of state for each agent [n_agents, n_actions]
            add_noise: (bool) add noise to the actions?
        """
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            actions += [self.noise.sample() for _ in range(self.n_agents)]
        return np.clip(actions, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        self.update_critic(states, actions, rewards, gamma, next_states, dones)
        self.update_actor(states)
        self.update_target_networks()

    def update_critic(self, states, actions, rewards, gamma, next_states,
                      dones):

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, states):
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def update_target_networks(self):
        self.soft_update(self.critic_local, self.critic_target,
                         self.config.TAU)
        self.soft_update(self.actor_local, self.actor_target, self.config.TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    @property
    def device(self):
        return device
Example #18
0
class Agent():
    """This agent will interact and learn from the UNITY LM Tennis environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize the Agent.
        
        Parameters:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network and its target network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network and its target network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = [OUNoise(action_size, random_seed) for i in range(2)]

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, timestep):
        """Save the experiences in replay buffer, reuse these samples when learning."""
        self.memory.add(state, action, reward, next_state, done)

        # Start learning when enough samples are present in memory

        if timestep % 2 == 0 and len(self.memory) > BATCH_SIZE:
            # sample ten times from memory
            for _ in range(10):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Get actions for state following current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        #Add noise to explore off policy for each two agents
        if add_noise:
            for i in range(2):
                single_action = action[i]
                for j in single_action:
                    j += self.noise[i].sample()
        #Clip for training stability
        return np.clip(action, -1, 1)

    def reset(self):
        """ Reset the noise process """
        for i in range(2):
            self.noise[i].reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Parameters:
        
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ############# Update critic network #####################################
        # Get next-state actions and Q values from target network
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        ################# update actor #############################################
        # Compute actor's loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize actor's loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        ############### update targets ##############################################
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Parameters
        
            local_model: MLP from which weights will be copied
            target_model: MLP to which weights will be copied to
            tau: interpolation parameter when copying the weights
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #19
0
class Agent:
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 random_seed,
                 gamma=0.99,
                 tau=1e-3,
                 lr_actor=1e-4,
                 lr_critic=3e-4,
                 weight_decay=1e-4,
                 fc1_a=32,
                 fc2_a=32,
                 fc1_c=32,
                 fc2_c=32,
                 buffer_size=int(1e5),
                 batch_size=64,
                 update_every=4,
                 sigma=0.2):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        #
        random.seed(random_seed)
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.update_every = update_every
        # Actor Network
        self.actor_local = Actor(state_size,
                                 action_size,
                                 random_seed,
                                 fc1_units=fc1_a,
                                 fc2_units=fc2_a).to(device)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  random_seed,
                                  fc1_units=fc1_a,
                                  fc2_units=fc2_a).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)
        # Critic Network
        self.critic_local = Critic(state_size,
                                   action_size,
                                   random_seed,
                                   fcs1_units=fc1_c,
                                   fc2_units=fc2_c).to(device)
        self.critic_target = Critic(state_size,
                                    action_size,
                                    random_seed,
                                    fcs1_units=fc1_c,
                                    fc2_units=fc2_c).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)
        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed)
        # Noise process
        self.noise = OUNoise((num_agents, action_size),
                             random_seed,
                             sigma=sigma)
        # Initialize time step (for updating every update_every steps)
        self.t_step = 0
        # print networks info
        print(self.actor_local)
        summary(self.actor_local, input_size=(state_size, ))
        print(self.critic_local)
        summary(self.critic_local,
                input_size=[(state_size, ), (action_size, )])

    def reset(self):
        self.noise.reset()

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        # add noise to actions
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        # Save experience / reward
        for i in range(self.num_agents):
            self.memory.add(state[i, :], action[i, :], reward[i],
                            next_state[i, :], done[i])
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #20
0
class Agent(object):
    def __init__(self, nb_states, nb_actions, args):
        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions

        # Create Actor and Critic Network
        self.actor = Actor(self.nb_states, self.nb_actions, args.init_w)
        self.actor_target = Actor(self.nb_states, self.nb_actions, args.init_w)

        self.critic = Critic(self.nb_states, self.nb_actions, args.init_w)
        self.critic_target = Critic(self.nb_states, self.nb_actions,
                                    args.init_w)

        self.reward_predictor = Critic(self.nb_states, self.nb_actions,
                                       args.init_w)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.trajectory_length = args.trajectory_length
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.is_training = True

        #
        if USE_CUDA: self.cuda()

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        return action

    def select_action(self, state, noise_enable=True, decay_epsilon=True):
        action, _ = self.actor(to_tensor(np.array([state])))
        action = to_numpy(action).squeeze(0)
        if noise_enable == True:
            action += self.is_training * max(self.epsilon,
                                             0) * self.random_process.sample()

        action = np.clip(action, -1., 1.)
        if decay_epsilon:
            self.epsilon -= self.depsilon

        return action

    def reset_lstm_hidden_state(self, done=True):
        self.actor.reset_lstm_hidden_state(done)

    def reset(self):
        self.random_process.reset_states()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()
        self.reward_predictor.cuda()

    def load_weights(self, output):
        if output is None: return False

        self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output)))

        self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output)))

        return True

    def save_model(self, output):
        torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output))
        torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output))
class DDPGAgent():
    """Deep Deterministic Policy Gradient Agent"""
    def __init__(self, state_size, action_size, random_seed, buffer_size,
                 batch_size, gamma, tau, lr_actor, lr_critic, weight_decay,
                 update_every, update_times):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.update_every = update_every
        self.update_times = update_times

        # initialize Actor Network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # initialize Critic Network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        self.noise = OUNoise(action_size, random_seed)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed, device)

        self.step_count = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)

        self.step_count += 1
        self.step_count %= self.update_every

        if len(self.memory) > self.batch_size and self.step_count == 0:
            for _ in range(self.update_times):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        # --- update critic ---
        states, actions, rewards, next_states, dones = experiences

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # --- update actor ---
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update target newtorks
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        target_params = target_model.parameters()
        local_params = local_model.parameters()
        for target, local in zip(target_params, local_params):
            target.data.copy_(tau * local.data + (1.0 - tau) * target.data)

    def save(self, actor_local_path, actor_target_path, critic_local_path,
             critic_target_path):
        torch.save(self.actor_local.state_dict(), actor_local_path)
        torch.save(self.actor_target.state_dict(), actor_target_path)
        torch.save(self.critic_local.state_dict(), critic_local_path)
        torch.save(self.critic_target.state_dict(), critic_target_path)

    def load(self, actor_local_path, actor_target_path, critic_local_path,
             critic_target_path):
        self.actor_local.load_state_dict(torch.load(actor_local_path))
        self.actor_target.load_state_dict(torch.load(actor_target_path))

        self.critic_local.load_state_dict(torch.load(critic_local_path))
        self.critic_target.load_state_dict(torch.load(critic_target_path))
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, random_seed, num_agents):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """


        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size*self.num_agents, action_size*self.num_agents, random_seed).to(device)
        self.critic_target = Critic(state_size*self.num_agents, action_size*self.num_agents, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed, sigma=0.1)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

    def step(self, time, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward

        #raise Exception

        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            if(time%UPDATE_EVERY_TIMESTAPES==0):
                for i in range(UPDATE_TIMES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        full_state = states.view(BATCH_SIZE, -1)
        next_full_state = next_states.view(BATCH_SIZE, -1)
        actions = actions.view(BATCH_SIZE, -1)

        with torch.no_grad():
            actions_next = [self.actor_target(next_states[:, i, :]) for i in range(self.num_agents)]

        actions_next = torch.cat(actions_next, axis=-1)

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        #actions_next = self.actor_target(next_states)  #Pi aproximator

        Q_targets_next = self.critic_target(next_full_state.to(device), actions_next.to(device))  #q value aproximator


        # Compute Q targets for current states (y_i)
        #print(Q_targets_next.size())
        rewards = rewards.sum(axis=-1, keepdim=True) #merge reward for all agents workaround
        dones = dones.max(axis=-1, keepdim=True)[0]  #merge dones for all agents workaround
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(full_state, actions)


        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = [self.actor_local(states[:, i, :]) for i in range(self.num_agents)]
        actions_pred = torch.cat(actions_pred, dim=1)

        actor_loss = -self.critic_local(full_state, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Example #23
0
class Agent():
    def __init__(self, state_size, action_size, buffer_size, batch_size,
                 num_agents, seed, gamma, tau, lr_actor, lr_critic,
                 weight_decay, update_every, num_updates):
        '''
        ----------------------------------
        Parameters
        
        state_size:   # of states
        action_size:  # of actions
        buffer_size:  size of the memory buffer
        batch_size:   sample minibatch size
        num_agents:   # of agents
        seed:         seed for random
        gamma:        discount rate for future rewards
        tau:          interpolation factor for soft update of target network
        lr_actor:     learning rate of Actor
        lr_critic:    learning rate of Critic
        weight_decay: L2 weight decay
        update_every: update every 20 time steps
        num_updates:  number of updates to the network
        ----------------------------------
        '''

        self.action_size = action_size
        self.state_size = state_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.num_agents = num_agents
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.update_every = update_every
        self.num_updates = num_updates
        self.t_step = 0
        self.seed = random.seed(seed)

        # Actor network agent
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic network agent
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=lr_critic,
                                           weight_decay=weight_decay)

        # Noise paramter
        self.noise = OUNoise((num_agents, action_size), seed)

        # Experience replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)

    def step(self, state, action, reward, next_state, done):
        '''
        Agents takes next step
        - save most recent environment event to ReplayBuffer for each agent
        - load random sample from memory to agent's policy and value network 10 times for every 20 time steps 
        '''
        for s, a, r, ns, d in zip(state, action, reward, next_state, done):
            self.memory.add(s, a, r, ns, d)

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if len(self.memory) > self.batch_size:
                for _ in range(self.num_updates):
                    experiences = self.memory.sample()
                    self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        '''
        Agent selects action based on current state and selected policy
        '''
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        '''
        Agent updates policy and value parameters based on experiences (state, action, reward, next_state, done)
        
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        '''
        states, actions, rewards, next_states, dones = experiences

        #--------- update critic -----------------------#
        # get current Q
        Q_expected = self.critic_local(states, actions)
        # get next action
        next_actions = self.actor_target(next_states)
        # get Qsa_next
        Q_targets_next = self.critic_target(next_states, next_actions)
        # calculate target with reward and Qsa_next
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # calculate loss
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        #--------- update actor ------------------------#
        # computer actor loss
        pred_actions = self.actor_local(states)
        actor_loss = -self.critic_local(states, pred_actions).mean()

        # minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #---------- update target networks -------------#
        # update target network parameters
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        '''
        Update target network weights gradually with an interpolation rate of TAU
        '''
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #24
0
class DDPG_Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, idx, random_seed=0):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.idx = idx

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

    def act(self, state, add_noise=True, nu=1.0):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += nu * self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, actions_next, actions_pred, freq):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(next_state) -> action
            critic_target(next_state, next_action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            next_actions (list): next actions computed from each agent
            actions_pred (list): prediction for actions for current states from each agent
        """
        states, actions, rewards, next_states, dones = experiences
        idxt = torch.tensor([self.idx - 1]).to(device)

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target model
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards.index_select(
            1, idxt) + (GAMMA * Q_targets_next *
                        (1 - dones.index_select(1, idxt)))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Example #25
0
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, **kwargs):
        """Initialize an Agent object.
        
        Params
        ======
        """

        self.agent_mh_size = kwargs['agent_mh_size']
        self.agent_inventory_size = kwargs['agent_inventory_size']
        self.world_state_size = kwargs['world_state_size']
        self.action_size = kwargs['action_size']
        self.seed = kwargs['random_seed']
        self.iter = 0
        self.noise_scale = 1.0
        
        # Actor Network (w/ Target Network)
        self.actor_local = Actor(self.agent_mh_size, self.agent_inventory_size, self.world_state_size, self.action_size, self.seed).to(device)
        self.actor_target = Actor(self.agent_mh_size, self.agent_inventory_size, self.world_state_size, self.action_size, self.seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
        #self.actor_optimizer = optim.Adam(self.actor_local.parameters())
        self.actor_scheduler = optim.lr_scheduler.StepLR(self.actor_optimizer, step_size=200, gamma=0.99)
        
        
        # Critic Network (w/ Target Network)
        self.critic_local = Critic(self.agent_mh_size, self.agent_inventory_size, self.world_state_size, self.action_size, self.seed).to(device)
        self.critic_target = Critic(self.agent_mh_size, self.agent_inventory_size, self.world_state_size, self.action_size, self.seed).to(device)

        params = list(self.critic_local.parameters()) + list(self.actor_local.parameters())
        #self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        self.critic_optimizer = optim.Adam(params, lr=LR_CRITIC)
        self.critic_scheduler = optim.lr_scheduler.StepLR(self.critic_optimizer, step_size=200, gamma=0.99)
        
        self.hard_copy_weights(self.actor_target, self.actor_local)
        self.hard_copy_weights(self.critic_target, self.critic_local)


        # Noise process
        self.noise = OUNoise(self.action_size, self.seed)

        # Replay memory
        #self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.seed)
        # Prioritized replay memory
        self.memory = NaivePrioritizedBuffer(BUFFER_SIZE, BATCH_SIZE, self.seed)

        if 'actor_chkpt_file' in kwargs and 'critic_chkpt_file' in kwargs:
            checkpoint_actor = torch.load(kwargs['actor_chkpt_file'])
            checkpoint_critic = torch.load(kwargs['critic_chkpt_file'])
            self.actor_local.load_state_dict(checkpoint_actor)
            self.critic_local.load_state_dict(checkpoint_critic)
            checkpoint_actor_t = torch.load(kwargs['actor_chkpt_file_t'])
            checkpoint_critic_t = torch.load(kwargs['critic_chkpt_file_t'])
            self.actor_target.load_state_dict(checkpoint_actor_t)
            self.critic_target.load_state_dict(checkpoint_critic_t)

    def flatten_action(self, action):
        
        action_flat = []
        for x in action:
            if type(x) is list:
                for y in x:
                    action_flat.append(y)
            else:
                action_flat.append(x)
        return action_flat

    def get_states(self, mainhand, inventory, pov):
        agent_state_mainhand = []
        agent_state_mainhand.append(mainhand['damage'])
        agent_state_mainhand.append(mainhand['maxDamage'])
        agent_state_mainhand.append(equipments.get(mainhand['type'], -1))

        agent_state_inventory = []
        agent_state_inventory.append(inventory['coal'])
        agent_state_inventory.append(inventory['cobblestone'])
        agent_state_inventory.append(inventory['crafting_table'])
        agent_state_inventory.append(inventory['dirt'])
        agent_state_inventory.append(inventory['furnace'])
        agent_state_inventory.append(inventory['iron_axe'])
        agent_state_inventory.append(inventory['iron_ingot'])
        agent_state_inventory.append(inventory['iron_ore'])
        agent_state_inventory.append(inventory['iron_pickaxe'])
        agent_state_inventory.append(inventory['log'])
        agent_state_inventory.append(inventory['planks'])
        agent_state_inventory.append(inventory['stick'])
        agent_state_inventory.append(inventory['stone'])
        agent_state_inventory.append(inventory['stone_axe'])
        agent_state_inventory.append(inventory['stone_pickaxe'])
        agent_state_inventory.append(inventory['torch'])
        agent_state_inventory.append(inventory['wooden_axe'])
        agent_state_inventory.append(inventory['wooden_pickaxe'])
                
        
        
        agent_state_mainhand = np.array(agent_state_mainhand)
        agent_state_inventory = np.array(agent_state_inventory)
        world_state_a = np.array(pov)
        world_state_b = np.swapaxes(world_state_a,0,2)
                
        return agent_state_mainhand, agent_state_inventory, world_state_b

        
    def hard_copy_weights(self, target, source):
        """ copy weights from source to target network (part of initialization)"""
        for target_param, param in zip(target.parameters(), source.parameters()):
            target_param.data.copy_(param.data)

    def step(self, mainhand, inventory, pov, action, reward, mainhand_n, inventory_n, pov_n, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        
        agent_state_mainhand, agent_state_inventory, world_state = self.get_states(mainhand, inventory, pov)
        agent_state_mainhand_n, agent_state_inventory_n, world_state_n = self.get_states(mainhand_n, inventory_n, pov_n)
        
        self.memory.add(agent_state_mainhand, agent_state_inventory, world_state, action, reward, agent_state_mainhand_n, agent_state_inventory_n, world_state_n, done)
        
        # Learn, if enough samples are available in memory
        self.iter = self.iter+1
        self.iter = self.iter%1
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        #self.actor_scheduler.step()
        #self.critic_scheduler.step()


    def learn_from_players(self, experiences, mh_ts, invent_ts, writer, loss_list):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        
        #print(experiences)
        e = experiences
        self.memory.add(e[0], e[1], e[2], e[3], e[4], e[5], e[6], e[7], e[8])

        # Learn, if enough samples are available in memory

        if len(self.memory) > BATCH_SIZE:
            
            experiences = self.memory.sample() 
            #(states, states_2, actions, rewards, next_states, next_states_2, dones) = experiences      
            self.iter = self.iter+1     
            loss_1, loss_2 = self.learn_2(experiences, GAMMA, writer)
            loss_list.append((loss_1, loss_2))
            
            self.iter = self.iter+1
            experiences = self.memory.sample()
            loss_1, loss_2 = self.learn_2(experiences, GAMMA, writer)
            loss_list.append((loss_1, loss_2))

        #self.actor_scheduler.step()
        #self.critic_scheduler.step()

    
    def act(self, mainhand, inventory, pov,  add_noise=True, noise_scale=1.0):
        """Returns actions for given state as per current policy."""

        agent_state_mainhand, agent_state_inventory, world_state = self.get_states(mainhand, inventory, pov)        
        
        s1 = torch.from_numpy(agent_state_mainhand).float().unsqueeze(dim=0).to(device)
        s3 = torch.from_numpy(agent_state_inventory).float().unsqueeze(dim=0).to(device)

        s2 = torch.from_numpy(world_state).float().unsqueeze(dim=0).to(device) 

        
        self.actor_local.eval()
        with torch.no_grad():
            action, action_raw ,_,  _ ,  _ , _ , _ , _, _, _= self.actor_local(s1,s2,s3)
            
        self.actor_local.train()
        
        return action, action_raw, agent_state_mainhand, agent_state_inventory

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def get_action_loss(self, writer, gt, onehot_probs, mh_state_loss, inventory_state_loss, \
        world_state_loss, q_diff_loss=None, q_value_loss=None):

        q_value_loss = q_value_loss.detach()/14
        attack_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,0], gt[:,0])
        back_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,1], gt[:,1])
        pitch_loss = F.mse_loss(onehot_probs[:,2], gt[:,2])
        yaw_loss = F.mse_loss(onehot_probs[:,3], gt[:,3])
        craft_loss = F.cross_entropy(onehot_probs[:,4:9], gt[:,4].long())
        equip_loss = F.cross_entropy(onehot_probs[:,9:17], gt[:,5].long())
        forward_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,17], gt[:,6])
        jump_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,18], gt[:,7])
        left_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,19], gt[:,8])
        nearby_craft_loss = F.cross_entropy(onehot_probs[:,20:28], gt[:,9].long())
        nearby_smelt_loss = F.cross_entropy(onehot_probs[:,28:31], gt[:,10].long())
        place_loss = F.cross_entropy(onehot_probs[:,31:38], gt[:,11].long())
        right_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,38], gt[:,12])
        sneak_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,39], gt[:,13])
        sprint_loss = F.binary_cross_entropy_with_logits(onehot_probs[:,40], gt[:,14])
        

        writer.add_scalars('Losses', {"attack":attack_loss, "back":back_loss, \
            "craft":craft_loss, "equip":equip_loss, "forward":forward_loss, \
            "jump":jump_loss, "left":left_loss, "nearbyCraft":nearby_craft_loss, \
            "nearbySmelt":nearby_smelt_loss, "place":place_loss, "right":right_loss, \
            "sneak":sneak_loss, "sprint":sprint_loss}, global_step=self.iter)

        writer.add_scalars('Camera Losses', {"pitch":pitch_loss, "yaw":yaw_loss}, global_step=self.iter)

        writer.add_scalars('State Prediction Losses', {"MainHand":mh_state_loss, "Inventory":inventory_state_loss, "World":world_state_loss}, global_step=self.iter)


        self.actor_optimizer.zero_grad()
        self.critic_optimizer.zero_grad()

        if q_value_loss is None and q_diff_loss is None:
            torch.autograd.backward([attack_loss,back_loss,pitch_loss,yaw_loss,craft_loss,equip_loss,\
                    forward_loss,jump_loss,left_loss,nearby_craft_loss,nearby_smelt_loss,place_loss, \
                    right_loss,sneak_loss,sprint_loss,mh_state_loss,inventory_state_loss, \
                    world_state_loss])
        else:

            writer.add_scalars('Q Values', {"Q Value":q_value_loss, "Q Difference":q_diff_loss}, global_step=self.iter)

            torch.autograd.backward([attack_loss,back_loss,pitch_loss,yaw_loss,craft_loss,equip_loss,\
                    forward_loss,jump_loss,left_loss,nearby_craft_loss,nearby_smelt_loss,place_loss, \
                    right_loss,sneak_loss,sprint_loss, q_diff_loss])

            # torch.autograd.backward([attack_loss,back_loss,pitch_loss,yaw_loss,craft_loss,equip_loss,\
            #         forward_loss,jump_loss,left_loss,nearby_craft_loss,nearby_smelt_loss,place_loss, \
            #         right_loss,sneak_loss,sprint_loss,mh_state_loss,inventory_state_loss, \
            #         world_state_loss, q_diff_loss])

        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        
        self.actor_optimizer.step()
        self.critic_optimizer.step()
        
        #print(attack_loss)
        #print(back_loss)
        #print(camera_loss.item())
        #print(craft_loss)
        #print(equip_loss)
        #print(forward_loss)
        #print(jump_loss)
        #print(left_loss)
        #print(nearby_craft_loss)
        #print(nearby_smelt_loss)
        #print(place_loss)
        #print(right_loss)
        #print(sneak_loss)
        #print(sprint_loss)

        return pitch_loss, yaw_loss


    def learn_1(self, experiences, gamma):

        ( a_states_mh, a_states_invent, w_states, actions, rewards, a_next_states_mh, a_next_states_invent, w_next_states, dones ) = experiences

        a_states_mh = a_states_mh.to(device)
        a_states_invent = a_states_invent.to(device)
        w_states = w_states.to(device)

        a_next_states_mh = a_next_states_mh.to(device)
        a_next_state_invent = a_next_states_invent.to(device)
        w_next_states = w_next_states.to(device)


        # predict next actions and next next state with actor
        with torch.no_grad():
            _ , _ , _ , Q_next , _ , _ , _ = self.actor_local(a_next_states_mh, w_next_states, a_next_states_invent)
            Q_current_2 = rewards + (gamma * Q_next * (1 - dones))


        #get next state (from experiences) descriptors
        with torch.no_grad():
            n_wsd = self.actor_local.get_wsd(w_next_states)
            n_asmhd = self.actor_local.get_asmhd(a_next_states_mh)
            n_asinventd = self.actor_local.get_asinventoryd(a_next_states_invent)

        # predict actions and next state with actor
        actions_pred, actions_pred_raw, action_logits, Q_current, n_wsd_predict, n_asmhd_predict, n_asinventd_predict = \
                self.actor_local(a_states_mh, w_states, a_states_invent)


        # calculate loss for actor
        loss_1, loss_2 = self.get_action_loss(actions, action_logits, \
                F.mse_loss(n_asmhd, n_asmhd_predict), F.mse_loss(n_asinventd, n_asinventd_predict), \
                F.mse_loss(n_wsd, n_wsd_predict), F.mse_loss(Q_current, Q_current_2.detach()))

        print("Actor Losses:{} {}".format(loss_1.item(), loss_2.item()))
        return loss_1, loss_2


    def learn_2(self, experiences, gamma, writer):
        
        #states, actions, rewards, next_states, dones, indices, weights = experiences
        ( a_states_mh, a_states_invent, w_states, actions, rewards, a_next_states_mh, a_next_states_invent, w_next_states, dones ) = experiences

        a_states_mh = a_states_mh.to(device)
        a_states_invent = a_states_invent.to(device)
        w_states = w_states.to(device)

        a_next_states_mh = a_next_states_mh.to(device)
        a_next_state_invent = a_next_states_invent.to(device)
        w_next_states = w_next_states.to(device)



        #get next state (from experiences) descriptors and Q_next
        with torch.no_grad():
            _, _, _, Q_next, _, _, _, wsd_next, mhd_next, inventd_next = \
                self.actor_local(a_next_states_mh, w_next_states, a_next_states_invent)
            Q_next = Q_next.detach()
            Q_current_2 = rewards + (gamma * Q_next * (1 - dones))
            wsd_next = wsd_next.detach()
            mhd_next = mhd_next.detach()
            inventd_next = inventd_next.detach()



        # predict actions and next state with 
        _, action_raw, action_logits, Q_current, n_wsd_predict, n_asmhd_predict, n_asinventd_predict, _, _, _ = \
                self.actor_local(a_states_mh, w_states, a_states_invent)

        # calculate loss for actor
        loss_1, loss_2 = self.get_action_loss(writer, actions, action_logits, \
                F.mse_loss(mhd_next, n_asmhd_predict), F.mse_loss(inventd_next, n_asinventd_predict), \
                F.mse_loss(wsd_next, n_wsd_predict), F.mse_loss(Q_current, Q_current_2), -Q_current.mean())

        print("Actor Losses:{} {}".format(loss_1.item(), loss_2.item()))
        return loss_1, loss_2


    def learn_3(self, experiences, gamma):
        
        #states, actions, rewards, next_states, dones, indices, weights = experiences
        ( a_states_mh, a_states_invent, w_states, actions, rewards, a_next_states_mh, a_next_states_invent, w_next_states, dones ) = experiences

        a_states_mh = a_states_mh.to(device)
        a_states_invent = a_states_invent.to(device)
        w_states = w_states.to(device)

        a_next_states_mh = a_next_states_mh.to(device)
        a_next_state_invent = a_next_states_invent.to(device)
        w_next_states = w_next_states.to(device)


        # predict actions 
        
        _ , actions_pred_raw, action_logits, _ , _ , _ , _ = \
                self.actor_local(a_states_mh, w_states, a_states_invent)

        #get next state (from experiences) descriptors
        with torch.no_grad():
            n_wsd = self.critic_local.get_wsd(w_next_states)
            n_asmhd = self.critic_local.get_asmhd(a_next_states_mh)
            n_asinventd = self.critic_local.get_asinventoryd(a_next_states_invent)


        # Compute Q value of current state (from experiences)
        Q_current, n_wsd_predict, n_asmhd_predict, n_asinventd_predict = self.critic_local(a_states_mh, a_states_invent, w_states, actions)
        
        # calculate loss for actor/critic
        loss_1, _ = self.get_action_loss(actions, action_logits, \
                F.mse_loss(n_asmhd, n_asmhd_predict), F.mse_loss(n_asinventd, n_asinventd_predict), \
                F.mse_loss(n_wsd, n_wsd_predict))

        


        # Compute Q value of next state (next state from experiences and the rest is predicted with actor and critic

        # predict action in the next state
        actions_next, actions_next_raw, action_logits, _ , _ , _ , _ = self.actor_local(a_next_states_mh, w_next_states, a_next_states_invent)
        # predict Q value in the next state
        Q_next, _ , _ , _ = self.critic_local(a_next_states_mh, a_next_states_invent, w_next_states, actions_next_raw)
        
        
        # Alternative Q value through Bellman equations
        Q_current_2 = rewards + (gamma * Q_next * (1 - dones))


        # Compute critic loss
        critic_loss = F.mse_loss(Q_current.detach(), Q_current_2)

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)
        self.critic_optimizer.step()


        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        print("Actor Losses:{} {}".format(loss_1.item(), critic_loss.item()))

        return loss_1, critic_loss
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action 
            num_agents (int): number of agents
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, num_agents,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, num_agents,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

    def step(self, memory):
        """Save experience in replay memory, and use random sample from buffer to learn."""

        # Learn, if enough samples are available in memory
        if len(memory) > BATCH_SIZE:
            experiences = memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        t_next_states = torch.cat(next_states, dim=1).to(device)
        t_states = torch.cat(states, dim=1).to(device)
        t_actions = torch.cat(actions, dim=1).to(device)

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = [self.actor_target(state) for state in states]
        actions_next = torch.cat(actions_next, dim=1).to(device)
        Q_targets_next = self.critic_target(t_next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(t_states, t_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = [self.actor_local(state) for state in states]
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        actor_loss = -self.critic_local(t_states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Agent which interacts and learns from environment"""
    def __init__(self, state_size, action_size, random_seed=0):
        """Initialize an Agent
        Params
        =======
            state_size (int): dimensions of each state
            action_size (int): dimensions of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        # Actor Network
        self.actor_local = Actor(state_size,
                                 action_size,
                                 random_seed,
                                 leakiness=LEAK_FACTOR).to(device)
        self.actor_target = Actor(state_size,
                                  action_size,
                                  random_seed,
                                  leakiness=LEAK_FACTOR).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network
        self.critic_local = Critic(state_size,
                                   action_size,
                                   random_seed,
                                   leakiness=LEAK_FACTOR).to(device)
        self.critic_target = Critic(state_size,
                                    action_size,
                                    random_seed,
                                    leakiness=LEAK_FACTOR).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def add_to_memory(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory"""
        self.memory.add(states, actions, rewards, next_states, dones)

    def learn_from_memory(self, timestep):
        """Sample experience tuples from the replay memory every LEARN_EVERY timesteps"""
        if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0:
            for _ in range(LEARN_NUM):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def step(self, states, actions, rewards, next_states, dones, timestep):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.add_to_memory(states, actions, rewards, next_states, dones)
        self.learn_from_memory(timestep)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if add_noise:
            action += self.epsilon * self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        """resets the current noise value"""
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters given batch of experience tuples
            Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
            where:
                actor_target(state)  -> action
                critic_target(state) -> Q_value
            Params
            =======
                experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
                gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        #----------------update critic-------------------------------#
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        # Compute Q_targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()

        # Clipping gradients
        if GRAD_CLIPPING > 0:
            torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                           GRAD_CLIPPING)
        self.critic_optimizer.step()

        #--------------update actor-------------------------------#

        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # Minimize loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #------------update target networks---------------------#
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        #-----------update epsilon decay------------------------#
        self.epsilon *= EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """ Soft updating target model's parameters
            theta_target = tau*theta_local + (1-tau)*theta_target
            
            Params
            =======
                local_model: Pytorch model
                target_model: Pytorch model
                tau: interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)
Example #28
0
class DDPGAgent:
    def __init__(self, total_agents, state_size, action_size, seed):
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        #self.device = 'cpu'

        self.total_agents = total_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        self.actor_local = Actor(self.state_size, self.action_size,
                                 seed).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  seed).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        self.critic_local = Critic(self.state_size, self.action_size,
                                   seed).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    seed).to(self.device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        #self.noise = OrnsteinUhlenbeckNoise(action_size, seed)
        self.noise = OrnsteinUhlenbeckProcess((self.total_agents, action_size),
                                              std=LinearSchedule(0.2))

        self.replay_buffer = UniformReplayBuffer(
            BUFFER_SIZE, BATCH_SIZE * self.total_agents, seed, self.device)
        #self.replay_buffer = PrioritizedReplay(BUFFER_SIZE, self.device)

        print('Device used: {}'.format(self.device))

        print('Actor Local DDPG ->', self.actor_local)
        print('Actor Target DDPG ->', self.actor_target)

        print('Critic Local DDPG ->', self.critic_local)
        print('Critic Target DDPG ->', self.critic_target)

    def reset(self):
        self.noise.reset()

    def act(self, states, add_noise=False):
        states = torch.from_numpy(states).float().to(self.device)

        self.actor_local.eval()
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()

        return np.clip(actions +
                       self.noise.sample(), -1, 1) if add_noise else actions

    def step(self, states, actions, rewards, next_states, dones):
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.replay_buffer.add(state, action, reward, next_state, done)

            #for _ in range(self.total_agents): TOO SLOW
        #if len(self.replay_buffer) > BATCH_SIZE:
        return self._learn(self.replay_buffer.sample(), GAMMA)

        #return (None,None)

    def _learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ---------- CRITIC UPDATE --------------------
        next_actions = self.actor_target(next_states)
        next_rewards = self.critic_target(next_states, next_actions)
        target_rewards = rewards + gamma * next_rewards * (1 - dones)
        predicted_rewards = self.critic_local(states, actions)

        critic_loss = F.mse_loss(predicted_rewards, target_rewards)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------- ACTOR UPDATE --------------------
        predicted_actions = self.actor_local(states)
        actor_loss = -self.critic_local(states, predicted_actions).mean()
        #print('\rActor Loss: {:.6f} - Critic Loss: {:.6f}'.format(actor_loss, critic_loss), end='')
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self._soft_update(self.critic_local, self.critic_target, TAU)
        self._soft_update(self.actor_local, self.actor_target, TAU)

        return critic_loss.cpu().data.numpy(), actor_loss.cpu().data.numpy()

    def _soft_update(self, local_model, target_model, tau):
        for local_parameter, target_parameter in zip(
                local_model.parameters(), target_model.parameters()):
            target_parameter.data.copy_((1.0 - tau) * target_parameter +
                                        (tau * local_parameter))
Example #29
0
class Agent(object):
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed, hyperparameters):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            hyperparameters (dict): dictionary with h
        """

        # initialize the random generator to ensure reproducibility
        random.seed(random_seed)

        # Read hyperparameters from Config dict
        self.hyperparamaters = hyperparameters

        self.state_size = state_size
        self.action_size = action_size
        self.step_counter = 0
        self.epsilon = float(self.hyperparamaters['EPSILON_START'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(
            self.actor_local.parameters(),
            lr=float(self.hyperparamaters['LR_ACTOR']))

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=float(self.hyperparamaters['LR_CRITIC']),
            weight_decay=float(self.hyperparamaters['WEIGHT_DECAY']))

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size,
                                   int(self.hyperparamaters['BUFFER_SIZE']),
                                   int(self.hyperparamaters['BATCH_SIZE']))

        # Hard update so that weights of local and target are identical
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

    def step_add_to_memory(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for i in range(len(states)):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i],
                            dones[i])
        self.step_counter += 1
        # Learn, if enough samples are available in memory
        self.step_learn()

    def step_learn(self):
        if self.step_counter % int(self.hyperparamaters['LEARN_EVERY']) == 0:
            if len(self.memory) > int(self.hyperparamaters['BATCH_SIZE']):
                for _ in range(int(self.hyperparamaters['LEARN_TIMES'])):
                    experiences = self.memory.sample()
                    self.learn(experiences,
                               float(self.hyperparamaters['GAMMA']))

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()

        scalar = False
        with torch.no_grad():
            if state.dim() == 1:
                state.unsqueeze_(0)
                scalar = True
            action = self.actor_local(state).cpu().data.numpy()
            if scalar:
                action = np.squeeze(action)

        self.actor_local.train()

        if add_noise:
            action += self.epsilon * self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def update_epsilon(self):
        self.epsilon = max(
            self.epsilon * float(self.hyperparamaters['EPSILON_DECAY']),
            float(self.hyperparamaters['EPSILON_END']))

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        indexes, states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # -----------------------------update td_error for ranking ------------- #
        deltas = list(
            torch.abs(Q_expected - Q_targets).cpu().detach().numpy().flatten())
        for index, delta in zip(indexes, deltas):
            self.memory.td_error_update(index, delta)

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target,
                         float(self.hyperparamaters['TAU']))
        self.soft_update(self.actor_local, self.actor_target,
                         float(self.hyperparamaters['TAU']))

        # ------------------------ update epsilon and noise -------------------- #
        self.update_epsilon()
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Example #30
0
def main():
    env = gym.make(args.env_name)
    env.seed(args.seed)
    torch.manual_seed(args.seed)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    running_state = ZFilter((num_inputs,), clip=5)

    print('state size:', num_inputs) 
    print('action size:', num_actions)

    actor = Actor(num_inputs, num_actions, args)
    critic = Critic(num_inputs, args)

    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate)

    writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num))
    
    if args.load_model is not None:
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])

        running_state.rs.n = ckpt['z_filter_n']
        running_state.rs.mean = ckpt['z_filter_m']
        running_state.rs.sum_square = ckpt['z_filter_s']

        print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n))

    
    episodes = 0    

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []

        while steps < args.total_sample_size: 
            state = env.reset()
            score = 0

            state = running_state(state)
            
            for _ in range(10000): 
                if args.render:
                    env.render()

                steps += 1

                mu, std = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                next_state = running_state(next_state)
                state = next_state

                score += reward

                if done:
                    break
            
            episodes += 1
            scores.append(score)
        
        score_avg = np.mean(scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
        writer.add_scalar('log/score', float(score_avg), iter)

        actor.train(), critic.train()
        train_model(actor, critic, memory, actor_optim, critic_optim, args)

        if iter % 100:
            score_avg = int(score_avg)

            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'z_filter_n':running_state.rs.n,
                'z_filter_m': running_state.rs.mean,
                'z_filter_s': running_state.rs.sum_square,
                'args': args,
                'score': score_avg
            }, filename=ckpt_path)
Example #31
0
class Agent:
    def __init__(self, state_size, action_size, args, device="cpu"):
        """
        Initialize DDPG agent for each agent in environment
        :param state_size: State size of the environment
        :param action_size: Action size of the environment
        :param args: Hyper-Parameters for training process
        :param device: Device to utilize
        """
        self.action_size = action_size
        self.state_size = state_size
        self.device = device
        self.discount_factor = args["discount_factor"]
        self.tau = args["tau"]

        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = Adam(self.actor_local.parameters(), lr=args["lr_actor"])

        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = Adam(self.critic_local.parameters(), lr=args["lr_critic"],
                                     weight_decay=args["weight_decay"])

        self.hard_update_actor(self.actor_local)
        self.hard_update_critic(self.critic_local)

        self.noise = OUNoise(action_size)

    def learn(self, batch):
        """
        Learn from given batch
        :param batch: Sampled batch from experience replay buffer
        :return: (critic_loss, actor_loss)
        Thanks for the udacity ddpg_bipedal implementation here.
        """

        if batch is None:
            return None

        states, actions, rewards, next_states, dones = batch

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.discount_factor * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update_critic(self.critic_local)
        self.soft_update_actor(self.actor_local)

        return critic_loss.cpu().data.numpy(), actor_loss.cpu().data.numpy()

    def hard_update_actor(self, model):
        """
        Hard update for the actor model
        :param model: Model to be used to update model
        """
        for target_param, param in zip(self.actor_target.parameters(), model.parameters()):
            target_param.data.copy_(param.data)

    def hard_update_critic(self, model):
        """
        Hard update for the critic model
        :param model: Model to be used to update model
        """
        for target_param, param in zip(self.critic_target.parameters(), model.parameters()):
            target_param.data.copy_(param.data)

    def soft_update_actor(self, model):
        """
        Soft update for the actor model
        :param model: Model to be used to update model
        """
        for target_param, param in zip(self.actor_target.parameters(), model.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)

    def soft_update_critic(self, model):
        """
        Soft update for the critic model
        :param model: Model to be used to update model
        """
        for target_param, param in zip(self.critic_target.parameters(), model.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)

    def act(self, state, add_noise=True):
        """
        Interact with the environment. Decide the actions with the given environment state and noise
        :param state: Current state of the environment
        :param add_noise: Whether if the noise will be added to the action
        :return: Decided actions
        """
        state = torch.from_numpy(state).float().to(self.device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)
Example #32
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.writer = writer
        self.select_time = 0
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'init_method':args.init_method
        }

        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def update_policy(self, train_actor = True):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        next_q_values = self.critic_target([
            to_tensor(next_state_batch, volatile=True),
            self.actor_target(to_tensor(next_state_batch, volatile=True)),
        ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = nn.MSELoss()(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()

        self.actor.zero_grad()

        policy_loss = -self.critic([
            to_tensor(state_batch),
            self.actor(to_tensor(state_batch))
        ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        if train_actor:
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def cuda(self):
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        return action

    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        # print(s_t.shape)
        action = to_numpy(
            self.actor(to_tensor(np.array([s_t])))
        ).squeeze(0)
            
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)
        
        action = action * (1 - noise_level) + (self.random_process.sample() * noise_level)
        action = np.clip(action, -1., 1.)

        if decay_epsilon:
            self.epsilon -= self.depsilon

        self.a_t = action
        return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.actor.cuda()
            self.critic.cuda()
Example #33
0
class Agent():
    def __init__(self, state_size, action_size, seed, hparams, identity):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.hparams = hparams
        self.identity = identity

        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=self.hparams["lr_actor"])

        for target_param, source_param in zip(self.actor_target.parameters(),
                                              self.actor_local.parameters()):
            target_param.data.copy_(source_param.data)

        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=self.hparams["lr_critic"],
            weight_decay=self.hparams["weight_decay"])

        for target_param, source_param in zip(self.critic_target.parameters(),
                                              self.critic_local.parameters()):
            target_param.data.copy_(source_param.data)

        #Controller will handle shared memory
        self.memory = ReplayBuffer(action_size, self.hparams["buffer_size"],
                                   self.hparams["batch_size"], seed)
        self.noise = OUNoise(action_size, seed)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""

        #Controller will handle concatenating the actions from each agent
        if not torch.is_tensor(states):
            states = torch.from_numpy(states).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * self.hparams['epsilon']
        return np.clip(action, -1, 1)

    # Handle step in controller
    def step(self, states, actions, rewards, next_states, dones, ep):
        self.memory.add(states, actions, rewards, next_states, dones)

        if len(self.memory
               ) > self.hparams["batch_size"] and ep % 5 == 0 and ep > 100:
            for _ in range(4):
                experiences = self.memory.sample()
                self.learn(experiences)

    def learn(self, experiences):

        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.hparams["gamma"] * Q_targets_next *
                               (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)

        self.hparams['epsilon'] *= self.hparams['epsilon_decay']

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.hparams["tau"] * local_param.data +
                                    (1.0 - self.hparams["tau"]) *
                                    target_param.data)

    def print_models(self):
        print("Agent ", str(self.identity), " ", self.actor_local)
        print("Agent ", str(self.identity), " ", self.critic_local)

    def save_models(self):
        torch.save(self.actor_local.state_dict(),
                   str(self.identity) + "_actor_weights.pth")
        torch.save(self.critic_local.state_dict(),
                   str(self.identity) + "_critic_weights.pth")
Example #34
0
class DDPG(object):
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()

    def normalize(self, pic):
        pic = pic.swapaxes(0, 2).swapaxes(1, 2)
        return pic

    def update_policy(self):
        # Sample batch
        state_batch, action_batch, reward_batch, \
            next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)

        # Prepare for the target q batch
        if self.pic:
            state_batch = np.array([self.normalize(x) for x in state_batch])
            state_batch = to_tensor(state_batch, volatile=True)
            state_batch = self.cnn(state_batch)
            next_state_batch = np.array([self.normalize(x) for x in next_state_batch])
            next_state_batch = to_tensor(next_state_batch, volatile=True)
            next_state_batch = self.cnn_target(next_state_batch)
            next_q_values = self.critic_target([
                next_state_batch,
                self.actor_target(next_state_batch)
            ])
        else:
            next_q_values = self.critic_target([
                to_tensor(next_state_batch, volatile=True),
                self.actor_target(to_tensor(next_state_batch, volatile=True)),
            ])
        # print('batch of picture is ok')
        next_q_values.volatile = False

        target_q_batch = to_tensor(reward_batch) + \
            self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values

        # Critic update
        self.critic.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            q_batch = self.critic([state_batch, to_tensor(action_batch)])
        else:
            q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)])

        # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float))
        value_loss = criterion(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if self.pic: self.cnn_optim.step()

        self.actor.zero_grad()
        if self.pic: self.cnn.zero_grad()

        if self.pic:
            state_batch.volatile = False
            policy_loss = -self.critic([
                state_batch,
                self.actor(state_batch)
            ])
        else:
            policy_loss = -self.critic([
                to_tensor(state_batch),
                self.actor(to_tensor(state_batch))
            ])

        policy_loss = policy_loss.mean()
        policy_loss.backward()

        if self.clip_actor_grad is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad))

            if self.writer != None:
                mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()]))
                #print(mean_policy_grad)
                self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time)

        self.actor_optim.step()
        if self.pic: self.cnn_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        if self.pic:
            soft_update(self.cnn_target, self.cnn, self.tau)

        return -policy_loss, value_loss

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()
        if(self.pic):
            self.cnn.eval()
            self.cnn_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()
        if(self.pic):
            self.cnn.train()
            self.cnn_target.train()

    def cuda(self):
        self.cnn.cuda()
        self.cnn_target.cuda()
        self.actor.cuda()
        self.actor_target.cuda()
        self.critic.cuda()
        self.critic_target.cuda()

    def observe(self, r_t, s_t1, done):
        self.memory.append([self.s_t, self.a_t, r_t, s_t1, done])
        self.s_t = s_t1

    def random_action(self, fix=False):
        action = np.random.uniform(-1.,1.,self.nb_actions)
        self.a_t = action
        if self.discrete and fix == False:
            action = action.argmax()
#        if self.pic:
#            action = np.concatenate((softmax(action[:16]), softmax(action[16:])))
        return action
        
    def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0):
        self.eval()
        if self.pic:
            s_t = self.normalize(s_t)
            s_t = self.cnn(to_tensor(np.array([s_t])))
        if self.pic:
            action = to_numpy(
                self.actor_target(s_t)
            ).squeeze(0)
        else:
            action = to_numpy(
                self.actor(to_tensor(np.array([s_t])))
            ).squeeze(0)
        self.train()
        noise_level = noise_level * max(self.epsilon, 0)

        if np.random.uniform(0, 1) < noise_level:
            action = self.random_action(fix=True) # episilon greedy            

        if decay_epsilon:
            self.epsilon -= self.depsilon
        self.a_t = action
        
        if return_fix:
            return action
        if self.discrete:
            return action.argmax()
        else:
            return action

    def reset(self, obs):
        self.s_t = obs
        self.random_process.reset_status()

    def load_weights(self, output, num=1):        
        if output is None: return
        self.actor.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.actor_target.load_state_dict(
            torch.load('{}/actor{}.pkl'.format(output, num))
        )
        self.critic.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )
        self.critic_target.load_state_dict(
            torch.load('{}/critic{}.pkl'.format(output, num))
        )

    def save_model(self, output, num):
        if self.use_cuda:
            self.cnn.cpu()
            self.actor.cpu()
            self.critic.cpu()
        torch.save(
            self.actor.state_dict(),
            '{}/actor{}.pkl'.format(output, num)
        )
        torch.save(
            self.critic.state_dict(),
            '{}/critic{}.pkl'.format(output, num)
        )
        if self.use_cuda:
            self.cnn.cuda()
            self.actor.cuda()
            self.critic.cuda()
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, num_agents, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Actor Network with target net
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network with target net
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck noise
        self.noise = OU_Noise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def reset(self):
        self.noise.reset()
        self.t_step = 0
    
    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory        
        for i in range(self.num_agents):
            self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i])
        
        # update the network UPDATE_TIMES times for every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                for i in range(UPDATE_TIMES):
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)
                
    def act(self, states, add_noise=True):
        """
        Returns actions for given state as per current policy.
        :param state: current state
        :param add_noise: whether to add Ornstein-Uhlenbeck noise
        """
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action_values = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()

        # add OU_noise to action to explore
        if add_noise:
            action_values += self.noise.sample()

        return np.clip(action_values, -1, 1)

    def learn(self, experiences, gamma):
        """
        Update policy and value parameters using given batch of experience tuples.

        :param experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
        :param gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ------------------- update critic ------------------- #
        # get predicted next state, actions and Q values from target network
        actions_next = self.actor_target(next_states)
        Qtargets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states 
        Qtargets = rewards + (gamma * Qtargets_next * (1 - dones))

        # Get expected Q values from local model
        Qexpected = self.critic_local(states, actions)
        # calculate the batch loss
        critic_loss = F.mse_loss(Qexpected, Qtargets)

        # minimize critic loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()        # backward pass
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)  #gradient clipping
        self.critic_optimizer.step()   # perform a single optimization step (parameter update)

        # ------------------- update actor ------------------- #
        # compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        # minimize actor loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()  # backward pass
        self.actor_optimizer.step()  # perform a single optimization step (parameter update)

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """
        Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model (PyTorch model): weights will be copied from
        :param target_model (PyTorch model): weights will be copied to
        :param tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)