class DDPGAgent:
    def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2):
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        self.noise = OUNoise(out_actor, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor(obs) + noise*self.noise.noise()
        return action

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise*self.noise.noise()
        return action
Ejemplo n.º 2
0
class DDPGAgent:
    def __init__(
            self,
            in_actor,
            out_actor,
            n_filt_actor,
            kernel_size_actor,
            stride_actor,
            fc_units_actor,
            in_critic,
            n_filt_critic,
            kernel_size_critic,
            stride_critic,
            fc_units_critic,
            lr_actor=1.0e-3,
            lr_critic=1.0e-5):  # 1e-5 was getting to 0.4 score (sporadically)
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor,
                             out_actor,
                             n_filt_actor,
                             kernel_size_actor,
                             stride_actor,
                             fc_units_actor,
                             actor=True).to(device)
        self.critic = Network(in_critic, 1, n_filt_critic, kernel_size_critic,
                              stride_critic, fc_units_critic).to(device)
        self.target_actor = Network(in_actor,
                                    out_actor,
                                    n_filt_actor,
                                    kernel_size_actor,
                                    stride_actor,
                                    fc_units_actor,
                                    actor=True).to(device)
        self.target_critic = Network(in_critic, 1, n_filt_critic,
                                     kernel_size_critic, stride_critic,
                                     fc_units_critic).to(device)

        self.noise = OUNoise(out_actor, scale=.1)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=1e-3)

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor(obs) + noise * self.noise.noise()
        return action

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise * self.noise.noise()
        return action
Ejemplo n.º 3
0
class DDPGAgent:
    def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2):
        super(DDPGAgent, self).__init__()

        self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        #self.noise = OUNoise(out_actor, scale=1.0 )
        self.noise = RNoise(out_actor, 0.5)
        
        self.epsilon = 1.
        self.epsilon_decay_rate = 0.999
        self.epsilon_min = 0.2

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=0.0)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0.0)
        
    def epsilon_decay(self):
        self.epsilon = max(self.epsilon_decay_rate*self.epsilon, self.epsilon_min)

    def act(self, obs, rand=0., add_noise=True):
        if np.random.random() < rand:
            action = np.random.randn(2)                    # select an action (for each agent)
            action = np.clip(action, -1, 1)                  # all actions between -1 and 1
            action = torch.tensor(action, dtype=torch.float)
        else:
            obs = obs.to(device)
            
            self.actor.eval()
            with torch.no_grad():
                action = self.actor(obs)
            self.actor.train()
            if add_noise:
                action += self.epsilon * self.noise.noise()
                action = action.squeeze(0)

        return action
    
    def reset(self):
        self.noise.reset()

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise*self.noise.noise()
        return action
Ejemplo n.º 4
0
class MADDPG:
    def __init__(self, discount_factor=0.95, tau=0.1):
        super(MADDPG, self).__init__()

        # DDGAgent used only to train independent actors
        self.maddpg_agent = [
            DDPGAgent(24, 256, 128, 2),
            DDPGAgent(24, 256, 128, 2)
        ]

        # Shared critic trained for both agents
        # critic input = obs_full + actions = 48+2+2=52
        self.critic = Network(52, 256, 128, 1).to(device)
        self.target_critic = Network(52, 256, 128, 1).to(device)

        # initialize targets same as original networks
        hard_update(self.target_critic, self.critic)

        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=1.0e-3,
                                     weight_decay=0.0)

        self.discount_factor = discount_factor
        self.tau = tau
        self.iter = 0

    def get_actors(self):
        """get actors of all the agents in the MADDPG object"""
        actors = [ddpg_agent.actor for ddpg_agent in self.maddpg_agent]
        return actors

    def get_target_actors(self):
        """get target_actors of all the agents in the MADDPG object"""
        target_actors = [
            ddpg_agent.target_actor for ddpg_agent in self.maddpg_agent
        ]
        return target_actors

    def act(self, obs_all_agents, noise=0.0):
        """get actions from all agents in the MADDPG object"""
        actions = [
            agent.act(obs, noise)
            for agent, obs in zip(self.maddpg_agent, obs_all_agents)
        ]
        return actions

    def target_act(self, obs_all_agents_list, noise=0.0):
        """get target network actions from all the agents in the MADDPG object """
        target_actions_list = []
        for obs_all_agents in obs_all_agents_list:
            target_actions = []
            for ddpg_agent, obs in zip(self.maddpg_agent, obs_all_agents):
                target_actions.append(ddpg_agent.target_act(obs, noise))
            target_actions_list.append(torch.stack(target_actions))
        return target_actions_list

    def act_on_list(self, obs_all_agents_list, agent_number):
        actions_list = []
        for obs_all_agents in obs_all_agents_list:
            actions = []
            for i in range(len(self.maddpg_agent)):
                if i == agent_number:
                    actions.append(self.maddpg_agent[i].actor(
                        obs_all_agents[i]))
                else:
                    actions.append(self.maddpg_agent[i].actor(
                        obs_all_agents[i]).detach())
            actions_list.append(torch.stack(actions))
        return actions_list

    @staticmethod
    def convert_samples_to_tensor(samples):
        obs, actions, rewards, next_obs, dones = [], [], [], [], []
        for sample in samples:
            obs.append(torch.tensor(sample[0], dtype=torch.float))
            actions.append(torch.tensor(sample[1], dtype=torch.float))
            rewards.append(torch.tensor(sample[2], dtype=torch.float))
            next_obs.append(torch.tensor(sample[3], dtype=torch.float))
            dones.append(torch.tensor(sample[4], dtype=torch.float))
        return obs, actions, rewards, next_obs, dones

    def update(self, samples, agent_number):
        """update the critics and actors of all the agents """

        obs_full, actions, rewards, next_obs_full, dones = self.convert_samples_to_tensor(
            samples)

        obs_full_s = torch.stack(obs_full)
        next_obs_full_s = torch.stack(next_obs_full)

        obs_full_c = torch.reshape(obs_full_s, (len(samples), -1))
        next_obs_full_c = torch.reshape(next_obs_full_s, (len(samples), -1))

        agent = self.maddpg_agent[agent_number]
        self.critic_optimizer.zero_grad()

        #critic loss = batch mean of (y- Q(s,a) from target network)^2
        #y = reward of this timestep + discount * Q(st+1,at+1) from target network
        target_actions = self.target_act(next_obs_full_s)
        target_actions_s = torch.stack(target_actions)
        target_actions_c = torch.reshape(target_actions_s, (len(samples), -1))

        target_critic_input = torch.cat((next_obs_full_c, target_actions_c),
                                        dim=1).to(device)

        with torch.no_grad():
            q_next = self.target_critic(target_critic_input)

        rewards_s = torch.stack(rewards)
        dones_s = torch.stack(dones)
        y = rewards_s[:, agent_number].view(
            -1, 1) + self.discount_factor * q_next * (
                1 - dones_s[:, agent_number].view(-1, 1))

        actions_s = torch.stack(actions)
        actions_c = torch.reshape(actions_s, (len(samples), -1))
        critic_input = torch.cat((obs_full_c, actions_c), dim=1).to(device)
        q = self.critic(critic_input)

        huber_loss = torch.nn.SmoothL1Loss()
        critic_loss = huber_loss(q, y.detach())
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1)
        self.critic_optimizer.step()

        #update actor network using policy gradient
        agent.actor_optimizer.zero_grad()
        q_input = self.act_on_list(obs_full_s, agent_number)
        q_input_s = torch.stack(q_input)

        q_input_c = torch.reshape(q_input_s, (len(samples), -1))
        # combine all the actions and observations for input to critic
        q_input2 = torch.cat((obs_full_c, q_input_c), dim=1)

        # get the policy gradient
        actor_loss = -self.critic(q_input2).mean()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1)
        agent.actor_optimizer.step()

        al = actor_loss.cpu().detach().item()
        cl = critic_loss.cpu().detach().item()

    def update_targets(self):
        """soft update targets"""
        self.iter += 1
        for ddpg_agent in self.maddpg_agent:
            soft_update(ddpg_agent.target_actor, ddpg_agent.actor, self.tau)
        soft_update(self.target_critic, self.critic, self.tau)
Ejemplo n.º 5
0
class DDPGAgent:
    def __init__(self, state_size, action_size, hidden_layers, gamma, tau,
                 lr_actor, lr_critic, weight_decay, seed):
        """Initialize DDPG agent."""
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.actor = Network(state_size,
                             hidden_layers[0],
                             hidden_layers[1],
                             action_size,
                             seed,
                             actor=True).to(device)
        self.critic = Network(2 * (state_size + action_size), hidden_layers[2],
                              hidden_layers[3], 1, seed).to(device)
        self.target_actor = Network(state_size,
                                    hidden_layers[0],
                                    hidden_layers[1],
                                    action_size,
                                    seed,
                                    actor=True).to(device)
        self.target_critic = Network(2 * (state_size + action_size),
                                     hidden_layers[2], hidden_layers[3], 1,
                                     seed).to(device)

        self.noise = OUNoise(action_size, seed, scale=1.0)
        '''
        # initialize targets same as original networks
        self.hard_update(self.target_actor, self.actor)
        self.hard_update(self.target_critic, self.critic)
        '''

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=weight_decay)

    def act(self, state):
        """Calculate actions under current policy for a specific agent."""
        state = torch.from_numpy(state).float().to(device)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().data.numpy()
        self.actor.train()
        action += self.noise.noise()
        return np.clip(action, -1, 1)

    def step_learn(self, experiences):
        """Update actor and critic using sampled experiences."""

        # states_list: list (length of num_agents) of 2D tensors (batch_size * state_size)
        # action_list: list (length of num_agents) of 2D tensors (batch_size * action_size)
        # rewards: 2D tensors (batch_size * num_agents)
        # next_states_list: list (length of num_agents) of 2D tensors (batch_size * state_size)
        # dones: 2D tensors (batch_size * num_agents)
        states_list, actions_list, rewards, next_states_list, dones = experiences

        next_full_states = torch.cat(next_states_list, dim=1).to(
            device)  # 2D tensor (batch_size * (num_agents*state_size))
        full_states = torch.cat(states_list, dim=1).to(
            device)  # 2D tensor (batch_size * (num_agents*state_size))
        full_actions = torch.cat(actions_list, dim=1).to(
            device)  # 2D tensor (batch_size * (num_agents*action_size))

        # update critic
        next_actions_list = [
            self.target_actor(states) for states in states_list
        ]
        next_full_actions = torch.cat(next_actions_list, dim=1).to(device)
        Q_target_next = self.target_critic(
            next_full_states, next_full_actions)  # 2D tensor (batch_size * 1)
        '''
        Q_target = rewards[:, idx_agent].view(-1, 1) + \
            self.gamma * Q_target_next * (1.0 - dones[:, idx_agent].view(-1, 1))
        '''
        Q_target = rewards + (self.gamma * Q_target_next * (1.0 - dones))
        Q_predict = self.critic(full_states,
                                full_actions)  # 2D tensor (batch_size * 1)
        critic_loss = F.mse_loss(Q_predict, Q_target)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # update actor
        '''
        action_pred = self.actor(states_list[idx_agent])     # 2D tensor (batch_size * action_size)
        actions_list_update = actions_list.copy()
        actions_list_update[idx_agent] = action_pred
        full_actions_update = torch.cat(actions_list_update, dim = 1).to(device)   # 2D tensor (batch_size * (num_agents*action_size))
        '''
        actions_pred = [self.actor(states) for states in states_list]
        actions_pred_tensor = torch.cat(actions_pred, dim=1).to(device)
        # actor_loss = -self.critic(full_states, full_actions_update).mean()
        actor_loss = -self.critic(full_states, actions_pred_tensor).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # soft update target networks
        self.soft_update(self.target_critic, self.critic, self.tau)
        self.soft_update(self.target_actor, self.actor, self.tau)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)

    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def reset(self):
        self.noise.reset()
class DDPGAgent:
    def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-4):
        super(DDPGAgent, self).__init__()
        
        self.state_size = in_actor
        self.action_size = out_actor 

        self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)
        self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device)
        self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device)

        self.noise = OUNoise(out_actor, scale=1.0 )
#         self.noise = OUNoise(action_size) #single agent only
        self.noise_scale = NOISE_START

        
        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)


#     def act(self, obs, noise=0.0):
#         obs = obs.to(device)
#         action = self.actor(obs) + noise*self.noise.noise().to(device)
#         return action
    
    def act(self, states, i_episode, add_noise=True):
        """Returns actions for given state as per current policy."""
        
        if self.noise_scale > NOISE_END:
            #self.noise_scale *= NOISE_REDUCTION
            self.noise_scale = NOISE_REDUCTION**(i_episode-EPISODES_BEFORE_TRAINING)
        #else keep the previous value
        
        if not add_noise:
            self.noise_scale = 0.0
                                    
#         states = torch.from_numpy(states).float().to(DEVICE)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(states)#.cpu().data.numpy()
        self.actor.train()
        
        #add noise
        actions += self.noise_scale*self.add_noise2() #works much better than OU Noise process
#         actions += self.noise_scale*self.noise.noise()
        
        return actions

    def add_noise2(self):
#         noise = 0.5*np.random.randn(self.action_size) #sigma of 0.5 as sigma of 1 will have alot of actions just clipped
        noise = 0.5*torch.rand(self.action_size).to(device)
        return noise
    
    def reset(self):
        self.noise.reset()

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(obs) + noise*self.noise.noise().to(device)
        return action
Ejemplo n.º 7
0
class DDPGAgent:
    def __init__(self,
                 in_actor,
                 out_actor,
                 in_critic,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()

        hidden_in_actor = 64
        hidden_out_actor = 128
        hidden_in_critic = hidden_in_actor
        hidden_out_critic = hidden_out_actor

        self.actor = Network(in_actor,
                             hidden_in_actor,
                             hidden_out_actor,
                             out_actor,
                             actor=True).to(device)
        self.critic = Network(in_critic,
                              hidden_in_critic,
                              hidden_out_critic,
                              1,
                              out_actor,
                              actor=False).to(device)
        self.target_actor = Network(in_actor,
                                    hidden_in_actor,
                                    hidden_out_actor,
                                    out_actor,
                                    actor=True).to(device)
        self.target_critic = Network(in_critic,
                                     hidden_in_critic,
                                     hidden_out_critic,
                                     1,
                                     out_actor,
                                     actor=False).to(device)

        self.noise = OUNoise(out_actor, scale=0.9)  #scale 1.0
        self.noise_shape = out_actor

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        WD = 1e-5
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=WD)

    def reset(self):
        self.noise.reset()

    def noisef(self, mean=0, sigma=0.08):
        return np.random.normal(mean, sigma, self.noise_shape)

    def act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.actor(obs).cpu().data.numpy(
        ) + noise * self.noisef()  #self.noise.noise()
        return action

    def target_act(self, obs, noise=0.0):
        obs = obs.to(device)
        action = self.target_actor(
            obs).cpu()  # + noise * self.noisef() #self.noise.noise()
        return action
Ejemplo n.º 8
0
class DDPGAgent:
    def __init__(self,
                 state_size,
                 action_size,
                 hidden_in_dim,
                 hidden_out_dim,
                 extrem_out=64,
                 num_agents=2,
                 lr_actor=1.0e-4,
                 lr_critic=1.0e-3):
        super(DDPGAgent, self).__init__()
        critic_state_size = state_size * num_agents
        critic_action_size = (action_size * (num_agents))
        self.actor = Network(state_size,
                             action_size,
                             hidden_in_dim,
                             hidden_out_dim,
                             hidden_extrem_out=extrem_out,
                             actor=True).to(device)
        self.critic = Network(critic_state_size,
                              critic_action_size,
                              hidden_in_dim,
                              hidden_out_dim,
                              hidden_extrem_out=extrem_out).to(device)
        self.target_actor = Network(state_size,
                                    action_size,
                                    hidden_in_dim,
                                    hidden_out_dim,
                                    hidden_extrem_out=extrem_out,
                                    actor=True).to(device)
        self.target_critic = Network(critic_state_size,
                                     critic_action_size,
                                     hidden_in_dim,
                                     hidden_out_dim,
                                     hidden_extrem_out=extrem_out).to(device)

        self.noise = OUNoise(action_size, scale=1.0)

        # initialize targets same as original networks
        hard_update(self.target_actor, self.actor)
        hard_update(self.target_critic, self.critic)

        self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = Adam(self.critic.parameters(),
                                     lr=lr_critic,
                                     weight_decay=0)

        print("critic", self.critic, self.target_critic, "optim",
              self.critic_optimizer)
        print("actor", self.actor, self.target_actor, "optim",
              self.actor_optimizer)

    def act(self, obs, noise=0.0, batch=True):
        obs = obs.to(device)
        self.actor.eval()
        act = self.actor(obs, batch=batch).cpu().data
        no = noise * self.noise.noise()
        #print( "act" , act , "noise" , no)
        action = act + no
        self.actor.train()
        return np.clip(action, -1, 1)

    def target_act(self, obs, noise=0.0, batch=True):
        obs = obs.to(device)
        self.target_actor.eval()
        action = self.target_actor(
            obs, batch=batch).cpu().data + noise * self.noise.noise()
        self.target_actor.training()
        return np.clip(action, -1, 1)