コード例 #1
0
    def __init__(self, game, num_agents, state_size, action_size, name, random_seed=0,
                    lr_critic=1e-3, lr_actor=1e-3,
                    fc1_units=400, fc2_units=300,
                    buffer_size=int(1e6), batch_size=128,
                    gamma=0.99, tau=1e-3,
                    max_norm=1.0,
                    epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.99,
                    exploration_mu=0.0, exploration_theta=0.15, exploration_sigma=0.2):
        
        """Initialize an Agent object.
        Args:
            game (class Game): meidator in chain-of-responsibility design pattern. (Broker chain)
            random_seed (int): random seed.
            
            max_norm (float): value of clip_grad_norm for critic optimizer
        """
        super().__init__()
        
        self.index_agent = None
        
        self.game = game
        self.num_agents = num_agents
            
        self.state_size = state_size
        self.action_size = action_size
        self.name = name
        self.seed = random.seed(random_seed)
        
        self.max_norm = max_norm
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # Actor Network (w/ Target Network)
        self.actor_local = MADDPGActorVersion3(state_size, action_size, 
                                               fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        self.actor_target = MADDPGActorVersion3(state_size, action_size, 
                                                fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)
        
        # Critic Network (w/ Target Network)
        self.critic_local = MADDPGCriticVersion4(num_agents, state_size, action_size, 
                                                 fcs1_units=fc1_units, fc2_units=fc2_units).to(device)
        self.critic_target = MADDPGCriticVersion4(num_agents, state_size, action_size, 
                                                  fcs1_units=fc1_units, fc2_units=fc2_units).to(device)
        
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # Noise process for action
        # Noise process
        self.noise = OUNoise(self.action_size, exploration_mu, exploration_theta, exploration_sigma)

        # parameter of discounted reward
        self.gamma = gamma
        
        # soft update parameter
        self.tau = tau
        
        self.batch_size = batch_size
コード例 #2
0
    def __init__(self,
                 agent_count,
                 observation_size,
                 action_size,
                 actor_optim_params,
                 critic_optim_params,
                 soft_update_tau,
                 discount_gamma,
                 use_batch_norm,
                 seed,
                 actor_network_states,
                 critic_network_states,
                 device):

        self._soft_update_tau = soft_update_tau
        self._gamma = discount_gamma

        # actor networks
        self._actor_local = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        self._actor_target = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        # critic networks
        self._critic_local = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        self._critic_target = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        # optimizers
        self._actor_optimizer = optim.Adam(
            self._actor_local.parameters(),
            **actor_optim_params
        )

        self._critic_optimizer = optim.Adam(
            self._critic_local.parameters(),
            **critic_optim_params
        )

        if actor_network_states is not None:
            self._actor_local.load_state_dict(actor_network_states[0])
            self._actor_target.load_state_dict(actor_network_states[1])

        if critic_network_states is not None:
            self._critic_local.load_state_dict(critic_network_states[0])
            self._critic_target.load_state_dict(critic_network_states[1])

        self.noise = OUNoise(action_size, seed)
コード例 #3
0
    def __init__(self, task, buffer_size, batch_size, gamma, tau,
                 actor_dropout, critic_dropout, exploration_theta,
                 exploration_sigma, actor_lr, critic_lr):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_dropout = actor_dropout
        self.critic_dropout = critic_dropout
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_dropout, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_dropout, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_dropout, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_dropout, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 5
        self.exploration_theta = exploration_theta
        self.exploration_sigma = exploration_sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = PrioritizedReplayBuffer(self.buffer_size,
                                              self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        self.best_score = -np.inf
コード例 #4
0
class MADDPGAgentVersion5(BaseAgent):
    def __init__(self, game, num_agents, state_size, action_size, name, random_seed=0,
                    lr_critic=1e-3, lr_actor=1e-3,
                    fc1_units=400, fc2_units=300,
                    buffer_size=int(1e6), batch_size=128,
                    gamma=0.99, tau=1e-3,
                    max_norm=1.0,
                    epsilon_start=1.0, epsilon_end=0.1, epsilon_decay=0.99,
                    exploration_mu=0.0, exploration_theta=0.15, exploration_sigma=0.2):
        
        """Initialize an Agent object.
        Args:
            game (class Game): meidator in chain-of-responsibility design pattern. (Broker chain)
            random_seed (int): random seed.
            
            max_norm (float): value of clip_grad_norm for critic optimizer
        """
        super().__init__()
        
        self.index_agent = None
        
        self.game = game
        self.num_agents = num_agents
            
        self.state_size = state_size
        self.action_size = action_size
        self.name = name
        self.seed = random.seed(random_seed)
        
        self.max_norm = max_norm
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # Actor Network (w/ Target Network)
        self.actor_local = MADDPGActorVersion2(state_size, action_size, random_seed, 
                                               fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        self.actor_target = MADDPGActorVersion2(state_size, action_size, random_seed, 
                                                fc1_units=fc1_units, fc2_units=fc2_units).to(device)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)
        
        # Critic Network (w/ Target Network)
        self.critic_local = MADDPGCriticVersion3(num_agents, state_size, action_size, 
                                                 fcs1_units=fc1_units, fc2_units=fc2_units,
                                                 seed=random_seed).to(device)
        self.critic_target = MADDPGCriticVersion3(num_agents, state_size, action_size, 
                                                  fcs1_units=fc1_units, fc2_units=fc2_units,
                                                 seed=random_seed).to(device)
        
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        
        # Noise process for action
        # Noise process
        self.noise = OUNoise(self.action_size, exploration_mu, exploration_theta, exploration_sigma)

        # parameter of discounted reward
        self.gamma = gamma
        
        # soft update parameter
        self.tau = tau
        
        self.batch_size = batch_size

        
    def step(self, states, actions, rewards, next_states, dones):
        """
        Args:
            states (numpy.array): states.shape[1] = (state_size*num_agents)
            actions (numpy.array): actions.shape[1] = (actions_size*num_agents)
            next_states (numpy.array): next_states.shape[1] = (state_size*num_agents)
        """
        
        self.learn(states, actions, rewards, next_states, dones)

       
    def act(self, state, add_noise=True):
        """
            Returns actions for given state.
            The input size of actor networks is state_size.
        """
        
        state = torch.from_numpy(state).float().to(device)
        
        with torch.no_grad(): 
            self.actor_local.eval()

            action = self.actor_local(state).cpu().data.numpy()

            self.actor_local.train()

            if add_noise:
                action += self.epsilon * self.noise.sample()

        return np.clip(action, -1, 1)    
    
    
    def reset(self):
        self.noise.reset()

        
    def forward_all(self, next_states):
        """
        Get next_actions. This is a chain-of-responsibility design pattern. (Broker chain)
        
        Return:
            1d differentiable tensor of next_actions.
        """
        q = ActionQuery()
        
        for i, agent in enumerate(self.game):
            # get next_state_i of agent_i
            n_state = next_states[:, i*self.state_size: (i+1)*self.state_size]
            
#             pdb.set_trace()
            
            if agent == self:
                detach = False
            else:
                detach = True
                
            # predict next_action and append it to actionQuery.actions
            agent.query(n_state, q, detach)
            
        return q.next_actions
    
    
    def query(self, next_state, q, detach):
        """
        Args:
            q (class ActionQuery): parcel that stores actions
        """
        
        next_action = self.actor_local(next_state)
        
        if detach is True:
            next_action = next_action.detach()
        
        if q.next_actions is None:
            q.next_actions = next_action
        else:
            q.next_actions = torch.cat((q.next_actions, next_action), dim=1)    
            
#             pdb.set_trace()


    def learn(self, states, actions, rewards, next_states, dones):
        """Update policy and value parameters using given batch of experience tuples.
        For agent i:
            Q_target_i = r_i + gamma * critic_target(next_state, actor_target(next_state))
            
        where:
            actor_target(state) -> actions for all agent
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        
        # divide fields update agent number i
        experience_unpacks = ExperienceUnpack(states, actions, rewards, next_states, dones,
                                              self.state_size, self.action_size, self.num_agents)
        
        # upack field in agent_i
        if self.index_agent is None:
            self.index_agent = self.game.index_of_agent(self)
            
            
#         pdb.set_trace()
            
        states_i, actions_i, rewards_i, next_states_i, dones_i = experience_unpacks[self.index_agent]

#         assert (states_i.shape[1] == (self.state_size)), 'Wrong shape of states_i'
#         assert (actions_i.shape[1] == (self.action_size)), 'Wrong shape of actions_i'
#         assert (rewards_i.shape[1] == (1)), 'Wrong shape of rewards_i'
#         assert (dones_i.shape[1] == (1)), 'Wrong shape of dones_i'

        # train critic
        # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current)      
        next_actions = self.forward_all(next_states)

        assert (next_actions.shape[1] == (self.action_size * self.num_agents)), 'Wrong shape of next_actions'

        Q_targets_next = self.critic_target(next_states, next_actions)

        Q_target_i = rewards_i + (self.gamma * Q_targets_next * (1-dones_i))
        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_target_i)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.max_norm)
        self.critic_optimizer.step()

        # train actor
        actions_pred = self.forward_all(states)
        actor_loss = - self.critic_local(states, actions).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

            
#         # update critic
#         self.soft_update(self.critic_local, self.critic_target, self.tau)

#         # update actors
#         self.soft_update(self.actor_local, self.actor_target, self.tau)
        
        #------ update noise ---#
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_end)
        self.noise.reset()
         
        
    def update_targets(self):
        # update critic
        self.soft_update(self.critic_local, self.critic_target, self.tau)

        # update actors
        self.soft_update(self.actor_local, self.actor_target, self.tau)
        
            
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
      

    def model_dicts(self):
        m_dicts = {'critic_{}'.format(self.name): self.critic_target,
                   'actor_{}'.format(self.name): self.actor_target}
        
        return m_dicts                                         
コード例 #5
0
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed, device)

        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0
コード例 #6
0
class DDPGAgentVersion1(BaseAgent):
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed, device)

        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0

    def step(self, state, action, reward, next_state, done):
        self.time_step += 1
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size) and (self.time_step %
                                                     self.learn_period == 0):
            for _ in range(self.learn_sampling_num):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # train critic
        # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       self.max_norm)
        self.critic_optimizer.step()

        # train actor (policy gradient)
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update critic_target
        self.soft_update(self.critic_local, self.critic_target, self.tau)

        # update actor_target
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        #------ update noise ---#
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def model_dicts(self):
        return {
            'agent_{}_actor'.format(self.name): self.actor_target,
            'agent_{}_critic'.format(self.name): self.critic_target
        }
コード例 #7
0
class DDPGAgentVersion5(BaseAgent):
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 lr_actor=1e-2,
                 lr_critic=1e-2,
                 fc1_units=128,
                 fc2_units=128,
                 buffer_size=int(1e6),
                 batch_size=50,
                 gamma=0.95,
                 tau=1e-2,
                 max_norm=1.0,
                 learn_period=100,
                 learn_sampling_num=50,
                 adam_critic_weight_decay=0.0,
                 name=None,
                 exploration_mu=0.0,
                 exploration_sigma=0.2,
                 exploration_theta=0.15,
                 epsilon_start=1.0,
                 epsilon_end=0.1,
                 epsilon_decay=0.99):
        """Initialize an Agent object.
        Args:
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            
        """
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        self.max_norm = max_norm
        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay

        self.actor_local = DDPGActorVersion1(state_size,
                                             action_size,
                                             random_seed,
                                             fc1_units=fc1_units,
                                             fc2_units=fc2_units).to(device)

        self.actor_target = DDPGActorVersion1(state_size,
                                              action_size,
                                              random_seed,
                                              fc1_units=fc1_units,
                                              fc2_units=fc2_units).to(device)

        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=lr_actor)

        # Critic Network (w/ Target Network)
        self.critic_local = DDPGCriticVersion1(state_size,
                                               action_size,
                                               random_seed,
                                               fcs1_units=fc1_units,
                                               fc2_units=fc2_units).to(device)

        self.critic_target = DDPGCriticVersion1(state_size,
                                                action_size,
                                                random_seed,
                                                fcs1_units=fc1_units,
                                                fc2_units=fc2_units).to(device)

        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=lr_critic,
            weight_decay=adam_critic_weight_decay)
        # Noise process for action
        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016)
        #         self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016)

        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta  # (Timothy Lillicrap, 2016)
        self.exploration_sigma = exploration_sigma  # (Timothy Lillicrap, 2016)

        self.noise = OUNoise(action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory

        #self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device)

        self.memory = PrioritizedReplayBuffer(action_size, buffer_size,
                                              batch_size, random_seed, device)

        # Prioritized Replay Buffer Params
        #self.a, self.b = 0.7, 0.5   # rank-based variant
        self.a, self.b = 0.6, 0.4  # proportional variant

        self.e = 1e-3  # 0.01 * (reward of each time step) = 0.01 * 0.1

        # parameter of discounted reward
        self.gamma = gamma

        # soft update parameter
        self.tau = tau

        self.batch_size = batch_size

        self.name = name

        self.time_step = 0

    def step(self, state, action, reward, next_state, done):
        self.time_step += 1
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if (len(self.memory) > self.batch_size) and (self.time_step %
                                                     self.learn_period == 0):

            for _ in range(self.learn_sampling_num):
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)

        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()

        self.actor_local.train()

        if add_noise:
            action += self.epsilon * self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        
        Q_targets = r + gamma * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        
        Args:
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, indices, probs = experiences
        # train critic
        # loss fuction = Q_target(TD 1-step boostrapping) - Q_local(current)
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)

        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.critic_local(states, actions)

        critic_loss = F.mse_loss(Q_expected, Q_targets)

        # compute td error (delta) for updating prioritized replay buffer
        abs_td_error = torch.abs(Q_targets - Q_expected)

        # Calculate importance sampling weight
        if probs:
            weights = np.array(probs).reshape(-1, 1) * len(
                self.memory)**(-self.b)
            weights /= np.max(weights)
            #weights = [(prob * size_memory) ** (-self.b) for prob in probs]
            #max_weight = max(weights)
            #weights = np.array([w / max_weight for w in weights]).reshape((-1, 1))
        else:
            weights = np.ones(critic_loss.shape, dtype=np.float)

        # Calculate weighted loss
        weighted_critic_loss = torch.mean(
            torch.from_numpy(weights).float().to(device) * critic_loss)
        self.critic_optimizer.zero_grad()
        weighted_critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(),
                                       self.max_norm)
        self.critic_optimizer.step()

        if indices:
            # convert errors to priorities and update them
            self.memory.update(
                indices,
                list(
                    abs_td_error.detach().to('cpu').numpy().squeeze()**self.a +
                    self.e))

        # train actor (policy gradient)
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # update critic_target
        self.soft_update(self.critic_local, self.critic_target, self.tau)

        # update actor_target
        self.soft_update(self.actor_local, self.actor_target, self.tau)

        #------ update noise ---#
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_end)
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Args:
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def model_dicts(self):
        return {
            'agent_{}_actor'.format(self.name): self.actor_target,
            'agent_{}_critic'.format(self.name): self.critic_target
        }
コード例 #8
0
class Brain:
    def __init__(self,
                 agent_count,
                 observation_size,
                 action_size,
                 actor_optim_params,
                 critic_optim_params,
                 soft_update_tau,
                 discount_gamma,
                 use_batch_norm,
                 seed,
                 actor_network_states,
                 critic_network_states,
                 device):

        self._soft_update_tau = soft_update_tau
        self._gamma = discount_gamma

        # actor networks
        self._actor_local = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        self._actor_target = ActorNetwork(
            observation_size, action_size, use_batch_norm, seed
        ).to(device)

        # critic networks
        self._critic_local = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        self._critic_target = CriticNetwork(
            observation_size * agent_count, action_size * agent_count, use_batch_norm, seed
        ).to(device)

        # optimizers
        self._actor_optimizer = optim.Adam(
            self._actor_local.parameters(),
            **actor_optim_params
        )

        self._critic_optimizer = optim.Adam(
            self._critic_local.parameters(),
            **critic_optim_params
        )

        if actor_network_states is not None:
            self._actor_local.load_state_dict(actor_network_states[0])
            self._actor_target.load_state_dict(actor_network_states[1])

        if critic_network_states is not None:
            self._critic_local.load_state_dict(critic_network_states[0])
            self._critic_target.load_state_dict(critic_network_states[1])

        self.noise = OUNoise(action_size, seed)

    def get_actor_model_states(self):
        return self._actor_local.state_dict(), self._actor_target.state_dict()

    def get_critic_model_states(self):
        return self._critic_local.state_dict(), self._critic_target.state_dict()

    def act(self, observation, target=False, noise=0.0, train=False):
        """
        :param observation: tensor of shape == (b, observation_size)
        :param target: true to evaluate with target
        :param noise: OU noise factor
        :param train: True for training mode else eval mode
        :return: action: tensor of shape == (b, action_size)
        """

        actor = self._actor_target if target else self._actor_local

        if train:
            actor.train()
        else:
            actor.eval()

        action_values = actor(observation)

        if noise > 0:
            noise = torch.tensor(
                noise * self.noise.sample(),
                dtype=observation.dtype,
                device=observation.device
            )
        else:
            noise = 0

        return action_values + noise

    def update_actor(self, all_obs, all_pred_actions):
        """
        Actor
        :param all_obs: array of shape == (b, observation_size * n_agents)
        :param all_pred_actions: array of shape == (b, action_size * n_agents)
        :return:
        """

        actor_loss = -self._critic_local(all_obs, all_pred_actions).mean()

        self._actor_optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self._actor_optimizer.step()

    def update_critic(self, rewards, dones,
                      all_obs, all_actions, all_next_obs, all_next_actions):
        """
        Critic receives observation and actions of all agents as input
        :param rewards: array of shape == (b, 1)
        :param dones: array of shape == (b, 1)
        :param all_obs: array of shape == (b, n_agents, observation_size)
        :param all_actions: array of shape == (b, n_agents, action_size)
        :param all_next_obs:  array of shape == (b, n_agents, observation_size)
        :param all_next_actions: array of shape == (b, n_agents, action_size)
        """

        with torch.no_grad():
            q_target_next = self._critic_target(all_next_obs, all_next_actions)

        q_target = rewards + self._gamma * q_target_next * (1 - dones)

        q_expected = self._critic_local(all_obs, all_actions)

        # mse loss, manual calculation due to mse_loss bug, as of 0.4.1
        # https://github.com/pytorch/pytorch/issues/10148
        # critic_loss = F.mse_loss(q_expected, q_target.detach())
        critic_loss = ((q_expected - q_target.detach()) ** 2).mean()

        self._critic_optimizer.zero_grad()
        critic_loss.backward()
        self._critic_optimizer.step()

    def update_targets(self):
        self._soft_update(self._actor_local, self._actor_target, self._soft_update_tau)
        self._soft_update(self._critic_local, self._critic_target, self._soft_update_tau)

    def reset(self):
        self.noise.reset()

    @staticmethod
    def _soft_update(local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ * θ_local + (1 - τ) * θ_target
        :param local_model: model will be copied from
        :param target_model: model will be copied to
        :param tau: interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
コード例 #9
0
ファイル: ddpg_main.py プロジェクト: cyzhao1991/dream-RL
def main(gpu_num, exp_num, env=None):
    dir_name = 'Data/checkpoint/'
    if not os.path.isdir(dir_name):
        os.makedirs(dir_name)

    with open('log.txt', 'a') as text_file:
        text_file.write('gpu %i exp %i started.\n' % (gpu_num, exp_num))

    with tf.device('/gpu:%i' % (gpu_num)):
        pms = Paras_base().pms
        pms.save_model = True
        pms.save_dir = dir_name
        env = CartPoleEnv() if env is None else env
        action_size = env.action_space.shape[0]
        observation_size = env.observation_space.shape[0]
        max_action = env.action_space.high[0]
        pms.obs_shape = observation_size
        pms.max_iter = 1000000
        pms.action_shape = action_size
        pms.max_action = max_action
        pms.num_of_paths = 100
        pms.name_scope = 'ddpg'
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.20
        sess = tf.Session(config=config)

        state_ph = tf.placeholder(tf.float32, [None, pms.obs_shape])
        action_ph = tf.placeholder(tf.float32, [None, pms.action_shape])
        critic_input_ph = tf.concat([state_ph, action_ph], axis=1)
        actor_net = Fcnn(sess,
                         pms.obs_shape,
                         pms.action_shape, [400, 300],
                         name=pms.name_scope + '_actor_r',
                         if_bias=[False],
                         activation=['relu', 'relu', 'None'],
                         input_tf=state_ph)
        actor_target_net = Fcnn(sess,
                                pms.obs_shape,
                                pms.action_shape, [400, 300],
                                name=pms.name_scope + '_actor_t',
                                if_bias=[False],
                                activation=['relu', 'relu', 'None'],
                                input_tf=state_ph)
        critic_net = Fcnn(sess,
                          pms.obs_shape + pms.action_shape,
                          1, [400, 300],
                          name=pms.name_scope + '_critic_r',
                          if_bias=[False],
                          activation=['relu', 'relu', 'None'],
                          input_tf=critic_input_ph)
        critic_target_net = Fcnn(sess,
                                 pms.obs_shape + pms.action_shape,
                                 1, [400, 300],
                                 name=pms.name_scope + '_critic_t',
                                 if_bias=[False],
                                 activation=['relu', 'relu', 'None'],
                                 input_tf=critic_input_ph)
        critic_net.state_ph = state_ph
        critic_net.action_ph = action_ph
        print('sth')
        actor = DeterministicActor(actor_net, sess, pms)
        actor_target = DeterministicActor(actor_net, sess, pms)

        replay_buffer = ReplayBuffer(buffer_size=pms.buffer_size)
        ounoise = OUNoise(pms.action_shape)
        learn_agent = DDPGagent(env, actor, critic_net, actor_target,
                                critic_target_net, replay_buffer, ounoise,
                                sess, pms, [None])

    saver = tf.train.Saver()
    learn_agent.saver = saver
    sess.run(tf.global_variables_initializer())
    saving_result = learn_agent.learn()
    sess.close()
コード例 #10
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, buffer_size, batch_size, gamma, tau,
                 actor_dropout, critic_dropout, exploration_theta,
                 exploration_sigma, actor_lr, critic_lr):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_dropout = actor_dropout
        self.critic_dropout = critic_dropout
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_dropout, self.actor_lr)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_dropout, self.actor_lr)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_dropout, self.critic_lr)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_dropout, self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 5
        self.exploration_theta = exploration_theta
        self.exploration_sigma = exploration_sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = PrioritizedReplayBuffer(self.buffer_size,
                                              self.batch_size)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters

        self.best_score = -np.inf

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        self.total_reward = 0.0
        self.count = 0
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        #self.memory.add(self.last_state, action, reward, next_state, done)
        #Generate the parameters in order to calculate the TD error
        next_state_predict = np.reshape(next_state, [-1, self.state_size])
        last_state_predict = np.reshape(self.last_state, [-1, self.state_size])
        action_predict = np.reshape(action, [-1, self.action_size])
        #next_state_action = np.concatenate([next_state, action])
        Q_target_next = self.critic_target.model.predict(
            [next_state_predict, action_predict])[0]
        Q_local = self.critic_local.model.predict(
            [last_state_predict, action_predict])[0]

        #Calculate the TD error in order to generate the priority value of the experience
        td_error = reward + self.gamma * Q_target_next - Q_local

        #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf
        #td_error = math.tanh(td_error[0])

        self.memory.add(self.last_state, action, reward, next_state, done,
                        abs(td_error[0]))

        self.total_reward += reward
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences, idx_sample, is_weights = self.memory.sample_priority()
            self.learn(experiences, idx_sample, is_weights)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state, test=False):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        if test == False:
            return list(action +
                        self.noise.sample())  # add some noise for exploration
        else:
            return list(action)

    def learn(self, experiences, idx_sample, is_weights):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])
        is_weights = is_weights.reshape(-1, 1)

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (
            1 - dones) * is_weights
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        #Generate the new TD error value and update the priority value within the Replay Buffer
        td_error = rewards + self.gamma * Q_targets_next * (1 -
                                                            dones) - Q_targets

        #Normalize the TD error with TANH as advised by Google's DeepMind paper "Prioritized Experience Replay": https://arxiv.org/pdf/1511.05952.pdf
        #td_error = np.tanh(td_error)

        self.memory.update_priority(idx=idx_sample, error=td_error)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def test_control(self, file_output='data.txt'):
        state = self.reset_episode()
        done = False
        #Results with the conditions of the quadcopter
        labels = [
            'time', 'x', 'y', 'z', 'phi', 'theta', 'psi', 'x_velocity',
            'y_velocity', 'z_velocity', 'phi_velocity', 'theta_velocity',
            'psi_velocity', 'rotor_speed1', 'rotor_speed2', 'rotor_speed3',
            'rotor_speed4'
        ]
        results = {x: [] for x in labels}

        # Run the simulation, and save the results.
        with open(file_output, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(labels)
            while True:
                action = self.act(state, test=True)
                #action = self.act(state, test=False)
                next_state, reward, done = self.task.step(action)
                state = next_state
                to_write = [self.task.sim.time] + list(
                    self.task.sim.pose) + list(self.task.sim.v) + list(
                        self.task.sim.angular_v) + list(action)
                for ii in range(len(labels)):
                    results[labels[ii]].append(to_write[ii])
                writer.writerow(to_write)
                if done:
                    break
        #Shows the results of the control
        control_results(results)

    #Useful for testing
    def update_score(self):
        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score