Ejemplo n.º 1
0
class BiCNet():
    def __init__(self, s_dim, a_dim, n_agents, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.n_agents = n_agents
        self.device = 'cuda' if self.config.use_cuda else 'cpu'
        # Networks
        self.policy = Actor(s_dim, a_dim, n_agents)
        self.policy_target = Actor(s_dim, a_dim, n_agents)
        self.critic = Critic(s_dim, a_dim, n_agents)
        self.critic_target = Critic(s_dim, a_dim, n_agents)

        if self.config.use_cuda:
            self.policy.cuda()
            self.policy_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)

        hard_update(self.policy, self.policy_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

        self.c_loss = None
        self.a_loss = None
        self.action_log = list()

    def choose_action(self, obs, noisy=True):
        obs = torch.Tensor([obs]).to(self.device)

        action = self.policy(obs).cpu().detach().numpy()[0]
        self.action_log.append(action)

        if noisy:
            for agent_idx in range(self.n_agents):
                pass
                # action[agent_idx] += self.epsilon * self.random_process.sample()
            self.epsilon -= self.depsilon
            self.epsilon = max(self.epsilon, 0.001)
        np.clip(action, -1., 1.)

        return action

    def reset(self):
        self.random_process.reset_states()
        self.action_log.clear()

    def prep_train(self):
        self.policy.train()
        self.critic.train()
        self.policy_target.train()
        self.critic_target.train()

    def prep_eval(self):
        self.policy.eval()
        self.critic.eval()
        self.policy_target.eval()
        self.critic_target.eval()

    def random_action(self):
        return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2))

    def memory(self, s, a, r, s_, done):
        self.replay_buffer.append((s, a, r, s_, done))

        if len(self.replay_buffer) >= self.config.memory_length:
            self.replay_buffer.pop(0)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.config.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def train(self):

        state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches(
        )

        state_batches = torch.Tensor(state_batches).to(self.device)
        action_batches = torch.Tensor(action_batches).to(self.device)
        reward_batches = torch.Tensor(reward_batches).reshape(
            self.config.batch_size, self.n_agents, 1).to(self.device)
        next_state_batches = torch.Tensor(next_state_batches).to(self.device)
        done_batches = torch.Tensor(
            (done_batches == False) * 1).reshape(self.config.batch_size,
                                                 self.n_agents,
                                                 1).to(self.device)

        target_next_actions = self.policy_target.forward(next_state_batches)
        target_next_q = self.critic_target.forward(next_state_batches,
                                                   target_next_actions)
        main_q = self.critic(state_batches, action_batches)
        '''
        How to concat each agent's Q value?
        '''
        #target_next_q = target_next_q
        #main_q = main_q.mean(dim=1)
        '''
        Reward Norm
        '''
        # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024

        # Critic Loss
        self.critic.zero_grad()
        baselines = reward_batches + done_batches * self.config.gamma * target_next_q
        loss_critic = torch.nn.MSELoss()(main_q, baselines.detach())
        loss_critic.backward()
        torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.critic_optimizer.step()

        # Actor Loss
        self.policy.zero_grad()
        clear_action_batches = self.policy.forward(state_batches)
        loss_actor = -self.critic.forward(state_batches,
                                          clear_action_batches).mean()
        loss_actor += (clear_action_batches**2).mean() * 1e-3
        loss_actor.backward()
        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
        self.policy_optimizer.step()

        # This is for logging
        self.c_loss = loss_critic.item()
        self.a_loss = loss_actor.item()

        soft_update(self.policy, self.policy_target, self.config.tau)
        soft_update(self.critic, self.critic_target, self.config.tau)

    def get_loss(self):
        return self.c_loss, self.a_loss

    def get_action_std(self):
        return np.array(self.action_log).std(axis=-1).mean()
Ejemplo n.º 2
0
class Preyer:
    def __init__(self, s_dim, a_dim, **kwargs):
        self.s_dim = s_dim
        self.a_dim = a_dim
        self.config = kwargs['config']
        self.device = 'cuda' if self.config.use_cuda else 'cpu'

        self.actor = Actor(s_dim, a_dim)
        self.actor_target = Actor(s_dim, a_dim)
        self.critic = Critic(s_dim, a_dim, 1)
        self.critic_target = Critic(s_dim, a_dim, 1)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.a_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.c_lr)
        self.c_loss = 0
        self.a_loss = 0

        if self.config.use_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor, self.actor_target)
        hard_update(self.critic, self.critic_target)

        self.random_process = OrnsteinUhlenbeckProcess(
            size=self.a_dim,
            theta=self.config.ou_theta,
            mu=self.config.ou_mu,
            sigma=self.config.ou_sigma)
        self.replay_buffer = list()
        self.epsilon = 1.
        self.depsilon = self.epsilon / self.config.epsilon_decay

    def memory(self, s, a, r, s_, done):
        self.replay_buffer.append((s, a, r, s_, done))

        if len(self.replay_buffer) >= self.config.memory_length:
            self.replay_buffer.pop(0)

    def get_batches(self):
        experiences = random.sample(self.replay_buffer, self.config.batch_size)

        state_batches = np.array([_[0] for _ in experiences])
        action_batches = np.array([_[1] for _ in experiences])
        reward_batches = np.array([_[2] for _ in experiences])
        next_state_batches = np.array([_[3] for _ in experiences])
        done_batches = np.array([_[4] for _ in experiences])

        return state_batches, action_batches, reward_batches, next_state_batches, done_batches

    def choose_action(self, s, noisy=True):
        if self.config.use_cuda:
            s = Variable(torch.cuda.FloatTensor(s))
        else:
            s = Variable(torch.FloatTensor(s))
        a = self.actor.forward(s).cpu().detach().numpy()

        if noisy:
            a += max(self.epsilon, 0.001) * self.random_process.sample()
            self.epsilon -= self.depsilon
        a = np.clip(a, -1., 1.)

        return np.array([a])

    def random_action(self):
        action = np.random.uniform(low=-1., high=1., size=(1, self.a_dim))
        return action

    def reset(self):
        self.random_process.reset_states()

    def train(self):
        state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches(
        )

        state_batches = Variable(torch.Tensor(state_batches).to(self.device))
        action_batches = Variable(
            torch.Tensor(action_batches).reshape(-1, 1).to(self.device))
        reward_batches = Variable(
            torch.Tensor(reward_batches).reshape(-1, 1).to(self.device))
        next_state_batches = Variable(
            torch.Tensor(next_state_batches).to(self.device))
        done_batches = Variable(
            torch.Tensor(
                (done_batches == False) * 1).reshape(-1, 1).to(self.device))

        target_next_actions = self.actor_target.forward(
            next_state_batches).detach()
        target_next_q = self.critic_target.forward(
            next_state_batches, target_next_actions).detach()

        main_q = self.critic(state_batches, action_batches)

        # Critic Loss
        self.critic.zero_grad()
        baselines = reward_batches + done_batches * self.config.gamma * target_next_q
        loss_critic = torch.nn.MSELoss()(main_q, baselines)
        loss_critic.backward()
        self.critic_optimizer.step()

        # Actor Loss
        self.actor.zero_grad()
        clear_action_batches = self.actor.forward(state_batches)
        loss_actor = (
            -self.critic.forward(state_batches, clear_action_batches)).mean()
        loss_actor.backward()
        self.actor_optimizer.step()

        # This is for logging
        self.c_loss = loss_critic.item()
        self.a_loss = loss_actor.item()

        soft_update(self.actor, self.actor_target, self.config.tau)
        soft_update(self.critic, self.critic_target, self.config.tau)

    def getLoss(self):
        return self.c_loss, self.a_loss