Ejemplo n.º 1
0
    def __init__(self, n_states, n_actions, hidden_dim):
        """Agent class that choose action and train

        Args:
            input_dim (int): input dimension
            output_dim (int): output dimension
            hidden_dim (int): hidden dimension
        """

        self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(device)
        self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(device)

        self.mse_loss = torch.nn.MSELoss()
        self.optim = optim.Adam(self.q_local.parameters(), lr=LEARNING_RATE)

        self.n_states = n_states
        self.n_actions = n_actions

        #  ReplayMemory: trajectory is saved here
        self.replay_memory = ReplayMemory(10000)
Ejemplo n.º 2
0
    # where it is shared
    print("Creating optimizers...")
    optimizer = torch.optim.RMSprop(DQN_main.parameters())
    # optimizer_next_object = torch.optim.RMSprop(DQN_next_object_main.parameters())
    # optimizer_predicate = torch.optim.RMSprop(DQN_predicate_main.parameters())
    # optimizer_attribute = torch.optim.RMSprop(DQN_attribute_main.parameters())
    print("Done!")

    # define loss functions
    loss_fn = nn.MSELoss()
    # loss_fn_predicate = nn.MSELoss()
    # loss_fn_next_object = nn.MSELoss()

    # create replay buffer
    print("Creating replay buffer...")
    replay_buffer = ReplayMemory(replay_buffer_capacity,
                                 replay_buffer_minimum_number_samples)
    print("Done!")

    # load skip thought model
    # skip_thought_model = skipthoughts.load_model()
    # skip_thought_encoder = skipthoughts.Encoder(skip_thought_model)

    # load train data samples
    if args.train:
        train_data_samples = json.load(open(args.train_data))
        train_dataset = VGDataset(train_data_samples, args.images_dir)
        train_data_loader = DataLoader(dataset=train_dataset,
                                       batch_size=args.batch_size,
                                       shuffle=True,
                                       num_workers=args.num_workers,
                                       collate_fn=collate)
Ejemplo n.º 3
0
class Agent(object):
    def __init__(self, n_states, n_actions, hidden_dim):
        """Agent class that choose action and train

        Args:
            input_dim (int): input dimension
            output_dim (int): output dimension
            hidden_dim (int): hidden dimension
        """

        self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(device)
        self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(device)

        self.mse_loss = torch.nn.MSELoss()
        self.optim = optim.Adam(self.q_local.parameters(), lr=LEARNING_RATE)

        self.n_states = n_states
        self.n_actions = n_actions

        #  ReplayMemory: trajectory is saved here
        self.replay_memory = ReplayMemory(10000)

    def get_action(self, state, eps, check_eps=True):
        """Returns an action

        Args:
            state : 2-D tensor of shape (n, input_dim)
            eps (float): eps-greedy for exploration

        Returns: int: action index
        """
        global steps_done
        sample = random.random()

        if check_eps == False or sample > eps:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                ## UserWarning: volatile was removed and now has no effect.
                ## Use `with torch.no_grad():` instead.
                return self.q_local(
                    Variable(state).type(FloatTensor)).data.max(1)[1].view(
                        1, 1)
        else:
            ## return LongTensor([[random.randrange(2)]])
            return torch.tensor([[random.randrange(self.n_actions)]],
                                device=device)

    def learn(self, experiences, gamma):
        """Prepare minibatch and train them

        Args:
        experiences (List[Transition]): Minibatch of `Transition`
        gamma (float): Discount rate of Q_target
        """

        if len(self.replay_memory.memory) < BATCH_SIZE:
            return

        transitions = self.replay_memory.sample(BATCH_SIZE)

        batch = Transition(*zip(*transitions))

        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)
        next_states = torch.cat(batch.next_state)
        dones = torch.cat(batch.done)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        # Use local model to choose an action, and target model to evaluate that action

        Q_max_action = self.q_local(next_states).detach().max(1)[1].unsqueeze(
            1)
        Q_targets_next = self.q_target(next_states).gather(
            1, Q_max_action).reshape(-1)

        # Compute the expected Q values
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        Q_expected = self.q_local(states).gather(1, actions)  ## current

        #self.q_local.train(mode=True)
        self.optim.zero_grad()

        #print('Q_expected.shape: ', Q_expected.shape)
        #print('Q_targets_next.shape: ', Q_targets_next.shape)
        #print('Q_targets.shape: ', Q_targets.shape)

        loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1))

        # backpropagation of loss to NN
        loss.backward()
        self.optim.step()
Ejemplo n.º 4
0
    def __init__(self, gamma, tau, actor_hidden_size, critic_hidden_size,
                 observation_space, action_space, args):

        self.num_inputs = observation_space.shape[0]
        self.action_space = action_space
        self.actor_hidden_size = actor_hidden_size
        self.critic_hidden_size = critic_hidden_size
        self.comm_hidden_size = actor_hidden_size // 2
        self.gamma = gamma
        self.tau = tau
        self.args = args
        # replay for the update of attention unit
        self.queue = queue.Queue()

        # Define actor part 1
        self.actor_p1 = ActorPart1(self.num_inputs,
                                   actor_hidden_size).to(device)
        self.actor_target_p1 = ActorPart1(self.num_inputs,
                                          actor_hidden_size).to(device)

        # attention unit is not end-to-end trained
        self.atten = AttentionUnit(actor_hidden_size,
                                   actor_hidden_size).to(device)
        self.atten_optim = Adam(self.atten.parameters(), lr=self.args.actor_lr)

        # Define Communication Channel
        self.comm = CommunicationChannel(actor_hidden_size,
                                         self.comm_hidden_size).to(device)
        self.comm_target = CommunicationChannel(
            actor_hidden_size, self.comm_hidden_size).to(device)
        self.comm_optim = Adam(self.comm.parameters(), lr=self.args.actor_lr)

        # Define actor part 2
        # input -- [thoughts, intergrated thoughts]
        self.actor_p2 = ActorPart2(
            actor_hidden_size + self.comm_hidden_size * 2, self.action_space,
            actor_hidden_size).to(device)
        self.actor_target_p2 = ActorPart2(
            actor_hidden_size + self.comm_hidden_size * 2, self.action_space,
            actor_hidden_size).to(device)
        self.actor_optim = Adam([{
            'params': self.actor_p1.parameters(),
            'lr': self.args.actor_lr
        }, {
            'params': self.actor_p2.parameters(),
            'lr': self.args.actor_lr
        }])

        self.critic = Critic(self.num_inputs, self.action_space,
                             critic_hidden_size).to(device)
        self.critic_target = Critic(self.num_inputs, self.action_space,
                                    critic_hidden_size).to(device)
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=self.args.critic_lr)

        # Make sure target is with the same weight
        hard_update(self.actor_target_p1, self.actor_p1)
        hard_update(self.comm_target, self.comm)
        hard_update(self.actor_target_p2, self.actor_p2)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = ReplayMemory(args.memory_size)
        self.random_process = OrnsteinUhlenbeckProcess(size=action_space.n,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)
Ejemplo n.º 5
0
class ATOC_trainer(object):
    def __init__(self, gamma, tau, actor_hidden_size, critic_hidden_size,
                 observation_space, action_space, args):

        self.num_inputs = observation_space.shape[0]
        self.action_space = action_space
        self.actor_hidden_size = actor_hidden_size
        self.critic_hidden_size = critic_hidden_size
        self.comm_hidden_size = actor_hidden_size // 2
        self.gamma = gamma
        self.tau = tau
        self.args = args
        # replay for the update of attention unit
        self.queue = queue.Queue()

        # Define actor part 1
        self.actor_p1 = ActorPart1(self.num_inputs,
                                   actor_hidden_size).to(device)
        self.actor_target_p1 = ActorPart1(self.num_inputs,
                                          actor_hidden_size).to(device)

        # attention unit is not end-to-end trained
        self.atten = AttentionUnit(actor_hidden_size,
                                   actor_hidden_size).to(device)
        self.atten_optim = Adam(self.atten.parameters(), lr=self.args.actor_lr)

        # Define Communication Channel
        self.comm = CommunicationChannel(actor_hidden_size,
                                         self.comm_hidden_size).to(device)
        self.comm_target = CommunicationChannel(
            actor_hidden_size, self.comm_hidden_size).to(device)
        self.comm_optim = Adam(self.comm.parameters(), lr=self.args.actor_lr)

        # Define actor part 2
        # input -- [thoughts, intergrated thoughts]
        self.actor_p2 = ActorPart2(
            actor_hidden_size + self.comm_hidden_size * 2, self.action_space,
            actor_hidden_size).to(device)
        self.actor_target_p2 = ActorPart2(
            actor_hidden_size + self.comm_hidden_size * 2, self.action_space,
            actor_hidden_size).to(device)
        self.actor_optim = Adam([{
            'params': self.actor_p1.parameters(),
            'lr': self.args.actor_lr
        }, {
            'params': self.actor_p2.parameters(),
            'lr': self.args.actor_lr
        }])

        self.critic = Critic(self.num_inputs, self.action_space,
                             critic_hidden_size).to(device)
        self.critic_target = Critic(self.num_inputs, self.action_space,
                                    critic_hidden_size).to(device)
        self.critic_optim = Adam(self.critic.parameters(),
                                 lr=self.args.critic_lr)

        # Make sure target is with the same weight
        hard_update(self.actor_target_p1, self.actor_p1)
        hard_update(self.comm_target, self.comm)
        hard_update(self.actor_target_p2, self.actor_p2)
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = ReplayMemory(args.memory_size)
        self.random_process = OrnsteinUhlenbeckProcess(size=action_space.n,
                                                       theta=args.ou_theta,
                                                       mu=args.ou_mu,
                                                       sigma=args.ou_sigma)

    def update_thoughts(self, thoughts, C):
        batch_size = 1
        nagents = thoughts.shape[0]
        thoughts = thoughts.clone().detach()

        for index in range(nagents):
            if not C[index, index]: continue
            input_comm = []
            # the neighbour of agent_i
            for j in range(nagents):
                if C[index, j]:
                    input_comm.append(thoughts[j])
            input_comm = torch.stack(input_comm, dim=0).unsqueeze(
                0)  # (1, m, acotr_hidden_size)
            # input communication channel to intergrate thoughts
            hidden_state = self.initHidden(batch_size)
            intergrated_thoughts, _ = self.comm(
                input_comm, hidden_state)  # (1, m, 2*comm_hidden_size)
            intergrated_thoughts = intergrated_thoughts.squeeze()

            # update group_index intergrated thoughts
            thoughts[C[index]] = intergrated_thoughts

        return thoughts

    def select_action(self, thoughts, inter_thoughts, C, action_noise=True):
        nagents = thoughts.shape[0]

        # merge invidual thoughts and intergrated thoughts
        is_comm = C.any(dim=0)  # (nagents)
        # agent withouth communication padding with zeros
        for i in range(nagents):
            if not is_comm[i]:
                inter_thoughts[i] = 0

        # TODO: [intergrated_thoughts, individual_thoughts] ???
        # (nagents, actor_hidden_size+2*comm_hidden_size)
        input_actor2 = torch.cat((thoughts, inter_thoughts), dim=-1)
        # input to part II of the actor
        actor2_action = self.actor_p2(input_actor2)
        action = actor2_action.data.numpy()

        return action

    def calc_delta_Q(self, obs_n, action_n, thoughts, C):
        obs_n = torch.FloatTensor(obs_n).to(device)
        action_n = torch.FloatTensor(action_n).to(device)
        nagents = obs_n.shape[0]

        for index in range(nagents):
            group_Q = []
            actual_group_Q = []
            if not C[index, index]: continue
            for j in range(nagents):
                if not C[index, j]: continue
                h_j = torch.cat((thoughts[j], torch.zeros_like(thoughts[j])),
                                dim=-1).unsqueeze(0)
                action_j = self.actor_p2(h_j)  # (1, action_shape)
                actual_action_j = action_n[j].unsqueeze(0)  # (1, action_shape)

                Q_j = self.critic(obs_n[j].unsqueeze(0), action_j)  # (1, 1)
                actual_Q_j = self.critic(obs_n[j].unsqueeze(0),
                                         actual_action_j)

                group_Q.append(Q_j.squeeze())
                actual_group_Q.append(actual_Q_j.squeeze())
            group_Q = torch.stack(group_Q, dim=0)
            actual_group_Q = torch.stack(actual_group_Q, dim=0)  # (m, )
            delta_Q = actual_group_Q.mean() - group_Q.mean()

            # store the thought and delta_Q
            h_i = thoughts[index].data.numpy()  # (actor_hidden_size, )
            delta_Q = delta_Q.data.numpy()  # 1
            self.queue.put((h_i, delta_Q))

    def update_parameters(self):
        batch_size = self.args.batch_size
        batch = self.memory.sample(batch_size)
        obs_n_batch = torch.FloatTensor(batch.obs_n).to(
            device)  # (batch_size, nagents, obs_shape)
        action_n_batch = torch.FloatTensor(batch.action_n).to(
            device)  # (batch_size, nagents, action_shape)
        reward_n_batch = torch.FloatTensor(batch.reward_n).unsqueeze(-1).to(
            device)  # (batch_size, nagents, 1)
        next_obs_n_batch = torch.FloatTensor(batch.next_obs_n).to(
            device)  # (batch_size, nagents, obs_shape)
        C_batch = torch.BoolTensor(batch.C).to(
            device)  # (batch_size, nagents, nagents)
        nagents = obs_n_batch.shape[1]

        # -----------------------------------------------------------------------------------------
        #                               sample agents without communication
        # -----------------------------------------------------------------------------------------
        # True --> communication, False --> no communicaiton
        # TODO: cancel the #
        # ind = C_batch.any(dim=1)                                            # (batch_size, nagents)
        # obs_n = obs_n_batch[ind==False]
        # action_n = action_n_batch[ind==False]
        # reward_n = reward_n_batch[ind==False]
        # next_obs_n = next_obs_n_batch[ind==False]                           # (sample_agents, shape)

        # # update critic
        # thoughts_n = self.actor_target_p1(next_obs_n)                       # (sample_agents, actor_hiddensize)
        # padding = torch.zeros(thoughts_n.shape[0], 2*self.comm_hidden_size)
        # input_target_actor2 = torch.cat((thoughts_n, padding), dim=-1)      # (sample_agents, hiddensize)
        # next_action_n = self.actor_target_p2(input_target_actor2)           # (sample_agents, action_shape)
        # next_Q_n = self.critic_target(next_obs_n, next_action_n)            # (sample_agents, 1)

        # target_Q_n = reward_n + (self.gamma * next_Q_n).detach()            # (sample_agents, 1)
        # Q_n = self.critic(obs_n, action_n)                                  # (sample_agents, 1)

        # value_loss = F.mse_loss(target_Q_n, Q_n)
        # self.critic_optim.zero_grad()
        # value_loss.backward()
        # self.critic_optim.step()

        # # update actor
        # thoughts_actor = self.actor_p1(obs_n)
        # padding_actor = torch.zeros(thoughts_actor.shape[0], 2*self.comm_hidden_size)
        # input_actor2 = torch.torch.cat((thoughts_actor, padding_actor), dim=-1)
        # action_n_actor = self.actor_p2(input_actor2)
        # policy_loss = -self.critic(obs_n, action_n_actor)

        # policy_loss = policy_loss.mean()
        # self.actor_optim.zero_grad()
        # policy_loss.backward()
        # self.actor_optim.step()

        # -----------------------------------------------------------------------------------------
        #                            sample agents with communication
        # -----------------------------------------------------------------------------------------

        # update critic
        target_Q = []
        Q = []
        for batch_index in range(batch_size):
            is_comm = C_batch[batch_index].any(dim=0)  # (nagents,)
            next_thoughts_n = self.actor_target_p1(
                next_obs_n_batch[batch_index])  # (nagents, actor_hiddensize)
            # communication
            padding = next_thoughts_n.clone().detach()
            for agent_i in range(nagents):
                if not C_batch[batch_index, agent_i, agent_i]: continue

                thoughts_m = padding[C_batch[batch_index, agent_i]].unsqueeze(
                    0)  # (1, m, actor_hiddensize)
                hidden_state = self.initHidden(1)
                inter_thoughts, _ = self.comm_target(
                    thoughts_m, hidden_state)  # (1, m, 2*comm_hidden_size)
                inter_thoughts = inter_thoughts.squeeze(
                )  # (m, 2*comm_hiddensize)

                # update inter thoughts to thoughts clone -- inter group communication
                # TODO: Can this avoid in-place operation?
                padding = padding.clone()
                padding[C_batch[batch_index, agent_i]] = inter_thoughts

            # select action for m agents with communication
            padding[~is_comm] = 0.0
            input_target_actor2 = torch.cat(
                (next_thoughts_n, padding),
                dim=-1)  # (nagents, a_hiddensie+c_hiddensize)
            next_action_n = self.actor_target_p2(
                input_target_actor2)  # (nagents, action_shape)
            # print('next_action_n shape', next_action_n.shape)
            next_obs_m = next_obs_n_batch[batch_index,
                                          is_comm]  # (m, obs_shape)
            next_action_m = next_action_n[is_comm]  # (m, action_shape)

            next_Q_m = self.critic_target(next_obs_m, next_action_m)  # (m, 1)
            reward_m = reward_n_batch[batch_index, is_comm]  # (m, 1)
            target_Q_m = reward_m + (self.gamma * next_Q_m).detach()  # (m, 1)

            obs_m = obs_n_batch[batch_index, is_comm]
            action_m = action_n_batch[batch_index, is_comm]
            Q_m = self.critic(obs_m, action_m)

            target_Q.append(target_Q_m)
            Q.append(Q_m)

        target_Q = torch.stack(target_Q, dim=0)
        Q = torch.stack(Q, dim=0)
        critic_loss = F.mse_loss(target_Q, Q)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        # update actor and communication channel
        actor_loss = []
        for batch_index in range(batch_size):
            is_comm = C_batch[batch_index].any(dim=0)  # (nagents, )
            thoughts_n = self.actor_p1(
                obs_n_batch[batch_index])  # (nagents, actor_hiddensize)
            # communication
            padding = thoughts_n.clone().detach()
            for agent_i in range(nagents):
                if not C_batch[batch_index, agent_i, agent_i]: continue

                thoughts_m = padding[C_batch[batch_index, agent_i]].unsqueeze(
                    0)  # (1, m, actor_hiddensize)
                hidden_state = self.initHidden(1)
                inter_thoughts, _ = self.comm(
                    thoughts_m, hidden_state)  # (1, m, 2*comm_hiddensize)
                inter_thoughts = inter_thoughts.squeeze()

                # TODO: Can this avoid in-place operation and pass the gradient?
                padding = padding.clone()
                padding[C_batch[batch_index, agent_i]] = inter_thoughts

            # select action for m agents with communication
            padding[~is_comm] = 0.0
            input_actor2 = torch.cat(
                (thoughts_n, padding),
                dim=-1)  # (nagents, a_hiddensize+c_hiddensize)
            action_n = self.actor_p2(input_actor2)  # (nagents, action shape)
            action_m = action_n[is_comm]  # (m, action shape)
            obs_m = obs_n_batch[batch_index, is_comm]  # (m, obs shape)

            actor_loss_batch = -self.critic(obs_m, action_m)  # (m, 1)
            actor_loss.append(actor_loss_batch)

        actor_loss = torch.stack(actor_loss, dim=0)  # (batch_size, m, 1)
        # print('actor_loss shape', actor_loss.shape)
        actor_loss = actor_loss.mean()
        self.actor_optim.zero_grad()
        self.comm_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()
        self.comm_optim.step()

        soft_update(self.actor_target_p1, self.actor_p1, self.tau)
        soft_update(self.actor_target_p2, self.actor_p2, self.tau)
        soft_update(self.comm_target, self.comm, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        return critic_loss.item(), actor_loss.item()

    def update_attention_unit(self):
        h_i_batch = []
        delta_Q_batch = []
        while not self.queue.empty():
            h_i, delta_Q = self.queue.get()
            h_i_batch.append(h_i)
            delta_Q_batch.append(delta_Q)

        print("delta_Q_batch values", delta_Q_batch)
        h_i_batch = torch.FloatTensor(h_i_batch).to(
            device)  # (batch_size, actor_hiddensize)
        delta_Q_batch = torch.FloatTensor(delta_Q_batch).to(
            device)  # (batch_size, )
        p_i = self.atten(h_i_batch)  # (batch_size, 1)
        p_i = p_i.squeeze()

        # min-max normalization
        delta_Q_batch = (delta_Q_batch - delta_Q_batch.min()) / (
            delta_Q_batch.max() - delta_Q_batch.min())

        # calc loss
        loss = -delta_Q_batch * torch.log(p_i) - (
            1 - delta_Q_batch) * torch.log(1 - p_i)
        self.atten_optim.zero_grad()
        loss.backward()
        self.atten_optim.step()

    def get_thoughts(self, obs_n):
        obs_n_tensor = torch.FloatTensor(obs_n).to(
            device)  # (nagents, obs_shape)
        thoughts = self.actor_p1(obs_n_tensor)
        return thoughts

    def initiate_group(self, obs_n, m, thoughts):
        obs_n = np.array(obs_n)
        nagents = obs_n.shape[0]

        # decide whether to initiate communication
        atten_out = self.atten(thoughts)  # (nagents, 1)
        is_comm = (atten_out > 0.5).squeeze()  # (nagents, )
        C = torch.zeros(nagents, nagents).bool()

        # relative position
        other_pos = (obs_n[:, -(nagents - 1) * 2:]).reshape(
            -1, nagents - 1, 2)  # (nagents, nagents-1, 2)
        other_dist = np.sqrt(np.sum(np.square(other_pos),
                                    axis=-1))  # (nagents, nagents-1)
        # insert itself distance into other_dist -> total_dist
        total_dist = []
        for i in range(nagents):
            total_dist.append(np.insert(other_dist[i], obj=i, values=0.0))
        total_dist = np.stack(total_dist)  # (nagents, nagents)
        # the id of top-m agents (including itself)
        index = np.argsort(total_dist, axis=-1)
        assert m <= nagents
        neighbour_m = index[:, :m]  # (nagents, m)

        for index, comm in enumerate(is_comm):
            if comm: C[index, neighbour_m[index]] = True

        # TODO: test the other parts of this project without attention unit
        C = torch.zeros(nagents, nagents)
        C[0] = 1
        C = C.bool()

        return C

    def initHidden(self, batch_size):
        return torch.zeros((2 * 1, batch_size, self.comm_hidden_size))

    def save_model(self, env_name, suffix=""):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        save_path = "models/ddpg_{}_{}".format(env_name, suffix)
        model = {
            'actor_p1': self.actor_p1.state_dict(),
            'actor_target_p1': self.actor_target_p1.state_dict(),
            'actor_p2': self.actor_p2.state_dict(),
            'actor_target_p2': self.actor_target_p2.state_dict(),
            'critic': self.critic.state_dict(),
            'critic_target': self.critic_target.state_dict(),
            'comm': self.comm.state_dict(),
            'comm_target': self.comm_target.state_dict(),
            'atten': self.atten.state_dict()
        }
        torch.save(model, save_path)
        print('Saving models to {}'.format(save_path))

    def load_model(self, env_name, suffix=""):
        load_path = "models/ddpg_{}_{}".format(env_name, suffix)
        print('Loading models from {}'.format(load_path))
        model = torch.load(load_path)
        self.actor_p1.load_state_dict(model['actor_p1'])
        self.actor_target_p1.load_state_dict(model['actor_target_p1'])
        self.actor_p2.load_state_dict(model['actor_p2'])
        self.actor_target_p2.load_state_dict(model['actor_target_p2'])
        self.critic.load_state_dict(model['critic'])
        self.critic_target.load_state_dict(model['critic_target'])
        self.comm.load_state_dict(model['comm'])
        self.comm_target.load_state_dict(model['comm_target'])
        self.atten.load_state_dict(model['atten'])
Ejemplo n.º 6
0
class Agent(object):

    def __init__(self, n_states, n_actions, hidden_dim, lr, device):
        """Agent class that choose action and train

        Args:
            n_states (int): input dimension
            n_actions (int): output dimension
            hidden_dim (int): hidden dimension
        """

        self.device = device

        self.q_local = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device)
        self.q_target = QNetwork(n_states, n_actions, hidden_dim=16).to(self.device)
        
        self.mse_loss = torch.nn.MSELoss()
        self.optim = optim.Adam(self.q_local.parameters(), lr=lr)
        
        self.n_states = n_states
        self.n_actions = n_actions
        

        #  ReplayMemory: trajectory is saved here
        self.replay_memory = ReplayMemory(10000)
        

    def get_action(self, state, eps, check_eps=True):
        """Returns an action

        Args:
            state : 2-D tensor of shape (n, input_dim)
            eps (float): eps-greedy for exploration

        Returns: int: action index
        """
        global steps_done
        sample = random.random()

        if check_eps==False or sample > eps:
            with torch.no_grad():
                return self.q_local(Variable(state).type(FloatTensor)).data.max(1)[1].view(1, 1)
        else:
           ## return LongTensor([[random.randrange(2)]])
           return torch.tensor([[random.randrange(self.n_actions)]], device=self.device) 


    def learn(self, experiences, gamma):
        """Prepare minibatch and train them

        Args:
        experiences (List[Transition]): batch of `Transition`
        gamma (float): Discount rate of Q_target
        """
        
        if len(self.replay_memory.memory) < BATCH_SIZE:
            return;
            
        transitions = self.replay_memory.sample(BATCH_SIZE)
        
        batch = Transition(*zip(*transitions))
                        
        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)
        next_states = torch.cat(batch.next_state)
        dones = torch.cat(batch.done)
        
            
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to newtork q_local (current estimate)
        Q_expected = self.q_local(states).gather(1, actions)     

        Q_targets_next = self.q_target(next_states).detach().max(1)[0] 

        # Compute the expected Q values
        Q_targets = rewards + (gamma * Q_targets_next * (1-dones))
        
        self.q_local.train(mode=True)        
        self.optim.zero_grad()
        loss = self.mse_loss(Q_expected, Q_targets.unsqueeze(1))
        # backpropagation of loss to NN        
        loss.backward()
        self.optim.step()
               
        
    def soft_update(self, local_model, target_model, tau):
        """ tau (float): interpolation parameter"""
        
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)     
            
    def hard_update(self, local, target):
        for target_param, param in zip(target.parameters(), local.parameters()):
            target_param.data.copy_(param.data)            
Ejemplo n.º 7
0
# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()

state_size = torch.tensor(env.observation()).shape[0]
act_size = env.action_space.shape[0]

n_actions = 3

policy_net = DQN(state_size, act_size).to(device)
target_net = DQN(state_size, act_size).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = torch.optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0


def converter(observation):
    state = torch.tensor(observation).float().to(device)
    return state


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
Ejemplo n.º 8
0
vis = Visdom()
win_score = None
win_actor_score = None
win_critic_loss = None

actor = Actor(state_size * 2, action_size * 2).to(device)
actor_target = Actor(state_size * 2, action_size * 2).to(device)
critic = Critic(state_size * 2, n_action=action_size * 2).to(device)
critic_target = Critic(state_size * 2, n_action=action_size * 2).to(device)
for target_param, param in zip(critic_target.parameters(),
                               critic.parameters()):
    target_param.data.copy_(param.data)
for target_param, param in zip(actor_target.parameters(), actor.parameters()):
    target_param.data.copy_(param.data)
replay_buffer = ReplayMemory(args.replay_capacity)
criterion = nn.MSELoss()
optim_critic = torch.optim.Adam(critic.parameters(),
                                lr=args.lr_critic,
                                weight_decay=args.weight_decay_critic)
optim_actor = torch.optim.Adam(actor.parameters(), lr=args.lr_actor)

loss_critic = []
score_actor = []
score = 0
steps = 0
noise_std = args.noise_std_start

for i in range(args.episodes):
    env_info = env.reset(train_mode=True)[brain_name]
    state = torch.from_numpy(