コード例 #1
0
ファイル: train.py プロジェクト: horacepan/qap
def train_v1(eps_start, eps_end, eps_decay, n_step, mem_capacity, num_episodes,
             embed_dim, iters):
    graph_generator = GraphGenerator(16, 16)
    memory = ReplayBuffer(mem_capacity)
    steps_done = 0
    gnn = Struc2Vec(embed_dim, iters)
    qnet = QNet(embed_dim)
    optimizer = optim.Adam(list(gnn.parameters()) + list(qnet.parameters()),
                           lr=0.0001,
                           weight_decay=1e-4)
    for e in range(num_episodes):
        node_labels, adj, edge_weights = graph_generator.next()
        vtx_feats = gnn(node_labels, adj, edge_weights)
        remaining_vertices = set([i for i in range(len(adj))])
        state = Variable(torch.zeros(embed_dim))
        curr_tour = []
        T = len(adj)
        rewards = []
        states = [state]

        for t in range(T):
            eps_threshold = util.get_eps_threshold(eps_start, eps_end,
                                                   eps_decay, steps_done)
            if random.random() > eps_threshold:
                # arg max action
                curr_vtx = arg_max_action(qnet, vtx_features,
                                          remaining_vertices)
            else:
                # random action
                curr_vtx = random.sample(remaining_vertices, 1)[0]

            action = vtx_feats[curr_vtx]
            # reward maintenance
            est_reward = qnet(state, curr_vtx)
            reward = get_reward(curr_tour, curr_vtx, edge_weights)
            rewards.append(reward)

            # update states
            curr_tour.append(curr_vtx)
            remaining_vertices.remove(curr_vtx)
            states.append(state + action)
            # wait till after doing the memory stuff to add the state

            # we only do these updates after n steps
            if t >= n_step:
                _, next_reward = arg_max_action(qnet, vtx_features,
                                                remaining_vertices)
                state_tminusn = states[-n_step]  # this is a torch tensor
                action_tminusn = vtx_feats[
                    curr_tour[-nstep]]  # this gives the vertex id
                reward_tminusn = sum(reward[-n:])
                memory.push(state_minusn, action_tminusn, reward_tminusn,
                            state, action)

                transitions = memory.sample(batch_size)
                # batch.state, batch.action, batch.reward, etc are now tuples
                # TODO: this looks a bit gross....
                batch = Transition(*zip(*batch))
                state_batch = torch.cat([s.unsqueeze(0) for s in batch.state],
                                        dim=0)
                action_batch = torch.cat(
                    [a.unsqueeze(0) for a in batch.action], dim=0)
                reward_batch = torch.cat(batch.reward)
                newstate_batch = torch.cat(
                    [ns.unsqueeze(0) for ns in batch.new_state], dim=0)
                max_action_batch = torch.cat(
                    [ma.unsqueeze(0) for ma in batch.max_action], dim=0)

                # TODO: make qnet allow batch
                # does the experience replay memory contain state/action/reward/next_state
                # from only the current episode's graph? Or can any graph seen before be
                # in the memory?
                # The argmax action is the thing taken at time t-n_step right?
                oldstate_action_value = qnet(state_batch, action_batch)
                newstate_action_value = qnet(new_state_batch, max_action_batch)
                expected_sa_values = reward_batch + gamma * newstate_action_value
                loss = F.mse_loss(oldstate_action_value, expected_sa_values)

                optimizer.zero_grad()
                loss.backward()
                # clamp grads?

            state += action
            steps_done += 1
コード例 #2
0
class SoftQLearning:
    def __init__(self):
        self.dO = 2  # Observation space dimensions
        self.dA = 2  # Action space dimensions

        self.criterion = nn.MSELoss()

        self.q_net = QNet()
        self.q_optimizer = optim.SGD(self.q_net.parameters(), lr=0.0001)
        self.policy_net = PolicyNet()
        self.policy_optimizer = optim.SGD(self.policy_net.parameters(),
                                          lr=0.001)
        self.terrain = Terrain()

        self.replay_buffer_maxlen = 50
        self.replay_buffer = []
        self.exploration_prob = 0.0
        self.alpha = 1.0
        self.value_alpha = 0.3

        self.action_set = []
        for j in range(32):
            self.action_set.append((math.sin(
                (3.14 * 2 / 32) * j), math.cos((3.14 * 2 / 32) * j)))

    def forward_QNet(self, obs, action):
        inputs = Variable(torch.FloatTensor([obs + action]))
        q_pred = self.q_net(inputs)
        return q_pred

    def forward_PolicyNet(self, obs, noise):
        inputs = Variable(torch.FloatTensor([obs + noise]))
        action_pred = self.policy_net(inputs)
        return action_pred

    def collect_samples(self):
        self.replay_buffer = []
        self.terrain.resetgame()
        while (1):
            self.terrain.plotgame()
            current_state = self.terrain.player.getposition()
            """
            best_action = self.action_set[0]
            for j in range(32):
                # Sample 32 actions and use them in the next state to get maximum Q_value
                action_temp = self.action_set[j]
                print self.forward_QNet(current_state, action_temp).data.numpy()[0][0]
                if self.forward_QNet(current_state, action_temp).data.numpy()[0][0] > self.forward_QNet(current_state, best_action).data.numpy()[0][0]:
                    best_action = action_temp
            print "Exploration prob:", self.exploration_prob
            """
            best_action = tuple(
                self.forward_PolicyNet(
                    current_state,
                    (np.random.normal(0.0, 0.5), np.random.normal(
                        0.0, 0.5))).data.numpy()[0].tolist())
            if random.uniform(0.0, 1.0) < self.exploration_prob:
                x_val = random.uniform(-1.0, 1.0)
                best_action = (x_val, random.choice([-1.0, 1.0]) *
                               math.sqrt(1.0 - x_val * x_val))
            print "Action:", best_action
            current_reward = self.terrain.player.action(best_action)
            print "Reward:", current_reward
            next_state = self.terrain.player.getposition()
            self.replay_buffer.append(
                [current_state, best_action, current_reward, next_state])
            if self.terrain.checkepisodeend() or len(
                    self.replay_buffer) > self.replay_buffer_maxlen:
                self.terrain.resetgame()
                break

    def rbf_kernel(self, input1, input2):
        return np.exp(-3.14 * (np.dot(input1 - input2, input1 - input2)))

    def rbf_kernel_grad(self, input1, input2):
        diff = (input1 - input2)
        mult_val = self.rbf_kernel(input1, input2) * -2 * 3.14
        return [x * mult_val for x in diff]

    def train_network(self):
        for t in range(50):
            i = random.randint(0, len(self.replay_buffer) - 1)
            current_state = self.replay_buffer[i][0]
            current_action = self.replay_buffer[i][1]
            current_reward = self.replay_buffer[i][2]
            next_state = self.replay_buffer[i][3]

            # Perform updates on the Q-Network
            best_q_val_next = 0
            for j in range(32):
                # Sample 32 actions and use them in the next state to get an estimate of the state value
                action_temp = self.action_set[j]
                q_value_temp = (1.0 / self.value_alpha) * self.forward_QNet(
                    next_state, action_temp).data.numpy()[0][0]
                q_value_temp = np.exp(q_value_temp) / (1.0 / 32)
                best_q_val_next += q_value_temp * (1.0 / 32)
            best_q_val_next = self.value_alpha * np.log(best_q_val_next)
            print "Best Q Val:", best_q_val_next
            inputs_cur = Variable(torch.FloatTensor([
                (current_state + current_action)
            ]),
                                  requires_grad=True)
            predicted_q = self.q_net(inputs_cur)
            expected_q = current_reward + 0.99 * best_q_val_next
            expected_q = (1 - self.alpha) * predicted_q.data.numpy(
            )[0][0] + self.alpha * expected_q
            expected_q = Variable(torch.FloatTensor([[expected_q]]))
            loss = self.criterion(predicted_q, expected_q)
            loss.backward()

            # Perform updates on the Policy-Network using SVGD
            action_predicted = self.forward_PolicyNet(current_state,
                                                      (0.0, 0.0))
            final_action_gradient = [0.0, 0.0]
            for j in range(32):
                action_temp = tuple(
                    self.forward_PolicyNet(
                        current_state,
                        (np.random.normal(0.0, 0.5), np.random.normal(
                            0.0, 0.5))).data.numpy()[0].tolist())
                inputs_temp = Variable(torch.FloatTensor(
                    [current_state + action_temp]),
                                       requires_grad=True)
                predicted_q = self.q_net(inputs_temp)

                # Perform standard Q-value computation for each of the selected actions
                best_q_val_next = 0
                for k in range(32):
                    # Sample 32 actions and use them in the next state to get an estimate of the state value
                    action_temp_2 = self.action_set[k]
                    q_value_temp = (
                        1.0 / self.value_alpha) * self.forward_QNet(
                            next_state, action_temp_2).data.numpy()[0][0]
                    q_value_temp = np.exp(q_value_temp) / (1.0 / 32)
                    best_q_val_next += q_value_temp * (1.0 / 32)
                best_q_val_next = self.value_alpha * np.log(best_q_val_next)
                expected_q = current_reward + 0.99 * best_q_val_next
                expected_q = (1 - self.alpha) * predicted_q.data.numpy(
                )[0][0] + self.alpha * expected_q
                expected_q = Variable(torch.FloatTensor([[expected_q]]))
                loss = self.criterion(predicted_q, expected_q)
                loss.backward()

                action_gradient_temp = [
                    inputs_temp.grad.data.numpy()[0][2],
                    inputs_temp.grad.data.numpy()[0][3]
                ]
                kernel_val = self.rbf_kernel(list(action_temp),
                                             action_predicted.data.numpy()[0])
                kernel_grad = self.rbf_kernel_grad(
                    list(action_temp),
                    action_predicted.data.numpy()[0])
                final_temp_grad = (
                    [x * kernel_val for x in action_gradient_temp] +
                    [x * self.value_alpha for x in kernel_grad])
                final_action_gradient[0] += (1.0 / 32) * final_temp_grad[0]
                final_action_gradient[1] += (1.0 / 32) * final_temp_grad[1]
            print final_action_gradient
            action_predicted.backward(
                torch.FloatTensor([final_action_gradient]))

            # Apply the updates using the optimizers
            self.q_optimizer.zero_grad()
            self.q_optimizer.step()
            self.policy_optimizer.zero_grad()
            self.policy_optimizer.step()