コード例 #1
0
ファイル: softqlearning.py プロジェクト: tyGavinZJU/rllite
    def __init__(self):
        self.dO = 2  # Observation space dimensions
        self.dA = 2  # Action space dimensions

        self.criterion = nn.MSELoss()

        self.q_net = QNet()
        self.q_optimizer = torch.optim.Adam(self.q_net.parameters())
        self.policy_net = PolicyNet()
        self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters())
        self.terrain = Terrain()

        self.replay_buffer_maxlen = 50
        self.replay_buffer = []
        self.exploration_prob = 0.0
        self.alpha = 1.0
        self.value_alpha = 0.3

        self.action_set = []
        for j in range(32):
            self.action_set.append((np.sin(
                (3.14 * 2 / 32) * j), np.cos((3.14 * 2 / 32) * j)))
コード例 #2
0
class QPlayer(Player):
    def __init__(self):
        super().__init__()
        self.exploration_rate = 0.2
        self.qnet = QNet()

    def take_turn(self, field):
        if self.exploration_rate <= np.random.random():
            take = self.random_turn(field)
        else:
            take = self.greedy_turn(field)
        return take

    def random_turn(self, field):
        return np.random.randint(0, 7)

    def greedy_turn(self, field):
        return np.argmax(self.qnet.predict(field))
コード例 #3
0
ファイル: qnet_test.py プロジェクト: ngc92/neural-control
def test_target_net(use_target, **kwargs):
    g = tf.Graph()
    net = QNet(1, 1, 1, graph=g, target_net=use_target, **kwargs)
    sess = tf.Session(graph=g)
    with g.as_default(), sess:
        gstep = tf.Variable(0,
                            dtype=tf.int64,
                            trainable=False,
                            name="global_step")
        net._qnet = QNetGraph(gstep)
        net._build_value_network(mini_arch)

        # at this point, g contains one MARKER variable
        # we can verify that:
        assert count_markers(
        ) == 1, "Expect one MARKER variable, something is wrong with the test"

        # Now, depending on whether we want seperate vars for target and value, we expect one or two counters
        net._build_target_network(mini_arch)
        if use_target:
            assert count_markers(
            ) == 2, "Expect two MARKER variable, target network was not correctly built."
        else:
            assert count_markers(
            ) == 1, "Expect one MARKER variable, target network seems to reuse wrongly."

        if use_target:
            sess.run(tf.global_variables_initializer())
            gv = tf.GraphKeys.GLOBAL_VARIABLES
            target_marker = [
                v for v in tf.get_collection(gv, scope="target_network")
                if v.name.endswith("MARKER:0")
            ]
            old_value_target = target_marker[0].eval()
            value_marker = [
                v for v in tf.get_collection(gv, scope="value_network")
                if v.name.endswith("MARKER:0")
            ]
            old_value_source = value_marker[0].eval()

            assert old_value_source != old_value_target, "Target and Value nets are initialized differently"
            net._qnet.update_target(session=sess)

            assert value_marker[0].eval(
            ) == old_value_source, "Update should not change value net"
            assert target_marker[0].eval(
            ) == old_value_source, "Update should set target net to value net"
コード例 #4
0
ファイル: main.py プロジェクト: donmahallem/connect_four_tf
class Game:
    def __init__(self):
        self.discount = 0.8
        self.qnet = QNet()

    def create_training_pair(self, old_state, action, reward, new_state):
        # Ask the model for the Q values of the old state (inference)
        states = np.zeros(
            (2, old_state.shape[0], old_state.shape[1], old_state.shape[2]))
        states[0] = old_state
        states[1] = new_state
        Q_values = self.qnet.predict(states)
        old_state_Q_values = Q_values[0]

        # Ask the model for the Q values of the new state (inference)
        new_state_Q_values = Q_values[1]

        # Real Q value for the action we took. This is what we will train towards.
        old_state_Q_values[action] = reward + \
            self.discount * np.amax(new_state_Q_values)

        return old_state, old_state_Q_values

    def minimax(self, field, player, depth=5):
        if depth == 0:
            return evaluate(position)
        bestScore = -100000000
        for i in range(0, 7):
            if field.isColumnFull(i):
                continue
            fieldClone = field.clone()
            fieldClone.put(i, player)
            subScore = -self.minimax(fieldClone, player * -1, depth - 1)
            if subScore > bestScore:
                bestScore = subScore
        return bestScore
コード例 #5
0
ファイル: train.py プロジェクト: horacepan/qap
def train_v1(eps_start, eps_end, eps_decay, n_step, mem_capacity, num_episodes,
             embed_dim, iters):
    graph_generator = GraphGenerator(16, 16)
    memory = ReplayBuffer(mem_capacity)
    steps_done = 0
    gnn = Struc2Vec(embed_dim, iters)
    qnet = QNet(embed_dim)
    optimizer = optim.Adam(list(gnn.parameters()) + list(qnet.parameters()),
                           lr=0.0001,
                           weight_decay=1e-4)
    for e in range(num_episodes):
        node_labels, adj, edge_weights = graph_generator.next()
        vtx_feats = gnn(node_labels, adj, edge_weights)
        remaining_vertices = set([i for i in range(len(adj))])
        state = Variable(torch.zeros(embed_dim))
        curr_tour = []
        T = len(adj)
        rewards = []
        states = [state]

        for t in range(T):
            eps_threshold = util.get_eps_threshold(eps_start, eps_end,
                                                   eps_decay, steps_done)
            if random.random() > eps_threshold:
                # arg max action
                curr_vtx = arg_max_action(qnet, vtx_features,
                                          remaining_vertices)
            else:
                # random action
                curr_vtx = random.sample(remaining_vertices, 1)[0]

            action = vtx_feats[curr_vtx]
            # reward maintenance
            est_reward = qnet(state, curr_vtx)
            reward = get_reward(curr_tour, curr_vtx, edge_weights)
            rewards.append(reward)

            # update states
            curr_tour.append(curr_vtx)
            remaining_vertices.remove(curr_vtx)
            states.append(state + action)
            # wait till after doing the memory stuff to add the state

            # we only do these updates after n steps
            if t >= n_step:
                _, next_reward = arg_max_action(qnet, vtx_features,
                                                remaining_vertices)
                state_tminusn = states[-n_step]  # this is a torch tensor
                action_tminusn = vtx_feats[
                    curr_tour[-nstep]]  # this gives the vertex id
                reward_tminusn = sum(reward[-n:])
                memory.push(state_minusn, action_tminusn, reward_tminusn,
                            state, action)

                transitions = memory.sample(batch_size)
                # batch.state, batch.action, batch.reward, etc are now tuples
                # TODO: this looks a bit gross....
                batch = Transition(*zip(*batch))
                state_batch = torch.cat([s.unsqueeze(0) for s in batch.state],
                                        dim=0)
                action_batch = torch.cat(
                    [a.unsqueeze(0) for a in batch.action], dim=0)
                reward_batch = torch.cat(batch.reward)
                newstate_batch = torch.cat(
                    [ns.unsqueeze(0) for ns in batch.new_state], dim=0)
                max_action_batch = torch.cat(
                    [ma.unsqueeze(0) for ma in batch.max_action], dim=0)

                # TODO: make qnet allow batch
                # does the experience replay memory contain state/action/reward/next_state
                # from only the current episode's graph? Or can any graph seen before be
                # in the memory?
                # The argmax action is the thing taken at time t-n_step right?
                oldstate_action_value = qnet(state_batch, action_batch)
                newstate_action_value = qnet(new_state_batch, max_action_batch)
                expected_sa_values = reward_batch + gamma * newstate_action_value
                loss = F.mse_loss(oldstate_action_value, expected_sa_values)

                optimizer.zero_grad()
                loss.backward()
                # clamp grads?

            state += action
            steps_done += 1
コード例 #6
0
        self.limit = limit

    def add(self, data):
        self.memory.append(data)
        overhead = len(self.memory) - self.limit
        if overhead > 0:
            self.memory = self.memory[overhead:]

    def count(self):
        return len(self.memory)


mem = Memory()

exploration_rate = 0.2
net = QNet()


def take_turns(field, player, max_depth=None):
    if not max_depth == None and max_depth == 0:
        return 0
    old_field = np.copy(field.getField())
    if np.random.random() < exploration_rate:
        take_turn = np.random.randint(0, 7)
    else:
        np_field = old_field.reshape((1, 6, 7))
        if player == -1:
            np_field *= -1
        predictions = net.predict(np_field)[0]
        take_turn = np.argmax(predictions)
    _, _, done, reward = field.put(take_turn, player)
コード例 #7
0
 def __init__(self):
     super().__init__()
     self.exploration_rate = 0.2
     self.qnet = QNet()
コード例 #8
0
class SoftQLearning:
    def __init__(self):
        self.dO = 2  # Observation space dimensions
        self.dA = 2  # Action space dimensions

        self.criterion = nn.MSELoss()

        self.q_net = QNet()
        self.q_optimizer = optim.SGD(self.q_net.parameters(), lr=0.0001)
        self.policy_net = PolicyNet()
        self.policy_optimizer = optim.SGD(self.policy_net.parameters(),
                                          lr=0.001)
        self.terrain = Terrain()

        self.replay_buffer_maxlen = 50
        self.replay_buffer = []
        self.exploration_prob = 0.0
        self.alpha = 1.0
        self.value_alpha = 0.3

        self.action_set = []
        for j in range(32):
            self.action_set.append((math.sin(
                (3.14 * 2 / 32) * j), math.cos((3.14 * 2 / 32) * j)))

    def forward_QNet(self, obs, action):
        inputs = Variable(torch.FloatTensor([obs + action]))
        q_pred = self.q_net(inputs)
        return q_pred

    def forward_PolicyNet(self, obs, noise):
        inputs = Variable(torch.FloatTensor([obs + noise]))
        action_pred = self.policy_net(inputs)
        return action_pred

    def collect_samples(self):
        self.replay_buffer = []
        self.terrain.resetgame()
        while (1):
            self.terrain.plotgame()
            current_state = self.terrain.player.getposition()
            """
            best_action = self.action_set[0]
            for j in range(32):
                # Sample 32 actions and use them in the next state to get maximum Q_value
                action_temp = self.action_set[j]
                print self.forward_QNet(current_state, action_temp).data.numpy()[0][0]
                if self.forward_QNet(current_state, action_temp).data.numpy()[0][0] > self.forward_QNet(current_state, best_action).data.numpy()[0][0]:
                    best_action = action_temp
            print "Exploration prob:", self.exploration_prob
            """
            best_action = tuple(
                self.forward_PolicyNet(
                    current_state,
                    (np.random.normal(0.0, 0.5), np.random.normal(
                        0.0, 0.5))).data.numpy()[0].tolist())
            if random.uniform(0.0, 1.0) < self.exploration_prob:
                x_val = random.uniform(-1.0, 1.0)
                best_action = (x_val, random.choice([-1.0, 1.0]) *
                               math.sqrt(1.0 - x_val * x_val))
            print "Action:", best_action
            current_reward = self.terrain.player.action(best_action)
            print "Reward:", current_reward
            next_state = self.terrain.player.getposition()
            self.replay_buffer.append(
                [current_state, best_action, current_reward, next_state])
            if self.terrain.checkepisodeend() or len(
                    self.replay_buffer) > self.replay_buffer_maxlen:
                self.terrain.resetgame()
                break

    def rbf_kernel(self, input1, input2):
        return np.exp(-3.14 * (np.dot(input1 - input2, input1 - input2)))

    def rbf_kernel_grad(self, input1, input2):
        diff = (input1 - input2)
        mult_val = self.rbf_kernel(input1, input2) * -2 * 3.14
        return [x * mult_val for x in diff]

    def train_network(self):
        for t in range(50):
            i = random.randint(0, len(self.replay_buffer) - 1)
            current_state = self.replay_buffer[i][0]
            current_action = self.replay_buffer[i][1]
            current_reward = self.replay_buffer[i][2]
            next_state = self.replay_buffer[i][3]

            # Perform updates on the Q-Network
            best_q_val_next = 0
            for j in range(32):
                # Sample 32 actions and use them in the next state to get an estimate of the state value
                action_temp = self.action_set[j]
                q_value_temp = (1.0 / self.value_alpha) * self.forward_QNet(
                    next_state, action_temp).data.numpy()[0][0]
                q_value_temp = np.exp(q_value_temp) / (1.0 / 32)
                best_q_val_next += q_value_temp * (1.0 / 32)
            best_q_val_next = self.value_alpha * np.log(best_q_val_next)
            print "Best Q Val:", best_q_val_next
            inputs_cur = Variable(torch.FloatTensor([
                (current_state + current_action)
            ]),
                                  requires_grad=True)
            predicted_q = self.q_net(inputs_cur)
            expected_q = current_reward + 0.99 * best_q_val_next
            expected_q = (1 - self.alpha) * predicted_q.data.numpy(
            )[0][0] + self.alpha * expected_q
            expected_q = Variable(torch.FloatTensor([[expected_q]]))
            loss = self.criterion(predicted_q, expected_q)
            loss.backward()

            # Perform updates on the Policy-Network using SVGD
            action_predicted = self.forward_PolicyNet(current_state,
                                                      (0.0, 0.0))
            final_action_gradient = [0.0, 0.0]
            for j in range(32):
                action_temp = tuple(
                    self.forward_PolicyNet(
                        current_state,
                        (np.random.normal(0.0, 0.5), np.random.normal(
                            0.0, 0.5))).data.numpy()[0].tolist())
                inputs_temp = Variable(torch.FloatTensor(
                    [current_state + action_temp]),
                                       requires_grad=True)
                predicted_q = self.q_net(inputs_temp)

                # Perform standard Q-value computation for each of the selected actions
                best_q_val_next = 0
                for k in range(32):
                    # Sample 32 actions and use them in the next state to get an estimate of the state value
                    action_temp_2 = self.action_set[k]
                    q_value_temp = (
                        1.0 / self.value_alpha) * self.forward_QNet(
                            next_state, action_temp_2).data.numpy()[0][0]
                    q_value_temp = np.exp(q_value_temp) / (1.0 / 32)
                    best_q_val_next += q_value_temp * (1.0 / 32)
                best_q_val_next = self.value_alpha * np.log(best_q_val_next)
                expected_q = current_reward + 0.99 * best_q_val_next
                expected_q = (1 - self.alpha) * predicted_q.data.numpy(
                )[0][0] + self.alpha * expected_q
                expected_q = Variable(torch.FloatTensor([[expected_q]]))
                loss = self.criterion(predicted_q, expected_q)
                loss.backward()

                action_gradient_temp = [
                    inputs_temp.grad.data.numpy()[0][2],
                    inputs_temp.grad.data.numpy()[0][3]
                ]
                kernel_val = self.rbf_kernel(list(action_temp),
                                             action_predicted.data.numpy()[0])
                kernel_grad = self.rbf_kernel_grad(
                    list(action_temp),
                    action_predicted.data.numpy()[0])
                final_temp_grad = (
                    [x * kernel_val for x in action_gradient_temp] +
                    [x * self.value_alpha for x in kernel_grad])
                final_action_gradient[0] += (1.0 / 32) * final_temp_grad[0]
                final_action_gradient[1] += (1.0 / 32) * final_temp_grad[1]
            print final_action_gradient
            action_predicted.backward(
                torch.FloatTensor([final_action_gradient]))

            # Apply the updates using the optimizers
            self.q_optimizer.zero_grad()
            self.q_optimizer.step()
            self.policy_optimizer.zero_grad()
            self.policy_optimizer.step()
コード例 #9
0
ファイル: main.py プロジェクト: donmahallem/connect_four_tf
 def __init__(self):
     self.discount = 0.8
     self.qnet = QNet()
コード例 #10
0
ファイル: main.py プロジェクト: donmahallem/connect_four_tf
from qnet import QNet
from field import Field
import numpy as np

a = Field()

net = QNet()
net.getModel().summary()
for i in range(6 * 7 + 2):
    player = 1 if i % 2 == 0 else -1
    if player > 0:
        x = np.argmax(net.predict(a.getField().reshape((1, 6, 7)))[0])
    else:
        x = np.random.randint(0, 7)
    f1, action, done, reward = a.put(x, player)
    print(i)
    print(a.getField())
    if done:
        print("Player " + str(reward) + " won")
        break


class Game:
    def __init__(self):
        self.discount = 0.8
        self.qnet = QNet()

    def create_training_pair(self, old_state, action, reward, new_state):
        # Ask the model for the Q values of the old state (inference)
        states = np.zeros(
            (2, old_state.shape[0], old_state.shape[1], old_state.shape[2]))