def __init__(self): self.dO = 2 # Observation space dimensions self.dA = 2 # Action space dimensions self.criterion = nn.MSELoss() self.q_net = QNet() self.q_optimizer = torch.optim.Adam(self.q_net.parameters()) self.policy_net = PolicyNet() self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters()) self.terrain = Terrain() self.replay_buffer_maxlen = 50 self.replay_buffer = [] self.exploration_prob = 0.0 self.alpha = 1.0 self.value_alpha = 0.3 self.action_set = [] for j in range(32): self.action_set.append((np.sin( (3.14 * 2 / 32) * j), np.cos((3.14 * 2 / 32) * j)))
class QPlayer(Player): def __init__(self): super().__init__() self.exploration_rate = 0.2 self.qnet = QNet() def take_turn(self, field): if self.exploration_rate <= np.random.random(): take = self.random_turn(field) else: take = self.greedy_turn(field) return take def random_turn(self, field): return np.random.randint(0, 7) def greedy_turn(self, field): return np.argmax(self.qnet.predict(field))
def test_target_net(use_target, **kwargs): g = tf.Graph() net = QNet(1, 1, 1, graph=g, target_net=use_target, **kwargs) sess = tf.Session(graph=g) with g.as_default(), sess: gstep = tf.Variable(0, dtype=tf.int64, trainable=False, name="global_step") net._qnet = QNetGraph(gstep) net._build_value_network(mini_arch) # at this point, g contains one MARKER variable # we can verify that: assert count_markers( ) == 1, "Expect one MARKER variable, something is wrong with the test" # Now, depending on whether we want seperate vars for target and value, we expect one or two counters net._build_target_network(mini_arch) if use_target: assert count_markers( ) == 2, "Expect two MARKER variable, target network was not correctly built." else: assert count_markers( ) == 1, "Expect one MARKER variable, target network seems to reuse wrongly." if use_target: sess.run(tf.global_variables_initializer()) gv = tf.GraphKeys.GLOBAL_VARIABLES target_marker = [ v for v in tf.get_collection(gv, scope="target_network") if v.name.endswith("MARKER:0") ] old_value_target = target_marker[0].eval() value_marker = [ v for v in tf.get_collection(gv, scope="value_network") if v.name.endswith("MARKER:0") ] old_value_source = value_marker[0].eval() assert old_value_source != old_value_target, "Target and Value nets are initialized differently" net._qnet.update_target(session=sess) assert value_marker[0].eval( ) == old_value_source, "Update should not change value net" assert target_marker[0].eval( ) == old_value_source, "Update should set target net to value net"
class Game: def __init__(self): self.discount = 0.8 self.qnet = QNet() def create_training_pair(self, old_state, action, reward, new_state): # Ask the model for the Q values of the old state (inference) states = np.zeros( (2, old_state.shape[0], old_state.shape[1], old_state.shape[2])) states[0] = old_state states[1] = new_state Q_values = self.qnet.predict(states) old_state_Q_values = Q_values[0] # Ask the model for the Q values of the new state (inference) new_state_Q_values = Q_values[1] # Real Q value for the action we took. This is what we will train towards. old_state_Q_values[action] = reward + \ self.discount * np.amax(new_state_Q_values) return old_state, old_state_Q_values def minimax(self, field, player, depth=5): if depth == 0: return evaluate(position) bestScore = -100000000 for i in range(0, 7): if field.isColumnFull(i): continue fieldClone = field.clone() fieldClone.put(i, player) subScore = -self.minimax(fieldClone, player * -1, depth - 1) if subScore > bestScore: bestScore = subScore return bestScore
def train_v1(eps_start, eps_end, eps_decay, n_step, mem_capacity, num_episodes, embed_dim, iters): graph_generator = GraphGenerator(16, 16) memory = ReplayBuffer(mem_capacity) steps_done = 0 gnn = Struc2Vec(embed_dim, iters) qnet = QNet(embed_dim) optimizer = optim.Adam(list(gnn.parameters()) + list(qnet.parameters()), lr=0.0001, weight_decay=1e-4) for e in range(num_episodes): node_labels, adj, edge_weights = graph_generator.next() vtx_feats = gnn(node_labels, adj, edge_weights) remaining_vertices = set([i for i in range(len(adj))]) state = Variable(torch.zeros(embed_dim)) curr_tour = [] T = len(adj) rewards = [] states = [state] for t in range(T): eps_threshold = util.get_eps_threshold(eps_start, eps_end, eps_decay, steps_done) if random.random() > eps_threshold: # arg max action curr_vtx = arg_max_action(qnet, vtx_features, remaining_vertices) else: # random action curr_vtx = random.sample(remaining_vertices, 1)[0] action = vtx_feats[curr_vtx] # reward maintenance est_reward = qnet(state, curr_vtx) reward = get_reward(curr_tour, curr_vtx, edge_weights) rewards.append(reward) # update states curr_tour.append(curr_vtx) remaining_vertices.remove(curr_vtx) states.append(state + action) # wait till after doing the memory stuff to add the state # we only do these updates after n steps if t >= n_step: _, next_reward = arg_max_action(qnet, vtx_features, remaining_vertices) state_tminusn = states[-n_step] # this is a torch tensor action_tminusn = vtx_feats[ curr_tour[-nstep]] # this gives the vertex id reward_tminusn = sum(reward[-n:]) memory.push(state_minusn, action_tminusn, reward_tminusn, state, action) transitions = memory.sample(batch_size) # batch.state, batch.action, batch.reward, etc are now tuples # TODO: this looks a bit gross.... batch = Transition(*zip(*batch)) state_batch = torch.cat([s.unsqueeze(0) for s in batch.state], dim=0) action_batch = torch.cat( [a.unsqueeze(0) for a in batch.action], dim=0) reward_batch = torch.cat(batch.reward) newstate_batch = torch.cat( [ns.unsqueeze(0) for ns in batch.new_state], dim=0) max_action_batch = torch.cat( [ma.unsqueeze(0) for ma in batch.max_action], dim=0) # TODO: make qnet allow batch # does the experience replay memory contain state/action/reward/next_state # from only the current episode's graph? Or can any graph seen before be # in the memory? # The argmax action is the thing taken at time t-n_step right? oldstate_action_value = qnet(state_batch, action_batch) newstate_action_value = qnet(new_state_batch, max_action_batch) expected_sa_values = reward_batch + gamma * newstate_action_value loss = F.mse_loss(oldstate_action_value, expected_sa_values) optimizer.zero_grad() loss.backward() # clamp grads? state += action steps_done += 1
self.limit = limit def add(self, data): self.memory.append(data) overhead = len(self.memory) - self.limit if overhead > 0: self.memory = self.memory[overhead:] def count(self): return len(self.memory) mem = Memory() exploration_rate = 0.2 net = QNet() def take_turns(field, player, max_depth=None): if not max_depth == None and max_depth == 0: return 0 old_field = np.copy(field.getField()) if np.random.random() < exploration_rate: take_turn = np.random.randint(0, 7) else: np_field = old_field.reshape((1, 6, 7)) if player == -1: np_field *= -1 predictions = net.predict(np_field)[0] take_turn = np.argmax(predictions) _, _, done, reward = field.put(take_turn, player)
def __init__(self): super().__init__() self.exploration_rate = 0.2 self.qnet = QNet()
class SoftQLearning: def __init__(self): self.dO = 2 # Observation space dimensions self.dA = 2 # Action space dimensions self.criterion = nn.MSELoss() self.q_net = QNet() self.q_optimizer = optim.SGD(self.q_net.parameters(), lr=0.0001) self.policy_net = PolicyNet() self.policy_optimizer = optim.SGD(self.policy_net.parameters(), lr=0.001) self.terrain = Terrain() self.replay_buffer_maxlen = 50 self.replay_buffer = [] self.exploration_prob = 0.0 self.alpha = 1.0 self.value_alpha = 0.3 self.action_set = [] for j in range(32): self.action_set.append((math.sin( (3.14 * 2 / 32) * j), math.cos((3.14 * 2 / 32) * j))) def forward_QNet(self, obs, action): inputs = Variable(torch.FloatTensor([obs + action])) q_pred = self.q_net(inputs) return q_pred def forward_PolicyNet(self, obs, noise): inputs = Variable(torch.FloatTensor([obs + noise])) action_pred = self.policy_net(inputs) return action_pred def collect_samples(self): self.replay_buffer = [] self.terrain.resetgame() while (1): self.terrain.plotgame() current_state = self.terrain.player.getposition() """ best_action = self.action_set[0] for j in range(32): # Sample 32 actions and use them in the next state to get maximum Q_value action_temp = self.action_set[j] print self.forward_QNet(current_state, action_temp).data.numpy()[0][0] if self.forward_QNet(current_state, action_temp).data.numpy()[0][0] > self.forward_QNet(current_state, best_action).data.numpy()[0][0]: best_action = action_temp print "Exploration prob:", self.exploration_prob """ best_action = tuple( self.forward_PolicyNet( current_state, (np.random.normal(0.0, 0.5), np.random.normal( 0.0, 0.5))).data.numpy()[0].tolist()) if random.uniform(0.0, 1.0) < self.exploration_prob: x_val = random.uniform(-1.0, 1.0) best_action = (x_val, random.choice([-1.0, 1.0]) * math.sqrt(1.0 - x_val * x_val)) print "Action:", best_action current_reward = self.terrain.player.action(best_action) print "Reward:", current_reward next_state = self.terrain.player.getposition() self.replay_buffer.append( [current_state, best_action, current_reward, next_state]) if self.terrain.checkepisodeend() or len( self.replay_buffer) > self.replay_buffer_maxlen: self.terrain.resetgame() break def rbf_kernel(self, input1, input2): return np.exp(-3.14 * (np.dot(input1 - input2, input1 - input2))) def rbf_kernel_grad(self, input1, input2): diff = (input1 - input2) mult_val = self.rbf_kernel(input1, input2) * -2 * 3.14 return [x * mult_val for x in diff] def train_network(self): for t in range(50): i = random.randint(0, len(self.replay_buffer) - 1) current_state = self.replay_buffer[i][0] current_action = self.replay_buffer[i][1] current_reward = self.replay_buffer[i][2] next_state = self.replay_buffer[i][3] # Perform updates on the Q-Network best_q_val_next = 0 for j in range(32): # Sample 32 actions and use them in the next state to get an estimate of the state value action_temp = self.action_set[j] q_value_temp = (1.0 / self.value_alpha) * self.forward_QNet( next_state, action_temp).data.numpy()[0][0] q_value_temp = np.exp(q_value_temp) / (1.0 / 32) best_q_val_next += q_value_temp * (1.0 / 32) best_q_val_next = self.value_alpha * np.log(best_q_val_next) print "Best Q Val:", best_q_val_next inputs_cur = Variable(torch.FloatTensor([ (current_state + current_action) ]), requires_grad=True) predicted_q = self.q_net(inputs_cur) expected_q = current_reward + 0.99 * best_q_val_next expected_q = (1 - self.alpha) * predicted_q.data.numpy( )[0][0] + self.alpha * expected_q expected_q = Variable(torch.FloatTensor([[expected_q]])) loss = self.criterion(predicted_q, expected_q) loss.backward() # Perform updates on the Policy-Network using SVGD action_predicted = self.forward_PolicyNet(current_state, (0.0, 0.0)) final_action_gradient = [0.0, 0.0] for j in range(32): action_temp = tuple( self.forward_PolicyNet( current_state, (np.random.normal(0.0, 0.5), np.random.normal( 0.0, 0.5))).data.numpy()[0].tolist()) inputs_temp = Variable(torch.FloatTensor( [current_state + action_temp]), requires_grad=True) predicted_q = self.q_net(inputs_temp) # Perform standard Q-value computation for each of the selected actions best_q_val_next = 0 for k in range(32): # Sample 32 actions and use them in the next state to get an estimate of the state value action_temp_2 = self.action_set[k] q_value_temp = ( 1.0 / self.value_alpha) * self.forward_QNet( next_state, action_temp_2).data.numpy()[0][0] q_value_temp = np.exp(q_value_temp) / (1.0 / 32) best_q_val_next += q_value_temp * (1.0 / 32) best_q_val_next = self.value_alpha * np.log(best_q_val_next) expected_q = current_reward + 0.99 * best_q_val_next expected_q = (1 - self.alpha) * predicted_q.data.numpy( )[0][0] + self.alpha * expected_q expected_q = Variable(torch.FloatTensor([[expected_q]])) loss = self.criterion(predicted_q, expected_q) loss.backward() action_gradient_temp = [ inputs_temp.grad.data.numpy()[0][2], inputs_temp.grad.data.numpy()[0][3] ] kernel_val = self.rbf_kernel(list(action_temp), action_predicted.data.numpy()[0]) kernel_grad = self.rbf_kernel_grad( list(action_temp), action_predicted.data.numpy()[0]) final_temp_grad = ( [x * kernel_val for x in action_gradient_temp] + [x * self.value_alpha for x in kernel_grad]) final_action_gradient[0] += (1.0 / 32) * final_temp_grad[0] final_action_gradient[1] += (1.0 / 32) * final_temp_grad[1] print final_action_gradient action_predicted.backward( torch.FloatTensor([final_action_gradient])) # Apply the updates using the optimizers self.q_optimizer.zero_grad() self.q_optimizer.step() self.policy_optimizer.zero_grad() self.policy_optimizer.step()
def __init__(self): self.discount = 0.8 self.qnet = QNet()
from qnet import QNet from field import Field import numpy as np a = Field() net = QNet() net.getModel().summary() for i in range(6 * 7 + 2): player = 1 if i % 2 == 0 else -1 if player > 0: x = np.argmax(net.predict(a.getField().reshape((1, 6, 7)))[0]) else: x = np.random.randint(0, 7) f1, action, done, reward = a.put(x, player) print(i) print(a.getField()) if done: print("Player " + str(reward) + " won") break class Game: def __init__(self): self.discount = 0.8 self.qnet = QNet() def create_training_pair(self, old_state, action, reward, new_state): # Ask the model for the Q values of the old state (inference) states = np.zeros( (2, old_state.shape[0], old_state.shape[1], old_state.shape[2]))