def train_v1(eps_start, eps_end, eps_decay, n_step, mem_capacity, num_episodes, embed_dim, iters): graph_generator = GraphGenerator(16, 16) memory = ReplayBuffer(mem_capacity) steps_done = 0 gnn = Struc2Vec(embed_dim, iters) qnet = QNet(embed_dim) optimizer = optim.Adam(list(gnn.parameters()) + list(qnet.parameters()), lr=0.0001, weight_decay=1e-4) for e in range(num_episodes): node_labels, adj, edge_weights = graph_generator.next() vtx_feats = gnn(node_labels, adj, edge_weights) remaining_vertices = set([i for i in range(len(adj))]) state = Variable(torch.zeros(embed_dim)) curr_tour = [] T = len(adj) rewards = [] states = [state] for t in range(T): eps_threshold = util.get_eps_threshold(eps_start, eps_end, eps_decay, steps_done) if random.random() > eps_threshold: # arg max action curr_vtx = arg_max_action(qnet, vtx_features, remaining_vertices) else: # random action curr_vtx = random.sample(remaining_vertices, 1)[0] action = vtx_feats[curr_vtx] # reward maintenance est_reward = qnet(state, curr_vtx) reward = get_reward(curr_tour, curr_vtx, edge_weights) rewards.append(reward) # update states curr_tour.append(curr_vtx) remaining_vertices.remove(curr_vtx) states.append(state + action) # wait till after doing the memory stuff to add the state # we only do these updates after n steps if t >= n_step: _, next_reward = arg_max_action(qnet, vtx_features, remaining_vertices) state_tminusn = states[-n_step] # this is a torch tensor action_tminusn = vtx_feats[ curr_tour[-nstep]] # this gives the vertex id reward_tminusn = sum(reward[-n:]) memory.push(state_minusn, action_tminusn, reward_tminusn, state, action) transitions = memory.sample(batch_size) # batch.state, batch.action, batch.reward, etc are now tuples # TODO: this looks a bit gross.... batch = Transition(*zip(*batch)) state_batch = torch.cat([s.unsqueeze(0) for s in batch.state], dim=0) action_batch = torch.cat( [a.unsqueeze(0) for a in batch.action], dim=0) reward_batch = torch.cat(batch.reward) newstate_batch = torch.cat( [ns.unsqueeze(0) for ns in batch.new_state], dim=0) max_action_batch = torch.cat( [ma.unsqueeze(0) for ma in batch.max_action], dim=0) # TODO: make qnet allow batch # does the experience replay memory contain state/action/reward/next_state # from only the current episode's graph? Or can any graph seen before be # in the memory? # The argmax action is the thing taken at time t-n_step right? oldstate_action_value = qnet(state_batch, action_batch) newstate_action_value = qnet(new_state_batch, max_action_batch) expected_sa_values = reward_batch + gamma * newstate_action_value loss = F.mse_loss(oldstate_action_value, expected_sa_values) optimizer.zero_grad() loss.backward() # clamp grads? state += action steps_done += 1
class SoftQLearning: def __init__(self): self.dO = 2 # Observation space dimensions self.dA = 2 # Action space dimensions self.criterion = nn.MSELoss() self.q_net = QNet() self.q_optimizer = optim.SGD(self.q_net.parameters(), lr=0.0001) self.policy_net = PolicyNet() self.policy_optimizer = optim.SGD(self.policy_net.parameters(), lr=0.001) self.terrain = Terrain() self.replay_buffer_maxlen = 50 self.replay_buffer = [] self.exploration_prob = 0.0 self.alpha = 1.0 self.value_alpha = 0.3 self.action_set = [] for j in range(32): self.action_set.append((math.sin( (3.14 * 2 / 32) * j), math.cos((3.14 * 2 / 32) * j))) def forward_QNet(self, obs, action): inputs = Variable(torch.FloatTensor([obs + action])) q_pred = self.q_net(inputs) return q_pred def forward_PolicyNet(self, obs, noise): inputs = Variable(torch.FloatTensor([obs + noise])) action_pred = self.policy_net(inputs) return action_pred def collect_samples(self): self.replay_buffer = [] self.terrain.resetgame() while (1): self.terrain.plotgame() current_state = self.terrain.player.getposition() """ best_action = self.action_set[0] for j in range(32): # Sample 32 actions and use them in the next state to get maximum Q_value action_temp = self.action_set[j] print self.forward_QNet(current_state, action_temp).data.numpy()[0][0] if self.forward_QNet(current_state, action_temp).data.numpy()[0][0] > self.forward_QNet(current_state, best_action).data.numpy()[0][0]: best_action = action_temp print "Exploration prob:", self.exploration_prob """ best_action = tuple( self.forward_PolicyNet( current_state, (np.random.normal(0.0, 0.5), np.random.normal( 0.0, 0.5))).data.numpy()[0].tolist()) if random.uniform(0.0, 1.0) < self.exploration_prob: x_val = random.uniform(-1.0, 1.0) best_action = (x_val, random.choice([-1.0, 1.0]) * math.sqrt(1.0 - x_val * x_val)) print "Action:", best_action current_reward = self.terrain.player.action(best_action) print "Reward:", current_reward next_state = self.terrain.player.getposition() self.replay_buffer.append( [current_state, best_action, current_reward, next_state]) if self.terrain.checkepisodeend() or len( self.replay_buffer) > self.replay_buffer_maxlen: self.terrain.resetgame() break def rbf_kernel(self, input1, input2): return np.exp(-3.14 * (np.dot(input1 - input2, input1 - input2))) def rbf_kernel_grad(self, input1, input2): diff = (input1 - input2) mult_val = self.rbf_kernel(input1, input2) * -2 * 3.14 return [x * mult_val for x in diff] def train_network(self): for t in range(50): i = random.randint(0, len(self.replay_buffer) - 1) current_state = self.replay_buffer[i][0] current_action = self.replay_buffer[i][1] current_reward = self.replay_buffer[i][2] next_state = self.replay_buffer[i][3] # Perform updates on the Q-Network best_q_val_next = 0 for j in range(32): # Sample 32 actions and use them in the next state to get an estimate of the state value action_temp = self.action_set[j] q_value_temp = (1.0 / self.value_alpha) * self.forward_QNet( next_state, action_temp).data.numpy()[0][0] q_value_temp = np.exp(q_value_temp) / (1.0 / 32) best_q_val_next += q_value_temp * (1.0 / 32) best_q_val_next = self.value_alpha * np.log(best_q_val_next) print "Best Q Val:", best_q_val_next inputs_cur = Variable(torch.FloatTensor([ (current_state + current_action) ]), requires_grad=True) predicted_q = self.q_net(inputs_cur) expected_q = current_reward + 0.99 * best_q_val_next expected_q = (1 - self.alpha) * predicted_q.data.numpy( )[0][0] + self.alpha * expected_q expected_q = Variable(torch.FloatTensor([[expected_q]])) loss = self.criterion(predicted_q, expected_q) loss.backward() # Perform updates on the Policy-Network using SVGD action_predicted = self.forward_PolicyNet(current_state, (0.0, 0.0)) final_action_gradient = [0.0, 0.0] for j in range(32): action_temp = tuple( self.forward_PolicyNet( current_state, (np.random.normal(0.0, 0.5), np.random.normal( 0.0, 0.5))).data.numpy()[0].tolist()) inputs_temp = Variable(torch.FloatTensor( [current_state + action_temp]), requires_grad=True) predicted_q = self.q_net(inputs_temp) # Perform standard Q-value computation for each of the selected actions best_q_val_next = 0 for k in range(32): # Sample 32 actions and use them in the next state to get an estimate of the state value action_temp_2 = self.action_set[k] q_value_temp = ( 1.0 / self.value_alpha) * self.forward_QNet( next_state, action_temp_2).data.numpy()[0][0] q_value_temp = np.exp(q_value_temp) / (1.0 / 32) best_q_val_next += q_value_temp * (1.0 / 32) best_q_val_next = self.value_alpha * np.log(best_q_val_next) expected_q = current_reward + 0.99 * best_q_val_next expected_q = (1 - self.alpha) * predicted_q.data.numpy( )[0][0] + self.alpha * expected_q expected_q = Variable(torch.FloatTensor([[expected_q]])) loss = self.criterion(predicted_q, expected_q) loss.backward() action_gradient_temp = [ inputs_temp.grad.data.numpy()[0][2], inputs_temp.grad.data.numpy()[0][3] ] kernel_val = self.rbf_kernel(list(action_temp), action_predicted.data.numpy()[0]) kernel_grad = self.rbf_kernel_grad( list(action_temp), action_predicted.data.numpy()[0]) final_temp_grad = ( [x * kernel_val for x in action_gradient_temp] + [x * self.value_alpha for x in kernel_grad]) final_action_gradient[0] += (1.0 / 32) * final_temp_grad[0] final_action_gradient[1] += (1.0 / 32) * final_temp_grad[1] print final_action_gradient action_predicted.backward( torch.FloatTensor([final_action_gradient])) # Apply the updates using the optimizers self.q_optimizer.zero_grad() self.q_optimizer.step() self.policy_optimizer.zero_grad() self.policy_optimizer.step()