def sarsa_nstep_diff_live(W, Nruns): print("Running nstep SARSA live") for run in range(Nruns): print("Run " + str(run + 1)) sim_environment.start_new_run(run) curr_s = initial_state_generate() t = 0 while True: intersection = t % 4 if intersection == 3: a_space = [13, 14, 15] else: a_space = [ 4 * intersection + 1, 4 * intersection + 2, 4 * intersection + 3, 4 * intersection + 4 ] a = epsilon_greedy_a(0, a_space, curr_s, W) env_param = sim_environment.take_action(a) next_s = env_param['next_state'] r = env_param['rwd'] if r == 1000: print("End of simulation at t = " + str(t)) break curr_s = next_s t += 1 return
def static_signalling(Nruns): print("Running Static signalling") for run in range(Nruns): print("Run " + str(run + 1)) sim_environment.start_new_run(run) initial_state_generate() curr_a = 1 # cyclic test t = 0 counter = [1, 0, 0, 0] while True: next_intersection = (t + 1) % 4 if next_intersection == 3: a_space = [13, 14, 15] else: a_space = [ 4 * next_intersection + 1, 4 * next_intersection + 2, 4 * next_intersection + 3, 4 * next_intersection + 4 ] next_a = a_space[counter[next_intersection]] if counter[next_intersection] != len(a_space) - 1: counter[next_intersection] += 1 else: counter[next_intersection] = 0 env_param = sim_environment.take_action(curr_a) r = env_param['rwd'] if r == 1000: print("End of simulation at t = " + str(t)) break curr_a = next_a t += 1 return
def qr_dqn_live(load): print("Running QR-DQN Live") # delete any existing images if platform.system() == 'Windows': os.system("del .\img\*.png") elif platform.system() == 'Linux': os.system("rm .\img\*.png") if load: model = Network(len_state=STATE_LEN, num_quant=NUM_QUANTS, num_actions=NUM_ACTIONS) model.load_state_dict(torch.load(TMPATH)) model.eval() else: model = Network(len_state=STATE_LEN, num_quant=NUM_QUANTS, num_actions=NUM_ACTIONS) model.load_state_dict(Z.state_dict()) model.eval() t = 0 sim_environment.start_new_run(0) state = initial_state_generate() plt.show() plt.ion() while True: plt.clf() plt.title('step = %s' % t) intersection = t % 4 if intersection == 3: a_space = [12, 13, 14] else: a_space = [4 * intersection, 4 * intersection + 1, 4 * intersection + 2, 4 * intersection + 3] action = model.select_action(torch.Tensor([state]), a_space, 0) observ = sim_environment.take_action(action + 1) state = observ['next_state'] reward = observ['rwd'] done = 1 if reward == -100 else 0 t += 1 Zval = model(torch.Tensor([state])).detach().numpy() for i in range(NUM_ACTIONS): x, y = get_plot(Zval[0][i]) plt.plot(x, y, label='%s Q=%.1f' % (i + 1, Zval[0][i].mean())) plt.legend(bbox_to_anchor=(1.1, 1.1), ncol=NUM_ACTIONS, prop={'size': 3}) if done: break plt.savefig('./img/%s.png' % t) plt.pause(0.001) plt.close() print("Steps = ", t) return
def qr_dqn_live_noplots(Nruns, load): print("Running QR-DQN Live (no plots)") if load: model = Network(len_state=STATE_LEN, num_quant=NUM_QUANTS, num_actions=NUM_ACTIONS) model.load_state_dict(torch.load(TMPATH)) model.eval() else: model = Network(len_state=STATE_LEN, num_quant=NUM_QUANTS, num_actions=NUM_ACTIONS) model.load_state_dict(Z.state_dict()) model.eval() for run in range(Nruns): t = 0 sim_environment.start_new_run(run) state = initial_state_generate() while True: intersection = t % 4 if intersection == 3: a_space = [12, 13, 14] else: a_space = [4 * intersection, 4 * intersection + 1, 4 * intersection + 2, 4 * intersection + 3] action = model.select_action(torch.Tensor([state]), a_space, 0) observ = sim_environment.take_action(action + 1) state = observ['next_state'] reward = observ['rwd'] done = 1 if reward == -100 else 0 t += 1 if done: break print("Steps = ", t) return
def lqf(Nruns): print("Running LQF") for run in range(Nruns): print("Run " + str(run + 1)) sim_environment.start_new_run(run) initial_state_generate() curr_a = random.randint(1, 4) t = 0 while True: next_intersection = (t + 1) % 4 env_param = sim_environment.take_action(curr_a) next_s = env_param['next_state'] r = env_param['rwd'] if r == -100: print("End of simulation at t = " + str(t)) break if next_intersection == 3: a_space = [13, 14, 15] else: a_space = [ 4 * next_intersection + 1, 4 * next_intersection + 2, 4 * next_intersection + 3, 4 * next_intersection + 4 ] q_next = [] for a_temp in a_space: q_next.append(next_s[a_temp - 1]) q_max = max(q_next) q_max_index = [i for i, j in enumerate(q_next) if j == q_max] rand_greedy_q = np.random.choice(q_max_index) next_a = a_space[rand_greedy_q] curr_a = next_a t += 1 return
def qr_dqn_train(Nruns): print("Running QR-DQN Training") # Quantiles tau = torch.Tensor((2 * np.arange(NUM_QUANTS) + 1) / (2.0 * NUM_QUANTS)).view(1, -1) logger = Logger('q-net', fmt={'loss': '.5f'}) steps_done = 0 running_reward = 0 for run in range(Nruns): t = 0 sum_reward = 0.0 memory = ReplayMemory(REPLAY_MEM_SIZE) # Initialize Replay buffer sim_environment.start_new_run(run) state = initial_state_generate() while True: intersection = t % 4 if intersection == 3: a_space = [12, 13, 14] else: a_space = [4 * intersection, 4 * intersection + 1, 4 * intersection + 2, 4 * intersection + 3] action = Z.select_action(torch.Tensor([state]), a_space, calc_epsilon(steps_done)) observ = sim_environment.take_action(action + 1) next_state = observ['next_state'] reward = observ['rwd'] done = 1 if reward == -100 else 0 steps_done += 1 t += 1 if not done: memory.push(state, action, next_state, reward, float(done)) sum_reward += reward if len(memory) < BATCH_SIZE: state = next_state continue states, actions, rewards, next_states, dones = memory.sample(BATCH_SIZE) theta = Z(states)[np.arange(BATCH_SIZE), actions] Znext = Ztgt(next_states).detach() Qnext_sa = Znext.mean(2) anext_max = torch.zeros([BATCH_SIZE], dtype=torch.long) for i in range(BATCH_SIZE): next_aspace = calc_next_aspace(int(actions[i])) temp = Qnext_sa[i, :] anext_max[i] = temp[next_aspace].max(0)[1] + next_aspace[0] Znext_max = Znext[np.arange(BATCH_SIZE), anext_max] Ttheta = rewards + GAMMA * (1 - dones) * Znext_max diff = Ttheta.t().unsqueeze(-1) - theta loss = huber(diff) * (tau - (diff.detach() < 0).float()).abs() loss = loss.mean() optimizer.zero_grad() loss.backward() optimizer.step() state = next_state if steps_done % NN_SYNC_FREQ == 0: Ztgt.load_state_dict(Z.state_dict()) if done: running_reward = sum_reward if not running_reward else 0.2*sum_reward + 0.8*running_reward logger.add(run + 1, steps=t, running_reward=running_reward, loss=loss.data.numpy()) logger.iter_info() break torch.save(Z.state_dict(), TMPATH) return
def sarsa_nstep_diff_train(n, c, epsilon, Nruns): print("Running nstep SARSA training") buff_len = n + 1 weight = np.zeros([S_LEN * A_LEN + 1, 1]) avg_reward = 0 dl_counter = 0 for run in range(Nruns): print("Run " + str(run + 1)) r_arr = np.zeros(n, dtype=int) a_arr = np.zeros(buff_len, dtype=int) s_arr = [] alpha = 0.1 / (run + 1) beta = c * alpha e = math.exp(-run) #Start new run sim_environment.start_new_run(run) curr_s = initial_state_generate() curr_a = random.randint(1, 4) a_arr[0] = curr_a s_arr.insert(0, curr_s) t = 0 while True: next_intersection = (t + 1) % 4 #signals corresponding to that intersection if next_intersection == 3: a_space = [13, 14, 15] else: a_space = [ 4 * next_intersection + 1, 4 * next_intersection + 2, 4 * next_intersection + 3, 4 * next_intersection + 4 ] #take action on intersection env_param = sim_environment.take_action(curr_a) #if all the traffic has left the simulation, sim_environment.py returns 1000 reward r = env_param['rwd'] if r == 1000: print("End of simulation at t = " + str(t)) break next_s = env_param['next_state'] r_arr[t % n] = r #chose next action epsilon greedily next_a = epsilon_greedy_a(e, a_space, next_s, weight[:, 0]) #store the next state and next action s_arr.insert((t + 1) % (n + 1), next_s) a_arr[(t + 1) % (n + 1)] = next_a tau = t - n + 1 #n-step SARSA Algo from Richard S. Sutton's Reinforcement Learning book if tau >= 0: q_tau_n = q_est(s_arr[(tau + n) % (n + 1)], a_arr[(tau + n) % (n + 1)], weight[:, 0]) q_tau = q_est(s_arr[tau % (n + 1)], a_arr[tau % (n + 1)], weight[:, 0]) do_error = sum(r_arr) - n * avg_reward + q_tau_n - q_tau avg_reward = avg_reward + beta * do_error phi_s_a_tau = phi(s_arr[tau % (n + 1)], a_arr[tau % (n + 1)]) weight[:, 0] = weight[:, 0] + alpha * do_error * np.transpose( phi_s_a_tau) curr_a = next_a t += 1 W = weight[:, 0] return W
def sarsa_nstep_diff_train(n, c, epsilon, Nruns): print("Running nstep SARSA training") buff_len = n + 1 r_arr = np.zeros(n, dtype=int) a_arr = np.zeros(buff_len, dtype=int) s_arr = [] weight = np.zeros([S_LEN * A_LEN + 1, 1]) for run in range(Nruns): print("Run " + str(run + 1)) avg_reward = 10 # initialize avg reward e = epsilon - run * (epsilon / Nruns) if e < 0.4: e = 0.4 sim_environment.start_new_run(run) curr_s = initial_state_generate() curr_a = random.randint(1, 4) a_arr[0] = curr_a s_arr.insert(0, curr_s) t = 0 while True: next_intersection = (t + 1) % 4 if next_intersection == 3: a_space = [13, 14, 15] else: a_space = [ 4 * next_intersection + 1, 4 * next_intersection + 2, 4 * next_intersection + 3, 4 * next_intersection + 4 ] alpha = 1 / (math.ceil((t + 1) / 10)) beta = c * alpha env_param = sim_environment.take_action(curr_a) r = env_param['rwd'] if r == -100: print("Simulation time", t) # for test break next_s = env_param['next_state'] r_arr[t % n] = r next_a = epsilon_greedy_a(e, a_space, next_s, weight[:, 0]) s_arr.insert((t + 1) % (n + 1), next_s) a_arr[(t + 1) % (n + 1)] = next_a tau = t - n + 1 if tau >= 0: q_tau_n = q_est(s_arr[(tau + n) % (n + 1)], a_arr[(tau + n) % (n + 1)], weight[:, 0]) q_tau = q_est(s_arr[tau % (n + 1)], a_arr[tau % (n + 1)], weight[:, 0]) do_error = sum(r_arr) - n * avg_reward + q_tau_n - q_tau avg_reward = avg_reward + beta * do_error phi_s_a_tau = phi(s_arr[tau % (n + 1)], a_arr[tau % (n + 1)]) weight[:, 0] = weight[:, 0] + alpha * do_error * np.transpose( phi_s_a_tau) curr_a = next_a t += 1 W = weight[:, 0] return W