def Learn_policy_from_data(paths, g, Q, N, Vx_rzns, Vy_rzns, num_of_paths=10, num_actions=36, ALPHA=0.5, method='reverse_order', num_passes=1): sampling_interval = SAMPLING_Interval # Q = EstimateQ_with_parallel_trajs(paths, g, pos_const, sampling_interval, Q, N, Vx, Vy, num_of_paths) # Q, max_Qdel_list= EstimateQ_mids_mids2(paths, g, Q, N, Vx_rzns, Vy_rzns, num_of_paths, num_actions, ALPHA, sampling_interval ) Q, max_Qdel_list = learn_Q_from_trajs(paths, g, Q, N, Vx_rzns, Vy_rzns, num_of_paths, num_actions, ALPHA, sampling_interval, method=method, num_passes=num_passes) #Compute policy policy = initialise_policy(g) for s in Q.keys(): newa, _ = max_dict(Q[s]) policy[s] = newa return Q, policy, max_Qdel_list
def Q_update(Q, N, max_delQ, sars, ALPHA, grid, N_inc): s1, a1, r1, s2 = sars if not grid.is_terminal(s1) and grid.if_within_actionable_time(s1): # if (s1[1], s1[2]) != grid.endpos: N[s1][a1] += N_inc alpha1 = ALPHA / N[s1][a1] q_s1_a1 = r1 if not grid.is_terminal(s2) and grid.if_within_actionable_time(s2): # if (s2[1], s2[2]) != grid.endpos: _, val = max_dict(Q[s2]) q_s1_a1 = r1 + val old_qsa = Q[s1][a1] Q[s1][a1] += alpha1 * (q_s1_a1 - Q[s1][a1]) delQ = np.abs(old_qsa - Q[s1][a1]) if delQ > max_delQ: max_delQ = delQ return Q, N, max_delQ
def Q_update(Q, N, max_delQ, sars): s1, a1, r1, s2 = sars if s1 != grid.endpos: N[s1][a1] += N_inc alpha1 = ALPHA / N[s1][a1] q_s1_a1 = r1 if s2 != grid.endpos: _, val = max_dict(Q[s2]) q_s1_a1 = r1 + val old_qsa = Q[s1][a1] Q[s1][a1] += alpha1 * (q_s1_a1 - Q[s1][a1]) delQ = np.abs(old_qsa - Q[s1][a1]) if delQ > max_delQ: max_delQ = delQ return Q, N, max_delQ
def Learn_policy_from_data(paths, g, Q, N, vel_field_data, nmodes, train_path_ids, n_inc, num_actions = 36, ALPHA=0.5, method = 'reverse_order', num_passes = 1): global N_inc N_inc = n_inc print("$$$$$$$$$$$ CHECk in buildQ: N_inc = ", N_inc) sampling_interval = SAMPLING_Interval # Q = EstimateQ_with_parallel_trajs(paths, g, pos_const, sampling_interval, Q, N, Vx, Vy, train_path_ids) # Q, max_Qdel_list= EstimateQ_mids_mids2(paths, g, Q, N, Vx_rzns, Vy_rzns, num_of_paths, num_actions, ALPHA, sampling_interval ) Q, N, max_Qdel_list= learn_Q_from_trajs(paths, g, Q, N, vel_field_data, nmodes, train_path_ids, num_actions, ALPHA, sampling_interval, method = method, num_passes= num_passes) #Compute policy policy=initialise_policy(g) for s in Q.keys(): newa, _ = max_dict(Q[s]) policy[s] = newa return Q, N, policy, max_Qdel_list
def update_Q_in_future_kth_rzn(g, Q, N, vel_field_data, nmodes, s1, rzn, eps): """ almost same as from Run_Q_learning_episode() s2: current state in whilie simulating roolout """ t, i, j = s1 g.set_state(s1) dummy_policy = None #stochastic_action_eps_greedy() here, uses Q. so policy is ingnored anyway # a1 = stochastic_action_eps_greedy(policy, s1, g, eps, Q=Q) count = 0 max_delQ = 0 # while not g.is_terminal() and g.if_within_TD_actionable_time(): while not g.is_terminal(s1) and not g.if_edge_state(s1) and g.if_within_actionable_time(): """Will have to change this for general time""" t, i, j = s1 a1 = stochastic_action_eps_greedy(dummy_policy, s1, g, eps, Q=Q) vx, vy = extract_velocity(vel_field_data, t, i, j, rzn) r = g.move_exact(a1, vx, vy, rzn) # r = g.move_exact(a1, Vx_rzns[rzn, i, j], Vy_rzns[rzn, i, j]) s2 = g.current_state() # if g.is_terminal() or (not g.if_within_actionable_time()): alpha = ALPHA / N[s1][a1] N[s1][a1] += N_inc #maxQsa = 0 if next state is a terminal state/edgestate/outside actionable time max_q_s2_a2= 0 if not g.is_terminal(s2) and not g.if_edge_state(s2) and g.if_within_actionable_time(): a2, max_q_s2_a2 = max_dict(Q[s2]) old_qsa = Q[s1][a1] Q[s1][a1] = Q[s1][a1] + alpha*(r + max_q_s2_a2 - Q[s1][a1]) if np.abs(old_qsa - Q[s1][a1]) > max_delQ: max_delQ = np.abs(old_qsa - Q[s1][a1]) s1 = s2 # t, i, j = s1 return Q, N
def learn_Q_from_exp_buffer(grid, exp_buffer, Q, N, ALPHA, method='reverse_order', num_passes =1): """ Learns Q values after building experience buffer. Contains 2 types of methods- 1.reverse pass through buffer 2.random pass through buffer :param grid: :param exp_buffer: :param Q: :param N: :param ALPHA: :param method: :param num_passes: :return: """ # print("$$$$ CHECK: ") # for kth_traj_buffer in exp_buffer: # print("* * * * * *") # for i in range(3): # print(kth_traj_buffer[i]) if not (method == 'reverse_order' or method == 'iid'): print("No such method learning Q values from traj") return print("In Build_Q_... learning method = ", method) max_delQ_list = [] if method == 'reverse_order': for Pass in range(num_passes): print("in Build_Q_.. : pass ", Pass) max_delQ = 0 for kth_traj_buffer in exp_buffer: for sars in kth_traj_buffer: Q, N, max_delQ = Q_update(Q, N, max_delQ, sars, ALPHA, grid, N_inc) max_delQ_list.append(max_delQ) print('max_delQ= ',max_delQ) # print("Q[start] = ", Q[grid.start_state]) print('Q[s]: best a, val =', max_dict(Q[grid.start_state])) if max_delQ < max_delQ_threshold: print("Qs converged") break if method == 'iid': flatten = lambda l: [item for sublist in l for item in sublist] exp_buffer = flatten(exp_buffer) idx_list= np.arange(len(exp_buffer)) print(len(exp_buffer)) for Pass in range(num_passes): print("in Build_Q_.. : pass ", Pass) random.shuffle(idx_list) max_delQ = 0 for i in idx_list: sars = exp_buffer[i] Q, N, max_delQ = Q_update(Q, N, max_delQ, sars, ALPHA, grid, N_inc) max_delQ_list.append(max_delQ) print('max_delQ= ', max_delQ) # print("Q[start] = ", Q[grid.start_state]) print('Q[s]: best a, val =', max_dict(Q[grid.start_state])) if max_delQ < max_delQ_threshold: print("Qs converged") break return Q, N, max_delQ_list
def run_and_plot_onboard_routing_episodes(setup_grid_params, Q, N, fpath, fname): # g, xs, ys, X, Y, vel_field_data, nmodes, useful_num_rzns, paths, params, param_str g, xs, ys, X, Y, vel_field_data, nmodes, _, paths, _, _ = setup_grid_params g.make_bcrumb_dict(paths, train_id_list) gcopy = copy.deepcopy(g) # Copy Q to Qcopy msize = 15 # fsize = 3 fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(1, 1, 1) ax.set_xlim(0,1) ax.set_ylim(0,1) minor_ticks = [i/100.0 for i in range(101) if i%20!=0] major_ticks = [i/100.0 for i in range(0,120,20)] ax.set_xticks(minor_ticks, minor=True) ax.set_xticks(major_ticks, minor=False) ax.set_yticks(major_ticks, minor=False) ax.set_yticks(minor_ticks, minor=True) ax.grid(b= True, which='both', color='#CCCCCC', axis='both',linestyle = '-', alpha = 0.5) ax.tick_params(axis='both', which='both', labelsize=6) ax.set_xlabel('X (Non-Dim)') ax.set_ylabel('Y (Non-Dim)') st_point= g.start_state plt.scatter(g.xs[st_point[1]], g.ys[g.ni - 1 - st_point[0]], marker = 'o', s = msize, color = 'k', zorder = 1e5) plt.scatter(g.xs[g.endpos[1]], g.ys[g.ni - 1 - g.endpos[0]], marker = '*', s = msize*2, color ='k', zorder = 1e5) plt.gca().set_aspect('equal', adjustable='box') # plt.quiver(X, Y, Vx_rzns[0, :, :], Vy_rzns[0, :, :]) t_list=[] traj_list = [] bad_count = 0 # for k in range(len(test_id_list)): for k in range(n_test_paths_range[0], n_test_paths_range[1]): Qcopy = copy.deepcopy(Q) Ncopy = copy.deepcopy(N) rzn = test_id_list[k] init_list = [None for i in range(rollout_interval)] cs1as2_list = deque(init_list) #to keep a fixed lenght list representation print("-------- In rzn ", rzn, " of test_id_list ---------") g.set_state(g.start_state) dont_plot =False bad_flag = False xtr = [] ytr = [] s1 = g.start_state t, i, j = s1 cs1 = (t, g.x, g.y ,i, j) a, q_s_a = max_dict(Qcopy[s1]) xtr.append(g.x) ytr.append(g.y) loop_count = 0 # while not g.is_terminal() and g.if_within_actionable_time() and g.current_state: # print("__CHECK__ t, i, j") while True: loop_count += 1 vx, vy = extract_velocity(vel_field_data, t, i, j, rzn) r = g.move_exact(a, vx, vy) # r = g.move_exact(a, Vx_rzns[rzn, i, j], Vy_rzns[rzn, i, j]) s2 = g.current_state() (t, i, j) = s2 cs1_a_s2 = (cs1, a, s2) # keep n latest transitions where n = rollout_interval cs1as2_list.pop() cs1as2_list.appendleft(cs1_a_s2) xtr.append(g.x) ytr.append(g.y) if g.if_edge_state((i,j)): bad_count += 1 # dont_plot=True break if (not g.is_terminal(almost = True)) and g.if_within_actionable_time(): if loop_count % rollout_interval == 0: print("------------loopcount/mission_time =", loop_count) # for kk in range(len(cs1as2_list)): # check_cs1_a_s2 = cs1as2_list[kk] # check_cs1 = check_cs1_a_s2[0] # check_s2 = check_cs1_a_s2[2] # tij1 = (check_cs1[0],check_cs1[3],check_cs1[4]) # print("check: ", tij1, check_s2) Qcopy, Ncopy = update_Q_in_future_rollouts(gcopy, Qcopy, Ncopy, cs1as2_list, vel_field_data, nmodes, loop_count) s1 = s2 #for next iteration of loop cs1 = (t, g.x, g.y, i, j) a, q_s_a = max_dict(Qcopy[s1]) elif g.is_terminal(almost = True): break else: # i.e. not terminal and not in actinable time. # already checked if ternminal or not. If not terminal # if time reaches nt ie not within actionable time, then increment badcount and Dont plot bad_count+=1 bad_flag=True # dont_plot=True break if dont_plot==False: plt.plot(xtr, ytr) # if bad flag is True then append None to the list. These nones are counted later if bad_flag == False: traj_list.append((xtr,ytr)) t_list.append(t) #ADDED for trajactory comparison else: traj_list.append(None) t_list.append(None) if fname != None: plt.savefig(join(fpath,fname),bbox_inches = "tight", dpi=200) plt.cla() plt.close(fig) writePolicytoFile(t_list, join(fpath,fname+'tlist' )) picklePolicy(traj_list, join(fpath,fname+'_coord_traj')) print("*** pickled phase2 traj_list ***") return t_list, bad_count
def EstimateQ_mids_mids2(paths, grid, Q, N, Vx_rzns, Vy_rzns, num_of_paths, num_actions, ALPHA, sampling_inerval): # considers transition from middle of state to middle of state # chooses correct actions by taking into consideration velocity field # generates velocity field realization here max_delQ_list = [] #pick trajectory from paths and store in reverse order for k in range(num_of_paths): if k % 500 == 0: print("traj_", k) max_delQ = 0 # setup corresponding realisation of velocity field """may have to build the realisation here!!!!""" # Vxt = Vx_rzns[k,:,:,:] # Vyt = Vy_rzns[k,:,:,:] """Jugaad""" Vxt = Vx_rzns[k, :, :] Vyt = Vy_rzns[k, :, :] # for all trajectories in the list of paths trajectory = paths[0, k] state_traj = [] coord_traj = [] test_trajx = [] test_trajy = [] #*********ASSUMING THAT 5DT IN TRAJ DATA IS 1 SECOND******** # s_t = 1 s_i = None s_j = None for j in range( 0, len(trajectory) - 1, sampling_inerval ): # the len '-1' is to avoid reading NaN at the end of path data s_i, s_j = compute_cell(grid, trajectory[j]) # state_traj.append((s_t, s_i, s_j)) # coord_traj.append((grid.ts[s_t],trajectory[j][0], trajectory[j][1])) state_traj.append((s_i, s_j)) coord_traj.append((trajectory[j][0], trajectory[j][1])) # test_trajx.append(trajectory[j][0]) # test_trajy.append(trajectory[j][1]) # s_t+=1 # if the last sampled point is not endpoint of trajectory, include it in the state/coord_traj # s_i_end, s_j_end = compute_cell(grid, trajectory[-2]) # if (s_i, s_j) != (s_i_end, s_j_end): # state_traj.append((s_t, s_i, s_j)) # coord_traj.append((grid.ts[s_t], trajectory[-2][0], trajectory[-2][1])) # test_trajx.append(trajectory[-2][0]) # test_trajy.append(trajectory[-2][1]) #Reverse trajectory orders state_traj.reverse() coord_traj.reverse() test_trajx.reverse() test_trajy.reverse() # since traj data does not contain start point info, adding it explicitly # p, m, n = grid.start_state m, n = grid.start_state x0 = grid.xs[n] y0 = grid.ys[grid.ni - 1 - m] state_traj.append(grid.start_state) # coord_traj.append((grid.ts[p],x0,y0)) coord_traj.append((x0, y0)) # test_trajx.append(x0) # # test_trajy.append(y0) # if k%500==0: # plt.plot(test_trajx, test_trajy, '-o') #Update Q values based on state and possible actions for i in range(len(state_traj) - 1): s1 = state_traj[i + 1] s2 = state_traj[i] # t ,m,n=s1 m, n = s1 p1 = coord_traj[i + 1] p2 = coord_traj[i] """COMMENTING THIS STATEMENT BELOW""" # if (s1[1],s1[2])!=(s2[1],s2[2]): #vx=Vxt[t,i,j] a1 = Calculate_action(s1, p1, p2, Vxt, Vyt, grid) # print("EstQ: YO") if (s1[0], s1[1]) != grid.endpos: N[s1][a1] += N_inc alpha1 = ALPHA / N[s1][a1] #update Q considering a1 was performed grid.set_state(s1, xcoord=p1[0], ycoord=p1[1]) r1 = grid.move_exact(a1, Vxt[m, n], Vyt[m, n]) q_s_a1 = r1 next_s = grid.current_state() if (next_s[0], next_s[1]) != grid.endpos: _, val = max_dict(Q[next_s]) q_s_a1 = r1 + val old_qsa = Q[s1][a1] Q[s1][a1] += alpha1 * (q_s_a1 - Q[s1][a1]) if np.abs(old_qsa - Q[s1][a1]) > max_delQ: max_delQ = np.abs(old_qsa - Q[s1][a1]) max_delQ_list.append(max_delQ) return Q, max_delQ_list
def learn_Q_from_exp_buffer(grid, exp_buffer, Q, N, ALPHA, method='reverse_order', num_passes=1): """ Learns Q values after building experience buffer. Contains 2 types of methods- 1.reverse pass through buffer 2.random pass through buffer :param grid: :param exp_buffer: :param Q: :param N: :param ALPHA: :param method: :param num_passes: :return: """ def Q_update(Q, N, max_delQ, sars): s1, a1, r1, s2 = sars if s1 != grid.endpos: N[s1][a1] += N_inc alpha1 = ALPHA / N[s1][a1] q_s1_a1 = r1 if s2 != grid.endpos: _, val = max_dict(Q[s2]) q_s1_a1 = r1 + val old_qsa = Q[s1][a1] Q[s1][a1] += alpha1 * (q_s1_a1 - Q[s1][a1]) delQ = np.abs(old_qsa - Q[s1][a1]) if delQ > max_delQ: max_delQ = delQ return Q, N, max_delQ if not (method == 'reverse_order' or method == 'iid'): print("No such method learning Q values from traj") return print("In Build_Q_... learning method = ", method) max_delQ_list = [] if method == 'reverse_order': for Pass in range(num_passes): print("in Build_Q_.. : pass ", Pass) max_delQ = 0 for kth_traj_buffer in exp_buffer: for sars in kth_traj_buffer: Q, N, max_delQ = Q_update(Q, N, max_delQ, sars) max_delQ_list.append(max_delQ) print('max_delQ= ', max_delQ) print("Q[start] = ", Q[grid.startpos]) print('Q[s]: best a, val =', max_dict(Q[grid.startpos])) if max_delQ < 1: print("Qs converged") break if method == 'iid': flatten = lambda l: [item for sublist in l for item in sublist] exp_buffer = flatten(exp_buffer) idx_list = np.arange(len(exp_buffer)) print(len(exp_buffer)) for Pass in range(num_passes): print("in Build_Q_.. : pass ", Pass) random.shuffle(idx_list) max_delQ = 0 for i in idx_list: sars = exp_buffer[i] Q, N, max_delQ = Q_update(Q, N, max_delQ, sars) max_delQ_list.append(max_delQ) print('max_delQ= ', max_delQ) print("Q[start] = ", Q[grid.startpos]) print('Q[s]: best a, val =', max_dict(Q[grid.startpos])) if max_delQ < 1: print("Qs converged") break return Q, max_delQ_list