Exemple #1
0
def Learn_policy_from_data(paths,
                           g,
                           Q,
                           N,
                           Vx_rzns,
                           Vy_rzns,
                           num_of_paths=10,
                           num_actions=36,
                           ALPHA=0.5,
                           method='reverse_order',
                           num_passes=1):

    sampling_interval = SAMPLING_Interval
    # Q = EstimateQ_with_parallel_trajs(paths, g, pos_const, sampling_interval, Q, N, Vx, Vy, num_of_paths)
    # Q, max_Qdel_list= EstimateQ_mids_mids2(paths, g, Q, N, Vx_rzns, Vy_rzns, num_of_paths, num_actions, ALPHA, sampling_interval )
    Q, max_Qdel_list = learn_Q_from_trajs(paths,
                                          g,
                                          Q,
                                          N,
                                          Vx_rzns,
                                          Vy_rzns,
                                          num_of_paths,
                                          num_actions,
                                          ALPHA,
                                          sampling_interval,
                                          method=method,
                                          num_passes=num_passes)
    #Compute policy
    policy = initialise_policy(g)
    for s in Q.keys():
        newa, _ = max_dict(Q[s])
        policy[s] = newa

    return Q, policy, max_Qdel_list
def Q_update(Q, N, max_delQ, sars, ALPHA, grid, N_inc):
    s1, a1, r1, s2 = sars
    if not grid.is_terminal(s1) and grid.if_within_actionable_time(s1):     # if (s1[1], s1[2]) != grid.endpos:
        N[s1][a1] += N_inc
        alpha1 = ALPHA / N[s1][a1]
        q_s1_a1 = r1
        if not grid.is_terminal(s2) and grid.if_within_actionable_time(s2): # if (s2[1], s2[2]) != grid.endpos:
            _, val = max_dict(Q[s2])
            q_s1_a1 = r1 + val
        old_qsa = Q[s1][a1]
        Q[s1][a1] += alpha1 * (q_s1_a1 - Q[s1][a1])
        delQ = np.abs(old_qsa - Q[s1][a1])
        if delQ > max_delQ:
            max_delQ = delQ
    return Q, N, max_delQ
Exemple #3
0
 def Q_update(Q, N, max_delQ, sars):
     s1, a1, r1, s2 = sars
     if s1 != grid.endpos:
         N[s1][a1] += N_inc
         alpha1 = ALPHA / N[s1][a1]
         q_s1_a1 = r1
         if s2 != grid.endpos:
             _, val = max_dict(Q[s2])
             q_s1_a1 = r1 + val
         old_qsa = Q[s1][a1]
         Q[s1][a1] += alpha1 * (q_s1_a1 - Q[s1][a1])
         delQ = np.abs(old_qsa - Q[s1][a1])
         if delQ > max_delQ:
             max_delQ = delQ
     return Q, N, max_delQ
def Learn_policy_from_data(paths, g, Q, N, vel_field_data, nmodes, train_path_ids, n_inc, num_actions = 36, ALPHA=0.5, method = 'reverse_order', num_passes = 1):

    global N_inc
    N_inc = n_inc
    print("$$$$$$$$$$$ CHECk in buildQ: N_inc = ", N_inc)

    sampling_interval = SAMPLING_Interval
    # Q = EstimateQ_with_parallel_trajs(paths, g, pos_const, sampling_interval, Q, N, Vx, Vy, train_path_ids)
    # Q, max_Qdel_list= EstimateQ_mids_mids2(paths, g, Q, N, Vx_rzns, Vy_rzns, num_of_paths, num_actions, ALPHA, sampling_interval )
    Q, N, max_Qdel_list= learn_Q_from_trajs(paths, g, Q, N, vel_field_data, nmodes, train_path_ids, num_actions, ALPHA, sampling_interval, method = method, num_passes= num_passes)
    #Compute policy
    policy=initialise_policy(g)
    for s in Q.keys():
        newa, _ = max_dict(Q[s])
        policy[s] = newa

    return Q, N,  policy, max_Qdel_list
def update_Q_in_future_kth_rzn(g, Q, N, vel_field_data, nmodes, s1, rzn, eps):
    """
    almost same as from Run_Q_learning_episode()
    s2: current state in whilie simulating roolout
    """

    t, i, j = s1
    g.set_state(s1)
    dummy_policy = None   #stochastic_action_eps_greedy() here, uses Q. so policy is ingnored anyway
    # a1 = stochastic_action_eps_greedy(policy, s1, g, eps, Q=Q)
    count = 0
    max_delQ = 0

    # while not g.is_terminal() and g.if_within_TD_actionable_time():
    while not g.is_terminal(s1) and not g.if_edge_state(s1) and g.if_within_actionable_time():
        """Will have to change this for general time"""
        
        t, i, j = s1
        a1 = stochastic_action_eps_greedy(dummy_policy, s1, g, eps, Q=Q)
        vx, vy = extract_velocity(vel_field_data, t, i, j, rzn)
        r = g.move_exact(a1, vx, vy, rzn)
        # r = g.move_exact(a1, Vx_rzns[rzn, i, j], Vy_rzns[rzn, i, j])
        s2 = g.current_state()
        # if g.is_terminal() or (not g.if_within_actionable_time()):

        alpha = ALPHA / N[s1][a1]
        N[s1][a1] += N_inc

        #maxQsa = 0 if next state is a terminal state/edgestate/outside actionable time
        max_q_s2_a2= 0
        if not g.is_terminal(s2) and not g.if_edge_state(s2) and g.if_within_actionable_time():
            a2, max_q_s2_a2 = max_dict(Q[s2])

        old_qsa = Q[s1][a1]
        Q[s1][a1] = Q[s1][a1] + alpha*(r + max_q_s2_a2 - Q[s1][a1])

        if np.abs(old_qsa - Q[s1][a1]) > max_delQ:
            max_delQ = np.abs(old_qsa - Q[s1][a1])


        s1 = s2
        # t, i, j = s1

    return Q, N
def learn_Q_from_exp_buffer(grid, exp_buffer, Q, N, ALPHA, method='reverse_order', num_passes =1):
    """
    Learns Q values after building experience buffer. Contains 2 types of methods- 1.reverse pass through buffer  2.random pass through buffer
    :param grid:
    :param exp_buffer:
    :param Q:
    :param N:
    :param ALPHA:
    :param method:
    :param num_passes:
    :return:
    """
    # print("$$$$ CHECK: ")
    # for kth_traj_buffer in exp_buffer:
    #     print("* *  * *  * *")
    #     for i in range(3):
    #         print(kth_traj_buffer[i])

    if not (method == 'reverse_order' or method == 'iid'):
        print("No such method learning Q values from traj")
        return

    print("In Build_Q_...   learning method = ", method)
    max_delQ_list = []

    if method == 'reverse_order':
        for Pass in range(num_passes):
            print("in Build_Q_.. : pass ", Pass)
            max_delQ = 0
            for kth_traj_buffer in exp_buffer:
                for sars in kth_traj_buffer:
                    Q, N, max_delQ = Q_update(Q, N, max_delQ, sars, ALPHA, grid, N_inc)

            max_delQ_list.append(max_delQ)
            print('max_delQ= ',max_delQ)
            # print("Q[start] = ", Q[grid.start_state])
            print('Q[s]: best a, val =', max_dict(Q[grid.start_state]))
            if max_delQ < max_delQ_threshold:
                print("Qs converged")
                break

    if method == 'iid':
        flatten = lambda l: [item for sublist in l for item in sublist]
        exp_buffer = flatten(exp_buffer)
        idx_list= np.arange(len(exp_buffer))
        print(len(exp_buffer))

        for Pass in range(num_passes):
            print("in Build_Q_.. : pass ", Pass)
            random.shuffle(idx_list)
            max_delQ = 0
            for i in idx_list:
                sars = exp_buffer[i]
                Q, N, max_delQ = Q_update(Q, N, max_delQ, sars, ALPHA, grid, N_inc)

            max_delQ_list.append(max_delQ)
            print('max_delQ= ', max_delQ)
            # print("Q[start] = ", Q[grid.start_state])
            print('Q[s]: best a, val =', max_dict(Q[grid.start_state]))
            if max_delQ < max_delQ_threshold:
                print("Qs converged")
                break

    return Q, N, max_delQ_list
def run_and_plot_onboard_routing_episodes(setup_grid_params, Q, N, fpath, fname):
# g, xs, ys, X, Y, vel_field_data, nmodes, useful_num_rzns, paths, params, param_str
    g, xs, ys, X, Y, vel_field_data, nmodes, _, paths, _, _ = setup_grid_params
    g.make_bcrumb_dict(paths, train_id_list)
   
    gcopy = copy.deepcopy(g)
    # Copy Q to Qcopy

    msize = 15
    # fsize = 3

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlim(0,1)
    ax.set_ylim(0,1)

    minor_ticks = [i/100.0 for i in range(101) if i%20!=0]
    major_ticks = [i/100.0 for i in range(0,120,20)]

    ax.set_xticks(minor_ticks, minor=True)
    ax.set_xticks(major_ticks, minor=False)
    ax.set_yticks(major_ticks, minor=False)
    ax.set_yticks(minor_ticks, minor=True)

    ax.grid(b= True, which='both', color='#CCCCCC', axis='both',linestyle = '-', alpha = 0.5)
    ax.tick_params(axis='both', which='both', labelsize=6)

    ax.set_xlabel('X (Non-Dim)')
    ax.set_ylabel('Y (Non-Dim)')

    st_point= g.start_state
    plt.scatter(g.xs[st_point[1]], g.ys[g.ni - 1 - st_point[0]], marker = 'o', s = msize, color = 'k', zorder = 1e5)
    plt.scatter(g.xs[g.endpos[1]], g.ys[g.ni - 1 - g.endpos[0]], marker = '*', s = msize*2, color ='k', zorder = 1e5)
    plt.gca().set_aspect('equal', adjustable='box')

    # plt.quiver(X, Y, Vx_rzns[0, :, :], Vy_rzns[0, :, :])

    
    t_list=[]
    traj_list = []
    bad_count = 0
    # for k in range(len(test_id_list)):
    for k in range(n_test_paths_range[0], n_test_paths_range[1]):
        Qcopy = copy.deepcopy(Q)
        Ncopy = copy.deepcopy(N)
        rzn = test_id_list[k]

        init_list = [None for i in range(rollout_interval)]
        cs1as2_list = deque(init_list)         #to keep a fixed lenght list representation  

        print("-------- In rzn ", rzn, " of test_id_list ---------")
        g.set_state(g.start_state)
        dont_plot =False
        bad_flag = False

        xtr = []
        ytr = []

        s1 = g.start_state
        t, i, j = s1
        cs1 = (t, g.x, g.y ,i, j)
        a, q_s_a = max_dict(Qcopy[s1])

        xtr.append(g.x)
        ytr.append(g.y)
        loop_count = 0
        # while not g.is_terminal() and g.if_within_actionable_time() and g.current_state:
        # print("__CHECK__ t, i, j")
        while True:
            loop_count += 1
            vx, vy = extract_velocity(vel_field_data, t, i, j, rzn)
            r = g.move_exact(a, vx, vy)
            # r = g.move_exact(a, Vx_rzns[rzn, i, j], Vy_rzns[rzn, i, j])
            s2 = g.current_state()
            (t, i, j) = s2
            cs1_a_s2 = (cs1, a, s2)

            # keep n latest transitions where n = rollout_interval
            cs1as2_list.pop()
            cs1as2_list.appendleft(cs1_a_s2)

            xtr.append(g.x)
            ytr.append(g.y)


            if g.if_edge_state((i,j)):
                bad_count += 1
                # dont_plot=True
                break
            if (not g.is_terminal(almost = True)) and  g.if_within_actionable_time():
                if loop_count % rollout_interval == 0:
                    print("------------loopcount/mission_time =", loop_count)
                    # for kk in range(len(cs1as2_list)):
                    #     check_cs1_a_s2 = cs1as2_list[kk]
                    #     check_cs1 = check_cs1_a_s2[0]
                    #     check_s2 = check_cs1_a_s2[2]
                    #     tij1 = (check_cs1[0],check_cs1[3],check_cs1[4])
                        # print("check: ", tij1, check_s2)
                    Qcopy, Ncopy = update_Q_in_future_rollouts(gcopy, Qcopy, Ncopy, cs1as2_list, vel_field_data, nmodes, loop_count)
                s1 = s2 #for next iteration of loop
                cs1 = (t, g.x, g.y, i, j)
                a, q_s_a = max_dict(Qcopy[s1])
            elif g.is_terminal(almost = True):
                break
            else:
            #  i.e. not terminal and not in actinable time.
            # already checked if ternminal or not. If not terminal 
            # if time reaches nt ie not within actionable time, then increment badcount and Dont plot
                bad_count+=1
                bad_flag=True
                # dont_plot=True
                break


        if dont_plot==False:
            plt.plot(xtr, ytr)
        # if bad flag is True then append None to the list. These nones are counted later
        if bad_flag == False:  
            traj_list.append((xtr,ytr))
            t_list.append(t)
        #ADDED for trajactory comparison
        else:
            traj_list.append(None)
            t_list.append(None)


    if fname != None:
        plt.savefig(join(fpath,fname),bbox_inches = "tight", dpi=200)
        plt.cla()
        plt.close(fig)
        writePolicytoFile(t_list, join(fpath,fname+'tlist' ))
        picklePolicy(traj_list, join(fpath,fname+'_coord_traj'))
        print("*** pickled phase2 traj_list ***")

    return t_list, bad_count
Exemple #8
0
def EstimateQ_mids_mids2(paths, grid, Q, N, Vx_rzns, Vy_rzns, num_of_paths,
                         num_actions, ALPHA, sampling_inerval):
    # considers transition from middle of state to middle of state
    # chooses correct actions by taking into consideration velocity field
    # generates velocity field realization here

    max_delQ_list = []
    #pick trajectory from paths and store in reverse order
    for k in range(num_of_paths):
        if k % 500 == 0:
            print("traj_", k)

        max_delQ = 0
        # setup corresponding realisation of velocity field
        """may have to build the realisation here!!!!"""
        # Vxt = Vx_rzns[k,:,:,:]
        # Vyt = Vy_rzns[k,:,:,:]
        """Jugaad"""
        Vxt = Vx_rzns[k, :, :]
        Vyt = Vy_rzns[k, :, :]

        # for all trajectories in the list of paths
        trajectory = paths[0, k]
        state_traj = []
        coord_traj = []

        test_trajx = []
        test_trajy = []

        #*********ASSUMING THAT 5DT IN TRAJ DATA IS 1 SECOND********
        # s_t = 1
        s_i = None
        s_j = None

        for j in range(
                0,
                len(trajectory) - 1, sampling_inerval
        ):  # the len '-1' is to avoid reading NaN at the end of path data
            s_i, s_j = compute_cell(grid, trajectory[j])

            # state_traj.append((s_t, s_i, s_j))
            # coord_traj.append((grid.ts[s_t],trajectory[j][0], trajectory[j][1]))
            state_traj.append((s_i, s_j))
            coord_traj.append((trajectory[j][0], trajectory[j][1]))

            # test_trajx.append(trajectory[j][0])
            # test_trajy.append(trajectory[j][1])
            # s_t+=1

        # if the last sampled point is not endpoint of trajectory, include it in the state/coord_traj
        # s_i_end, s_j_end = compute_cell(grid, trajectory[-2])
        # if (s_i, s_j) != (s_i_end, s_j_end):
        #     state_traj.append((s_t, s_i, s_j))
        #     coord_traj.append((grid.ts[s_t], trajectory[-2][0], trajectory[-2][1]))
        #     test_trajx.append(trajectory[-2][0])
        #     test_trajy.append(trajectory[-2][1])
        #Reverse trajectory orders
        state_traj.reverse()
        coord_traj.reverse()
        test_trajx.reverse()
        test_trajy.reverse()

        # since traj data does not contain start point info, adding it explicitly
        # p, m, n = grid.start_state

        m, n = grid.start_state
        x0 = grid.xs[n]
        y0 = grid.ys[grid.ni - 1 - m]
        state_traj.append(grid.start_state)
        # coord_traj.append((grid.ts[p],x0,y0))
        coord_traj.append((x0, y0))

        # test_trajx.append(x0)
        # # test_trajy.append(y0)
        # if k%500==0:
        #     plt.plot(test_trajx, test_trajy, '-o')
        #Update Q values based on state and possible actions

        for i in range(len(state_traj) - 1):
            s1 = state_traj[i + 1]
            s2 = state_traj[i]
            # t ,m,n=s1
            m, n = s1
            p1 = coord_traj[i + 1]
            p2 = coord_traj[i]
            """COMMENTING THIS STATEMENT BELOW"""
            # if (s1[1],s1[2])!=(s2[1],s2[2]):

            #vx=Vxt[t,i,j]
            a1 = Calculate_action(s1, p1, p2, Vxt, Vyt, grid)
            # print("EstQ: YO")
            if (s1[0], s1[1]) != grid.endpos:

                N[s1][a1] += N_inc
                alpha1 = ALPHA / N[s1][a1]

                #update Q considering a1 was performed
                grid.set_state(s1, xcoord=p1[0], ycoord=p1[1])
                r1 = grid.move_exact(a1, Vxt[m, n], Vyt[m, n])
                q_s_a1 = r1
                next_s = grid.current_state()

                if (next_s[0], next_s[1]) != grid.endpos:
                    _, val = max_dict(Q[next_s])
                    q_s_a1 = r1 + val

                old_qsa = Q[s1][a1]
                Q[s1][a1] += alpha1 * (q_s_a1 - Q[s1][a1])

                if np.abs(old_qsa - Q[s1][a1]) > max_delQ:
                    max_delQ = np.abs(old_qsa - Q[s1][a1])

        max_delQ_list.append(max_delQ)

    return Q, max_delQ_list
Exemple #9
0
def learn_Q_from_exp_buffer(grid,
                            exp_buffer,
                            Q,
                            N,
                            ALPHA,
                            method='reverse_order',
                            num_passes=1):
    """
    Learns Q values after building experience buffer. Contains 2 types of methods- 1.reverse pass through buffer  2.random pass through buffer
    :param grid:
    :param exp_buffer:
    :param Q:
    :param N:
    :param ALPHA:
    :param method:
    :param num_passes:
    :return:
    """
    def Q_update(Q, N, max_delQ, sars):
        s1, a1, r1, s2 = sars
        if s1 != grid.endpos:
            N[s1][a1] += N_inc
            alpha1 = ALPHA / N[s1][a1]
            q_s1_a1 = r1
            if s2 != grid.endpos:
                _, val = max_dict(Q[s2])
                q_s1_a1 = r1 + val
            old_qsa = Q[s1][a1]
            Q[s1][a1] += alpha1 * (q_s1_a1 - Q[s1][a1])
            delQ = np.abs(old_qsa - Q[s1][a1])
            if delQ > max_delQ:
                max_delQ = delQ
        return Q, N, max_delQ

    if not (method == 'reverse_order' or method == 'iid'):
        print("No such method learning Q values from traj")
        return

    print("In Build_Q_...   learning method = ", method)
    max_delQ_list = []

    if method == 'reverse_order':
        for Pass in range(num_passes):
            print("in Build_Q_.. : pass ", Pass)
            max_delQ = 0
            for kth_traj_buffer in exp_buffer:
                for sars in kth_traj_buffer:
                    Q, N, max_delQ = Q_update(Q, N, max_delQ, sars)

            max_delQ_list.append(max_delQ)
            print('max_delQ= ', max_delQ)
            print("Q[start] = ", Q[grid.startpos])
            print('Q[s]: best a, val =', max_dict(Q[grid.startpos]))
            if max_delQ < 1:
                print("Qs converged")
                break

    if method == 'iid':
        flatten = lambda l: [item for sublist in l for item in sublist]
        exp_buffer = flatten(exp_buffer)
        idx_list = np.arange(len(exp_buffer))
        print(len(exp_buffer))

        for Pass in range(num_passes):
            print("in Build_Q_.. : pass ", Pass)
            random.shuffle(idx_list)
            max_delQ = 0
            for i in idx_list:
                sars = exp_buffer[i]
                Q, N, max_delQ = Q_update(Q, N, max_delQ, sars)

            max_delQ_list.append(max_delQ)
            print('max_delQ= ', max_delQ)
            print("Q[start] = ", Q[grid.startpos])
            print('Q[s]: best a, val =', max_dict(Q[grid.startpos]))
            if max_delQ < 1:
                print("Qs converged")
                break

    return Q, max_delQ_list