Example #1
0
#     eval_obs_array = pickle.loads(f.read())


def seed_func():
    return np.random.randint(0, 1000)

num_timesteps = 2.5e7
learning_freq = 1
# training iterations to go
num_iter = num_timesteps / learning_freq

# piecewise learning rate
lr_multiplier = 1.0
learning_rate = PiecewiseSchedule([
    (0, 1e-4 * lr_multiplier),
    (num_iter / 10, 1e-4 * lr_multiplier),
    (num_iter / 2,  5e-5 * lr_multiplier),
], outside_value=5e-5 * lr_multiplier)

# piecewise learning rate
exploration = PiecewiseSchedule([
    (0, 0.05),
    (num_iter / 2, 0.05),
    (num_iter * 3 / 4, 0.05),
    (num_iter * 7 / 8, 0.05),
], outside_value=0.05)

dqn_config = {
    'seed': seed_func,  # will override game settings
    'num_timesteps': num_timesteps,
    'replay_buffer_size': 1000000,
Example #2
0
def main(layers, t_hor, ind, nrolls, bts, ler_r, mom, teps, renew, imp, q):
    # Quad Params
    max_list = [0.1, 0.1, 11.81]
    min_list = [-0.1, -0.1, 7.81]

    g = 9.81

    print 'Starting worker-' + str(ind)

    f = 1
    Nx = 100 * f + 1
    minn = [-5.0, -10.0, -5.0, -10.0, 0.0, -10.0]
    maxx = [5.0, 10.0, 5.0, 10.0, 2 * np.pi, 10.0]

    X = np.linspace(minn[0], maxx[0], Nx)
    Y = np.linspace(minn[2], maxx[2], Nx)
    Z = np.linspace(minn[4], maxx[4], Nx)
    X_, Y_, Z_ = np.meshgrid(X, Y, Z)
    X, Y = np.meshgrid(X, Y)
    XX = np.reshape(X, [-1, 1])
    YY = np.reshape(Y, [-1, 1])
    XX_ = np.reshape(X_, [-1, 1])
    YY_ = np.reshape(Y_, [-1, 1])
    ZZ_ = np.reshape(Z_, [-1, 1])
    grid_check = np.concatenate((XX_, np.ones(
        XX_.shape), YY_, np.ones(XX_.shape), ZZ_, np.zeros(XX_.shape)),
                                axis=1)
    grid_eval = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 6.0 * np.ones(XX.shape)), axis=1)
    grid_eval_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_eval__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 12.0 * np.ones(XX.shape)), axis=1)
    grid_evall_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)

    # Calculate number of parameters of the policy
    nofparams = 0
    for i in xrange(len(layers) - 1):
        nofparams += layers[i] * layers[i + 1] + layers[i + 1]
    print 'Number of Params is: ' + str(nofparams)

    H_length = t_hor
    center = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
    depth = 2.0
    incl = 1.0

    ##################### DEFINITIONS #####################
    #layers = [2 + 1,10,1];                                                    #VAR
    #ssize = layers[0] - 1;
    dt = 0.05
    #VAR
    num_ac = 3
    iters = int(np.abs(t_hor) / dt) * renew + 1
    ##################### INSTANTIATIONS #################
    states, y, Tt, L, l_r, lb, reg, cross_entropy = TransDef(
        "Critic", False, layers, depth, incl, center)
    ola1 = tf.argmax(Tt, dimension=1)
    ola2 = tf.argmax(y, dimension=1)
    ola3 = tf.equal(ola1, ola2)
    accuracy = tf.reduce_mean(tf.cast(ola3, tf.float32))
    #a_layers = layers;
    #a_layers[-1] = 2; #We have two actions
    #states_,y_,Tt_,l_r_,lb_,reg_ = TransDef("Actor",False,a_layers,depth,incl,center,outp=True);

    V_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Critic')
    #A_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Actor');

    #var_grad = tf.gradients(Tt_,states_)[0]
    var_grad_ = tf.gradients(Tt, states)[0]
    grad_x = tf.slice(var_grad_, [0, 0], [-1, layers[0] - 1])
    #theta = tf.trainable_variables();

    set_to_zero = []
    for var in sorted(V_func_vars, key=lambda v: v.name):
        set_to_zero.append(var.assign(tf.zeros(tf.shape(var))))
    set_to_zero = tf.group(*set_to_zero)

    set_to_not_zero = []
    for var in sorted(V_func_vars, key=lambda v: v.name):
        set_to_not_zero.append(
            var.assign(
                tf.random_uniform(tf.shape(var), minval=-0.1, maxval=0.1)))
    set_to_not_zero = tf.group(*set_to_not_zero)

    # DEFINE LOSS

    lmbda = 0.0
    #1.0**(-3.5);#0.01;
    beta = 0.00
    #L = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y,Tt)),1,keep_dims=True))) + beta*tf.reduce_mean(tf.reduce_max(tf.abs(grad_x),reduction_indices=1,keep_dims=True));
    #L = tf.reduce_mean(tf.mul(tf.exp(imp*t_vec),tf.abs(tf.sub(y,Tt)))) + lmbda*reg;
    #L = tf.reduce_mean(tf.abs(tf.sub(y,Tt))) + lmbda*reg;

    # DEFINE OPTIMIZER

    #nu = 5.01;
    #nunu = ler_r;#0.00005;
    nu = tf.placeholder(tf.float32, shape=[])  #VAR

    #lr_multiplier = ler_r
    lr_schedule = PiecewiseSchedule([
        (0, 0.1),
        (10000, 0.01),
        (20000, 0.001),
        (30000, 0.0001),
    ],
                                    outside_value=0.0001)

    #optimizer = tf.train.GradientDescentOptimizer(nu)
    #optimizer
    #train_step = tf.train.MomentumOptimizer(learning_rate=nu,momentum=mom).minimize(L)
    #optimizer
    #train_step = tf.train.AdamOptimizer(learning_rate=nu).minimize(L);
    train_step = tf.train.RMSPropOptimizer(learning_rate=nu,
                                           momentum=mom).minimize(L)
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom);
    #gvs = optimizer.compute_gradients(L,theta);
    #capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in gvs];
    #train_step = optimizer.apply_gradients(gvs);
    #train_step = tf.train.AdagradOptimizer(learning_rate=nu,initial_accumulator_value=0.5).minimize(L);

    hot_input = tf.placeholder(tf.int64, shape=(None))
    make_hot = tf.one_hot(hot_input, 2**num_ac, on_value=1, off_value=0)

    # INITIALIZE GRAPH
    theta = tf.trainable_variables()
    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    def V_0(x):
        return np.linalg.norm(x, ord=np.inf, axis=1, keepdims=True) - 1.0
        #return np.linalg.norm(x,axis=1,keepdims=True) - 1.0

    def p_corr(ALL_x):
        ALL_x = np.mod(ALL_x, 2.0 * np.pi)
        return ALL_x

    def F(ALL_x, opt_a, opt_b):  #(grad,ALL_x):
        col1 = ALL_x[:, 1, None] - opt_b[:, 0, None]
        col2 = ALL_x[:, 2, None] - opt_b[:, 1, None]
        col3 = ALL_x[:, 3, None] - opt_b[:, 2, None]
        col4 = g * opt_a[:, 0, None]
        col5 = -g * opt_a[:, 1, None]
        col6 = opt_a[:, 2, None] - g

        return np.concatenate((col1, col2, col3, col4, col5, col6), axis=1)

    ####################### RECURSIVE FUNC ####################

    def RK4(ALL_x, dtt, opt_a, opt_b):

        k1 = F(ALL_x, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k2)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k1)
        #ALL_tmp[:,[4]] = p_corr(ALL_tmp[:,4]);

        k2 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k3)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k2)
        #ALL_tmp[:,4] = p_corr(ALL_tmp[:,4]);

        k3 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k4)
        ALL_tmp = ALL_x + np.multiply(dtt, k3)
        #ALL_tmp[:,4] = p_corr(ALL_tmp[:,4]);

        k4 = F(ALL_tmp, opt_a, opt_b)
        #### !!!

        Snx = ALL_x + np.multiply((dtt / 6.0), (k1 + 2.0 * k2 + 2.0 * k3 + k4))
        #np.multiply(dtt,k1)
        #Snx[:,4] = p_corr(Snx[:,4]);
        return Snx

    perms = list(itertools.product([-1, 1], repeat=num_ac))
    true_ac_list = []
    for i in range(len(perms)):  #2**num_actions
        ac_tuple = perms[i]
        ac_list = [(tmp1 == 1) * tmp3 + (tmp1 == -1) * tmp2
                   for tmp1, tmp2, tmp3 in zip(ac_tuple, min_list, max_list)]
        true_ac_list.append(ac_list)

    def Hot_to_Cold(hots, ac_list):
        a = hots.argmax(axis=1)
        a = np.asarray([ac_list[i] for i in a])
        return a

    def getPI(
        ALL_x,
        F_PI=[],
        subSamples=1
    ):  #Things to keep in MIND: You want the returned value to be the minimum accross a trajectory.

        current_params = sess.run(theta)

        #perms = list(itertools.product([-1,1], repeat=num_ac))
        next_states = []
        for i in range(len(perms)):
            opt_a = np.asarray(true_ac_list[i]) * np.ones([ALL_x.shape[0], 1])
            Snx = ALL_x
            for _ in range(subSamples):
                Snx = RK4(Snx, dt / float(subSamples), opt_a, None)
            next_states.append(Snx)
        next_states = np.concatenate(next_states, axis=0)
        values = V_0(next_states[:, [0, 1, 2]])

        for params in F_PI:
            for ind in range(len(params)):  #Reload pi*(x,t+dt) parameters
                sess.run(theta[ind].assign(params[ind]))

            hots = sess.run(Tt, {states: ConvCosSin(next_states)})
            opt_a = Hot_to_Cold(hots, true_ac_list)
            for _ in range(subSamples):
                next_states = RK4(next_states, dt / float(subSamples), opt_a,
                                  None)
                values = np.min((values, V_0(next_states[:, [0, 1, 2]])),
                                axis=0)

        values_ = V_0(next_states[:, [0, 1, 2]])
        compare_vals_ = values_.reshape([-1, ALL_x.shape[0]]).T
        #Changed to values instead of values_
        index_best_a_ = compare_vals_.argmin(axis=1)  #Changed to ARGMIN
        values_ = np.min(compare_vals_, axis=1, keepdims=True)

        filterr = np.max(compare_vals_, axis=1) > -1.0
        index_best_a_ = index_best_a_[filterr]
        values_ = values_[filterr]
        print("States filtered out: " + str(len(filterr) - np.sum(filterr)))

        for ind in range(len(current_params)):  #Reload pi*(x,t+dt) parameters
            sess.run(theta[ind].assign(current_params[ind]))

        return sess.run(make_hot, {hot_input: index_best_a_}), values_, filterr

#    def getTraj(ALL_x,F_PI=[],subSamples=1,StepsLeft=None,Noise = False):
#
#        current_params = sess.run(theta);
#
#        if(StepsLeft == None): StepsLeft = len(F_PI);
#
#        next_states = ALL_x;
#        traj = [next_states];
#        actions = [];
#
#        for params in F_PI[len(F_PI)-StepsLeft:]:
#            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
#                sess.run(theta[ind].assign(params[ind]));
#
#            hots = sess.run(Tt,{states:ConvCosSin(next_states)});
#            opt_a = Hot_to_Cold(hots,true_ac_list)
#            for _ in range(subSamples):
#                next_states = RK4(next_states,dt/float(subSamples),opt_a,None);
#                if Noise:
#                    next_states = next_states + np.random.normal(size=next_states.shape)*0.01
#                traj.append(next_states);
#                actions.append(hots.argmax(axis=1)[0]);
#                #values = np.min((values,V_0(next_states[:,[0,1]])),axis=0);
#
#        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
#            sess.run(theta[ind].assign(current_params[ind]));
#
#        return traj,V_0(next_states[:,[0,2]]),actions;

#    def RSScatterPlot(F_PI=[],v_slice=[1,1,0],s_left=None):
#        if(s_left == None): s_left = len(F_PI);
#        ALL_x = np.random.uniform(-5.0,5.0,(nrolls/10,layers[0]-1));
#        ALL_x[:,1] = v_slice[0]
#        ALL_x[:,3] = v_slice[1]
#        ALL_x[:,4] = ALL_x[:,4]*np.pi/5.0 + np.pi;
#        ALL_x[:,5] = v_slice[2]
#        _,VAL,_ = getTraj(ALL_x,F_PI=F_PI,subSamples=4,StepsLeft=s_left,Noise=False);
#        fi = (VAL < 0.0)
#        mini_reach_ = ALL_x[fi[:,0]]
#        fig = plt.figure(1)
#        ax = fig.add_subplot(111, projection='3d')
#        ax.scatter(mini_reach_[:,0], mini_reach_[:,2], mini_reach_[:,4]);
#        plt.pause(20);

    def ConvCosSin(ALL_x):
        pos = ALL_x[:, [0, 1, 2]] / 5.0
        vel = ALL_x[:, [3, 4, 5]] / 10.0
        ret_val = np.concatenate((pos, vel), axis=1)
        return ret_val

    # *****************************************************************************
    #
    # ============================= MAIN LOOP ====================================
    #
    # *****************************************************************************
    t1 = time.time()
    t = 0.0
    mse = np.inf
    k = 0
    kk = 0
    beta = 3.0
    batch_size = bts
    tau = 1000.0
    steps = teps
    ALL_PI = []
    nunu = lr_schedule.value(k)

    #    act_color = ['r','g','b','y'];
    #    if(imp == 1.0):
    #        ALL_PI = pickle.load( open( "policies6D_h20_h20.pkl", "rb" ) );
    #        while (imp == 1.0):
    #            state_get = input('State: ');
    #            sub_smpl = input('SUBSAMPLING: ');
    #            pause_len = input('Pause: ')
    #            s_left = input("How many steps left to go (max. "+str(len(ALL_PI))+")? -> ")
    #            traj,VAL,act = getTraj(state_get,F_PI=ALL_PI,subSamples=sub_smpl,StepsLeft=s_left,Noise=False);
    #            act.append(act[-1]);
    #            all_to = np.concatenate(traj);
    #            plt.scatter(all_to[:,[0]],all_to[:,[2]],c=[act_color[i] for i in act])
    #            #plt.colorbar()
    #            plt.pause(pause_len)
    #            print(str(VAL));
    #    elif(imp==2.0):
    #        ALL_PI = pickle.load( open( "policies6D_h20_h20.pkl", "rb" ) );
    #        RSScatterPlot(F_PI=ALL_PI,v_slice=[1.0,1.0,0],s_left=6)
    #        exit()

    for i in xrange(iters):

        if (np.mod(i, renew) == 0 and i is not 0):

            ALL_PI.insert(0, sess.run(theta))

            #            fig = plt.figure(1)
            #            plt.clf();
            #            _,nn_vals,_ = getTraj(grid_check,ALL_PI,20)
            #            fi = (np.abs(nn_vals) < 0.05)
            #            mini_reach_ = grid_check[fi[:,0]]
            #            ax = fig.add_subplot(111, projection='3d')
            #            ax.scatter(mini_reach_[:,0], mini_reach_[:,2], mini_reach_[:,4]);
            #            plt.pause(0.25);
            #
            #            plt.figure(2) #TODO: Figure out why facing up vs facing down has same action... -> Solved: colors in a scatter plot only depend on the labels
            #            plt.clf();
            #            ALL_xx = np.array([[0.0,0.0,1.0,0.0,0.0,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi/4,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi/2 - 0.3,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi/2 + 0.3,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi/2 + 0.7,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi,0.0]]);
            #            for tmmp in range(ALL_xx.shape[0]):
            #                traj,_,act = getTraj(ALL_xx[[tmmp],:],F_PI=ALL_PI,subSamples=10);
            #                act.append(act[-1]);
            #                all_to = np.concatenate(traj);
            #                plt.scatter(all_to[:,[0]],all_to[:,[2]],c=act);
            #            plt.pause(0.25)
            #
            #            plt.figure(3)
            #            d = 0.1
            #            plt.clf();
            #            plt.title(str([str(i)+" : "+str(perms[i]) for i in range(len(perms))]))
            #            ALL_xp = np.random.uniform(-5.0,5.0,(nrolls/100,layers[0]-1));
            #            plt.subplot(2,3,1) #SUBPLOT
            #            ALL_xp[:,1] = 0.0
            #            ALL_xp[:,3] = 0.0
            #            ALL_xp[:,4] = 0.0 + d
            #            ALL_xp[:,5] = 0.0;
            #            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
            #            letsee_ = letsee_.argmax(axis=1);
            #            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
            #            plt.colorbar()
            #            plt.subplot(2,3,2) #SUBPLOT
            #            ALL_xp[:,1] = 0.0
            #            ALL_xp[:,3] = 0.0
            #            ALL_xp[:,4] = np.pi/2.0 + d
            #            ALL_xp[:,5] = 0.0;
            #            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
            #            letsee_ = letsee_.argmax(axis=1);
            #            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
            #            plt.colorbar()
            #            plt.subplot(2,3,3) #SUBPLOT
            #            ALL_xp[:,1] = 0.0
            #            ALL_xp[:,3] = 0.0
            #            ALL_xp[:,4] = np.pi + d
            #            ALL_xp[:,5] = 0.0;
            #            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
            #            letsee_ = letsee_.argmax(axis=1);
            #            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
            #            plt.colorbar()
            #            plt.subplot(2,3,4) #SUBPLOT
            #            ALL_xp[:,1] = 0.0
            #            ALL_xp[:,3] = 0.0
            #            ALL_xp[:,4] = 0.0 - d
            #            ALL_xp[:,5] = 0.0;
            #            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
            #            letsee_ = letsee_.argmax(axis=1);
            #            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
            #            plt.colorbar()
            #            plt.subplot(2,3,5) #SUBPLOT
            #            ALL_xp[:,1] = 0.0
            #            ALL_xp[:,3] = 0.0
            #            ALL_xp[:,4] = np.pi/2 - d
            #            ALL_xp[:,5] = 0.0;
            #            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
            #            letsee_ = letsee_.argmax(axis=1);
            #            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
            #            plt.colorbar()
            #            plt.subplot(2,3,6) #SUBPLOT
            #            ALL_xp[:,1] = 0.0
            #            ALL_xp[:,3] = 0.0
            #            ALL_xp[:,4] = np.pi - d
            #            ALL_xp[:,5] = 0.0;
            #            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
            #            letsee_ = letsee_.argmax(axis=1);
            #            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
            #            plt.colorbar()
            #            plt.pause(0.1);

            k = 0
            ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0] - 1))
            ALL_x[:, [3, 4, 5]] = ALL_x[:, [3, 4, 5]] * 2.0
            PI, _, filterr = getPI(ALL_x, ALL_PI, subSamples=3)
            ALL_x = ALL_x[filterr]
            pre_ALL_x = ConvCosSin(ALL_x)

            ALL_x_ = np.random.uniform(-5.0, 5.0,
                                       (nrolls / 100, layers[0] - 1))
            ALL_x_[:, [3, 4, 5]] = ALL_x_[:, [3, 4, 5]] * 2.0
            PI_, _, filterr = getPI(ALL_x_, ALL_PI, subSamples=3)
            ALL_x_ = ALL_x_[filterr]
            pre_ALL_x_ = ConvCosSin(ALL_x_)

            #            tmp = np.random.randint(len(reach100s[:,:-1]), size=12000);
            #            _,ZR = getPI(reach100s[tmp,:-1],ALL_PI)
            #            #ZR = sess.run(Tt,{states:reach100s[:,:-1]});
            #            error1 = ZR - reach100s[tmp,-1,None];
            #
            #
            #            plt.figure(2)
            #            _,Z000 = getPI(grid_eval,ALL_PI);
            #            _,Z001 = getPI(grid_eval_,ALL_PI);
            #            _,Z002 = getPI(grid_eval__,ALL_PI);
            #            Z000 = np.reshape(Z000,X.shape);
            #            Z001 = np.reshape(Z001,X.shape);
            #            Z002 = np.reshape(Z002,X.shape);
            #            #filter_in = (Z000 <= 0.05) #& (Z000 >= 0.05);
            #            filter_out = (Z000 > 0.00) #| (Z000 < -0.05);
            #            filter_out_ = (Z001 > 0.00) #| (Z000 < -0.05);
            #            filter_out__ = (Z002 > 0.00) #| (Z000 < -0.05);
            #            #Z000[filter_in] = 1.0;
            #            Z000[filter_out] = 0.0;
            #            Z001[filter_out_] = 0.0;
            #            Z002[filter_out__] = 0.0;
            #
            #            _,Z000l = getPI(grid_evall,ALL_PI);
            #            _,Z001l = getPI(grid_evall_,ALL_PI);
            #            _,Z002l = getPI(grid_evall__,ALL_PI);
            #            Z000l = np.reshape(Z000l,X.shape);
            #            Z001l = np.reshape(Z001l,X.shape);
            #            Z002l = np.reshape(Z002l,X.shape);
            #            #filter_in = (Z000 <= 0.05) #& (Z000 >= 0.05);
            #            filter_outl = (Z000l > 0.00) #| (Z000 < -0.05);
            #            filter_out_l = (Z001l > 0.00) #| (Z000 < -0.05);
            #            filter_out__l = (Z002l > 0.00) #| (Z000 < -0.05);
            #            #Z000[filter_in] = 1.0;
            #            Z000l[filter_outl] = 0.0;
            #            Z001l[filter_out_l] = 0.0;
            #            Z002l[filter_out__l] = 0.0;
            #
            #            plt.clf();
            #            #plt.plot(ALL_t_, np.abs(allE), 'ro');
            #            #plt.axis([-1.0, 0.0, 0.0, 10.0])
            #            plt.subplot(2,3,1)
            #            plt.imshow(Z000,cmap='gray');
            #            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
            #            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
            #            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
            #            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
            #            plt.subplot(2,3,2)
            #            plt.imshow(Z001,cmap='gray');
            #            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
            #            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
            #            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
            #            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
            #            plt.subplot(2,3,3)
            #            plt.imshow(Z002,cmap='gray');
            #            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
            #            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
            #            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
            #            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
            #            plt.subplot(2,3,4)
            #            plt.imshow(Z000l,cmap='gray');
            #            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
            #            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
            #            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
            #            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
            #            plt.subplot(2,3,5)
            #            plt.imshow(Z001l,cmap='gray');
            #            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
            #            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
            #            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
            #            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
            #            plt.subplot(2,3,6)
            #            plt.imshow(Z002l,cmap='gray');
            #            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
            #            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
            #            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
            #            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
            #            plt.pause(0.01);

            t = t - dt
            print('Again.')
#            sess.run(set_to_not_zero);
#            print str(t) + " || " + str(np.max(np.abs(error1))) + " , " + str(np.mean(np.abs(error1))) + "|ITR=" + str(i)                                                #VAR

#            plt.figure(4)
#            plt.clf();
#            plt.title(str([str(i)+" : "+str(perms[i]) for i in range(len(perms))]))
#            b_sele = (ALL_x[:,-1] < 6.1);
#            ALL_xp = ALL_x[b_sele];
#            letsee_ = PI[b_sele];
#            b_sele = (np.abs(ALL_xp[:,2]-np.pi/2.0 + 0.1) < 0.1);
#            ALL_xp = ALL_xp[b_sele];
#            letsee_ = letsee_[b_sele];
#            _,_ = getPI(ALL_xp);
#            #plt.subplot(2,3,1) #SUBPLOT
#            letsee_ = letsee_.argmax(axis=1);
#            plt.scatter(ALL_xp[:,0],ALL_xp[:,1],c=letsee_)
#            plt.colorbar()
#            plt.pause(0.01)
#            woot = np.array([[-0.15023694, -4.03420314,  1.56425333,  6.02741677],
#       [ 0.10373495, -4.34956515,  1.50186123,  6.08060291],
#       [ 0.13439703, -5.47363893,  1.60820922,  6.0519111 ],
#       [ 0.07739933, -4.93777028,  1.57579839,  6.00117299]])
#            _,_ = getPI(woot,ALL_PI);

#elif(i is 0):
        elif (np.mod(i, renew) == 0 and i is 0):

            #            sess.run(set_to_zero);
            t = time.time()
            ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0] - 1))
            ALL_x[:, [3, 4, 5]] = ALL_x[:, [3, 4, 5]] * 2.0
            PI, _, filterr = getPI(ALL_x, F_PI=[], subSamples=3)
            ALL_x = ALL_x[filterr]
            pre_ALL_x = ConvCosSin(ALL_x)
            elapsed = time.time() - t
            print("Compute Data Time = " + str(elapsed))

            ALL_x_ = np.random.uniform(-5.0, 5.0,
                                       (nrolls / 100, layers[0] - 1))
            ALL_x_[:, [3, 4, 5]] = ALL_x_[:, [3, 4, 5]] * 2.0
            PI_, _, filterr = getPI(ALL_x_, F_PI=[], subSamples=3)
            ALL_x_ = ALL_x_[filterr]
            pre_ALL_x_ = ConvCosSin(ALL_x_)
#            sess.run(set_to_not_zero);

# |||||||||||| ----  PRINT ----- ||||||||||||

        if (np.mod(i, 200) == 0):

            #xel = sess.run(L,{states:ALL_x,y:PI});
            #test_e = sess.run(L,{states:ALL_x_,y:PI_});
            train_acc = sess.run(accuracy, {
                states: pre_ALL_x,
                y: PI
            })
            test_acc = sess.run(accuracy, {
                states: pre_ALL_x_,
                y: PI_
            })
            #o = np.random.randint(len(ALL_x));
            print str(i) + ") | TR_ACC = " + str(
                train_acc) + " | TE_ACC = " + str(
                    test_acc) + " | Lerning Rate = " + str(nunu)
            #print str(i) + ") | XEL = " + str(xel) + " | Test_E = " + str(test_e) + " | Lerning Rate = " + str(nunu)
            #print str(PI[[o],:]) + " || " + str(sess.run(l_r[-1],{states:ALL_x[[o],:]})) #+ " || " + str(sess.run(gvs[-1],{states:ALL_x,y:PI}))

        nunu = 0.01  #/(np.sqrt(np.mod(i,renew))+1.0)#lr_schedule.value(i);
        #nunu = ler_r/(np.mod(i,renew)+1.0);
        tmp = np.random.randint(len(ALL_x), size=bts)
        sess.run(train_step,
                 feed_dict={
                     states: pre_ALL_x[tmp],
                     y: PI[tmp],
                     nu: nunu
                 })
        #tmp = np.random.randint(len(reach100s), size=bts);
        #sess.run(train_step, feed_dict={states:reach100s[tmp,:-1],y:reach100s[tmp,-1,None],nu:nunu});

    pickle.dump(ALL_PI, open("policies6Dreach_h50.pkl", "wb"))
Example #3
0
    eval_obs_array = pickle.loads(f.read())


def seed_func():
    return np.random.randint(0, 1000)

num_timesteps = 2e7
learning_freq = 4
# training iterations to go
num_iter = num_timesteps / learning_freq

# piecewise learning rate (larger)
lr_multiplier = 1.0
learning_rate = PiecewiseSchedule([
    (0, 4e-4 * lr_multiplier),
    (num_iter / 2, 2e-4 * lr_multiplier),
    (num_iter * 3 / 4,  1e-4 * lr_multiplier),
], outside_value=1e-4 * lr_multiplier)

# piecewise exploration rate
exploration = PiecewiseSchedule([
    (0, 1.0),
    (num_iter / 2, 0.7),
    (num_iter * 3 / 4, 0.1),
    (num_iter * 7 / 8, 0.05),
], outside_value=0.05)

dqn_config = {
    'seed': seed_func,  # will override game settings
    'num_timesteps': num_timesteps,
    'replay_buffer_size': 1000000,

def seed_func():
    return np.random.randint(0, 1000)


num_timesteps = 2e7  # 400 epoch
learning_freq = 4
# training iterations to go
num_iter = num_timesteps / learning_freq

# piecewise learning rate
lr_multiplier = 1.0
learning_rate = PiecewiseSchedule([
    (0, 1e-4 * lr_multiplier),
    (num_iter / 10, 1e-4 * lr_multiplier),
    (num_iter / 2, 5e-5 * lr_multiplier),
],
                                  outside_value=5e-5 * lr_multiplier)

learning_rate_term = PiecewiseSchedule([
    (0, 2.5e-4 * lr_multiplier),
    (num_iter / 20, 2.5e-4 * lr_multiplier),
    (num_iter / 5, 2.5e-4 * lr_multiplier),
    (num_iter / 2, 2.5e-4 * lr_multiplier),
    (num_iter * 3 / 4, 2.5e-4 * lr_multiplier),
],
                                       outside_value=2.5e-4 * lr_multiplier)

# piecewise exploration rate
exploration = PiecewiseSchedule([
    (0, 1.0),
Example #5
0
def lander_exploration_schedule(num_timesteps):
    return PiecewiseSchedule([
        (0, 1),
        (num_timesteps * 0.1, 0.02),
    ],
                             outside_value=0.02)
Example #6
0
   eval_obs_array = pickle.loads(f.read())


def seed_func():
    return np.random.randint(0, 1000)

num_timesteps = 2e6  # 40 epoch
learning_freq = 4
# training iterations to go
num_iter = num_timesteps / learning_freq

# piecewise learning rate
lr_multiplier = 1.0
learning_rate = PiecewiseSchedule([
    (0, 2e-4 * lr_multiplier),
    (num_iter / 2, 1e-4 * lr_multiplier),
    (num_iter * 3 / 4,  5e-5 * lr_multiplier),
], outside_value=5e-5 * lr_multiplier)

learning_rate_term = PiecewiseSchedule([
    (0, 2e-4 * lr_multiplier),
    (num_iter / 40, 1e-3 * lr_multiplier),
    (num_iter / 20, 5e-2 * lr_multiplier),
    (num_iter * 3 / 4, 5e-3 * lr_multiplier),
    (num_iter * 7 / 8,  5e-4 * lr_multiplier),
], outside_value=5e-4 * lr_multiplier)

# piecewise exploration rate
exploration = PiecewiseSchedule([
    (0, 1.0),
    (num_iter / 40, 0.97),
Example #7
0
def torcs_config_ft(tag):
    torcs_config(tag)
    FLAGS.torcs_demo = False
    FLAGS.exploration_schedule = PiecewiseSchedule([(0, 0.05), (10, 0.05)],
                                                   outside_value=0.05)
def main(layers,t_hor,ind,nrolls,bts,ler_r,mom,teps,renew,imp,q):
    # Quad Params
    T1Max = 36.7875/2.0; 
    T1Min = 0;
    T2Max = 36.7875/2.0;
    T2Min = 0;
    max_list = [T1Max,T2Max];
    min_list = [T1Min,T2Min];
    
    #Disturbance
    max_list_ = [0.5,0.5];
    min_list_ = [-0.5,-0.5];
    
    m = 1.25; 
    grav = 9.81;
    transDrag = 0.25; 
    rotDrag = 0.02255; 
    Iyy = 0.03; 
    l = 0.5; 


    print 'Starting worker-' + str(ind)

    f = 1;
    Nx = 100*f + 1;
    minn = [-5.0,-10.0,-5.0,-10.0,0.0,-10.0];
    maxx = [ 5.0, 10.0, 5.0, 10.0,2*np.pi, 10.0];
    
    X = np.linspace(minn[0],maxx[0],Nx);
    Y = np.linspace(minn[2],maxx[2],Nx);
    Z = np.linspace(minn[4],maxx[4],Nx);
    X_,Y_,Z_ = np.meshgrid(X, Y, Z);    
    X,Y = np.meshgrid(X, Y);
    XX = np.reshape(X,[-1,1]);
    YY = np.reshape(Y,[-1,1]);
    XX_ = np.reshape(X_,[-1,1]);
    YY_ = np.reshape(Y_,[-1,1]);
    ZZ_ = np.reshape(Z_,[-1,1]); grid_check = np.concatenate((XX_,np.ones(XX_.shape),YY_,np.ones(XX_.shape),ZZ_,np.zeros(XX_.shape)),axis=1);
    grid_eval = np.concatenate((XX,YY,0.0*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
    grid_eval_ = np.concatenate((XX,YY,(2.0/3.0)*np.pi*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
    grid_eval__ = np.concatenate((XX,YY,(4.0/3.0)*np.pi*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
    grid_evall = np.concatenate((XX,YY,0.0*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);
    grid_evall_ = np.concatenate((XX,YY,(2.0/3.0)*np.pi*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);
    grid_evall__ = np.concatenate((XX,YY,(4.0/3.0)*np.pi*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);    


    # Calculate number of parameters of the policy
    nofparams = 0;
    for i in xrange(len(layers)-1):
        nofparams += layers[i]*layers[i+1] + layers[i+1];
    print 'Number of Params is: ' + str(nofparams)
    
    H_length = t_hor;
    center = np.array([[0.0,0.0,0.0,0.0,0.0,0.0]])
    depth = 2.0;
    incl = 1.0;

    ##################### DEFINITIONS #####################
    #layers = [2 + 1,10,1];                                                    #VAR
    #ssize = layers[0] - 1;
    dt = 0.05;                                                                 #VAR
    num_ac = 2;
    iters = int(np.abs(t_hor)/dt)*renew + 1; 
    ##################### INSTANTIATIONS #################
    states,y,Tt,L,l_r,lb,reg, cross_entropy = TransDef("Control",False,layers,depth,incl,center);
    states_,y_,Tt_,L_,l_r_,lb_,reg_, cross_entropy_ = TransDef("Disturbance",False,layers,depth,incl,center);
    ola1 = tf.argmax(Tt,dimension=1)
    ola2 = tf.argmax(y,dimension=1)
    ola3 = tf.equal(ola1,ola2)
    accuracy = tf.reduce_mean(tf.cast(ola3, tf.float32));
    ola1_ = tf.argmax(Tt_,dimension=1)
    ola2_ = tf.argmax(y_,dimension=1)
    ola3_ = tf.equal(ola1_,ola2_)
    accuracy_ = tf.reduce_mean(tf.cast(ola3_, tf.float32));    
    #a_layers = layers;
    #a_layers[-1] = 2; #We have two actions
    #states_,y_,Tt_,l_r_,lb_,reg_ = TransDef("Actor",False,a_layers,depth,incl,center,outp=True);
    
    C_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Control');
    D_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Disturbance');
    
    #var_grad = tf.gradients(Tt_,states_)[0]
    var_grad_ = tf.gradients(Tt,states)[0]
    grad_x = tf.slice(var_grad_,[0,0],[-1,layers[0]-1]);
    #theta = tf.trainable_variables();

    set_to_zero = []
    for var  in sorted(C_func_vars,        key=lambda v: v.name):
        set_to_zero.append(var.assign(tf.zeros(tf.shape(var))))
    set_to_zero = tf.group(*set_to_zero)
    
    set_to_not_zero = []
    for var  in sorted(C_func_vars,        key=lambda v: v.name):
        set_to_not_zero.append(var.assign(tf.random_uniform(tf.shape(var),minval=-0.1,maxval=0.1)));
    set_to_not_zero = tf.group(*set_to_not_zero)    

    # DEFINE LOSS

    lmbda = 0.0;#1.0**(-3.5);#0.01;
    beta = 0.00;
    #L = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y,Tt)),1,keep_dims=True))) + beta*tf.reduce_mean(tf.reduce_max(tf.abs(grad_x),reduction_indices=1,keep_dims=True));
    #L = tf.reduce_mean(tf.mul(tf.exp(imp*t_vec),tf.abs(tf.sub(y,Tt)))) + lmbda*reg;
    #L = tf.reduce_mean(tf.abs(tf.sub(y,Tt))) + lmbda*reg;    

    # DEFINE OPTIMIZER

    #nu = 5.01;
    #nunu = ler_r;#0.00005;
    nu = tf.placeholder(tf.float32, shape=[])                                         #VAR

    #lr_multiplier = ler_r
    lr_schedule = PiecewiseSchedule([
                                         (0, 0.1),
                                         (10000, 0.01 ),
                                         (20000, 0.001 ),
                                         (30000, 0.0001 ),
                                    ],
                                    outside_value=0.0001)

    #optimizer = tf.train.GradientDescentOptimizer(nu)
    #optimizer
    #train_step = tf.train.MomentumOptimizer(learning_rate=nu,momentum=mom).minimize(L)
    #optimizer 
    #train_step = tf.train.AdamOptimizer(learning_rate=nu).minimize(L);
    train_step = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom).minimize(L);
    train_step_ = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom).minimize(L_);
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom);
    #gvs = optimizer.compute_gradients(L,theta);
    #capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in gvs];
    #train_step = optimizer.apply_gradients(gvs);
    #train_step = tf.train.AdagradOptimizer(learning_rate=nu,initial_accumulator_value=0.5).minimize(L);

    hot_input = tf.placeholder(tf.int64,shape=(None));   
    make_hot = tf.one_hot(hot_input, 4, on_value=1, off_value=0)

    # INITIALIZE GRAPH
    sess = tf.Session();
    init = tf.initialize_all_variables();
    sess.run(init);

    def V_0(x):
        return np.linalg.norm(x,ord=np.inf,axis=1,keepdims=True) - 1.0
        #return np.linalg.norm(x,axis=1,keepdims=True) - 1.0

    def p_corr(ALL_x):
        ALL_x = np.mod(ALL_x,2.0*np.pi);
        return ALL_x;

    def F(ALL_x,opt_a,opt_b):#(grad,ALL_x):
       sin_phi = np.sin(ALL_x[:,4,None]);
       cos_phi = np.cos(ALL_x[:,4,None]);
       
       col1 = ALL_x[:,1,None] + opt_b[:,0,None];
       col2 = -1.0*transDrag*ALL_x[:,1,None]/m - np.multiply(opt_a[:,0,None],sin_phi)/m - np.multiply(opt_a[:,1,None],sin_phi)/m;
       col3 = ALL_x[:,3,None] + opt_b[:,1,None];
       col4 = -1.0*(m*grav + transDrag*ALL_x[:,3,None]) + np.multiply(opt_a[:,0,None],cos_phi)/m + np.multiply(opt_a[:,1,None],cos_phi)/m;
       col5 = ALL_x[:,5,None];
       col6 = -1.0*(1.0/Iyy)*rotDrag*ALL_x[:,5,None] - (l/Iyy)*opt_a[:,0,None] + (l/Iyy)*opt_a[:,1,None];
       
       return np.concatenate((col1,col2,col3,col4,col5,col6),axis=1);

    ####################### RECURSIVE FUNC ####################

    def RK4(ALL_x,dtt,opt_a,opt_b):

        k1 = F(ALL_x,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k2)
        ALL_tmp = ALL_x + np.multiply(dtt/2.0,k1);
        ALL_tmp[:,4] = p_corr(ALL_tmp[:,4]);

        k2 = F(ALL_tmp,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k3)
        ALL_tmp = ALL_x + np.multiply(dtt/2.0,k2);
        ALL_tmp[:,4] = p_corr(ALL_tmp[:,4]);

        k3 = F(ALL_tmp,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k4)
        ALL_tmp = ALL_x + np.multiply(dtt,k3);
        ALL_tmp[:,4] = p_corr(ALL_tmp[:,4]);

        k4 = F(ALL_tmp,opt_a,opt_b);  #### !!!

        Snx = ALL_x + np.multiply((dtt/6.0),(k1 + 2.0*k2 + 2.0*k3 + k4)); #np.multiply(dtt,k1)
        Snx[:,4] = p_corr(Snx[:,4]);
        return Snx;

    perms = list(itertools.product([-1,1], repeat=num_ac))
    true_ac_list = [];
    for i in range(len(perms)): #2**num_actions
        ac_tuple = perms[i];
        ac_list = [(tmp1==1)*tmp3 +  (tmp1==-1)*tmp2 for tmp1,tmp2,tmp3 in zip(ac_tuple,min_list,max_list)]; 
        true_ac_list.append(ac_list);
        
    dist_ac = 2;    
    perms_ = list(itertools.product([-1,1], repeat=dist_ac))
    true_ac_list_ = [];
    for i in range(len(perms_)): #2**num_actions
        ac_tuple_ = perms_[i];
        ac_list_ = [(tmp1==1)*tmp3 +  (tmp1==-1)*tmp2 for tmp1,tmp2,tmp3 in zip(ac_tuple_,min_list_,max_list_)]; #ASSUMING: aMax = -aMin
        true_ac_list_.append(ac_list_);       
    
    def Hot_to_Cold(hots,ac_list):
        a = hots.argmax(axis=1);
        a = np.asarray([ac_list[i] for i in a]);
        return a;
    
    def getPI(ALL_x,F_PI=[], F_PI_=[], subSamples=1): #Things to keep in MIND: You want the returned value to be the minimum accross a trajectory.

        current_params = sess.run(C_func_vars);
        current_params_ = sess.run(D_func_vars);

        #perms = list(itertools.product([-1,1], repeat=num_ac))
        next_states_ = [];
        for k in range((len(perms))):
            next_states = [];
            opt_a = np.asarray(true_ac_list[k])*np.ones([ALL_x.shape[0],1]);
            for i in range(len(perms_)):
                opt_b = np.asarray(true_ac_list_[i])*np.ones([ALL_x.shape[0],1]);
                Snx = ALL_x;
                for _ in range(subSamples): 
                    Snx = RK4(Snx,dt/float(subSamples),opt_a,opt_b);
                next_states.append(Snx);
            next_states_.append(np.concatenate(next_states,axis=0));
        next_states_ = np.concatenate(next_states_,axis=0);
        #values = V_0(next_states[:,[0,2]]);
        
        
        for params,params_ in zip(F_PI,F_PI_):
            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
                sess.run(C_func_vars[ind].assign(params[ind]));
            for ind in range(len(params_)): #Reload pi*(x,t+dt) parameters
                sess.run(D_func_vars[ind].assign(params_[ind]));            

            tmp = ConvCosSin(next_states_);
            hots = sess.run(Tt,{states:tmp});
            opt_a = Hot_to_Cold(hots,true_ac_list)   
            hots = sess.run(Tt_,{states_:tmp});
            opt_b = Hot_to_Cold(hots,true_ac_list_)            
            for _ in range(subSamples):
                next_states_ = RK4(next_states_,dt/float(subSamples),opt_a,opt_b);
                #values = np.min((values,V_0(next_states[:,[0,2]])),axis=0);
        
        values_ = V_0(next_states_[:,[0,2]]);
        pre_compare_vals_ = values_.reshape([-1,ALL_x.shape[0]]).T;         #Changed to values instead of values_
        final_v = [];
        final_v_ = [];
        per = len(perms);
        for k in range(len(perms_)):
            final_v.append(np.argmin(pre_compare_vals_[:,k*per:(k+1)*per,None],axis=1))
            final_v_.append(np.min(pre_compare_vals_[:,k*per:(k+1)*per,None],axis=1))
        finalF = np.concatenate(final_v_,axis=1);
        index_best_a_ = np.argmax(finalF,axis=1);
        finalF_ = np.concatenate(final_v,axis=1);
        index_best_b_ = np.array([finalF_[k,index_best_a_[k]] for k in range(len(index_best_a_))]);
        
        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
            sess.run(C_func_vars[ind].assign(current_params[ind]));
        for ind in range(len(current_params_)): #Reload pi*(x,t+dt) parameters
            sess.run(D_func_vars[ind].assign(current_params_[ind]));
            
        return sess.run(make_hot,{hot_input:index_best_a_}),sess.run(make_hot,{hot_input:index_best_b_})

#    def getTraj(ALL_x,F_PI=[],F_PI_=[],subSamples=1,StepsLeft=None,Noise = False):
#
#        current_params = sess.run(C_func_vars);
#        current_params_ = sess.run(D_func_vars);
#        
#        if(StepsLeft == None): StepsLeft = len(F_PI);        
#        
#        next_states_ = ALL_x;
#        traj = [next_states_];
#        actions = [];
#
#        for params,params_ in zip(F_PI,F_PI_):
#            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
#                sess.run(C_func_vars[ind].assign(params[ind]));
#            for ind in range(len(params_)): #Reload pi*(x,t+dt) parameters
#                sess.run(D_func_vars[ind].assign(params_[ind]));            
#
#            tmp = ConvCosSin(next_states_);
#            hots = sess.run(Tt,{states:tmp});
#            opt_a = Hot_to_Cold(hots,true_ac_list)   
#            hots_ = sess.run(Tt_,{states_:tmp});
#            opt_b = Hot_to_Cold(hots_,true_ac_list_)            
#            for _ in range(subSamples):
#                next_states_ = RK4(next_states_,dt/float(subSamples),opt_a,opt_b);
#                traj.append(next_states_); 
#                actions.append(hots.argmax(axis=1)[0]);
#
#        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
#            sess.run(C_func_vars[ind].assign(current_params[ind]));
#        for ind in range(len(current_params_)): #Reload pi*(x,t+dt) parameters
#            sess.run(D_func_vars[ind].assign(current_params_[ind]));
#                        
#        return traj,actions#,V_0(next_states[:,[0,2]]),actions; 

    def getTraj(ALL_x,F_PI=[],F_PI_=[],subSamples=1,StepsLeft=None,Noise = False):

        current_params = sess.run(C_func_vars);
        current_params_ = sess.run(D_func_vars);
        
        if(StepsLeft == None): StepsLeft = len(F_PI);        
        
        next_states_ = ALL_x;
        traj = [next_states_];
        actions = [];
              
        for params,params_ in zip(F_PI[len(F_PI)-StepsLeft:],F_PI_[len(F_PI_)-StepsLeft:]):
            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
                sess.run(C_func_vars[ind].assign(params[ind]));
            for ind in range(len(params_)): #Reload pi*(x,t+dt) parameters
                sess.run(D_func_vars[ind].assign(params_[ind]));  
            
            tmp = ConvCosSin(next_states_);
            hots = sess.run(Tt,{states:tmp});
            opt_a = Hot_to_Cold(hots,true_ac_list)   
            if Noise == False:
                hots_ = sess.run(Tt_,{states_:tmp});
                opt_b = Hot_to_Cold(hots_,true_ac_list_)
            else:
                hots_ = np.zeros((1,2**dist_ac));
                hots_[0][np.random.randint(2**dist_ac)] = 1
                opt_b = Hot_to_Cold(hots_,true_ac_list_)
                        
            for _ in range(subSamples):
                next_states_ = RK4(next_states_,dt/float(subSamples),opt_a,opt_b);
                traj.append(next_states_); 
                actions.append(hots.argmax(axis=1)[0]);   

        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
            sess.run(C_func_vars[ind].assign(current_params[ind]));
        for ind in range(len(current_params_)): #Reload pi*(x,t+dt) parameters
            sess.run(D_func_vars[ind].assign(current_params_[ind]));
                        
        return traj,actions,V_0(next_states_[:,[0,2]])

    def ConvCosSin(ALL_x):
        sin_phi = np.sin(ALL_x[:,4,None])
        cos_phi = np.cos(ALL_x[:,4,None])
        pos = ALL_x[:,[0,2]]/5.0;
        vel = ALL_x[:,[1,3]]/10.0;
        arate = ALL_x[:,[5]]/30.0;
        ret_val = np.concatenate((pos,vel,arate,sin_phi,cos_phi),axis=1)
        return ret_val
    # *****************************************************************************
    #
    # ============================= MAIN LOOP ====================================
    #
    # *****************************************************************************
    t1 = time.time();
    t = 0.0;
    mse = np.inf;
    k=0; kk = 0; beta=3.0; batch_size = bts; tau = 1000.0; steps = teps;
    ALL_PI = [];
    ALL_PI_= [];
    nunu = lr_schedule.value(k);
    
    act_color = ['r','g','b','y'];
    if(imp == 1.0):
        ALL_PI,ALL_PI_ = pickle.load( open( "policies6D_C&D_h30_h30.pkl", "rb" ) );
        while True:
            state_get = input('State: ');
            sub_smpl = input('SUBSAMPLING: ');
            pause_len = input('Pause: ')
            s_left = input("How many steps left to go (max. "+str(len(ALL_PI))+")? -> ")
            noise = input("Noise? (0/1): ")
            traj,act,_ = getTraj(state_get,F_PI=ALL_PI,F_PI_=ALL_PI_,subSamples=sub_smpl,StepsLeft=s_left,Noise=noise);
            act.append(act[-1]);
            all_to = np.concatenate(traj);
            plt.scatter(all_to[:,[0]],all_to[:,[2]],c=[act_color[i] for i in act])
            #plt.colorbar()
    elif(imp == 2.0):
        ALL_PI,ALL_PI_ = pickle.load( open( "policies6D_C&D_h30_h30.pkl", "rb" ) );
        fig = plt.figure(1)
        while True:
            sub_smpl = input('SUBSAMPLING: ');
            pause_len = input('Pause: ')
            s_left = input("How many steps left to go (max. "+str(len(ALL_PI))+")? -> ")  
            grid_check = np.random.uniform(-10.0,10.0,(nrolls,layers[0]-1));
            grid_check[:,0] = 1.5
            grid_check[:,2] = 1.5
            grid_check[:,4] = 1.5#grid_check[:,4]*np.pi/5.0 + np.pi;
            #grid_check[:,5] = 0
            #fig = plt.figure(1)
            #plt.clf();
            _,_,nn_vals = getTraj(grid_check,F_PI=ALL_PI,F_PI_=ALL_PI_,subSamples=sub_smpl,StepsLeft=s_left);
            fi = (nn_vals < 0.0)
            mini_reach_ = grid_check[fi[:,0]]
            ax = fig.add_subplot(111, projection='3d')
            ax.scatter(mini_reach_[:,1], mini_reach_[:,3], mini_reach_[:,5]); 
            #plt.xlim(-5, 5)
            #plt.ylim(-5, 5)
            plt.pause(pause_len);         
            
    
    for i in xrange(iters):
        
        if(np.mod(i,renew) == 0 and i is not 0):       
            
            ALL_PI.insert(0,sess.run(C_func_vars));
            ALL_PI_.insert(0,sess.run(D_func_vars)); 
            
#            fig = plt.figure(1)
#            plt.clf();
#            _,nn_vals,_ = getTraj(grid_check,ALL_PI,20)
#            fi = (np.abs(nn_vals) < 0.05)
#            mini_reach_ = grid_check[fi[:,0]]
#            ax = fig.add_subplot(111, projection='3d')
#            ax.scatter(mini_reach_[:,0], mini_reach_[:,2], mini_reach_[:,4]);            
#            plt.pause(0.25);            

            plt.figure(2) #TODO: Figure out why facing up vs facing down has same action... -> Solved: colors in a scatter plot only depend on the labels
            plt.clf();
            ALL_xx = np.array([[-0.3,0.0,1.0,0.0,0.0,0.0],
                               [-0.2,0.0,1.0,0.0,np.pi/4,0.0],
                               [-0.1,0.0,1.0,0.0,np.pi/2 - 0.3,0.0],
                               [-0.1,0.0,1.0,0.0,np.pi/2,0.0],
                               [0.1,0.0,1.0,0.0,np.pi/2 + 0.3,0.0],
                               [0.2,0.0,1.0,0.0,np.pi/2 + 0.7,0.0],
                               [0.3,0.0,1.0,0.0,np.pi,0.0]]);
            for tmmp in range(ALL_xx.shape[0]):                   
                traj,act,_ = getTraj(ALL_xx[[tmmp],:],F_PI=ALL_PI,F_PI_=ALL_PI_,subSamples=10);
                act.append(act[-1]);
                all_to = np.concatenate(traj);
                plt.scatter(all_to[:,[0]],all_to[:,[2]],c=[act_color[ii] for ii in act]);                   
            plt.pause(0.25)                   
 
#            plt.figure(3)
#            d = 0.1
#            plt.clf();
#            plt.title(str([str(i)+" : "+str(perms[i]) for i in range(len(perms))]))
#            ALL_xp = np.random.uniform(-5.0,5.0,(nrolls/100,layers[0]-1));
#            plt.subplot(2,3,1) #SUBPLOT
#            ALL_xp[:,1] = 0.0
#            ALL_xp[:,3] = 0.0
#            ALL_xp[:,4] = 0.0 + d
#            ALL_xp[:,5] = 0.0; 
#            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
#            letsee_ = letsee_.argmax(axis=1);
#            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
#            plt.colorbar()
#            plt.subplot(2,3,2) #SUBPLOT
#            ALL_xp[:,1] = 0.0
#            ALL_xp[:,3] = 0.0
#            ALL_xp[:,4] = np.pi/2.0 + d
#            ALL_xp[:,5] = 0.0; 
#            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
#            letsee_ = letsee_.argmax(axis=1);
#            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
#            plt.colorbar()
#            plt.subplot(2,3,3) #SUBPLOT
#            ALL_xp[:,1] = 0.0
#            ALL_xp[:,3] = 0.0
#            ALL_xp[:,4] = np.pi + d
#            ALL_xp[:,5] = 0.0; 
#            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
#            letsee_ = letsee_.argmax(axis=1);
#            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
#            plt.colorbar()
#            plt.subplot(2,3,4) #SUBPLOT
#            ALL_xp[:,1] = 0.0
#            ALL_xp[:,3] = 0.0
#            ALL_xp[:,4] = 0.0 - d
#            ALL_xp[:,5] = 0.0; 
#            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
#            letsee_ = letsee_.argmax(axis=1);
#            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
#            plt.colorbar()
#            plt.subplot(2,3,5) #SUBPLOT
#            ALL_xp[:,1] = 0.0
#            ALL_xp[:,3] = 0.0
#            ALL_xp[:,4] = np.pi/2 - d
#            ALL_xp[:,5] = 0.0; 
#            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
#            letsee_ = letsee_.argmax(axis=1);
#            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
#            plt.colorbar()
#            plt.subplot(2,3,6) #SUBPLOT
#            ALL_xp[:,1] = 0.0
#            ALL_xp[:,3] = 0.0
#            ALL_xp[:,4] = np.pi - d
#            ALL_xp[:,5] = 0.0; 
#            letsee_ = sess.run(Tt,{states:ConvCosSin(ALL_xp)});
#            letsee_ = letsee_.argmax(axis=1);
#            plt.scatter(ALL_xp[:,0],ALL_xp[:,2],c=letsee_)
#            plt.colorbar()         
#            plt.pause(0.1);            
                        
            
            k = 0;
            ALL_x = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-1));
            ALL_x[:,1] = ALL_x[:,1]*2.0
            ALL_x[:,3] = ALL_x[:,3]*2.0
            ALL_x[:,4] = ALL_x[:,4]*np.pi/5.0 + np.pi;
            ALL_x[:,5] = ALL_x[:,5]*6.0;  
            PI_c,PI_d = getPI(ALL_x,ALL_PI,ALL_PI_,subSamples=3);
            pre_ALL_x = ConvCosSin(ALL_x);
            
            ALL_x_ = np.random.uniform(-5.0,5.0,(nrolls/100,layers[0]-1));
            ALL_x_[:,1] = ALL_x_[:,1]*2.0
            ALL_x_[:,3] = ALL_x_[:,3]*2.0
            ALL_x_[:,4] = ALL_x_[:,4]*np.pi/5.0 + np.pi;
            ALL_x_[:,5] = ALL_x_[:,5]*6.0; 
            PI_c_,PI_d_ = getPI(ALL_x_,ALL_PI,ALL_PI_,subSamples=3);
            pre_ALL_x_ = ConvCosSin(ALL_x_);

#            tmp = np.random.randint(len(reach100s[:,:-1]), size=12000);
#            _,ZR = getPI(reach100s[tmp,:-1],ALL_PI)
#            #ZR = sess.run(Tt,{states:reach100s[:,:-1]});
#            error1 = ZR - reach100s[tmp,-1,None];
#            
#           
#            plt.figure(2)
#            _,Z000 = getPI(grid_eval,ALL_PI);
#            _,Z001 = getPI(grid_eval_,ALL_PI);
#            _,Z002 = getPI(grid_eval__,ALL_PI);            
#            Z000 = np.reshape(Z000,X.shape);
#            Z001 = np.reshape(Z001,X.shape);
#            Z002 = np.reshape(Z002,X.shape);
#            #filter_in = (Z000 <= 0.05) #& (Z000 >= 0.05);
#            filter_out = (Z000 > 0.00) #| (Z000 < -0.05);       
#            filter_out_ = (Z001 > 0.00) #| (Z000 < -0.05);       
#            filter_out__ = (Z002 > 0.00) #| (Z000 < -0.05);       
#            #Z000[filter_in] = 1.0;
#            Z000[filter_out] = 0.0;
#            Z001[filter_out_] = 0.0;
#            Z002[filter_out__] = 0.0;
#
#            _,Z000l = getPI(grid_evall,ALL_PI);
#            _,Z001l = getPI(grid_evall_,ALL_PI);
#            _,Z002l = getPI(grid_evall__,ALL_PI);             
#            Z000l = np.reshape(Z000l,X.shape);
#            Z001l = np.reshape(Z001l,X.shape);
#            Z002l = np.reshape(Z002l,X.shape);
#            #filter_in = (Z000 <= 0.05) #& (Z000 >= 0.05);
#            filter_outl = (Z000l > 0.00) #| (Z000 < -0.05);       
#            filter_out_l = (Z001l > 0.00) #| (Z000 < -0.05);       
#            filter_out__l = (Z002l > 0.00) #| (Z000 < -0.05);       
#            #Z000[filter_in] = 1.0;
#            Z000l[filter_outl] = 0.0;
#            Z001l[filter_out_l] = 0.0;
#            Z002l[filter_out__l] = 0.0;
#
#            plt.clf();
#            #plt.plot(ALL_t_, np.abs(allE), 'ro');
#            #plt.axis([-1.0, 0.0, 0.0, 10.0])
#            plt.subplot(2,3,1)
#            plt.imshow(Z000,cmap='gray');
#            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
#            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
#            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
#            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
#            plt.subplot(2,3,2)
#            plt.imshow(Z001,cmap='gray');
#            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
#            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
#            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
#            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
#            plt.subplot(2,3,3)
#            plt.imshow(Z002,cmap='gray');
#            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
#            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
#            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
#            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
#            plt.subplot(2,3,4)
#            plt.imshow(Z000l,cmap='gray');
#            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
#            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
#            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
#            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
#            plt.subplot(2,3,5)
#            plt.imshow(Z001l,cmap='gray');
#            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
#            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
#            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
#            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
#            plt.subplot(2,3,6)
#            plt.imshow(Z002l,cmap='gray'); 
#            plt.plot([30*f, 30*f], [30*f, 70*f], 'r-', lw=1)
#            plt.plot([30*f, 70*f], [70*f, 70*f], 'r-', lw=1)
#            plt.plot([70*f, 70*f], [70*f, 30*f], 'r-', lw=1)
#            plt.plot([70*f, 30*f], [30*f, 30*f], 'r-', lw=1)
#            plt.pause(0.01);

            t = t - dt; 
            print('Again.')
#            sess.run(set_to_not_zero);
#            print str(t) + " || " + str(np.max(np.abs(error1))) + " , " + str(np.mean(np.abs(error1))) + "|ITR=" + str(i)                                                #VAR         
            
#            plt.figure(4)
#            plt.clf();
#            plt.title(str([str(i)+" : "+str(perms[i]) for i in range(len(perms))]))
#            b_sele = (ALL_x[:,-1] < 6.1); 
#            ALL_xp = ALL_x[b_sele]; 
#            letsee_ = PI[b_sele];
#            b_sele = (np.abs(ALL_xp[:,2]-np.pi/2.0 + 0.1) < 0.1);
#            ALL_xp = ALL_xp[b_sele];
#            letsee_ = letsee_[b_sele];  
#            _,_ = getPI(ALL_xp);
#            #plt.subplot(2,3,1) #SUBPLOT
#            letsee_ = letsee_.argmax(axis=1);
#            plt.scatter(ALL_xp[:,0],ALL_xp[:,1],c=letsee_)
#            plt.colorbar()
#            plt.pause(0.01)
#            woot = np.array([[-0.15023694, -4.03420314,  1.56425333,  6.02741677],
#       [ 0.10373495, -4.34956515,  1.50186123,  6.08060291],
#       [ 0.13439703, -5.47363893,  1.60820922,  6.0519111 ],
#       [ 0.07739933, -4.93777028,  1.57579839,  6.00117299]])          
#            _,_ = getPI(woot,ALL_PI);
            
        #elif(i is 0):
        elif(np.mod(i,renew) == 0 and i is 0):

#            sess.run(set_to_zero);
            t = time.time()
            ALL_x = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-1));
            ALL_x[:,1] = ALL_x[:,1]*2.0
            ALL_x[:,3] = ALL_x[:,3]*2.0
            ALL_x[:,4] = ALL_x[:,4]*np.pi/5.0 + np.pi;
            ALL_x[:,5] = ALL_x[:,5]*6.0;            
            PI_c,PI_d = getPI(ALL_x,F_PI=[],F_PI_=[],subSamples=3);
            pre_ALL_x = ConvCosSin(ALL_x);
            elapsed = time.time() - t
            print("Compute Data Time = "+str(elapsed))
            
            ALL_x_ = np.random.uniform(-5.0,5.0,(nrolls/100,layers[0]-1));
            ALL_x_[:,1] = ALL_x_[:,1]*2.0
            ALL_x_[:,3] = ALL_x_[:,3]*2.0
            ALL_x_[:,4] = ALL_x_[:,4]*np.pi/5.0 + np.pi;
            ALL_x_[:,5] = ALL_x_[:,5]*6.0; 
            PI_c_,PI_d_ = getPI(ALL_x_,F_PI=[],F_PI_=[],subSamples=3);
            pre_ALL_x_ = ConvCosSin(ALL_x_);           
#            sess.run(set_to_not_zero);

            

        # |||||||||||| ----  PRINT ----- |||||||||||| 

        if(np.mod(i,200) == 0):

            #xel = sess.run(L,{states:ALL_x,y:PI});
            #test_e = sess.run(L,{states:ALL_x_,y:PI_});
            train_acc = sess.run(accuracy,{states:pre_ALL_x,y:PI_c});
            test_acc = sess.run(accuracy,{states:pre_ALL_x_,y:PI_c_});
            train_acc_ = sess.run(accuracy_,{states_:pre_ALL_x,y_:PI_d});
            test_acc_ = sess.run(accuracy_,{states_:pre_ALL_x_,y_:PI_d_});             
            #o = np.random.randint(len(ALL_x));
            print str(i) + ") control | TR_ACC = " + str(train_acc) + " | TE_ACC = " + str(test_acc) + " | Learning Rate = " + str(nunu)
            print str(i) + ") disturb | TR_ACC = " + str(train_acc_) + " | TE_ACC = " + str(test_acc_) + " | Learning Rate = " + str(nunu)
            #print str(i) + ") | XEL = " + str(xel) + " | Test_E = " + str(test_e) + " | Lerning Rate = " + str(nunu)
            #print str(PI[[o],:]) + " || " + str(sess.run(l_r[-1],{states:ALL_x[[o],:]})) #+ " || " + str(sess.run(gvs[-1],{states:ALL_x,y:PI}))
            
        nunu = 0.001#/(np.sqrt(np.mod(i,renew))+1.0)#lr_schedule.value(i);
        #nunu = ler_r/(np.mod(i,renew)+1.0);
        tmp = np.random.randint(len(ALL_x), size=bts);
        sess.run(train_step, feed_dict={states:pre_ALL_x[tmp],y:PI_c[tmp],nu:nunu});
        sess.run(train_step_, feed_dict={states_:pre_ALL_x[tmp],y_:PI_d[tmp],nu:nunu});
        #tmp = np.random.randint(len(reach100s), size=bts);
        #sess.run(train_step, feed_dict={states:reach100s[tmp,:-1],y:reach100s[tmp,-1,None],nu:nunu});

    pickle.dump([ALL_PI,ALL_PI_],open( "policies6D_C&D_h30_h30.pkl", "wb" ));
Example #9
0

def seed_func():
    return np.random.randint(0, 1000)


num_timesteps = 1e7
learning_freq = 4
# training iterations to go
num_iter = num_timesteps / learning_freq

# piecewise learning rate
lr_multiplier = 1.0
learning_rate = PiecewiseSchedule([
    (0, 2e-4 * lr_multiplier),
    (num_iter / 2, 1e-4 * lr_multiplier),
    (num_iter * 3 / 4, 5e-5 * lr_multiplier),
],
                                  outside_value=5e-5 * lr_multiplier)

# piecewise learning rate
exploration = PiecewiseSchedule(
    [
        (0, 1.0),
        (num_iter / 12, 0.1),
        # (num_iter * 3 / 4, 0.1),
        (num_iter / 2, 0.01),
    ],
    outside_value=0.01)

######### transfer only #########
Example #10
0
def main(layers, t_hor, ind, nrolls, bts, ler_r, mom, teps, renew, imp, q):
    # Quad Params
    wMax = 3.0
    wMin = -1.0 * wMax
    aMax = 2 * np.pi / 10.0
    aMin = -1.0 * aMax
    max_list = [wMax, aMax]

    print 'Starting worker-' + str(ind)

    Nx = 101
    minn = [-5.0, -5.0, 0.0, 6.0]
    maxx = [5.0, 5.0, 2 * np.pi, 12.0]

    X = np.linspace(minn[0], maxx[0], Nx)
    Y = np.linspace(minn[1], maxx[1], Nx)
    X, Y = np.meshgrid(X, Y)
    XX = np.reshape(X, [-1, 1])
    YY = np.reshape(Y, [-1, 1])
    grid_eval = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 6.0 * np.ones(XX.shape)), axis=1)
    grid_eval_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_eval__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 12.0 * np.ones(XX.shape)), axis=1)
    grid_evall_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)

    reach100s = sio.loadmat('flat_1s.mat')
    reach100s = reach100s["M"]
    reach100s[:, [1, 2]] = reach100s[:, [2, 1]]
    reach100s[:, 2] = np.mod(reach100s[:, 2], 2.0 * np.pi)
    #mean_data = np.mean(reach100s[:,:-1],axis=0);
    #std_data = np.std(reach100s[:,:-1],axis=0);

    nofparams = 0
    for i in xrange(len(layers) - 1):
        nofparams += layers[i] * layers[i + 1] + layers[i + 1]
    print 'Number of Params is: ' + str(nofparams)

    H_length = t_hor
    #-1.0; #Has to be negative                                 #VAR
    iters = 1000000
    #VAR
    #center = np.array([[0.0,0.0]])
    center = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
    depth = 2.0
    incl = 1.0

    ##################### DEFINITIONS #####################
    #layers = [2 + 1,10,1];                                                    #VAR
    #ssize = layers[0] - 1;
    dt = 0.05
    #VAR
    num_ac = 2
    ##################### INSTANTIATIONS #################
    states, y, Tt, l_r, lb, reg = TransDef("Critic", False, layers, depth,
                                           incl, center)
    #a_layers = layers;
    #a_layers[-1] = 2; #We have two actions
    #states_,y_,Tt_,l_r_,lb_,reg_ = TransDef("Actor",False,a_layers,depth,incl,center,outp=True);

    V_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Critic')
    #A_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Actor');

    #var_grad = tf.gradients(Tt_,states_)[0]
    var_grad_ = tf.gradients(Tt, states)[0]
    grad_x = tf.slice(var_grad_, [0, 0], [-1, layers[0] - 1])
    #theta = tf.trainable_variables();

    set_to_zero = []
    for var in sorted(V_func_vars, key=lambda v: v.name):
        set_to_zero.append(var.assign(tf.zeros(tf.shape(var))))
    set_to_zero = tf.group(*set_to_zero)

    set_to_not_zero = []
    for var in sorted(V_func_vars, key=lambda v: v.name):
        set_to_not_zero.append(
            var.assign(
                tf.random_uniform(tf.shape(var), minval=-0.1, maxval=0.1)))
    set_to_not_zero = tf.group(*set_to_not_zero)

    # DEFINE LOSS

    lmbda = 0.0
    #1.0**(-3.5);#0.01;
    beta = 0.00
    L = tf.sqrt(
        tf.reduce_mean(
            tf.reduce_sum(tf.square(tf.sub(y, Tt)), 1, keep_dims=True))
    ) + beta * tf.reduce_mean(
        tf.reduce_max(tf.abs(grad_x), reduction_indices=1, keep_dims=True))
    #L = tf.reduce_mean(tf.mul(tf.exp(imp*t_vec),tf.abs(tf.sub(y,Tt)))) + lmbda*reg;
    #L = tf.reduce_mean(tf.abs(tf.sub(y,Tt))) + lmbda*reg;

    # DEFINE OPTIMIZER

    #nu = 5.01;
    #nunu = ler_r;#0.00005;
    nu = tf.placeholder(tf.float32, shape=[])  #VAR

    #lr_multiplier = ler_r
    lr_schedule = PiecewiseSchedule([
        (0, 0.01),
        (renew * 2 / 4, 0.007),
        (renew * 3 / 4, 0.005),
        (renew * 4 / 4, 0.002),
    ],
                                    outside_value=0.001)

    #train_step = tf.train.GradientDescentOptimizer(nu).minimize(L)
    #optimizer = tf.train.MomentumOptimizer(learning_rate=nu,momentum=mom);#.minimize(L)
    #optimizer = tf.train.AdamOptimizer(learning_rate=nu);
    optimizer = tf.train.RMSPropOptimizer(learning_rate=nu, momentum=mom)
    gvs = optimizer.compute_gradients(L, V_func_vars)
    capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in gvs]
    train_step = optimizer.apply_gradients(gvs)
    #train_step = tf.train.AdagradOptimizer(learning_rate=nu,initial_accumulator_value=0.5).minimize(L);

    # INITIALIZE GRAPH
    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    def p_corr(ALL_x):
        ALL_x = np.mod(ALL_x, 2.0 * np.pi)
        return ALL_x

    def F(ALL_x, opt_a, opt_b):
        sin_phi = np.sin(ALL_x[:, 2, None])
        cos_phi = np.cos(ALL_x[:, 2, None])

        col1 = np.multiply(ALL_x[:, 3, None], cos_phi)
        col2 = np.multiply(ALL_x[:, 3, None], sin_phi)
        col3 = opt_a[:, 0, None]
        col4 = opt_a[:, 1, None]

        return np.concatenate((col1, col2, col3, col4), axis=1)

    ####################### RECURSIVE FUNC ####################

    def RK4(ALL_x, dtt, opt_a, opt_b):

        k1 = F(ALL_x, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k2)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k1)
        ALL_tmp[:, 2] = p_corr(ALL_tmp[:, 2])

        k2 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k3)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k2)
        ALL_tmp[:, 2] = p_corr(ALL_tmp[:, 2])

        k3 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k4)
        ALL_tmp = ALL_x + np.multiply(dtt, k3)
        ALL_tmp[:, 2] = p_corr(ALL_tmp[:, 2])

        k4 = F(ALL_tmp, opt_a, opt_b)
        #### !!!

        Snx = ALL_x + np.multiply((dtt / 6.0), (k1 + 2.0 * k2 + 2.0 * k3 + k4))
        Snx[:, 2] = p_corr(Snx[:, 2])
        return Snx

    def opt_ac(grad):
        opt_dir_1_ = np.sign(
            grad[:, 2, None]
        ) * wMin  #np.floor((np.sign(grad[:,1,None])+1.0)/2.0)*wMin + np.ceil((np.sign(grad[:,1,None])-1.0)/2.0)*wMin;
        opt_dir_2_ = np.sign(
            grad[:, 3, None]
        ) * aMin  #np.floor((np.sign(grad[:,3,None])+1.0)/2.0)*aMin + np.ceil((np.sign(grad[:,3,None])-1.0)/2.0)*aMax;
        opt_a = np.concatenate((opt_dir_1_, opt_dir_2_), axis=1)
        return opt_a, None

    def V_ret(ALL_x):

        due = np.inf * np.ones([ALL_x.shape[0], 1])
        perms = list(itertools.product([-1, 1], repeat=num_ac))
        opt_actions = np.zeros([ALL_x.shape[0], num_ac])
        for i in range(len(perms)):  #2**num_actions
            ac_tuple = perms[i]
            ac_list = [tmp1 * tmp2 for tmp1, tmp2 in zip(ac_tuple, max_list)]
            opt_a = np.asarray(ac_list) * np.ones([ALL_x.shape[0], 1])
            Snx = RK4(ALL_x, dt, opt_a, None)
            due_tmp = np.min(np.concatenate((due, sess.run(Tt, {states: Snx})),
                                            axis=1),
                             axis=1,
                             keepdims=True)
            b_indexes = (due_tmp < due)[:, 0]
            opt_actions[b_indexes] = opt_a[b_indexes]
            due = due_tmp

        uno = sess.run(Tt, {states: ALL_x})
        filt = [((Snx[:, k, None] > maxx[k]) | (Snx[:, k, None] < minn[k]))
                for k in range(len(minn))]
        filt = np.any(filt, axis=0)
        #due[filt] = np.inf;

        V = np.min(np.concatenate((uno, due), axis=1), axis=1, keepdims=True)

        return due, opt_actions
        #V,opt_actions;

    # *****************************************************************************
    #
    # ============================= MAIN LOOP ====================================
    #                     ( )
    # *****************************************************************************
    t1 = time.time()
    t = 0.0
    mse = np.inf
    k = 0
    kk = 0
    beta = 3.0
    batch_size = bts
    tau = 1000.0
    steps = teps
    nunu = lr_schedule.value(k)
    for i in xrange(iters):

        if (np.mod(i, renew) == 0 and i is not 0):

            get_grads = sess.run(var_grad_, {states: ALL_x})
            opt_a, _ = opt_ac(get_grads)
            opaye = [(np.float32(opa[i, 0]) == np.float32(opt_a[i, 0]))
                     and (np.float32(opa[i, 1]) == np.float32(opt_a[i, 1]))
                     for i in range(len(opa))]
            get_grads_ = sess.run(var_grad_, {states: ALL_x_})
            opt_a_, _ = opt_ac(get_grads_)
            opaye_ = [(np.float32(opa_[i, 0]) == np.float32(opt_a_[i, 0]))
                      and (np.float32(opa_[i, 1]) == np.float32(opt_a_[i, 1]))
                      for i in range(len(opa_))]
            print "Train Accuracy = " + str(
                np.float(sum(opaye)) / 1000000.) + " Test Accuracy = " + str(
                    np.float(sum(opaye_)) / 1000.)

            k = 0
            ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0]))
            ALL_x[:, 2] = ALL_x[:, 2] * np.pi / 5.0 + np.pi
            ALL_x[:, 3] = ALL_x[:, 3] * 3.0 / 5.0 + 9.0
            V, _ = V_ret(ALL_x)

            ALL_x_ = np.random.uniform(-5.0, 5.0, (nrolls / 1000, layers[0]))
            ALL_x_[:, 2] = ALL_x_[:, 2] * np.pi / 5.0 + np.pi
            ALL_x_[:, 3] = ALL_x_[:, 3] * 3.0 / 5.0 + 9.0
            V_, _ = V_ret(ALL_x_)

            ZR = sess.run(Tt, {states: reach100s[:, :-1]})
            error1 = ZR - reach100s[:, -1, None]

            #error1 = 0.0;#targ_nn - sess.run(Tt,{states:into_nn});

            #            log_avg_error[kk] = np.max(np.abs(error1));
            #            log_error[kk] = np.mean(np.abs(error1));

            Z000 = np.reshape(sess.run(Tt, {states: grid_eval}), X.shape)
            Z001 = np.reshape(sess.run(Tt, {states: grid_eval_}), X.shape)
            Z002 = np.reshape(sess.run(Tt, {states: grid_eval__}), X.shape)
            #filter_in = (Z000 <= 0.05) #& (Z000 >= 0.05);
            filter_out = (Z000 > 0.00)  #| (Z000 < -0.05);
            filter_out_ = (Z001 > 0.00)  #| (Z000 < -0.05);
            filter_out__ = (Z002 > 0.00)  #| (Z000 < -0.05);
            #Z000[filter_in] = 1.0;
            Z000[filter_out] = 0.0
            Z001[filter_out_] = 0.0
            Z002[filter_out__] = 0.0

            Z000l = np.reshape(sess.run(Tt, {states: grid_evall}), X.shape)
            Z001l = np.reshape(sess.run(Tt, {states: grid_evall_}), X.shape)
            Z002l = np.reshape(sess.run(Tt, {states: grid_evall__}), X.shape)
            #filter_in = (Z000 <= 0.05) #& (Z000 >= 0.05);
            filter_outl = (Z000l > 0.00)  #| (Z000 < -0.05);
            filter_out_l = (Z001l > 0.00)  #| (Z000 < -0.05);
            filter_out__l = (Z002l > 0.00)  #| (Z000 < -0.05);
            #Z000[filter_in] = 1.0;
            Z000l[filter_outl] = 0.0
            Z001l[filter_out_l] = 0.0
            Z002l[filter_out__l] = 0.0

            plt.clf()
            #plt.plot(ALL_t_, np.abs(allE), 'ro');
            #plt.axis([-1.0, 0.0, 0.0, 10.0])
            plt.subplot(2, 3, 1)
            plt.imshow(Z000, cmap='gray')
            plt.subplot(2, 3, 2)
            plt.imshow(Z001, cmap='gray')
            plt.subplot(2, 3, 3)
            plt.imshow(Z002, cmap='gray')
            plt.subplot(2, 3, 4)
            plt.imshow(Z000l, cmap='gray')
            plt.subplot(2, 3, 5)
            plt.imshow(Z001l, cmap='gray')
            plt.subplot(2, 3, 6)
            plt.imshow(Z002l, cmap='gray')
            plt.pause(0.01)

            print str(t) + " || " + str(np.max(np.abs(error1))) + " , " + str(
                np.mean(np.abs(error1))) + " REG = " + str(
                    sess.run(reg)) + ") | MSE = " + str(mse) + "|ITR=" + str(
                        i)  #VAR
            t = t - dt

        #elif(i is 0):
        elif (np.mod(i, renew) == 0 and i is 0):

            k = 0
            sess.run(set_to_zero)
            ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0]))
            ALL_x[:, 2] = ALL_x[:, 2] * np.pi / 5.0 + np.pi
            ALL_x[:, 3] = ALL_x[:, 3] * 3.0 / 5.0 + 9.0
            V, opa = V_ret(ALL_x)

            ALL_x_ = np.random.uniform(-5.0, 5.0, (nrolls / 1000, layers[0]))
            ALL_x_[:, 2] = ALL_x_[:, 2] * np.pi / 5.0 + np.pi
            ALL_x_[:, 3] = ALL_x_[:, 3] * 3.0 / 5.0 + 9.0
            V_, opa_ = V_ret(ALL_x_)
            sess.run(set_to_not_zero)

        # |||||||||||| ----  PRINT ----- ||||||||||||

        if (np.mod(i, 200) == 0):

            mse = sess.run(L, {
                states: ALL_x,
                y: V
            })
            test_e = sess.run(L, {
                states: ALL_x_,
                y: V_
            })
            print str(i) + ") | MSE = " + str(mse) + " | Test_E = " + str(
                test_e) + " | Lerning Rate = " + str(nunu)

        nunu = 0.001
        #lr_schedule.value(k);
        #nunu = ler_r/(np.mod(i,renew)+1.0);
        tmp = np.random.randint(len(ALL_x), size=bts)
        sess.run(train_step,
                 feed_dict={
                     states: ALL_x[tmp],
                     y: V[tmp],
                     nu: nunu
                 })
Example #11
0
def main(layers,t_hor,ind,nrolls,bts,ler_r,mom,teps,renew,imp,q):
    # Quad Params
    m0 = 1.5;
    m1 = 0.5;
    m2 = 0.75;
    L1 = 0.5; l1 = L1/2.0;
    L2 = 0.75; l2 = L2/2.0;
    I1 = m1*L1**2 / 12.0;
    I2 = m2*L2**2 / 12.0;
    
    d1 = m0+m1+m2;
    d2 = (m1/2.0 + m2)*L1
    d3 = m2*l2
    d4 = (m1/3.0 + m2)*L1**2
    d5 = m2*L1*l2
    d6 = m2*l2**2 + I2
    
    

    g = 9.81;

    f1 = (m1*l1 + m2*L1)*g
    f2 = m2*l2*g 
    
    min_list = [-1.0];
    max_list = [1.0];
    
    print 'Starting worker-' + str(ind)

    f = 1;
    Nx = 100*f + 1;
    minn = [-5.0,-10.0,-5.0,-10.0,0.0,-10.0];
    maxx = [ 5.0, 10.0, 5.0, 10.0,2*np.pi, 10.0];
    
#    X = np.linspace(minn[0],maxx[0],Nx);
#    Y = np.linspace(minn[2],maxx[2],Nx);
#    Z = np.linspace(minn[4],maxx[4],Nx);
#    X_,Y_,Z_ = np.meshgrid(X, Y, Z);    
#    X,Y = np.meshgrid(X, Y);
#    XX = np.reshape(X,[-1,1]);
#    YY = np.reshape(Y,[-1,1]);
#    XX_ = np.reshape(X_,[-1,1]);
#    YY_ = np.reshape(Y_,[-1,1]);
#    ZZ_ = np.reshape(Z_,[-1,1]); grid_check = np.concatenate((XX_,np.ones(XX_.shape),YY_,np.ones(XX_.shape),ZZ_,np.zeros(XX_.shape)),axis=1);
#    grid_eval = np.concatenate((XX,YY,0.0*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
#    grid_eval_ = np.concatenate((XX,YY,(2.0/3.0)*np.pi*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
#    grid_eval__ = np.concatenate((XX,YY,(4.0/3.0)*np.pi*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
#    grid_evall = np.concatenate((XX,YY,0.0*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);
#    grid_evall_ = np.concatenate((XX,YY,(2.0/3.0)*np.pi*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);
#    grid_evall__ = np.concatenate((XX,YY,(4.0/3.0)*np.pi*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);    


    # Calculate number of parameters of the policy
    nofparams = 0;
    for i in xrange(len(layers)-1):
        nofparams += layers[i]*layers[i+1] + layers[i+1];
    print 'Number of Params is: ' + str(nofparams)
    
    H_length = t_hor;
    center = np.array([[0.0,0.0,0.0,0.0,0.0,0.0]])
    depth = 2.0;
    incl = 1.0;

    ##################### DEFINITIONS #####################
    #layers = [2 + 1,10,1];                                                    #VAR
    #ssize = layers[0] - 1;
    dt = 0.1;                                                                 #VAR
    num_ac = 1;
    iters = int(np.abs(t_hor)/dt)*renew + 1; 
    ##################### INSTANTIATIONS #################
    states,y,Tt,L,l_r,lb,reg, cross_entropy = TransDef("Critic",False,layers,depth,incl,center);
    ola1 = tf.argmax(Tt,dimension=1)
    ola2 = tf.argmax(y,dimension=1)
    ola3 = tf.equal(ola1,ola2)
    accuracy = tf.reduce_mean(tf.cast(ola3, tf.float32));
    #a_layers = layers;
    #a_layers[-1] = 2; #We have two actions
    #states_,y_,Tt_,l_r_,lb_,reg_ = TransDef("Actor",False,a_layers,depth,incl,center,outp=True);
    
    V_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Critic');
    #A_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Actor');
    
    #var_grad = tf.gradients(Tt_,states_)[0]
    var_grad_ = tf.gradients(Tt,states)[0]
    grad_x = tf.slice(var_grad_,[0,0],[-1,layers[0]-1]);
    #theta = tf.trainable_variables();

    set_to_zero = []
    for var  in sorted(V_func_vars,        key=lambda v: v.name):
        set_to_zero.append(var.assign(tf.zeros(tf.shape(var))))
    set_to_zero = tf.group(*set_to_zero)
    
    set_to_not_zero = []
    for var  in sorted(V_func_vars,        key=lambda v: v.name):
        set_to_not_zero.append(var.assign(tf.random_uniform(tf.shape(var),minval=-0.1,maxval=0.1)));
    set_to_not_zero = tf.group(*set_to_not_zero)    

    # DEFINE LOSS

    lmbda = 0.0;#1.0**(-3.5);#0.01;
    beta = 0.00;
    #L = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y,Tt)),1,keep_dims=True))) + beta*tf.reduce_mean(tf.reduce_max(tf.abs(grad_x),reduction_indices=1,keep_dims=True));
    #L = tf.reduce_mean(tf.mul(tf.exp(imp*t_vec),tf.abs(tf.sub(y,Tt)))) + lmbda*reg;
    #L = tf.reduce_mean(tf.abs(tf.sub(y,Tt))) + lmbda*reg;    

    # DEFINE OPTIMIZER

    #nu = 5.01;
    #nunu = ler_r;#0.00005;
    nu = tf.placeholder(tf.float32, shape=[])                                         #VAR

    #lr_multiplier = ler_r
    lr_schedule = PiecewiseSchedule([
                                         (0, 0.1),
                                         (10000, 0.01 ),
                                         (20000, 0.001 ),
                                         (30000, 0.0001 ),
                                    ],
                                    outside_value=0.0001)

    #optimizer = tf.train.GradientDescentOptimizer(nu)
    #optimizer
    #train_step = tf.train.MomentumOptimizer(learning_rate=nu,momentum=mom).minimize(L)
    #optimizer 
    #train_step = tf.train.AdamOptimizer(learning_rate=nu).minimize(L);
    train_step = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom).minimize(L);
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom);
    #gvs = optimizer.compute_gradients(L,theta);
    #capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in gvs];
    #train_step = optimizer.apply_gradients(gvs);
    #train_step = tf.train.AdagradOptimizer(learning_rate=nu,initial_accumulator_value=0.5).minimize(L);

    hot_input = tf.placeholder(tf.int64,shape=(None));   
    make_hot = tf.one_hot(hot_input, 2**num_ac, on_value=1, off_value=0)

    # INITIALIZE GRAPH
    theta = tf.trainable_variables();
    sess = tf.Session();
    init = tf.initialize_all_variables();
    sess.run(init);

    def V_0(x):
        #return np.linalg.norm(x,ord=np.inf,axis=1,keepdims=True)
        return np.linalg.norm(x,axis=1,keepdims=True)

    def p_corr(ALL_x):
        ALL_x = np.mod(ALL_x + np.pi,2.0*np.pi) - np.pi;
        return ALL_x;

    def F(ALL_x,opt_a,opt_b):
       v1 = ALL_x[:,3,None];
       w1 = ALL_x[:,4,None]; 
       w2 = ALL_x[:,5,None]; 
       cos_t1 = np.cos(ALL_x[:,1,None]);
       sin_t1 = np.sin(ALL_x[:,1,None]);
       t1 = np.cos(ALL_x[:,1,None]);
       cos_t2 = np.cos(ALL_x[:,2,None]);
       sin_t2 = np.sin(ALL_x[:,2,None]);
       t2 = np.cos(ALL_x[:,2,None]);
       
       #n_c = d4*(d3*cos_t2)**2.0 + d1*(d5*np.cos(t1-t2))**2.0 + d6*((d2*cos_t1)**2.0 -d1*d4) - 2.0*d2*d3*d5*cos_t1*np.cos(t1-t2)*cos_t2
       #n_c = (d1*d4*d6 - d1*(d5*np.cos(t2-t1))**2.0 - d6*(d2*cos_t1)**2.0 + 2.0*d2*d3*d5*cos_t2*cos_t1*np.cos(t2-t1) - d4*(d3*cos_t2)**2.0);
       try:       
           D11 = (d4*d6 - (np.cos(t1-t2)*d5)**2.0);              D12 = (d3*d5*cos_t2*np.cos(t1-t2) - d2*d6*cos_t1);  D13 = (d2*d5*np.cos(t1-t2)*cos_t1 - d3*d4*cos_t2);
           D21 = D12;                                            D22 = (d1*d6 - (d3*cos_t2)**2);                      D23 = (d2*d3*cos_t2*cos_t1 - d1*d5*np.cos(t1-t2));
           D31 = D13;                                            D32 = D23;                                           D33 = (d1*d4 - (d2*cos_t1)**2.0);       
    
           n_c_ = L1**2*L2**2*m2*(m0*m1 + m1**2*sin_t1**2 + m1*m2*sin_t1**2 + m0*m2*np.sin(t1-t2)**2)
           n_c = d1*D11 + d2*cos_t1*D12 + d3*cos_t2*D13
           n_c2 = d2*cos_t1*D21 + d4*D22 + d5*np.cos(t1-t2)*D23
           n_c3 = d3*cos_t2*D31 + d5*np.cos(t1-t2)*D32 + d6*D33
    
           C11 = 0.0; C12 = -d2*sin_t1*w1;          C13 = -d3*sin_t2*w2;
           C21 = 0.0; C22 = 0.0;                    C23 = d5*np.sin(t1-t2)*w2;
           C31 = 0.0; C32 = -d5*np.sin(t1-t2)*w1;   C33 = 0.0;
           
           G1 = 0.0; G2 = -f1*sin_t1; G3 = -f2*sin_t2;
           
           DC11 = 0.0; DC12 = D11*C12 + D13*C32; DC13 = D11*C13 + D12*C23;
           DC21 = 0.0; DC22 = D21*C12 + D23*C32; DC23 = D21*C13 + D22*C23;
           DC31 = 0.0; DC32 = D31*C12 + D33*C32; DC33 = D31*C13 + D32*C23;
           
           DG1 = D11*G1 + D12*G2 + D13*G3;
           DG2 = D21*G1 + D22*G2 + D23*G3;
           DG3 = D31*G1 + D32*G2 + D33*G3;       
       
           col1 = v1;
           col2 = w1;
           col3 = w2;
           col4 = ( -(DC11*v1 + DC12*w1 + DC13*w2) - 0.1*v1 - DG1 + D11*opt_a)/n_c_
           col5 = ( -(DC21*v1 + DC22*w1 + DC23*w2) - 0.1*w1 - DG2 + D21*opt_a)/n_c_
           col6 = ( -(DC31*v1 + DC32*w1 + DC33*w2) - 0.1*w2 - DG3 + D31*opt_a)/n_c_
       except RuntimeWarning:
           print("Whoops...")
       
       return np.concatenate((col1,col2,col3,col4,col5,col6),axis=1);

   #Dynamics
#    
#    (a) d1 = m0+m1+m2;
#    (b) d2 = m1*l1 + m2*L2
#    (c) d3 = m2*l2
#    (d) d4 = m1*l1**2 + m2*L1**2 + I1
#    (e) d5 = m2*L1*l2
#    (f) d6 = m2*l2**2 + I2
#
#    g = 9.81;
#
#    f1 = (m1*l1 + m2*L1)*g
#    f2 = m2*l2*g 


    ####################### RECURSIVE FUNC ####################

    def RK4(ALL_x,dtt,opt_a,opt_b):

        k1 = F(ALL_x,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k2)
        ALL_tmp = ALL_x + np.multiply(dtt/2.0,k1);
        ALL_tmp[:,[1,2]] = p_corr(ALL_tmp[:,[1,2]]);

        k2 = F(ALL_tmp,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k3)
        ALL_tmp = ALL_x + np.multiply(dtt/2.0,k2);
        ALL_tmp[:,[1,2]] = p_corr(ALL_tmp[:,[1,2]]);

        k3 = F(ALL_tmp,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k4)
        ALL_tmp = ALL_x + np.multiply(dtt,k3);
        ALL_tmp[:,[1,2]] = p_corr(ALL_tmp[:,[1,2]]);

        k4 = F(ALL_tmp,opt_a,opt_b);  #### !!!

        Snx = ALL_x + np.multiply((dtt/6.0),(k1 + 2.0*k2 + 2.0*k3 + k4)); #np.multiply(dtt,k1)
        Snx[:,[1,2]] = p_corr(Snx[:,[1,2]]);
        return Snx;

    perms = list(itertools.product([-1,1], repeat=num_ac))
    true_ac_list = [];
    for i in range(len(perms)): #2**num_actions
        ac_tuple = perms[i];
        ac_list = [(tmp1==1)*tmp3 + (tmp1==-1)*tmp2 for tmp1,tmp2,tmp3 in zip(ac_tuple,min_list,max_list)]; 
        true_ac_list.append(ac_list);
    
    def Hot_to_Cold(hots,ac_list):
        a = hots.argmax(axis=1);
        a = np.asarray([ac_list[i] for i in a]);
        return a;
    
    def getPI(ALL_x,F_PI=[],subSamples=1): #Things to keep in MIND: You want the returned value to be the minimum accross a trajectory.

        current_params = sess.run(theta);

        #perms = list(itertools.product([-1,1], repeat=num_ac))
        next_states = [];
        for i in range(len(perms)):
            opt_a = np.asarray(true_ac_list[i])*np.ones([ALL_x.shape[0],1]);
            Snx = ALL_x;
            for _ in range(subSamples): 
                Snx = RK4(Snx,dt/float(subSamples),opt_a,None);
            next_states.append(Snx);
        next_states = np.concatenate(next_states,axis=0);
        values = V_0(next_states[:,[1,2,3]]);
        
        for params in F_PI:
            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
                sess.run(theta[ind].assign(params[ind]));
           
            for _ in range(subSamples):
                hots = sess.run(Tt,{states:ConvCosSin(next_states)});
                opt_a = Hot_to_Cold(hots,true_ac_list)                 
                next_states = RK4(next_states,dt/float(subSamples),opt_a,None);
                values = np.max((values,V_0(next_states[:,[1,2,3]])),axis=0);
        
        values_ = values#V_0(next_states);
        compare_vals_ = values_.reshape([-1,ALL_x.shape[0]]).T;         #Changed to values instead of values_
        index_best_a_ = compare_vals_.argmin(axis=1)                    #Changed to ARGMIN
        values_ = np.min(compare_vals_,axis=1,keepdims=True);
        
        filterr = 0#np.max(compare_vals_,axis=1) > -0.8
        #index_best_a_ = index_best_a_[filterr]
        #values_ = values_[filterr]
        #print("States filtered out: "+str(len(filterr)-np.sum(filterr)))
        
        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
            sess.run(theta[ind].assign(current_params[ind]));
        
        return sess.run(make_hot,{hot_input:index_best_a_}),values_,filterr

#    def getTraj(ALL_x,F_PI=[],subSamples=1,StepsLeft=None,Noise = False):
#
#        current_params = sess.run(theta);
#        
#        if(StepsLeft == None): StepsLeft = len(F_PI);        
#        
#        next_states = ALL_x;
#        traj = [next_states];
#        actions = [];
#              
#        for params in F_PI[len(F_PI)-StepsLeft:]:
#            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
#                sess.run(theta[ind].assign(params[ind]));
#            
#            hots = sess.run(Tt,{states:ConvCosSin(next_states)});
#            opt_a = Hot_to_Cold(hots,true_ac_list)
#            for _ in range(subSamples):
#                next_states = RK4(next_states,dt/float(subSamples),opt_a,None);
#                if Noise:
#                    next_states = next_states + np.random.normal(size=next_states.shape)*0.01
#                traj.append(next_states);
#                actions.append(hots.argmax(axis=1)[0]);
#                #values = np.min((values,V_0(next_states[:,[0,1]])),axis=0);    
#
#        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
#            sess.run(theta[ind].assign(current_params[ind]));
#                        
#        return traj,actions,V_0(next_states[:,[0,2]]);                 

    def getTraj(ALL_x,F_PI=[],subSamples=1,StepsLeft=None,Noise=False, Static=False):

        current_params = sess.run(theta);
        
        if(StepsLeft == None): StepsLeft = len(F_PI);        
        
        next_states = ALL_x;
        traj = [next_states];
        actions = [];
        
        if Static:
            steps = input("How Many Steps? ")
            for ind in range(len(F_PI[len(F_PI)-StepsLeft])): #Reload pi*(x,t+dt) parameters
                sess.run(theta[ind].assign(F_PI[len(F_PI)-StepsLeft][ind])); 
            for i in range(steps):                            
                for _ in range(subSamples):
                    tmp = ConvCosSin(next_states);
                    hots = sess.run(Tt,{states:tmp});
                    opt_a = Hot_to_Cold(hots,true_ac_list)   
                    if Noise == False:
                        next_states = next_states + np.random.normal(size=next_states.shape)*0.01
                    
                    next_states = RK4(next_states,dt/float(subSamples),opt_a,None);
                    traj.append(next_states); 
                    actions.append(hots.argmax(axis=1)[0]);   
        else:      
            for params in F_PI[len(F_PI)-StepsLeft:]:
                for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
                    sess.run(theta[ind].assign(params[ind]));
                            
                for _ in range(subSamples):
                    tmp = ConvCosSin(next_states);
                    hots = sess.run(Tt,{states:tmp});
                    opt_a = Hot_to_Cold(hots,true_ac_list)   
                    if Noise == False:
                        next_states = next_states + np.random.normal(size=next_states.shape)*0.01                    
                    next_states = RK4(next_states,dt/float(subSamples),opt_a,None);
                    traj.append(next_states); 
                    actions.append(hots.argmax(axis=1)[0]);   

        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
            sess.run(theta[ind].assign(current_params[ind]));       
                
        return traj,actions,V_0(next_states[:,[0,2]])

    def ConvCosSin(ALL_x):
        sin_phi = np.sin(ALL_x[:,[1,2]])
        cos_phi = np.cos(ALL_x[:,[1,2]])
        pos = ALL_x[:,[0]]/5.0;
        vel = ALL_x[:,[3]]/10.0;
        arate = ALL_x[:,[4,5]]/5.0;
        ret_val = np.concatenate((pos,vel,arate,sin_phi,cos_phi),axis=1)
        return ret_val
    # *****************************************************************************
    #
    # ============================= MAIN LOOP ====================================
    #
    # *****************************************************************************
    t1 = time.time();
    t = 0.0;
    mse = np.inf;
    k=0; kk = 0; beta=3.0; batch_size = bts; tau = 1000.0; steps = teps;
    ALL_PI = [];
    nunu = lr_schedule.value(k);               

    act_color = ['r','g','b','y'];
    if(imp == 1.0):
        ALL_PI = pickle.load( open( "policies6Dreach_h50.pkl", "rb" ) );
        cc = 0;
        while True:
            state_get = input('State: ');
            sub_smpl = input('SUBSAMPLING: ');
            pause_len = input('Pause: ')
            s_left = input("How many steps left to go (max. "+str(len(ALL_PI))+")? -> ")
            noise = input("Noise? (0/1): ")
            stat = input("Static? (0/1): ")
            traj,act,_ = getTraj(state_get,F_PI=ALL_PI,subSamples=sub_smpl,StepsLeft=s_left,Noise=noise,Static=stat);
            #act.append(act[-1]);
            all_to = np.concatenate(traj);
            plt.scatter(all_to[:,[1]],all_to[:,[2]])#,color=act_color[cc % len(act_color)])
            plt.pause(pause_len);
            cc = cc + 1;
            #plt.colorbar()

    
    for i in xrange(iters):
        
        if(np.mod(i,renew) == 0 and i is not 0):       
            
            ALL_PI.insert(0,sess.run(theta))            

            plt.figure(2) #TODO: Figure out why facing up vs facing down has same action... -> Solved: colors in a scatter plot only depend on the labels
            plt.clf();
            ALL_xx = np.array([[0.0,0.1,0.1,0.0,0.0,0.0],
                               [0.0,np.pi,np.pi,0.0,0.0,0.0],
                               [0.5,0.0,0.0,0.0,0.0,0.0],
                               [0.0,-np.pi/2,np.pi/2,0.0,0.0,0.0]]);
            for tmmp in range(ALL_xx.shape[0]):                   
                traj,act,_ = getTraj(ALL_xx[[tmmp],:],F_PI=ALL_PI,subSamples=10);
                #act.append(act[-1]);
                all_to = np.concatenate(traj);
                plt.scatter(all_to[:,[1]],all_to[:,[2]])#c=[act_color[ii] for ii in act]);          

            plt.pause(0.25)   
                                             
            
            k = 0;
            ALL_x = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-2));
            ALL_x[:,[1,2]] = ALL_x[:,[1,2]]*np.pi/5.0;
            ALL_x[:,[3]] = ALL_x[:,[3]]*2.0; 
            ALL_x[:,[4,5]] = ALL_x[:,[4,5]];
            PI,_,filterr = getPI(ALL_x,ALL_PI,subSamples=3);
            #ALL_x = ALL_x[filterr]
            pre_ALL_x = ConvCosSin(ALL_x);
            
            ALL_x_ = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-2));
            ALL_x_[:,[1,2]] = ALL_x_[:,[1,2]]*np.pi/5.0;
            ALL_x_[:,[3]] = ALL_x_[:,[3]]*2.0; 
            ALL_x_[:,[4,5]] = ALL_x_[:,[4,5]];
            PI_,_,filterr = getPI(ALL_x_,ALL_PI,subSamples=3);
            #ALL_x_ = ALL_x_[filterr]
            pre_ALL_x_ = ConvCosSin(ALL_x_);

            t = t - dt; 
            print('Again.')
            
        elif(np.mod(i,renew) == 0 and i is 0):

#            sess.run(set_to_zero);
            t = time.time()
            ALL_x = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-2));
            ALL_x[:,[1,2]] = ALL_x[:,[1,2]]*np.pi/5.0;
            ALL_x[:,[3]] = ALL_x[:,[3]]*2.0; 
            ALL_x[:,[4,5]] = ALL_x[:,[4,5]];           
            PI,_,filterr = getPI(ALL_x,F_PI=[],subSamples=3);
            #ALL_x = ALL_x[filterr]
            pre_ALL_x = ConvCosSin(ALL_x);
            elapsed = time.time() - t
            print("Compute Data Time = "+str(elapsed))
            
            ALL_x_ = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-2));
            ALL_x_[:,[1,2]] = ALL_x_[:,[1,2]]*np.pi/5.0;
            ALL_x_[:,[3]] = ALL_x_[:,[3]]*2.0; 
            ALL_x_[:,[4,5]] = ALL_x_[:,[4,5]];
            PI_,_,filterr = getPI(ALL_x_,F_PI=[],subSamples=3);
            #ALL_x_ = ALL_x_[filterr]
            pre_ALL_x_ = ConvCosSin(ALL_x_);           
#            sess.run(set_to_not_zero);

            

        # |||||||||||| ----  PRINT ----- |||||||||||| 

        if(np.mod(i,200) == 0):

            train_acc = sess.run(accuracy,{states:pre_ALL_x,y:PI});
            test_acc = sess.run(accuracy,{states:pre_ALL_x_,y:PI_});       
            print str(i) + ") | TR_ACC = " + str(train_acc) + " | TE_ACC = " + str(test_acc) + " | Lerning Rate = " + str(nunu)
            
        nunu = 0.01
        tmp = np.random.randint(len(ALL_x), size=bts);
        sess.run(train_step, feed_dict={states:pre_ALL_x[tmp],y:PI[tmp],nu:nunu});

    pickle.dump(ALL_PI,open( "policies6Dreach_h50.pkl", "wb" ));
def main(layers, t_hor, ind, nrolls, bts, ler_r, mom, teps, renew, imp, q):
    # Quad Params
    wMax = 3.0
    wMin = -1.0 * wMax
    aMax = 2 * np.pi / 10.0
    aMin = -1.0 * aMax
    max_list = [wMax, aMax]
    min_list = [wMin, aMin]

    print 'Starting worker-' + str(ind)

    Nx = 101
    minn = [-5.0, -5.0, 0.0, 6.0]
    maxx = [5.0, 5.0, 2 * np.pi, 12.0]

    X = np.linspace(minn[0], maxx[0], Nx)
    Y = np.linspace(minn[1], maxx[1], Nx)
    X, Y = np.meshgrid(X, Y)
    XX = np.reshape(X, [-1, 1])
    YY = np.reshape(Y, [-1, 1])
    grid_eval = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 6.0 * np.ones(XX.shape)), axis=1)
    grid_eval_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_eval__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 12.0 * np.ones(XX.shape)), axis=1)
    grid_evall_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)

    reach100s = sio.loadmat('flat_1s.mat')
    reach100s = reach100s["M"]
    reach100s[:, [1, 2]] = reach100s[:, [2, 1]]
    reach100s[:, 2] = np.mod(reach100s[:, 2], 2.0 * np.pi)
    #mean_data = np.mean(reach100s[:,:-1],axis=0);
    #std_data = np.std(reach100s[:,:-1],axis=0);

    nofparams = 0
    for i in xrange(len(layers) - 1):
        nofparams += layers[i] * layers[i + 1] + layers[i + 1]
    print 'Number of Params is: ' + str(nofparams)

    H_length = t_hor
    #-1.0; #Has to be negative                                 #VAR
    iters = 1000000
    #VAR
    #center = np.array([[0.0,0.0]])
    center = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
    depth = 2.0
    incl = 1.0

    ##################### DEFINITIONS #####################
    #layers = [2 + 1,10,1];                                                    #VAR
    #ssize = layers[0] - 1;
    dt = 0.05
    #VAR
    num_ac = 2
    ##################### INSTANTIATIONS #################
    states, y, Tt, L, l_r, lb, reg, cross_entropy = TransDef(
        "Critic", False, layers, depth, incl, center)
    ola1 = tf.argmax(Tt, dimension=1)
    ola2 = tf.argmax(y, dimension=1)
    ola3 = tf.equal(ola1, ola2)
    accuracy = tf.reduce_mean(tf.cast(ola3, tf.float32))
    #a_layers = layers;
    #a_layers[-1] = 2; #We have two actions
    #states_,y_,Tt_,l_r_,lb_,reg_ = TransDef("Actor",False,a_layers,depth,incl,center,outp=True);

    #theta = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Critic');
    #A_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Actor');

    #var_grad = tf.gradients(Tt_,states_)[0]
    var_grad_ = tf.gradients(Tt, states)[0]
    grad_x = tf.slice(var_grad_, [0, 0], [-1, layers[0] - 1])
    #theta = tf.trainable_variables();

    #    set_to_zero = []
    #    for var  in sorted(V_func_vars,        key=lambda v: v.name):
    #        set_to_zero.append(var.assign(tf.zeros(tf.shape(var))))
    #    set_to_zero = tf.group(*set_to_zero)
    #
    #    set_to_not_zero = []
    #    for var  in sorted(V_func_vars,        key=lambda v: v.name):
    #        set_to_not_zero.append(var.assign(tf.random_uniform(tf.shape(var),minval=-0.1,maxval=0.1)));
    #    set_to_not_zero = tf.group(*set_to_not_zero)

    # DEFINE LOSS

    lmbda = 0.0
    #1.0**(-3.5);#0.01;
    beta = 0.00
    L = tf.sqrt(
        tf.reduce_mean(
            tf.reduce_sum(tf.square(tf.sub(y, Tt)), 1, keep_dims=True))
    ) + beta * tf.reduce_mean(
        tf.reduce_max(tf.abs(grad_x), reduction_indices=1, keep_dims=True))
    #L = tf.reduce_mean(tf.mul(tf.exp(imp*t_vec),tf.abs(tf.sub(y,Tt)))) + lmbda*reg;
    #L = tf.reduce_mean(tf.abs(tf.sub(y,Tt))) + lmbda*reg;

    # DEFINE OPTIMIZER

    #nu = 5.01;
    #nunu = ler_r;#0.00005;
    nu = tf.placeholder(tf.float32, shape=[])  #VAR

    #lr_multiplier = ler_r
    lr_schedule = PiecewiseSchedule([
        (0, 0.1),
        (10000, 0.01),
        (20000, 0.001),
        (30000, 0.0001),
    ],
                                    outside_value=0.0001)

    #optimizer = tf.train.GradientDescentOptimizer(nu)
    #optimizer
    #train_step = tf.train.MomentumOptimizer(learning_rate=nu,momentum=mom).minimize(L)
    #optimizer
    #train_step = tf.train.AdamOptimizer(learning_rate=nu).minimize(L);
    train_step = tf.train.RMSPropOptimizer(learning_rate=nu,
                                           momentum=mom).minimize(L)
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom);
    #gvs = optimizer.compute_gradients(L,theta);
    #capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in gvs];
    #train_step = optimizer.apply_gradients(gvs);
    #train_step = tf.train.AdagradOptimizer(learning_rate=nu,initial_accumulator_value=0.5).minimize(L);

    hot_input = tf.placeholder(tf.int64, shape=(None))
    make_hot = tf.one_hot(hot_input, 4, on_value=1, off_value=0)

    # INITIALIZE GRAPH
    theta = tf.trainable_variables()
    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    def V_0(x):
        return np.linalg.norm(x, ord=np.inf, axis=1, keepdims=True) - 1.0

    def p_corr(ALL_x):
        ALL_x = np.mod(ALL_x, 2.0 * np.pi)
        return ALL_x

    def F(ALL_x, opt_a, opt_b):
        sin_phi = np.sin(ALL_x[:, 2, None])
        cos_phi = np.cos(ALL_x[:, 2, None])

        col1 = np.multiply(ALL_x[:, 3, None], cos_phi)
        col2 = np.multiply(ALL_x[:, 3, None], sin_phi)
        col3 = opt_a[:, 0, None]
        col4 = opt_a[:, 1, None]

        return np.concatenate((col1, col2, col3, col4), axis=1)

    ####################### RECURSIVE FUNC ####################

    def RK4(ALL_x, dtt, opt_a, opt_b):

        k1 = F(ALL_x, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k2)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k1)
        ALL_tmp[:, 2] = p_corr(ALL_tmp[:, 2])

        k2 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k3)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k2)
        ALL_tmp[:, 2] = p_corr(ALL_tmp[:, 2])

        k3 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k4)
        ALL_tmp = ALL_x + np.multiply(dtt, k3)
        ALL_tmp[:, 2] = p_corr(ALL_tmp[:, 2])

        k4 = F(ALL_tmp, opt_a, opt_b)
        #### !!!

        Snx = ALL_x + np.multiply((dtt / 6.0), (k1 + 2.0 * k2 + 2.0 * k3 + k4))
        Snx[:, 2] = p_corr(Snx[:, 2])
        return Snx

    perms = list(itertools.product([-1, 1], repeat=num_ac))

    def Hot_to_Cold(opt_a):
        for k in range(len(max_list)):
            ind_max = (opt_a[:, [k]] > 0.0)
            opt_a[ind_max] = max_list[k]
            opt_a[not (ind_max)] = min_list[k]
        return opt_a

    def getPI(
        ALL_x,
        F_PI=[]
    ):  #Things to keep in MIND: You want the returned value to be the minimum accross a trajectory.

        current_params = sess.run(theta)

        #perms = list(itertools.product([-1,1], repeat=num_ac))
        next_states = []
        true_ac_list = []
        for i in range(len(perms)):  #2**num_actions
            ac_tuple = perms[i]
            ac_list = [tmp1 * tmp2 for tmp1, tmp2 in zip(ac_tuple, max_list)]
            #ASSUMING: aMax = -aMin
            true_ac_list.append(ac_list)
            opt_a = np.asarray(ac_list) * np.ones([ALL_x.shape[0], 1])
            Snx = RK4(ALL_x, dt, opt_a, None)
            next_states.append(Snx)
        next_states = np.concatenate(next_states, axis=0)
        values = V_0(next_states[:, [0, 1]])

        for params in F_PI:
            for ind in range(len(params)):  #Reload pi*(x,t+dt) parameters
                sess.run(theta[ind].assign(params[i]))

            opt_a = sess.run(Tt, {states: next_states})
            next_states = RK4(ALL_x, dt, opt_a, None)
            values = np.min((values, V_0(next_states[:, [0, 1]])),
                            axis=1,
                            keepdims=True)

        compare_vals = values.reshape([ALL_x.shape[0], -1])
        values = np.min(compare_vals, axis=1, keepdims=True)
        index_best_a = compare_vals.argmin(axis=1)  #.reshape([-1,1]);
        best_actions = np.asarray([true_ac_list[i] for i in index_best_a])
        final_values = np.min((values, V_0(ALL_x[:, [0, 1]])), axis=1)

        for ind in range(len(current_params)):  #Reload pi*(x,t+dt) parameters
            sess.run(theta[ind].assign(current_params[ind]))

        #return index_best_a,final_values
        return best_actions, final_values

    # *****************************************************************************
    #
    # ============================= MAIN LOOP ====================================
    #                     ( )
    # *****************************************************************************
    t1 = time.time()
    t = 0.0
    mse = np.inf
    k = 0
    kk = 0
    beta = 3.0
    batch_size = bts
    tau = 1000.0
    steps = teps
    ALL_PI = []
    nunu = lr_schedule.value(k)
    for i in xrange(iters):

        if (np.mod(i, renew) == 0 and i is not 0):

            ALL_PI.insert(0, sess.run(theta))

            k = 0
            ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0]))
            ALL_x[:, 2] = ALL_x[:, 2] * np.pi / 5.0 + np.pi
            ALL_x[:, 3] = ALL_x[:, 3] * 3.0 / 5.0 + 9.0
            PI, _ = getPI(ALL_x, ALL_PI)

            ALL_x_ = np.random.uniform(-5.0, 5.0, (nrolls / 100, layers[0]))
            ALL_x_[:, 2] = ALL_x_[:, 2] * np.pi / 5.0 + np.pi
            ALL_x_[:, 3] = ALL_x_[:, 3] * 3.0 / 5.0 + 9.0
            PI_, _ = getPI(ALL_x_, ALL_PI)

            #ZR = getPI
            #ZR = sess.run(Tt,{states:reach100s[:,:-1]});
            #error1 = ZR - reach100s[:,-1,None];

            #            Z000 = np.reshape(sess.run(Tt,{states:grid_eval}),X.shape);
            #            Z001 = np.reshape(sess.run(Tt,{states:grid_eval_}),X.shape);
            #            Z002 = np.reshape(sess.run(Tt,{states:grid_eval__}),X.shape);
            #            #filter_in = (Z000 <= 0.05) #& (Z000 >= 0.05);
            #            filter_out = (Z000 > 0.00) #| (Z000 < -0.05);
            #            filter_out_ = (Z001 > 0.00) #| (Z000 < -0.05);
            #            filter_out__ = (Z002 > 0.00) #| (Z000 < -0.05);
            #            #Z000[filter_in] = 1.0;
            #            Z000[filter_out] = 0.0;
            #            Z001[filter_out_] = 0.0;
            #            Z002[filter_out__] = 0.0;
            #
            #            Z000l = np.reshape(sess.run(Tt,{states:grid_evall}),X.shape);
            #            Z001l = np.reshape(sess.run(Tt,{states:grid_evall_}),X.shape);
            #            Z002l = np.reshape(sess.run(Tt,{states:grid_evall__}),X.shape);
            #            #filter_in = (Z000 <= 0.05) #& (Z000 >= 0.05);
            #            filter_outl = (Z000l > 0.00) #| (Z000 < -0.05);
            #            filter_out_l = (Z001l > 0.00) #| (Z000 < -0.05);
            #            filter_out__l = (Z002l > 0.00) #| (Z000 < -0.05);
            #            #Z000[filter_in] = 1.0;
            #            Z000l[filter_outl] = 0.0;
            #            Z001l[filter_out_l] = 0.0;
            #            Z002l[filter_out__l] = 0.0;
            #
            #            plt.clf();
            #            #plt.plot(ALL_t_, np.abs(allE), 'ro');
            #            #plt.axis([-1.0, 0.0, 0.0, 10.0])
            #            plt.subplot(2,3,1)
            #            plt.imshow(Z000,cmap='gray');
            #            plt.subplot(2,3,2)
            #            plt.imshow(Z001,cmap='gray');
            #            plt.subplot(2,3,3)
            #            plt.imshow(Z002,cmap='gray');
            #            plt.subplot(2,3,4)
            #            plt.imshow(Z000l,cmap='gray');
            #            plt.subplot(2,3,5)
            #            plt.imshow(Z001l,cmap='gray');
            #            plt.subplot(2,3,6)
            #            plt.imshow(Z002l,cmap='gray');
            #            plt.pause(0.01);
            #
            #
            #            print str(t) + " || " + str(np.max(np.abs(error1))) + " , " + str(np.mean(np.abs(error1))) + " REG = " + str(sess.run(reg)) + ") | MSE = " + str(mse) + "|ITR=" + str(i)                                                #VAR
            t = t - dt

        #elif(i is 0):
        elif (np.mod(i, renew) == 0 and i is 0):

            k = 0
            #            sess.run(set_to_zero);
            ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0]))
            ALL_x[:, 2] = ALL_x[:, 2] * np.pi / 5.0 + np.pi
            ALL_x[:, 3] = ALL_x[:, 3] * 3.0 / 5.0 + 9.0
            PI, _ = getPI(ALL_x)

            ALL_x_ = np.random.uniform(-5.0, 5.0, (nrolls / 100, layers[0]))
            ALL_x_[:, 2] = ALL_x_[:, 2] * np.pi / 5.0 + np.pi
            ALL_x_[:, 3] = ALL_x_[:, 3] * 3.0 / 5.0 + 9.0
            PI_, _ = getPI(ALL_x_)


#            sess.run(set_to_not_zero);

# |||||||||||| ----  PRINT ----- ||||||||||||

        if (np.mod(i, 200) == 0):

            #xel = sess.run(L,{states:ALL_x,y:PI});
            #test_e = sess.run(L,{states:ALL_x_,y:PI_});
            train_acc = sess.run(accuracy, {
                states: ALL_x,
                y: PI
            })
            test_acc = sess.run(accuracy, {
                states: ALL_x_,
                y: PI_
            })
            #o = np.random.randint(len(ALL_x));
            print str(i) + ") | TR_ACC = " + str(
                train_acc) + " | TE_ACC = " + str(
                    test_acc) + " | Lerning Rate = " + str(nunu)
            #print str(i) + ") | XEL = " + str(xel) + " | Test_E = " + str(test_e) + " | Lerning Rate = " + str(nunu)
            #print str(PI[[o],:]) + " || " + str(sess.run(l_r[-1],{states:ALL_x[[o],:]})) #+ " || " + str(sess.run(gvs[-1],{states:ALL_x,y:PI}))

        nunu = 0.001  #/np.log(i+2.0)#lr_schedule.value(i);
        #nunu = ler_r/(np.mod(i,renew)+1.0);
        tmp = np.random.randint(len(ALL_x), size=bts)
        sess.run(train_step,
                 feed_dict={
                     states: ALL_x[tmp],
                     y: PI[tmp],
                     nu: nunu
                 })
Example #13
0
def main(layers,t_hor,ind,nrolls,bts,ler_r,mom,teps,renew,imp,q):
    # Quad Params
    #Change to limit control in pitch or roll
    max_list = [0.1,0.1,11.81,1.0]; #w=1
    min_list = [-0.1,-0.1,7.81,-1.0]; 
    
    max_list_ = [0.5,0.5,0.5]
    min_list_ = [-0.5,-0.5,-0.5]
    
    g = 9.81;


    print 'Starting worker-' + str(ind)

    f = 1;
    Nx = 100*f + 1;
    minn = [-5.0,-10.0,-5.0,-10.0,0.0,-10.0];
    maxx = [ 5.0, 10.0, 5.0, 10.0,2*np.pi, 10.0];
    
    X = np.linspace(minn[0],maxx[0],Nx);
    Y = np.linspace(minn[2],maxx[2],Nx);
    Z = np.linspace(minn[4],maxx[4],Nx);
    X_,Y_,Z_ = np.meshgrid(X, Y, Z);    
    X,Y = np.meshgrid(X, Y);
    XX = np.reshape(X,[-1,1]);
    YY = np.reshape(Y,[-1,1]);
    XX_ = np.reshape(X_,[-1,1]);
    YY_ = np.reshape(Y_,[-1,1]);
    ZZ_ = np.reshape(Z_,[-1,1]); grid_check = np.concatenate((XX_,np.ones(XX_.shape),YY_,np.ones(XX_.shape),ZZ_,np.zeros(XX_.shape)),axis=1);
    grid_eval = np.concatenate((XX,YY,0.0*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
    grid_eval_ = np.concatenate((XX,YY,(2.0/3.0)*np.pi*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
    grid_eval__ = np.concatenate((XX,YY,(4.0/3.0)*np.pi*np.ones(XX.shape),6.0*np.ones(XX.shape)),axis=1);
    grid_evall = np.concatenate((XX,YY,0.0*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);
    grid_evall_ = np.concatenate((XX,YY,(2.0/3.0)*np.pi*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);
    grid_evall__ = np.concatenate((XX,YY,(4.0/3.0)*np.pi*np.ones(XX.shape),12.0*np.ones(XX.shape)),axis=1);    


    # Calculate number of parameters of the policy
    nofparams = 0;
    for i in xrange(len(layers)-1):
        nofparams += layers[i]*layers[i+1] + layers[i+1];
    print 'Number of Params is: ' + str(nofparams)
    
    H_length = t_hor;
    center = np.array([[0.0,0.0,0.0,0.0,0.0,0.0]])
    depth = 2.0;
    incl = 1.0;

    ##################### DEFINITIONS #####################
    #layers = [2 + 1,10,1];                                                    #VAR
    #ssize = layers[0] - 1;
    dt = 0.1;                                                                 #VAR
    num_ac = 4;
    dist_ac = 3;
    iters = int(np.abs(t_hor)/dt)*renew + 1; 
    ##################### INSTANTIATIONS #################
    states,y,Tt,L,l_r,lb,reg, cross_entropy = TransDef("Control",False,layers,depth,incl,center);
    layers_ = layers[:]
    layers_[-1] = 2**dist_ac
    states_,y_,Tt_,L_,l_r_,lb_,reg_, cross_entropy_ = TransDef("Disturbance",False,layers_,depth,incl,center);
    ola1 = tf.argmax(Tt,dimension=1)
    ola2 = tf.argmax(y,dimension=1)
    ola3 = tf.equal(ola1,ola2)
    accuracy = tf.reduce_mean(tf.cast(ola3, tf.float32));
    ola1_ = tf.argmax(Tt_,dimension=1)
    ola2_ = tf.argmax(y_,dimension=1)
    ola3_ = tf.equal(ola1_,ola2_)
    accuracy_ = tf.reduce_mean(tf.cast(ola3_, tf.float32));    
    #a_layers = layers;
    #a_layers[-1] = 2; #We have two actions
    #states_,y_,Tt_,l_r_,lb_,reg_ = TransDef("Actor",False,a_layers,depth,incl,center,outp=True);
    
    C_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Control');
    D_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Disturbance');
    
    #var_grad = tf.gradients(Tt_,states_)[0]
    var_grad_ = tf.gradients(Tt,states)[0]
    grad_x = tf.slice(var_grad_,[0,0],[-1,layers[0]-1]);
    #theta = tf.trainable_variables();

    set_to_zero = []
    for var  in sorted(C_func_vars,        key=lambda v: v.name):
        set_to_zero.append(var.assign(tf.zeros(tf.shape(var))))
    set_to_zero = tf.group(*set_to_zero)
    
    set_to_not_zero = []
    for var  in sorted(C_func_vars,        key=lambda v: v.name):
        set_to_not_zero.append(var.assign(tf.random_uniform(tf.shape(var),minval=-0.1,maxval=0.1)));
    set_to_not_zero = tf.group(*set_to_not_zero)    

    # DEFINE LOSS

    lmbda = 0.0;#1.0**(-3.5);#0.01;
    beta = 0.00;
    #L = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y,Tt)),1,keep_dims=True))) + beta*tf.reduce_mean(tf.reduce_max(tf.abs(grad_x),reduction_indices=1,keep_dims=True));
    #L = tf.reduce_mean(tf.mul(tf.exp(imp*t_vec),tf.abs(tf.sub(y,Tt)))) + lmbda*reg;
    #L = tf.reduce_mean(tf.abs(tf.sub(y,Tt))) + lmbda*reg;    

    # DEFINE OPTIMIZER

    #nu = 5.01;
    #nunu = ler_r;#0.00005;
    nu = tf.placeholder(tf.float32, shape=[])                                         #VAR

    #lr_multiplier = ler_r
    lr_schedule = PiecewiseSchedule([
                                         (0, 0.1),
                                         (10000, 0.01 ),
                                         (20000, 0.001 ),
                                         (30000, 0.0001 ),
                                    ],
                                    outside_value=0.0001)

    #optimizer = tf.train.GradientDescentOptimizer(nu)
    #optimizer
    #train_step = tf.train.MomentumOptimizer(learning_rate=nu,momentum=mom).minimize(L)
    #optimizer 
    #train_step = tf.train.AdamOptimizer(learning_rate=nu).minimize(L);
    train_step = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom).minimize(L);
    train_step_ = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom).minimize(L_);
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom);
    #gvs = optimizer.compute_gradients(L,theta);
    #capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in gvs];
    #train_step = optimizer.apply_gradients(gvs);
    #train_step = tf.train.AdagradOptimizer(learning_rate=nu,initial_accumulator_value=0.5).minimize(L);

    hot_input = tf.placeholder(tf.int64,shape=(None));   
    make_hot = tf.one_hot(hot_input, 2**num_ac, on_value=1, off_value=0)
    make_hot_ = tf.one_hot(hot_input, 2**dist_ac, on_value=1, off_value=0)

    # INITIALIZE GRAPH
    sess = tf.Session();
    init = tf.initialize_all_variables();
    sess.run(init);

    def V_0(x):
        #return np.linalg.norm(x,ord=np.inf,axis=1,keepdims=True) - 1.0
        return np.linalg.norm(x,axis=1,keepdims=True) - 1.0

    def p_corr(ALL_x):
        ALL_x = np.mod(ALL_x,2.0*np.pi);
        return ALL_x;

#    def F(ALL_x,opt_a,opt_b):#(grad,ALL_x):
#       col1 = ALL_x[:,3,None] - opt_b[:,0,None]
#       col2 = ALL_x[:,4,None] - opt_b[:,1,None]
#       col3 = ALL_x[:,5,None] - opt_b[:,2,None]
#       col4 = g*opt_a[:,0,None]
#       col5 = -g*opt_a[:,1,None]
#       col6 = opt_a[:,2,None] - g
#       
#       return np.concatenate((col1,col2,col3,col4,col5,col6),axis=1);
   
    def F(ALL_x,opt_a,opt_b):#(grad,ALL_x):
       col1 = ALL_x[:,3,None] - opt_b[:,0,None]
       col2 = ALL_x[:,4,None] - opt_b[:,1,None]
       col3 = ALL_x[:,5,None] - opt_b[:,2,None]
       col4 = np.multiply(opt_a[:,2,None],np.multiply(np.cos(ALL_x[:,-1,None]),opt_a[:,0,None]) + np.multiply(np.sin(ALL_x[:,-1,None]),opt_a[:,1,None]))
       col5 = np.multiply(opt_a[:,2,None],-np.multiply(np.cos(ALL_x[:,-1,None]),opt_a[:,1,None]) + np.multiply(np.sin(ALL_x[:,-1,None]),opt_a[:,0,None]))
       col6 = np.multiply(opt_a[:,2,None],np.multiply(np.cos(opt_a[:,0,None]),np.cos(opt_a[:,1,None]))) - g
       col7 = opt_a[:,3,None]
       
       return np.concatenate((col1,col2,col3,col4,col5,col6,col7),axis=1);    
   
    ####################### RECURSIVE FUNC ####################

    def RK4(ALL_x,dtt,opt_a,opt_b): #Try Euler

        k1 = F(ALL_x,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k2)
        ALL_tmp = ALL_x + np.multiply(dtt/2.0,k1);
        ALL_tmp[:,-1] = p_corr(ALL_tmp[:,-1]);

        k2 = F(ALL_tmp,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k3)
        ALL_tmp = ALL_x + np.multiply(dtt/2.0,k2);
        ALL_tmp[:,-1] = p_corr(ALL_tmp[:,-1]);

        k3 = F(ALL_tmp,opt_a,opt_b);  #### !!!
        # ~~~~ Compute optimal input (k4)
        ALL_tmp = ALL_x + np.multiply(dtt,k3);
        ALL_tmp[:,-1] = p_corr(ALL_tmp[:,-1]);

        k4 = F(ALL_tmp,opt_a,opt_b);  #### !!!

        Snx = ALL_x + np.multiply((dtt/6.0),(k1 + 2.0*k2 + 2.0*k3 + k4)); #np.multiply(dtt,k1)
        ALL_tmp[:,-1] = p_corr(ALL_tmp[:,-1]);
        return Snx;

    perms = list(itertools.product([-1,1], repeat=num_ac))
    true_ac_list = [];
    for i in range(len(perms)): #2**num_actions
        ac_tuple = perms[i];
        ac_list = [(tmp1==1)*tmp3 +  (tmp1==-1)*tmp2 for tmp1,tmp2,tmp3 in zip(ac_tuple,min_list,max_list)]; 
        true_ac_list.append(ac_list);
            
    perms_ = list(itertools.product([-1,1], repeat=dist_ac))
    true_ac_list_ = [];
    for i in range(len(perms_)): #2**num_actions
        ac_tuple_ = perms_[i];
        ac_list_ = [(tmp1==1)*tmp3 +  (tmp1==-1)*tmp2 for tmp1,tmp2,tmp3 in zip(ac_tuple_,min_list_,max_list_)]; #ASSUMING: aMax = -aMin
        true_ac_list_.append(ac_list_);       
    
    def Hot_to_Cold(hots,ac_list):
        a = hots.argmax(axis=1);
        a = np.asarray([ac_list[i] for i in a]);
        return a;
    
    def getPI(ALL_x,F_PI=[], F_PI_=[], subSamples=1): #Things to keep in MIND: You want the returned value to be the minimum accross a trajectory.

        current_params = sess.run(C_func_vars);
        current_params_ = sess.run(D_func_vars);

        #perms = list(itertools.product([-1,1], repeat=num_ac))
        next_states_ = [];
        for k in range((len(perms))):
            next_states = [];
            opt_a = np.asarray(true_ac_list[k])*np.ones([ALL_x.shape[0],1]);
            for i in range(len(perms_)):
                opt_b = np.asarray(true_ac_list_[i])*np.ones([ALL_x.shape[0],1]);
                Snx = ALL_x;
                for _ in range(subSamples): 
                    Snx = RK4(Snx,dt/float(subSamples),opt_a,opt_b);
                next_states.append(Snx);
            next_states_.append(np.concatenate(next_states,axis=0));
        next_states_ = np.concatenate(next_states_,axis=0);
        values = V_0(next_states_[:,[0,1,2]]);
        
        
        for params,params_ in zip(F_PI,F_PI_):
            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
                sess.run(C_func_vars[ind].assign(params[ind]));
            for ind in range(len(params_)): #Reload pi*(x,t+dt) parameters
                sess.run(D_func_vars[ind].assign(params_[ind]));

            tmp = ConvCosSin(next_states_);
            hots = sess.run(Tt,{states:tmp});
            opt_a = Hot_to_Cold(hots,true_ac_list)   
            hots = sess.run(Tt_,{states_:tmp});
            opt_b = Hot_to_Cold(hots,true_ac_list_)            
            for _ in range(subSamples):
                next_states_ = RK4(next_states_,dt/float(subSamples),opt_a,opt_b);
                values = np.max((values,V_0(next_states_[:,[0,1,2]])),axis=0);
        
        values_ = values;#V_0(next_states_[:,[0,1,2]]);
        pre_compare_vals_ = values_.reshape([-1,ALL_x.shape[0]]).T;         #Changed to values instead of values_
        final_v = [];
        final_v_ = [];
        per = len(perms_);
        for k in range(len(perms)):
            final_v.append(np.argmax(pre_compare_vals_[:,k*per:(k+1)*per,None],axis=1))
            final_v_.append(np.max(pre_compare_vals_[:,k*per:(k+1)*per,None],axis=1))
        finalF = np.concatenate(final_v_,axis=1);
        index_best_a_ = np.argmin(finalF,axis=1);
        finalF_ = np.concatenate(final_v,axis=1);
        index_best_b_ = np.array([finalF_[k,index_best_a_[k]] for k in range(len(index_best_a_))]);
        
        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
            sess.run(C_func_vars[ind].assign(current_params[ind]));
        for ind in range(len(current_params_)): #Reload pi*(x,t+dt) parameters
            sess.run(D_func_vars[ind].assign(current_params_[ind]));
            
        return sess.run(make_hot,{hot_input:index_best_a_}),sess.run(make_hot_,{hot_input:index_best_b_})

#    def getTraj(ALL_x,F_PI=[],F_PI_=[],subSamples=1,StepsLeft=None,Noise = False):
#
#        current_params = sess.run(C_func_vars);
#        current_params_ = sess.run(D_func_vars);
#        
#        if(StepsLeft == None): StepsLeft = len(F_PI);        
#        
#        next_states_ = ALL_x;
#        traj = [next_states_];
#        actions = [];
#
#        for params,params_ in zip(F_PI,F_PI_):
#            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
#                sess.run(C_func_vars[ind].assign(params[ind]));
#            for ind in range(len(params_)): #Reload pi*(x,t+dt) parameters
#                sess.run(D_func_vars[ind].assign(params_[ind]));            
#
#            tmp = ConvCosSin(next_states_);
#            hots = sess.run(Tt,{states:tmp});
#            opt_a = Hot_to_Cold(hots,true_ac_list)   
#            hots_ = sess.run(Tt_,{states_:tmp});
#            opt_b = Hot_to_Cold(hots_,true_ac_list_)            
#            for _ in range(subSamples):
#                next_states_ = RK4(next_states_,dt/float(subSamples),opt_a,opt_b);
#                traj.append(next_states_); 
#                actions.append(hots.argmax(axis=1)[0]);
#
#        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
#            sess.run(C_func_vars[ind].assign(current_params[ind]));
#        for ind in range(len(current_params_)): #Reload pi*(x,t+dt) parameters
#            sess.run(D_func_vars[ind].assign(current_params_[ind]));
#                        
#        return traj,actions#,V_0(next_states[:,[0,2]]),actions; 

    def getTraj(ALL_x,F_PI=[],F_PI_=[],subSamples=1,StepsLeft=None,Noise=False, Static=False, justV=False, disturb = -1, steps = -1):

        current_params = sess.run(C_func_vars);
        current_params_ = sess.run(D_func_vars);
        
        if(StepsLeft == None): StepsLeft = len(F_PI);        
        
        next_states_ = ALL_x;
        traj = [next_states_];
        actions = [];
        
        values = V_0(next_states_[:,[0,1,2]]);
        
        if Static:
            if(steps < 0):
                disturb = input("Disturbance Policy = ")
                steps = input("How Many Steps? ")
            for ind in range(len(F_PI[len(F_PI)-StepsLeft])): #Reload pi*(x,t+dt) parameters
                sess.run(C_func_vars[ind].assign(F_PI[len(F_PI)-StepsLeft][ind]));
            for ind in range(len(F_PI_[len(F_PI_)-disturb])): #Reload pi*(x,t+dt) parameters
                sess.run(D_func_vars[ind].assign(F_PI_[len(F_PI_)-disturb][ind]));  
            for i in range(steps):                            
                for _ in range(subSamples):
                    tmp = ConvCosSin(next_states_);
                    hots = sess.run(Tt,{states:tmp});
                    opt_a = Hot_to_Cold(hots,true_ac_list)   
                    if Noise == False:
                        hots_ = sess.run(Tt_,{states_:tmp});
                        opt_b = Hot_to_Cold(hots_,true_ac_list_)
                    else:
                        hots_ = np.zeros((1,2**dist_ac));
                        hots_[0][np.random.randint(2**dist_ac)] = 1
                        opt_b = Hot_to_Cold(hots_,true_ac_list_)
                    
                    next_states_ = RK4(next_states_,dt/float(subSamples),opt_a,opt_b);
                    if not justV: 
                        traj.append(next_states_); 
                        actions.append(hots.argmax(axis=1)[0]);  
                    values = np.max((values,V_0(next_states_[:,[0,1,2]])),axis=0);
                    if i % 20 == 0:
                        print(i)
        else:      
            for params,params_ in zip(F_PI[len(F_PI)-StepsLeft:],F_PI_[len(F_PI_)-StepsLeft:]):           
                for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
                    sess.run(C_func_vars[ind].assign(params[ind]));
                for ind in range(len(params_)): #Reload pi*(x,t+dt) parameters
                    sess.run(D_func_vars[ind].assign(params_[ind]));  
                
                tmp = ConvCosSin(next_states_);
                hots = sess.run(Tt,{states:tmp});
                opt_a = Hot_to_Cold(hots,true_ac_list)   
                if Noise == False:
                    hots_ = sess.run(Tt_,{states_:tmp});
                    opt_b = Hot_to_Cold(hots_,true_ac_list_)
                else:
                    hots_ = np.zeros((1,2**dist_ac));
                    hots_[0][np.random.randint(2**dist_ac)] = 1
                    opt_b = Hot_to_Cold(hots_,true_ac_list_)
                            
                for _ in range(subSamples):
                    next_states_ = RK4(next_states_,dt/float(subSamples),opt_a,opt_b);
                    traj.append(next_states_); 
                    actions.append(hots.argmax(axis=1)[0]);   

        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
            sess.run(C_func_vars[ind].assign(current_params[ind]));
        for ind in range(len(current_params_)): #Reload pi*(x,t+dt) parameters
            sess.run(D_func_vars[ind].assign(current_params_[ind]));

        print(str(next_states_))        
                
        return traj,actions,values

    def ConvCosSin(ALL_x):
        sin_psi = np.sin(ALL_x[:,[6]])
        cos_psi = np.cos(ALL_x[:,[6]])
        pos = ALL_x[:,[0,1,2]]/5.0;
        vel = ALL_x[:,[3,4,5]]/10.0;
        ret_val = np.concatenate((pos,vel,sin_psi,cos_psi),axis=1)
        return ret_val
    # *****************************************************************************
    #
    # ============================= MAIN LOOP ====================================
    #
    # *****************************************************************************
    t1 = time.time();
    t = 0.0;
    mse = np.inf;
    k=0; kk = 0; beta=3.0; batch_size = bts; tau = 1000.0; steps = teps;
    ALL_PI = [];
    ALL_PI_= [];
    nunu = lr_schedule.value(k);
    
    act_color = ['r','g','b','y'];
    if(imp == 1.0):
        ALL_PI,ALL_PI_ = pickle.load( open( "policies7D_P&Tcoupled_h60_h60_h60.pkl", "rb" ) );
        cc = 0;
        while True:
            state_get = input('State: ');
            sub_smpl = input('SUBSAMPLING: ');
            pause_len = input('Pause: ')
            s_left = input("How many steps left to go (max. "+str(len(ALL_PI))+")? -> ")
            noise = input("Noise? (0/1): ")
            stat = input("Static? (0/1): ")
            traj,act,value = getTraj(state_get,F_PI=ALL_PI,F_PI_=ALL_PI_,subSamples=sub_smpl,StepsLeft=s_left,Noise=noise,Static=stat);
            print(value)
            act.append(act[-1]);
            all_to = np.concatenate(traj);
            plt.scatter(all_to[:,[0]],all_to[:,[1]],color=act_color[cc % len(act_color)])
            plt.pause(pause_len);
            cc = cc + 1;
            #plt.colorbar()       
    elif(imp == 2.0):
        ALL_PI,ALL_PI_ = pickle.load( open( "policies7D_P&Tcoupled_h60_h60_h60.pkl", "rb" ) );
        cc = 0;
        dist_bound = input("Distance: ")
        state_get = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-1));
        state_get[:,:3] = dist_bound*state_get[:,:3]/np.linalg.norm(state_get[:,:3],axis=1,keepdims=True)
        sub_smpl = input('SUBSAMPLING: ');
        s_left = input("How many steps left to go (max. "+str(len(ALL_PI))+")? -> ")
        noise = input("Noise? (0/1): ");
        stat = input("Static? (0/1): ");
        traj,act,values = getTraj(state_get,F_PI=ALL_PI,F_PI_=ALL_PI_,subSamples=sub_smpl,StepsLeft=s_left,Noise=noise,Static=stat,justV=True);
        values = values + 1.0
        print(values.shape)
        filt = (values < dist_bound).T[0];
        print(filt.shape)
        subset = state_get[filt]
        print(len(subset))
        plt.hist(values,bins=100)   
        plt.pause(10)
        tracking_error_bound = np.max(abs(subset[:,:3]),axis=0)
        print(tracking_error_bound)
        print(subset)
        save_dict = {}
        save_dict["weights"]=(ALL_PI,ALL_PI_)
        save_dict["c_layers"]=layers1
        save_dict["d_layers"]=layers_
        save_dict["control_bounds_upper"]= max_list
        save_dict["control_bounds_lower"]= min_list
        save_dict["tracking_error_bound"]= tracking_error_bound
        save_dict["planner_params"]={"max_speed":[0.5,0.5,0.5],"max_vel_dist":[0.0,0.0,0.0],"max_acc_dist":[0.0,0.0,0.0]}
        save_dict["normalization_args"] = [5.0,5.0,5.0,10.0,10.0,10.0,-1]
        pickle.dump(save_dict,open( "TESTpolicies7Dubins_PT_h100_h100.pkl", "wb" ));
    elif(imp == 3.0):
        ALL_PI,ALL_PI_ = pickle.load( open( "policies7D_P&Tcoupled_h100_h100.pkl", "rb" ) );
        fig = plt.figure()
        tmp = 1
        vals = []
        for i in range(1,len(ALL_PI),2):
            for j in range(1,len(ALL_PI_),2):
                state_get = np.array([[0.0,0.0,0.0,0.0,0.0,0.0,0.0]])
                sub_smpl = 2
                pause_len = 10
                s_left = i
                noise = 0
                stat = 1
                traj,act,v = getTraj(state_get,F_PI=ALL_PI,F_PI_=ALL_PI_,subSamples=sub_smpl,StepsLeft=s_left,Noise=noise,Static=stat,disturb=j,steps=1000);
                vals.append(v[0][0])
                all_to = np.concatenate(traj);
                ax = fig.add_subplot(len(ALL_PI)/2,len(ALL_PI_)/2,tmp)
                tmp = tmp + 1
                ax.scatter(all_to[:,[0]],all_to[:,[2]])
                plt.pause(1.0);
        vals = np.array(vals).reshape((10,10))
        pickle.dump(vals,open( "avore.pkl", "wb" ));
        plt.pause(1000.0)
        cc = cc + 1;      
    else:

        for i in xrange(iters):
            
            if(np.mod(i,renew) == 0 and i is not 0):       
                
                ALL_PI.insert(0,sess.run(C_func_vars));
                ALL_PI_.insert(0,sess.run(D_func_vars));        
                            
                
                k = 0;
                ALL_x = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-1));
                ALL_x[:,[3,4,5]] = ALL_x[:,[3,4,5]]*2.0
                ALL_x[:,[6]] = np.mod(ALL_x[:,[6]]*np.pi/5.0,2.0*np.pi);  
                PI_c,PI_d = getPI(ALL_x,ALL_PI,ALL_PI_,subSamples=1);
                pre_ALL_x = ConvCosSin(ALL_x);
                
                ALL_x_ = np.random.uniform(-5.0,5.0,(nrolls/100,layers[0]-1));
                ALL_x_[:,[3,4,5]] = ALL_x_[:,[3,4,5]]*2.0
                ALL_x_[:,[6]] = np.mod(ALL_x_[:,[6]]*np.pi/5.0,2.0*np.pi); 
                PI_c_,PI_d_ = getPI(ALL_x_,ALL_PI,ALL_PI_,subSamples=1);
                pre_ALL_x_ = ConvCosSin(ALL_x_);
    
                t = t - dt; 
                print('Again.')
                
            #elif(i is 0):
            elif(np.mod(i,renew) == 0 and i is 0):
    
    #            sess.run(set_to_zero);
                t = time.time()
                ALL_x = np.random.uniform(-5.0,5.0,(nrolls,layers[0]-1));
                ALL_x[:,[3,4,5]] = ALL_x[:,[3,4,5]]*2.0
                ALL_x[:,[6]] = np.mod(ALL_x[:,[6]]*np.pi/5.0,2.0*np.pi);                
                PI_c,PI_d = getPI(ALL_x,F_PI=[],F_PI_=[],subSamples=1);
                pre_ALL_x = ConvCosSin(ALL_x);
                elapsed = time.time() - t
                print("Compute Data Time = "+str(elapsed))
                
                ALL_x_ = np.random.uniform(-5.0,5.0,(nrolls/100,layers[0]-1));
                ALL_x_[:,[3,4,5]] = ALL_x_[:,[3,4,5]]*2.0
                ALL_x_[:,[6]] = np.mod(ALL_x_[:,[6]]*np.pi/5.0,2.0*np.pi); 
                PI_c_,PI_d_ = getPI(ALL_x_,F_PI=[],F_PI_=[],subSamples=1);
                pre_ALL_x_ = ConvCosSin(ALL_x_);           
    #            sess.run(set_to_not_zero);
    
                
    
            # |||||||||||| ----  PRINT ----- |||||||||||| 
    
            if(np.mod(i,200) == 0):
    
                #xel = sess.run(L,{states:ALL_x,y:PI});
                #test_e = sess.run(L,{states:ALL_x_,y:PI_});
                train_acc = sess.run(accuracy,{states:pre_ALL_x,y:PI_c});
                test_acc = sess.run(accuracy,{states:pre_ALL_x_,y:PI_c_});
                train_acc_ = sess.run(accuracy_,{states_:pre_ALL_x,y_:PI_d});
                test_acc_ = sess.run(accuracy_,{states_:pre_ALL_x_,y_:PI_d_});             
                #o = np.random.randint(len(ALL_x));
                print str(i) + ") control | TR_ACC = " + str(train_acc) + " | TE_ACC = " + str(test_acc) + " | Learning Rate = " + str(nunu)
                print str(i) + ") disturb | TR_ACC = " + str(train_acc_) + " | TE_ACC = " + str(test_acc_) + " | Learning Rate = " + str(nunu)
                #print str(i) + ") | XEL = " + str(xel) + " | Test_E = " + str(test_e) + " | Lerning Rate = " + str(nunu)
                #print str(PI[[o],:]) + " || " + str(sess.run(l_r[-1],{states:ALL_x[[o],:]})) #+ " || " + str(sess.run(gvs[-1],{states:ALL_x,y:PI}))
                
            nunu = 0.001#/(np.sqrt(np.mod(i,renew))+1.0)#lr_schedule.value(i);
            #nunu = ler_r/(np.mod(i,renew)+1.0);
            tmp = np.random.randint(len(ALL_x), size=bts);
            sess.run(train_step, feed_dict={states:pre_ALL_x[tmp],y:PI_c[tmp],nu:nunu});
            sess.run(train_step_, feed_dict={states_:pre_ALL_x[tmp],y_:PI_d[tmp],nu:nunu});
            #tmp = np.random.randint(len(reach100s), size=bts);
            #sess.run(train_step, feed_dict={states:reach100s[tmp,:-1],y:reach100s[tmp,-1,None],nu:nunu});
    
        pickle.dump([ALL_PI,ALL_PI_],open( "policies7D_P&Tcoupled_h100_h100.pkl", "wb" ));
def main(layers, t_hor, ind, nrolls, bts, ler_r, mom, teps, renew, imp, q):

    # Constants
    #   The choices of n0, d1, d0 actually results in a very large
    #   steady state error in the pitch/roll; this seems to be
    #   expected according to Pat's report
    n0 = 10  # Angular dynamics parameters
    d1 = 8
    d0 = 10

    kT = 1.0  #0.91   # Thrust coefficient (vertical direction)
    grav = 9.81  # Acceleration due to gravity (for convenience)
    m = 1.3  # Mass

    # Quad Params
    max_list = [1, 1, 2.0 * grav]
    min_list = [-1, -1, 0.0]

    max_list_ = [0.5, 0.5, 0.5]
    min_list_ = [-0.5, -0.5, -0.5]

    g = 9.81

    print 'Starting worker-' + str(ind)

    f = 1
    Nx = 100 * f + 1
    minn = [-5.0, -10.0, -5.0, -10.0, 0.0, -10.0]
    maxx = [5.0, 10.0, 5.0, 10.0, 2 * np.pi, 10.0]

    X = np.linspace(minn[0], maxx[0], Nx)
    Y = np.linspace(minn[2], maxx[2], Nx)
    Z = np.linspace(minn[4], maxx[4], Nx)
    X_, Y_, Z_ = np.meshgrid(X, Y, Z)
    X, Y = np.meshgrid(X, Y)
    XX = np.reshape(X, [-1, 1])
    YY = np.reshape(Y, [-1, 1])
    XX_ = np.reshape(X_, [-1, 1])
    YY_ = np.reshape(Y_, [-1, 1])
    ZZ_ = np.reshape(Z_, [-1, 1])
    grid_check = np.concatenate((XX_, np.ones(
        XX_.shape), YY_, np.ones(XX_.shape), ZZ_, np.zeros(XX_.shape)),
                                axis=1)
    grid_eval = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 6.0 * np.ones(XX.shape)), axis=1)
    grid_eval_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_eval__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 12.0 * np.ones(XX.shape)), axis=1)
    grid_evall_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)

    # Calculate number of parameters of the policy
    nofparams = 0
    for i in xrange(len(layers) - 1):
        nofparams += layers[i] * layers[i + 1] + layers[i + 1]
    print 'Number of Params is: ' + str(nofparams)

    H_length = t_hor
    sub_sys = []

    ##################### DEFINITIONS #####################
    #layers = [2 + 1,10,1];                                                    #VAR
    #ssize = layers[0] - 1;
    dt = 0.1
    #VAR
    num_ac = 3
    iters = int(np.abs(t_hor) / dt) * renew + 1
    ##################### INSTANTIATIONS #################
    states, y, Tt, L, l_r, lb, reg, cross_entropy = TransDef(
        "Control", False, layers)
    states_, y_, Tt_, L_, l_r_, lb_, reg_, cross_entropy_ = TransDef(
        "Disturbance", False, layers)
    ola1 = tf.argmax(Tt, dimension=1)
    ola2 = tf.argmax(y, dimension=1)
    ola3 = tf.equal(ola1, ola2)
    accuracy = tf.reduce_mean(tf.cast(ola3, tf.float32))
    ola1_ = tf.argmax(Tt_, dimension=1)
    ola2_ = tf.argmax(y_, dimension=1)
    ola3_ = tf.equal(ola1_, ola2_)
    accuracy_ = tf.reduce_mean(tf.cast(ola3_, tf.float32))
    #a_layers = layers;
    #a_layers[-1] = 2; #We have two actions
    #states_,y_,Tt_,l_r_,lb_,reg_ = TransDef("Actor",False,a_layers,depth,incl,center,outp=True);

    C_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Control')
    D_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES,
                                    scope='Disturbance')

    #var_grad = tf.gradients(Tt_,states_)[0]
    var_grad_ = tf.gradients(Tt, states)[0]
    grad_x = tf.slice(var_grad_, [0, 0], [-1, layers[0] - 1])
    #theta = tf.trainable_variables();

    set_to_zero = []
    for var in sorted(C_func_vars, key=lambda v: v.name):
        set_to_zero.append(var.assign(tf.zeros(tf.shape(var))))
    set_to_zero = tf.group(*set_to_zero)

    set_to_not_zero = []
    for var in sorted(C_func_vars, key=lambda v: v.name):
        set_to_not_zero.append(
            var.assign(
                tf.random_uniform(tf.shape(var), minval=-0.1, maxval=0.1)))
    set_to_not_zero = tf.group(*set_to_not_zero)

    # DEFINE LOSS

    lmbda = 0.0
    #1.0**(-3.5);#0.01;
    beta = 0.00
    #L = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y,Tt)),1,keep_dims=True))) + beta*tf.reduce_mean(tf.reduce_max(tf.abs(grad_x),reduction_indices=1,keep_dims=True));
    #L = tf.reduce_mean(tf.mul(tf.exp(imp*t_vec),tf.abs(tf.sub(y,Tt)))) + lmbda*reg;
    #L = tf.reduce_mean(tf.abs(tf.sub(y,Tt))) + lmbda*reg;

    # DEFINE OPTIMIZER

    #nu = 5.01;
    #nunu = ler_r;#0.00005;
    nu = tf.placeholder(tf.float32, shape=[])  #VAR

    #lr_multiplier = ler_r
    lr_schedule = PiecewiseSchedule([
        (0, 0.1),
        (10000, 0.01),
        (20000, 0.001),
        (30000, 0.0001),
    ],
                                    outside_value=0.0001)

    #optimizer = tf.train.GradientDescentOptimizer(nu)
    #optimizer
    #train_step = tf.train.MomentumOptimizer(learning_rate=nu,momentum=mom).minimize(L)
    #optimizer
    #train_step = tf.train.AdamOptimizer(learning_rate=nu).minimize(L);
    train_step = tf.train.RMSPropOptimizer(learning_rate=nu,
                                           momentum=mom).minimize(L)
    train_step_ = tf.train.RMSPropOptimizer(learning_rate=nu,
                                            momentum=mom).minimize(L_)
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom);
    #gvs = optimizer.compute_gradients(L,theta);
    #capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in gvs];
    #train_step = optimizer.apply_gradients(gvs);
    #train_step = tf.train.AdagradOptimizer(learning_rate=nu,initial_accumulator_value=0.5).minimize(L);

    hot_input = tf.placeholder(tf.int64, shape=(None))
    make_hot = tf.one_hot(hot_input, 2**num_ac, on_value=1, off_value=0)

    # INITIALIZE GRAPH
    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    def V_0(x):
        #return np.linalg.norm(x,ord=1,axis=1,keepdims=True) - 1.0
        return np.linalg.norm(x, axis=1, keepdims=True) - 1.0

    def p_corr(ALL_x):
        ALL_x = np.mod(ALL_x, 2.0 * np.pi)
        return ALL_x

    def F(ALL_x, opt_a, opt_b):

        #       #Positions
        #       col1 = ALL_x[:,3,None] - opt_b[:,0,None]
        #       col2 = ALL_x[:,4,None] - opt_b[:,1,None]
        #       col3 = ALL_x[:,5,None] - opt_b[:,2,None]
        #       #Velocities
        #       col4 = np.multiply(opt_a[:,2,None],np.tan(ALL_x[:,6,None]))
        #       col5 = -np.multiply(opt_a[:,2,None],np.tan(ALL_x[:,7,None]))
        #       col6 = opt_a[:,2,None] - g
        #       #Angles
        #       col7 = opt_a[:,0,None];
        #       col8 = opt_a[:,1,None];

        col1 = ALL_x[:, 1, None] - opt_b[:, 0, None]
        #position x
        col2 = np.multiply(opt_a[:, 2, None], np.tan(ALL_x[:, 2,
                                                           None]))  #velocity x
        col3 = opt_a[:, 0,
                     None]  #-d1*ALL_x[:,1,None] + opt_a[:,0,None];                #angle th_x

        col5 = ALL_x[:, 4, None] - opt_b[:, 1, None]
        #position y
        col6 = -np.multiply(opt_a[:, 2, None], np.tan(
            ALL_x[:, 5, None]))  #velocity y
        col7 = opt_a[:, 1,
                     None]  #-d1*ALL_x[:,4,None] + opt_a[:,1,None];                #angle th_y

        col9 = ALL_x[:, 7, None] - opt_b[:, 2, None]  #position z
        col10 = kT * opt_a[:, 2, None] - grav  #velocity z

        return np.concatenate(
            (col1, col2, col3, col5, col6, col7, col9, col10), axis=1)

    ####################### RECURSIVE FUNC ####################

    def RK4(ALL_x, dtt, opt_a, opt_b):  #Try Euler

        k1 = F(ALL_x, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k2)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k1)
        ALL_tmp[:, [2, 5]] = p_corr(ALL_tmp[:, [2, 5]])

        k2 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k3)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k2)
        ALL_tmp[:, [2, 5]] = p_corr(ALL_tmp[:, [2, 5]])

        k3 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k4)
        ALL_tmp = ALL_x + np.multiply(dtt, k3)
        ALL_tmp[:, [2, 5]] = p_corr(ALL_tmp[:, [2, 5]])

        k4 = F(ALL_tmp, opt_a, opt_b)
        #### !!!

        Snx = ALL_x + np.multiply((dtt / 6.0), (k1 + 2.0 * k2 + 2.0 * k3 + k4))
        #np.multiply(dtt,k1)
        ALL_tmp[:, [2, 5]] = p_corr(ALL_tmp[:, [2, 5]])
        return Snx

    perms = list(itertools.product([-1, 1], repeat=num_ac))
    true_ac_list = []
    for i in range(len(perms)):  #2**num_actions
        ac_tuple = perms[i]
        ac_list = [(tmp1 == 1) * tmp3 + (tmp1 == -1) * tmp2
                   for tmp1, tmp2, tmp3 in zip(ac_tuple, min_list, max_list)]
        true_ac_list.append(ac_list)

    dist_ac = 3
    perms_ = list(itertools.product([-1, 1], repeat=dist_ac))
    true_ac_list_ = []
    for i in range(len(perms_)):  #2**num_actions
        ac_tuple_ = perms_[i]
        ac_list_ = [
            (tmp1 == 1) * tmp3 + (tmp1 == -1) * tmp2
            for tmp1, tmp2, tmp3 in zip(ac_tuple_, min_list_, max_list_)
        ]
        #ASSUMING: aMax = -aMin
        true_ac_list_.append(ac_list_)

    def Hot_to_Cold(hots, ac_list):
        a = hots.argmax(axis=1)
        a = np.asarray([ac_list[i] for i in a])
        return a

    def getPI(
        ALL_x,
        F_PI=[],
        F_PI_=[],
        subSamples=1
    ):  #Things to keep in MIND: You want the returned value to be the minimum accross a trajectory.

        current_params = sess.run(C_func_vars)
        current_params_ = sess.run(D_func_vars)

        #perms = list(itertools.product([-1,1], repeat=num_ac))
        next_states_ = []
        for k in range((len(perms))):
            next_states = []
            opt_a = np.asarray(true_ac_list[k]) * np.ones([ALL_x.shape[0], 1])
            for i in range(len(perms_)):
                opt_b = np.asarray(true_ac_list_[i]) * np.ones(
                    [ALL_x.shape[0], 1])
                Snx = ALL_x
                for _ in range(subSamples):
                    Snx = RK4(Snx, dt / float(subSamples), opt_a, opt_b)
                next_states.append(Snx)
            next_states_.append(np.concatenate(next_states, axis=0))
        next_states_ = np.concatenate(next_states_, axis=0)
        values = V_0(next_states_[:, [0, 3, 6]])

        for params, params_ in zip(F_PI, F_PI_):
            for ind in range(len(params)):  #Reload pi*(x,t+dt) parameters
                sess.run(C_func_vars[ind].assign(params[ind]))
            for ind in range(len(params_)):  #Reload pi*(x,t+dt) parameters
                sess.run(D_func_vars[ind].assign(params_[ind]))

            tmp = ConvCosSin(next_states_)
            hots = sess.run(Tt, {states: tmp})
            opt_a = Hot_to_Cold(hots, true_ac_list)
            hots = sess.run(Tt_, {states_: tmp})
            opt_b = Hot_to_Cold(hots, true_ac_list_)
            for _ in range(subSamples):
                next_states_ = RK4(next_states_, dt / float(subSamples), opt_a,
                                   opt_b)
                values = np.max((values, V_0(next_states_[:, [0, 3, 6]])),
                                axis=0)

        values_ = values
        #V_0(next_states_[:,[0,1,2]]);
        pre_compare_vals_ = values_.reshape([-1, ALL_x.shape[0]]).T
        #Changed to values instead of values_
        final_v = []
        final_v_ = []
        per = len(perms)
        for k in range(len(perms_)):
            final_v.append(
                np.argmax(pre_compare_vals_[:, k * per:(k + 1) * per, None],
                          axis=1))
            final_v_.append(
                np.max(pre_compare_vals_[:, k * per:(k + 1) * per, None],
                       axis=1))
        finalF = np.concatenate(final_v_, axis=1)
        index_best_a_ = np.argmin(finalF, axis=1)
        finalF_ = np.concatenate(final_v, axis=1)
        index_best_b_ = np.array(
            [finalF_[k, index_best_a_[k]] for k in range(len(index_best_a_))])

        for ind in range(len(current_params)):  #Reload pi*(x,t+dt) parameters
            sess.run(C_func_vars[ind].assign(current_params[ind]))
        for ind in range(len(current_params_)):  #Reload pi*(x,t+dt) parameters
            sess.run(D_func_vars[ind].assign(current_params_[ind]))

        return sess.run(make_hot, {hot_input: index_best_a_}), sess.run(
            make_hot, {hot_input: index_best_b_})

    def getTraj(ALL_x,
                F_PI=[],
                F_PI_=[],
                subSamples=1,
                StepsLeft=None,
                Noise=False,
                Static=False,
                justV=False):

        current_params = sess.run(C_func_vars)
        current_params_ = sess.run(D_func_vars)

        if (StepsLeft == None): StepsLeft = len(F_PI)

        next_states_ = ALL_x
        traj = [next_states_]
        actions = []

        values = V_0(next_states_[:, [0, 1, 2]])

        if Static:
            steps = input("How Many Steps? ")
            for ind in range(len(
                    F_PI[len(F_PI) -
                         StepsLeft])):  #Reload pi*(x,t+dt) parameters
                sess.run(C_func_vars[ind].assign(F_PI[len(F_PI) -
                                                      StepsLeft][ind]))
            for ind in range(len(
                    F_PI_[len(F_PI_) -
                          StepsLeft])):  #Reload pi*(x,t+dt) parameters
                sess.run(D_func_vars[ind].assign(F_PI_[len(F_PI_) -
                                                       StepsLeft][ind]))
            for i in range(steps):
                for _ in range(subSamples):
                    tmp = ConvCosSin(next_states_)
                    hots = sess.run(Tt, {states: tmp})
                    opt_a = Hot_to_Cold(hots, true_ac_list)
                    if Noise == False:
                        hots_ = sess.run(Tt_, {states_: tmp})
                        opt_b = Hot_to_Cold(hots_, true_ac_list_)
                    else:
                        hots_ = np.zeros((1, 2**dist_ac))
                        hots_[0][np.random.randint(2**dist_ac)] = 1
                        opt_b = Hot_to_Cold(hots_, true_ac_list_)

                    next_states_ = RK4(next_states_, dt / float(subSamples),
                                       opt_a, opt_b)
                    if not justV:
                        traj.append(next_states_)
                        actions.append(hots.argmax(axis=1)[0])
                    values = np.max((values, V_0(next_states_[:, [0, 1, 2]])),
                                    axis=0)
                    if i % 20 == 0:
                        print(i)
        else:
            for params, params_ in zip(F_PI[len(F_PI) - StepsLeft:],
                                       F_PI_[len(F_PI_) - StepsLeft:]):
                for ind in range(len(params)):  #Reload pi*(x,t+dt) parameters
                    sess.run(C_func_vars[ind].assign(params[ind]))
                for ind in range(len(params_)):  #Reload pi*(x,t+dt) parameters
                    sess.run(D_func_vars[ind].assign(params_[ind]))

                tmp = ConvCosSin(next_states_)
                hots = sess.run(Tt, {states: tmp})
                opt_a = Hot_to_Cold(hots, true_ac_list)
                if Noise == False:
                    hots_ = sess.run(Tt_, {states_: tmp})
                    opt_b = Hot_to_Cold(hots_, true_ac_list_)
                else:
                    hots_ = np.zeros((1, 2**dist_ac))
                    hots_[0][np.random.randint(2**dist_ac)] = 1
                    opt_b = Hot_to_Cold(hots_, true_ac_list_)

                for _ in range(subSamples):
                    next_states_ = RK4(next_states_, dt / float(subSamples),
                                       opt_a, opt_b)
                    traj.append(next_states_)
                    actions.append(hots.argmax(axis=1)[0])

        for ind in range(len(current_params)):  #Reload pi*(x,t+dt) parameters
            sess.run(C_func_vars[ind].assign(current_params[ind]))
        for ind in range(len(current_params_)):  #Reload pi*(x,t+dt) parameters
            sess.run(D_func_vars[ind].assign(current_params_[ind]))

        print(str(next_states_))

        return traj, actions, values

    def ConvCosSin(ALL_x):
        sin_phi = np.sin(ALL_x[:, [2, 5]])
        cos_phi = np.cos(ALL_x[:, [2, 5]])
        pos = ALL_x[:, [0, 3, 6]]
        vel = ALL_x[:, [1, 4, 7]]
        ret_val = np.concatenate((pos, vel, sin_phi, cos_phi), axis=1)
        return ret_val

    # *****************************************************************************
    #
    # ============================= MAIN LOOP ====================================
    #
    # *****************************************************************************
    t1 = time.time()
    t = 0.0
    mse = np.inf
    k = 0
    kk = 0
    beta = 3.0
    batch_size = bts
    tau = 1000.0
    steps = teps
    ALL_PI = []
    ALL_PI_ = []
    nunu = lr_schedule.value(k)

    act_color = ['r', 'g', 'b', 'y']
    if (imp == 1.0):
        ALL_PI, ALL_PI_ = pickle.load(
            open("policies6D_P&T.9769 | TE_ACC = 0.94 | Lc_h40_h40.pkl", "rb"))
        cc = 0
        while True:
            state_get = input('State: ')
            sub_smpl = input('SUBSAMPLING: ')
            pause_len = input('Pause: ')
            s_left = input("How many steps left to go (max. " +
                           str(len(ALL_PI)) + ")? -> ")
            noise = input("Noise? (0/1): ")
            stat = input("Static? (0/1): ")
            traj, act, _ = getTraj(state_get,
                                   F_PI=ALL_PI,
                                   F_PI_=ALL_PI_,
                                   subSamples=sub_smpl,
                                   StepsLeft=s_left,
                                   Noise=noise,
                                   Static=stat)
            act.append(act[-1])
            all_to = np.concatenate(traj)
            plt.scatter(all_to[:, [0]],
                        all_to[:, [2]],
                        color=act_color[cc % len(act_color)])
            plt.pause(pause_len)
            cc = cc + 1
            #plt.colorbar()
    elif (imp == 2.0):
        ALL_PI, ALL_PI_ = pickle.loadsess.run(set_to_not_zero)
        (open("policies6D_P&Tc_h40_h40.pkl", "rb"))
        cc = 0
        dist_bound = input("Distance: ")
        state_get = np.random.uniform(-5.0, 5.0, (nrolls, layers[0]))
        state_get[:, :3] = dist_bound * state_get[:, :3] / np.linalg.norm(
            state_get[:, :3], axis=1, keepdims=True)
        sub_smpl = input('SUBSAMPLING: ')
        s_left = input("How many steps left to go (max. " + str(len(ALL_PI)) +
                       ")? -> ")
        noise = input("Noise? (0/1): ")
        stat = input("Static? (0/1): ")
        traj, act, values = getTraj(state_get,
                                    F_PI=ALL_PI,
                                    F_PI_=ALL_PI_,
                                    subSamples=sub_smpl,
                                    StepsLeft=s_left,
                                    Noise=noise,
                                    Static=stat,
                                    justV=True)
        values = values + 1.0
        print(values.shape)
        filt = (values < dist_bound).T[0]
        print(filt.shape)
        subset = state_get[filt]
        print(len(subset))
        plt.hist(values, bins=100)
        plt.pause(10)
        tracking_error_bound = np.max(abs(subset[:, :3]), axis=0)
        print(tracking_error_bound)
        print(subset)
        save_dict = {}
        save_dict["weights"] = (ALL_PI, ALL_PI_)
        save_dict["layers"] = layers1
        save_dict["control_bounds_upper"] = max_list
        save_dict["control_bounds_lower"] = min_list
        save_dict["tracking_error_bound"] = tracking_error_bound
        save_dict["planner_params"] = {
            "max_speed": [0.5, 0.5, 0.5],
            "max_vel_dist": [0.0, 0.0, 0.0],
            "max_acc_dist": [0.0, 0.0, 0.0]
        }
        pickle.dump(save_dict, open("policies6D_PT_h40_h40.pkl", "wb"))
    else:

        for i in xrange(iters):

            if (np.mod(i, renew) == 0 and i is not 0):

                ALL_PI.insert(0, sess.run(C_func_vars))
                ALL_PI_.insert(0, sess.run(D_func_vars))

                #                plt.figure(2) #TODO: Figure out why facing up vs facing down has same action... -> Solved: colors in a scatter plot only depend on the labels
                #                plt.clf();
                #                ALL_xx = np.array([[-1.0,0.0,1.0,0.0,0.0,0.0],
                #                                   [1.0,0.0,1.0,0.0,0.0,0.0],
                #                                   [1.0,0.0,-1.0,0.0,0.0,0.0],
                #                                   [-1.0,0.0,-1.0,0.0,0.0,0.0]]);
                #                for tmmp in range(ALL_xx.shape[0]):
                #                    traj,act,_ = getTraj(ALL_xx[[tmmp],:],F_PI=ALL_PI,F_PI_=ALL_PI_,subSamples=10);
                #                    #act.append(act[-1]);
                #                    all_to = np.concatenate(traj);
                #                    plt.scatter(all_to[:,[0]],all_to[:,[2]])#c=[act_color[ii] for ii in act]);
                #                plt.pause(0.25)

                t = time.time()
                ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0] - 2))
                ALL_x[:, [1, 4, 7]] = ALL_x[:, [1, 4, 7]]
                ALL_x[:, [2, 5]] = np.mod(ALL_x[:, [2, 5]] * np.pi / 20.0,
                                          2.0 * np.pi)
                PI_c, PI_d = getPI(ALL_x, ALL_PI, ALL_PI_, subSamples=1)
                pre_ALL_x = ConvCosSin(ALL_x)
                elapsed = time.time() - t
                print("Compute Data Time = " + str(elapsed))

                ALL_x_ = np.random.uniform(-5.0, 5.0,
                                           (nrolls / 100, layers[0] - 2))
                ALL_x_[:, [1, 4, 7]] = ALL_x_[:, [1, 4, 7]]
                ALL_x_[:, [2, 5]] = np.mod(ALL_x_[:, [2, 5]] * np.pi / 20.0,
                                           2.0 * np.pi)
                PI_c_, PI_d_ = getPI(ALL_x_, ALL_PI, ALL_PI_, subSamples=1)
                pre_ALL_x_ = ConvCosSin(ALL_x_)

                #sess.run(set_to_not_zero);

                t = t - dt
                print('Again.')

            elif (np.mod(i, renew) == 0 and i is 0):

                #            sess.run(set_to_zero);
                t = time.time()
                ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0] - 2))
                ALL_x[:, [1, 4, 7]] = ALL_x[:, [1, 4, 7]]
                ALL_x[:, [2, 5]] = np.mod(ALL_x[:, [2, 5]] * np.pi / 20.0,
                                          2.0 * np.pi)
                PI_c, PI_d = getPI(ALL_x, F_PI=[], F_PI_=[], subSamples=1)
                pre_ALL_x = ConvCosSin(ALL_x)
                elapsed = time.time() - t
                print("Compute Data Time = " + str(elapsed))

                ALL_x_ = np.random.uniform(-5.0, 5.0,
                                           (nrolls / 100, layers[0] - 2))
                ALL_x_[:, [1, 4, 7]] = ALL_x_[:, [1, 4, 7]]
                ALL_x_[:, [2, 5]] = np.mod(ALL_x_[:, [2, 5]] * np.pi / 20.0,
                                           2.0 * np.pi)
                PI_c_, PI_d_ = getPI(ALL_x_, F_PI=[], F_PI_=[], subSamples=1)
                pre_ALL_x_ = ConvCosSin(ALL_x_)
    #            sess.run(set_to_not_zero);

    # |||||||||||| ----  PRINT ----- ||||||||||||

            if (np.mod(i, 200) == 0):

                #xel = sess.run(L,{states:ALL_x,y:PI});
                #test_e = sess.run(L,{states:ALL_x_,y:PI_});
                train_acc = sess.run(accuracy, {
                    states: pre_ALL_x,
                    y: PI_c
                })
                test_acc = sess.run(accuracy, {
                    states: pre_ALL_x_,
                    y: PI_c_
                })
                train_acc_ = sess.run(accuracy_, {
                    states_: pre_ALL_x,
                    y_: PI_d
                })
                test_acc_ = sess.run(accuracy_, {
                    states_: pre_ALL_x_,
                    y_: PI_d_
                })
                #o = np.random.randint(len(ALL_x));
                print str(i) + ") control | TR_ACC = " + str(
                    train_acc) + " | TE_ACC = " + str(
                        test_acc) + " | Learning Rate = " + str(nunu)
                print str(i) + ") disturb | TR_ACC = " + str(
                    train_acc_) + " | TE_ACC = " + str(
                        test_acc_) + " | Learning Rate = " + str(nunu)
                #print str(i) + ") | XEL = " + str(xel) + " | Test_E = " + str(test_e) + " | Lerning Rate = " + str(nunu)
                #print str(PI[[o],:]) + " || " + str(sess.run(l_r[-1],{states:ALL_x[[o],:]})) #+ " || " + str(sess.run(gvs[-1],{states:ALL_x,y:PI}))

            nunu = 0.001  #/(np.sqrt(np.mod(i,renew))+1.0)#lr_schedule.value(i);
            #nunu = ler_r/(np.mod(i,renew)+1.0);
            tmp = np.random.randint(len(ALL_x), size=bts)
            sess.run(train_step,
                     feed_dict={
                         states: pre_ALL_x[tmp],
                         y: PI_c[tmp],
                         nu: nunu
                     })
            sess.run(train_step_,
                     feed_dict={
                         states_: pre_ALL_x[tmp],
                         y_: PI_d[tmp],
                         nu: nunu
                     })
            #tmp = np.random.randint(len(reach100s), size=bts);
            #sess.run(train_step, feed_dict={states:reach100s[tmp,:-1],y:reach100s[tmp,-1,None],nu:nunu});vs

        pickle.dump([ALL_PI, ALL_PI_],
                    open("policies10D_P&Tc_h170_h170.pkl", "wb"))
Example #15
0
def main(layers, t_hor, ind, nrolls, bts, ler_r, mom, teps, renew, imp, q):
    # Quad Params
    u1_max = 0.17
    u1_min = 0
    u2_max = 0.017
    u2_min = 0
    u3_max = 0.017
    u3_min = 0
    u4_max = 0.017
    u4_min = 0
    max_list = [u1_max, u2_max, u3_max, u4_max]
    min_list = [u1_min, u2_min, u3_min, u4_min]

    I = [0.0224, 0.0224, 0.0436]
    Ix = I[0]
    Iy = I[1]
    Iz = I[2]
    m = 0.65
    L_ = 0.156
    g = 9.8

    print 'Starting worker-' + str(ind)

    f = 1
    Nx = 100 * f + 1
    minn = [-5.0, -10.0, -5.0, -10.0, 0.0, -10.0]
    maxx = [5.0, 10.0, 5.0, 10.0, 2 * np.pi, 10.0]

    X = np.linspace(minn[0], maxx[0], Nx)
    Y = np.linspace(minn[2], maxx[2], Nx)
    Z = np.linspace(minn[4], maxx[4], Nx)
    X_, Y_, Z_ = np.meshgrid(X, Y, Z)
    X, Y = np.meshgrid(X, Y)
    XX = np.reshape(X, [-1, 1])
    YY = np.reshape(Y, [-1, 1])
    XX_ = np.reshape(X_, [-1, 1])
    YY_ = np.reshape(Y_, [-1, 1])
    ZZ_ = np.reshape(Z_, [-1, 1])
    grid_check = np.concatenate((XX_, np.ones(
        XX_.shape), YY_, np.ones(XX_.shape), ZZ_, np.zeros(XX_.shape)),
                                axis=1)
    grid_eval = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 6.0 * np.ones(XX.shape)), axis=1)
    grid_eval_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_eval__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 6.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall = np.concatenate(
        (XX, YY, 0.0 * np.ones(XX.shape), 12.0 * np.ones(XX.shape)), axis=1)
    grid_evall_ = np.concatenate(
        (XX, YY,
         (2.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)
    grid_evall__ = np.concatenate(
        (XX, YY,
         (4.0 / 3.0) * np.pi * np.ones(XX.shape), 12.0 * np.ones(XX.shape)),
        axis=1)

    # Calculate number of parameters of the policy
    nofparams = 0
    for i in xrange(len(layers) - 1):
        nofparams += layers[i] * layers[i + 1] + layers[i + 1]
    print 'Number of Params is: ' + str(nofparams)

    H_length = t_hor
    center = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])
    depth = 2.0
    incl = 1.0

    ##################### DEFINITIONS #####################
    #layers = [2 + 1,10,1];                                                    #VAR
    #ssize = layers[0] - 1;
    dt = 0.05
    #VAR
    num_ac = 4
    iters = int(np.abs(t_hor) / dt) * renew + 1
    ##################### INSTANTIATIONS #################
    states, y, Tt, L, l_r, lb, reg, cross_entropy = TransDef(
        "Critic", False, layers, depth, incl, center)
    ola1 = tf.argmax(Tt, dimension=1)
    ola2 = tf.argmax(y, dimension=1)
    ola3 = tf.equal(ola1, ola2)
    accuracy = tf.reduce_mean(tf.cast(ola3, tf.float32))
    #a_layers = layers;
    #a_layers[-1] = 2; #We have two actions
    #states_,y_,Tt_,l_r_,lb_,reg_ = TransDef("Actor",False,a_layers,depth,incl,center,outp=True);

    V_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Critic')
    #A_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope='Actor');

    #var_grad = tf.gradients(Tt_,states_)[0]
    var_grad_ = tf.gradients(Tt, states)[0]
    grad_x = tf.slice(var_grad_, [0, 0], [-1, layers[0] - 1])
    #theta = tf.trainable_variables();

    set_to_zero = []
    for var in sorted(V_func_vars, key=lambda v: v.name):
        set_to_zero.append(var.assign(tf.zeros(tf.shape(var))))
    set_to_zero = tf.group(*set_to_zero)

    set_to_not_zero = []
    for var in sorted(V_func_vars, key=lambda v: v.name):
        set_to_not_zero.append(
            var.assign(
                tf.random_uniform(tf.shape(var), minval=-0.1, maxval=0.1)))
    set_to_not_zero = tf.group(*set_to_not_zero)

    # DEFINE LOSS

    lmbda = 0.0
    #1.0**(-3.5);#0.01;
    beta = 0.00
    #L = tf.sqrt(tf.reduce_mean(tf.reduce_sum(tf.square(tf.sub(y,Tt)),1,keep_dims=True))) + beta*tf.reduce_mean(tf.reduce_max(tf.abs(grad_x),reduction_indices=1,keep_dims=True));
    #L = tf.reduce_mean(tf.mul(tf.exp(imp*t_vec),tf.abs(tf.sub(y,Tt)))) + lmbda*reg;
    #L = tf.reduce_mean(tf.abs(tf.sub(y,Tt))) + lmbda*reg;

    # DEFINE OPTIMIZER

    #nu = 5.01;
    #nunu = ler_r;#0.00005;
    nu = tf.placeholder(tf.float32, shape=[])  #VAR

    #lr_multiplier = ler_r
    lr_schedule = PiecewiseSchedule([
        (0, 0.1),
        (10000, 0.01),
        (20000, 0.001),
        (30000, 0.0001),
    ],
                                    outside_value=0.0001)

    #optimizer = tf.train.GradientDescentOptimizer(nu)
    #optimizer
    #train_step = tf.train.MomentumOptimizer(learning_rate=nu,momentum=mom).minimize(L)
    #optimizer
    #train_step = tf.train.AdamOptimizer(learning_rate=nu).minimize(L);
    train_step = tf.train.RMSPropOptimizer(learning_rate=nu,
                                           momentum=mom).minimize(L)
    #optimizer = tf.train.RMSPropOptimizer(learning_rate=nu,momentum=mom);
    #gvs = optimizer.compute_gradients(L,theta);
    #capped_gvs = [(tf.clip_by_value(grad, -3., 3.), var) for grad, var in gvs];
    #train_step = optimizer.apply_gradients(gvs);
    #train_step = tf.train.AdagradOptimizer(learning_rate=nu,initial_accumulator_value=0.5).minimize(L);

    hot_input = tf.placeholder(tf.int64, shape=(None))
    make_hot = tf.one_hot(hot_input, 2**num_ac, on_value=1, off_value=0)

    # INITIALIZE GRAPH
    theta = tf.trainable_variables()
    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)

    def V_0(x):
        return np.linalg.norm(x, ord=np.inf, axis=1, keepdims=True) - 1.0
        #return np.linalg.norm(x,axis=1,keepdims=True) - 1.0

    def p_corr(ALL_x):
        ALL_x = np.mod(ALL_x, 2.0 * np.pi)
        return ALL_x

    def F(ALL_x, opt_a, opt_b):  #(grad,ALL_x):
        cos_phi = np.cos(ALL_x[:, 6, None])
        sin_phi = np.sin(ALL_x[:, 6, None])
        cos_the = np.cos(ALL_x[:, 7, None])
        sin_the = np.sin(ALL_x[:, 7, None])
        tan_the = np.tan(ALL_x[:, 7, None])
        cos_psi = np.cos(ALL_x[:, 8, None])
        sin_psi = np.sin(ALL_x[:, 8, None])

        col1 = ALL_x[:, 3, None]
        col2 = ALL_x[:, 4, None]
        col3 = ALL_x[:, 5, None]

        col4 = -(cos_phi * sin_the * cos_psi +
                 sin_phi * sin_psi) * opt_a[:, 0, None] / m
        col5 = -(cos_phi * sin_the * cos_psi -
                 sin_phi * cos_psi) * opt_a[:, 0, None] / m
        col6 = g - (cos_phi * cos_the) * opt_a[:, 0, None] / m

        col7 = ALL_x[:, 9,
                     None] + sin_phi * tan_the * ALL_x[:, 10,
                                                       None] + cos_phi * tan_the * ALL_x[:,
                                                                                         11,
                                                                                         None]
        col8 = cos_phi * ALL_x[:, 10, None] - sin_phi * ALL_x[:, 11, None]
        col9 = (sin_phi / cos_phi) * ALL_x[:, 10, None] + (
            cos_phi / cos_the) * ALL_x[:, 11, None]

        col10 = ALL_x[:, 10, None] * ALL_x[:, 11, None] * (
            (Iy - Iz) / Ix) + (L_ / Ix) * opt_a[:, 1, None]
        col11 = ALL_x[:, 9, None] * ALL_x[:, 11, None] * (
            (Iz - Ix) / Iy) + (L_ / Iy) * opt_a[:, 2, None]
        col12 = ALL_x[:, 9, None] * ALL_x[:, 10, None] * (
            (Ix - Iy) / Iz) + (L_ / Iz) * opt_a[:, 3, None]

        return np.concatenate((col1, col2, col3, col4, col5, col6, col7, col8,
                               col9, col10, col11, col12),
                              axis=1)

    #     \dot x    x_1  = x_4
    #     \dot y    x_2  = x_5
    #     \dot z    x_3  = x_6
    #     \dot vx   x_4  = -(\cos x_7 \sin x_8 \cos x_9 + \sin x_7 \sin x_9) u_1/m
    #     \dot vy   x_5  = -(\cos x_7 \sin x_8 \sin x_9 - \sin x_7 \cos x_9) u_1/m
    #     \dot vz   x_6  = g - (\cos x_7 \cos x_8) u_1/m
    #     \dot phi  x_7  = x_10 + \sin x_7 \tan(x_8) x_11 + \cos x_7 \tan(x_8) x_12
    #     \dot the  x_8  = \cos x_7 x_11 - \sin x_7 x_12
    #     \dot psi  x_9  = (\sin x_7/\cos x_8)*x_11 + (\cos x_7/\cos x_8) x_12   <---------
    #     \dot wphi x_10 = x_11 x_12 (I_y - I_z)/I_x + L/I_x u_2
    #     \dot wthe x_11 = x_10 x_12 (I_z - I_x)/I_y + L/I_y u_3
    #     \dot wpsi x_12 = x_10 x_11 (I_x - I_y)/I_z + 1/I_z u_4

    ####################### RECURSIVE FUNC ####################

    def RK4(ALL_x, dtt, opt_a, opt_b):

        k1 = F(ALL_x, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k2)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k1)
        ALL_tmp[:, [6, 7, 8]] = p_corr(ALL_tmp[:, [6, 7, 8]])

        k2 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k3)
        ALL_tmp = ALL_x + np.multiply(dtt / 2.0, k2)
        ALL_tmp[:, [6, 7, 8]] = p_corr(ALL_tmp[:, [6, 7, 8]])

        k3 = F(ALL_tmp, opt_a, opt_b)
        #### !!!
        # ~~~~ Compute optimal input (k4)
        ALL_tmp = ALL_x + np.multiply(dtt, k3)
        ALL_tmp[:, [6, 7, 8]] = p_corr(ALL_tmp[:, [6, 7, 8]])

        k4 = F(ALL_tmp, opt_a, opt_b)
        #### !!!

        Snx = ALL_x + np.multiply((dtt / 6.0), (k1 + 2.0 * k2 + 2.0 * k3 + k4))
        #np.multiply(dtt,k1)
        Snx[:, [6, 7, 8]] = p_corr(Snx[:, [6, 7, 8]])
        return Snx

    perms = list(itertools.product([-1, 1], repeat=num_ac))
    true_ac_list = []
    for i in range(len(perms)):  #2**num_actions
        ac_tuple = perms[i]
        ac_list = [(tmp1 == 1) * tmp3 + (tmp1 == -1) * tmp2
                   for tmp1, tmp2, tmp3 in zip(ac_tuple, min_list, max_list)]
        true_ac_list.append(ac_list)

    def Hot_to_Cold(hots, ac_list):
        a = hots.argmax(axis=1)
        a = np.asarray([ac_list[i] for i in a])
        return a

    def getPI(
        ALL_x,
        F_PI=[],
        subSamples=1
    ):  #Things to keep in MIND: You want the returned value to be the minimum accross a trajectory.

        current_params = sess.run(theta)

        #perms = list(itertools.product([-1,1], repeat=num_ac))
        next_states = []
        for i in range(len(perms)):
            opt_a = np.asarray(true_ac_list[i]) * np.ones([ALL_x.shape[0], 1])
            Snx = ALL_x
            for _ in range(subSamples):
                Snx = RK4(Snx, dt / float(subSamples), opt_a, None)
            next_states.append(Snx)
        next_states = np.concatenate(next_states, axis=0)
        values = V_0(next_states[:, [0, 1, 2]])

        for params in F_PI:
            for ind in range(len(params)):  #Reload pi*(x,t+dt) parameters
                sess.run(theta[ind].assign(params[ind]))

            hots = sess.run(Tt, {states: ConvCosSin(next_states)})
            opt_a = Hot_to_Cold(hots, true_ac_list)
            for _ in range(subSamples):
                next_states = RK4(next_states, dt / float(subSamples), opt_a,
                                  None)
                values = np.min((values, V_0(next_states[:, [0, 1, 2]])),
                                axis=0)

        values_ = V_0(next_states[:, [0, 1, 2]])
        compare_vals_ = values_.reshape([-1, ALL_x.shape[0]]).T
        #Changed to values instead of values_
        index_best_a_ = compare_vals_.argmin(axis=1)  #Changed to ARGMIN
        values_ = np.min(compare_vals_, axis=1, keepdims=True)

        #        filterr = np.min(compare_vals_,axis=1) < 0.0
        #        index_best_a_ = index_best_a_[filterr]
        #        values_ = values_[filterr]
        #        print("States filtered out: "+str(len(filterr)-np.sum(filterr)))

        for ind in range(len(current_params)):  #Reload pi*(x,t+dt) parameters
            sess.run(theta[ind].assign(current_params[ind]))

        return sess.run(make_hot,
                        {hot_input: index_best_a_}), values_, 0  #filterr

#    def getTraj(ALL_x,F_PI=[],subSamples=1,StepsLeft=None,Noise = False):
#
#        current_params = sess.run(theta);
#
#        if(StepsLeft == None): StepsLeft = len(F_PI);
#
#        next_states = ALL_x;
#        traj = [next_states];
#        actions = [];
#
#        for params in F_PI[len(F_PI)-StepsLeft:]:
#            for ind in range(len(params)): #Reload pi*(x,t+dt) parameters
#                sess.run(theta[ind].assign(params[ind]));
#
#            hots = sess.run(Tt,{states:ConvCosSin(next_states)});
#            opt_a = Hot_to_Cold(hots,true_ac_list)
#            for _ in range(subSamples):
#                next_states = RK4(next_states,dt/float(subSamples),opt_a,None);
#                if Noise:
#                    next_states = next_states + np.random.normal(size=next_states.shape)*0.01
#                traj.append(next_states);
#                actions.append(hots.argmax(axis=1)[0]);
#                #values = np.min((values,V_0(next_states[:,[0,1]])),axis=0);
#
#        for ind in range(len(current_params)): #Reload pi*(x,t+dt) parameters
#            sess.run(theta[ind].assign(current_params[ind]));
#
#        return traj,V_0(next_states[:,[0,2]]),actions;

    def ConvCosSin(ALL_x):
        cos_phi = np.cos(ALL_x[:, 6, None])
        sin_phi = np.sin(ALL_x[:, 6, None])
        cos_the = np.cos(ALL_x[:, 7, None])
        sin_the = np.sin(ALL_x[:, 7, None])
        cos_psi = np.cos(ALL_x[:, 8, None])
        sin_psi = np.sin(ALL_x[:, 8, None])
        pos = ALL_x[:, [0, 1, 2]] / 5.0
        vel = ALL_x[:, [3, 4, 5]] / 10.0
        arate = ALL_x[:, [9, 10, 11]] / 30.0
        ret_val = np.concatenate((pos, vel, arate, cos_phi, sin_phi, cos_the,
                                  sin_the, cos_psi, sin_psi),
                                 axis=1)
        return ret_val

    # *****************************************************************************
    #
    # ============================= MAIN LOOP ====================================
    #
    # *****************************************************************************
    t1 = time.time()
    t = 0.0
    mse = np.inf
    k = 0
    kk = 0
    beta = 3.0
    batch_size = bts
    tau = 1000.0
    steps = teps
    ALL_PI = []
    nunu = lr_schedule.value(k)

    #    act_color = ['r','g','b','y'];
    #    if(imp == 1.0):
    #        ALL_PI = pickle.load( open( "policies6D_C&D_h30_h30.pkl", "rb" ) );
    #        while (imp == 1.0):
    #            state_get = input('State: ');
    #            sub_smpl = input('SUBSAMPLING: ');
    #            pause_len = input('Pause: ')
    #            s_left = input("How many steps left to go (max. "+str(len(ALL_PI))+")? -> ")
    #            traj,VAL,act = getTraj(state_get,F_PI=ALL_PI,subSamples=sub_smpl,StepsLeft=s_left,Noise=False);
    #            act.append(act[-1]);
    #            all_to = np.concatenate(traj);
    #            plt.scatter(all_to[:,[0]],all_to[:,[2]],c=[act_color[i] for i in act])
    #            #plt.colorbar()
    #            plt.pause(pause_len)
    #            print(str(VAL));

    for i in xrange(iters):

        if (np.mod(i, renew) == 0 and i is not 0):

            ALL_PI.insert(0, sess.run(theta))

            #            fig = plt.figure(1)
            #            plt.clf();
            #            _,nn_vals,_ = getTraj(grid_check,ALL_PI,20)
            #            fi = (np.abs(nn_vals) < 0.05)
            #            mini_reach_ = grid_check[fi[:,0]]
            #            ax = fig.add_subplot(111, projection='3d')
            #            ax.scatter(mini_reach_[:,0], mini_reach_[:,2], mini_reach_[:,4]);
            #            plt.pause(0.25);

            #            plt.figure(2) #TODO: Figure out why facing up vs facing down has same action... -> Solved: colors in a scatter plot only depend on the labels
            #            plt.clf();
            #            ALL_xx = np.array([[0.0,0.0,0.0,0.0,0.0,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi/4,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi/2 - 0.3,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi/2 + 0.3,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi/2 + 0.7,0.0],
            #                               [0.0,0.0,1.0,0.0,np.pi,0.0]]);
            #            for tmmp in range(ALL_xx.shape[0]):
            #                traj,_,act = getTraj(ALL_xx[[tmmp],:],F_PI=ALL_PI,subSamples=10);
            #                act.append(act[-1]);
            #                all_to = np.concatenate(traj);
            #                plt.scatter(all_to[:,[0]],all_to[:,[2]],c=act);
            #            plt.pause(0.25)

            t = time.time()
            ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0] - 3))
            ALL_x[:, [3, 4, 5]] = ALL_x[:, [3, 4, 5]]
            ALL_x[:, [6, 7]] = np.mod(ALL_x[:, [6, 7]] * np.pi / 20.0,
                                      2.0 * np.pi)
            ALL_x[:, [8]] = ALL_x[:, [8]] * np.pi / 5.0 + np.pi
            ALL_x[:, [9, 10, 11]] = ALL_x[:, [9, 10, 11]]
            PI, _, filterr = getPI(ALL_x, ALL_PI, subSamples=1)
            #ALL_x = ALL_x[filterr]
            pre_ALL_x = ConvCosSin(ALL_x)
            elapsed = time.time() - t
            print("Compute Data Time = " + str(elapsed))

            ALL_x_ = np.random.uniform(-5.0, 5.0,
                                       (nrolls / 100, layers[0] - 3))
            ALL_x_[:, [3, 4, 5]] = ALL_x_[:, [3, 4, 5]] * 2.0
            ALL_x_[:, [6, 7]] = np.mod(ALL_x_[:, [6, 7]] * np.pi / 20.0,
                                       2.0 * np.pi)
            ALL_x_[:, [8]] = ALL_x_[:, [8]] * np.pi / 5.0 + np.pi
            ALL_x_[:, [9, 10, 11]] = ALL_x_[:, [9, 10, 11]]
            PI_, _, filterr = getPI(ALL_x_, ALL_PI, subSamples=1)
            #ALL_x_ = ALL_x_[filterr]
            pre_ALL_x_ = ConvCosSin(ALL_x_)

            t = t - dt
            print('Again.')

        elif (np.mod(i, renew) == 0 and i is 0):

            #            sess.run(set_to_zero);
            t = time.time()
            ALL_x = np.random.uniform(-5.0, 5.0, (nrolls, layers[0] - 3))
            ALL_x[:, [3, 4, 5]] = ALL_x[:, [3, 4, 5]]
            ALL_x[:, [6, 7]] = np.mod(ALL_x[:, [6, 7]] * np.pi / 20.0,
                                      2.0 * np.pi)
            ALL_x[:, [8]] = ALL_x[:, [8]] * np.pi / 5.0 + np.pi
            ALL_x[:, [9, 10, 11]] = ALL_x[:, [9, 10, 11]]
            PI, _, filterr = getPI(ALL_x, F_PI=[], subSamples=1)
            #ALL_x = ALL_x[filterr]
            pre_ALL_x = ConvCosSin(ALL_x)
            elapsed = time.time() - t
            print("Compute Data Time = " + str(elapsed))

            ALL_x_ = np.random.uniform(-5.0, 5.0,
                                       (nrolls / 100, layers[0] - 3))
            ALL_x_[:, [3, 4, 5]] = ALL_x_[:, [3, 4, 5]]
            ALL_x_[:, [6, 7]] = np.mod(ALL_x_[:, [6, 7]] * np.pi / 20.0,
                                       2.0 * np.pi)
            ALL_x_[:, [8]] = ALL_x_[:, [8]] * np.pi / 5.0 + np.pi
            ALL_x_[:, [9, 10, 11]] = ALL_x_[:, [9, 10, 11]]
            PI_, _, filterr = getPI(ALL_x_, F_PI=[], subSamples=1)
            #ALL_x_ = ALL_x_[filterr]
            pre_ALL_x_ = ConvCosSin(ALL_x_)
#            sess.run(set_to_not_zero);

# |||||||||||| ----  PRINT ----- ||||||||||||

        if (np.mod(i, 200) == 0):

            #xel = sess.run(L,{states:ALL_x,y:PI});
            #test_e = sess.run(L,{states:ALL_x_,y:PI_});
            train_acc = sess.run(accuracy, {
                states: pre_ALL_x,
                y: PI
            })
            test_acc = sess.run(accuracy, {
                states: pre_ALL_x_,
                y: PI_
            })
            #o = np.random.randint(len(ALL_x));
            print str(i) + ") | TR_ACC = " + str(
                train_acc) + " | TE_ACC = " + str(
                    test_acc) + " | Lerning Rate = " + str(nunu)
            #print str(i) + ") | XEL = " + str(xel) + " | Test_E = " + str(test_e) + " | Lerning Rate = " + str(nunu)
            #print str(PI[[o],:]) + " || " + str(sess.run(l_r[-1],{states:ALL_x[[o],:]})) #+ " || " + str(sess.run(gvs[-1],{states:ALL_x,y:PI}))

        nunu = 0.01  #/(np.sqrt(np.mod(i,renew))+1.0)#lr_schedule.value(i);
        #nunu = ler_r/(np.mod(i,renew)+1.0);
        tmp = np.random.randint(len(ALL_x), size=bts)
        sess.run(train_step,
                 feed_dict={
                     states: pre_ALL_x[tmp],
                     y: PI[tmp],
                     nu: nunu
                 })
        #tmp = np.random.randint(len(reach100s), size=bts);
        #sess.run(train_step, feed_dict={states:reach100s[tmp,:-1],y:reach100s[tmp,-1,None],nu:nunu});

    pickle.dump(ALL_PI, open("policies6Dreach_h50.pkl", "wb"))