Ejemplo n.º 1
0
    def __init__(self, args, sess):
        self.args = args
        self.sess = sess
        [A, B] = get_linear_dynamics()
        self.prior = BasePrior(A, B)

        self.env = gym.make(self.args.env_name)
        self.args.max_path_length = self.env.spec.timestep_limit
        self.agent = TRPO(self.args, self.env, self.sess, self.prior)
Ejemplo n.º 2
0
def linear_dynamics_constraints(prog, decision_vars, x_bar, u_bar, N, dt):
    '''
    Dynamics Constraints -- error state form
    x_bar - nominal point; dx = (x - x_bar)
    A, B, C = get_linear_dynamics(derivs, x_bar[:,n], u_bar[:,n])
    '''
    x, u, _, _ = decision_vars
    derivs = dynamics.derivatives(dynamics.discrete_dynamics, n_x, n_u)
    for n in range(N-1):
        A, B = dynamics.get_linear_dynamics(derivs, x_bar[:, n], u_bar[:, n])
        dx = x[:, n] - x_bar[:, n]
        du = u[:, n] - u_bar[:, n]
        dxdt = A@dx + B@du
        fxu_bar = dynamics.car_continuous_dynamics(x_bar[:, n], u_bar[:, n])
        for i in range(n_x):
            prog.AddConstraint(x[i, n+1] == x[i, n] +
                               (fxu_bar[i] + dxdt[i])*dt)
    return prog
Ejemplo n.º 3
0
def train(sess, env, args, actor, critic, actor_noise, reward_result):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Get dynamics and initialize prior controller
    [A, B] = get_linear_dynamics()
    prior = BasePrior(A, B)
    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    paths = list()

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0.
        ep_ave_max_q = 0

        obs, action, rewards = [], [], []

        #Get optimal reward using optimal control
        s0 = np.copy(s)
        ep_reward_opt = 0.
        for kk in range(int(args['max_episode_len'])):
            a_prior = prior.getControl_h(s0)
            a = a_prior
            s0, r, stop_c, _ = env.step(a)
            ep_reward_opt += r
            if (stop_c):
                break

        # Get reward using regRL algorithm
        env.reset()
        s = env.unwrapped.reset(s)

        for j in range(int(args['max_episode_len'])):

            # Set control prior regularization weight
            lambda_mix = 5.

            # Prior control
            a_prior = prior.getControl_h(s)

            # Rl control with exploration noise
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()
            #a = actor.predict(np.reshape(s, (1, actor.s_dim))) + (1. / (1. + i))

            # Mix the actions (RL controller + control prior)
            act = a[0] / (1 + lambda_mix) + (lambda_mix /
                                             (1 + lambda_mix)) * a_prior

            # Take action and observe next state/reward
            s2, r, terminal, info = env.step(act)

            # Add info from time step to the replay buffer
            replay_buffer.add(
                np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )),
                r, terminal, np.reshape(s2, (actor.s_dim, )),
                np.reshape((lambda_mix / (1 + lambda_mix)) * a_prior,
                           (actor.a_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):

                #Sample a batch from the replay buffer
                s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                a_batch = a_batch_0

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))
                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

                # Calculate TD-Error for each state
                base_q = critic.predict_target(s_batch,
                                               actor.predict_target(s_batch))
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

            s = s2
            ep_reward += r

            obs.append(s)
            rewards.append(r)
            action.append(a[0])

            # Collect results at end of episode
            if terminal:
                for ii in range(len(obs)):
                    obs[ii] = obs[ii].reshape((4, 1))
                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward - ep_reward_opt), \
                        i, (ep_ave_max_q / float(j))))
                reward_result[0, i] = ep_reward
                reward_result[1, i] = ep_reward_opt
                path = {
                    "Observation": np.concatenate(obs).reshape((-1, 4)),
                    "Action": np.concatenate(action),
                    "Reward": np.asarray(rewards)
                }
                paths.append(path)
                print(ep_reward)
                break

    return [summary_ops, summary_vars, paths]
Ejemplo n.º 4
0
def fn3():

    pdf_path = os.path.join(os.getcwd(), param.get('fn_plots'))

    # remove if exists
    if os.path.exists(pdf_path):
        os.remove(pdf_path)

    np.random.seed(88)

    util.get_x0()
    util.get_xd()

    # baseline trajectory
    bX = []
    bU = np.zeros((len(param.get('T')), param.get('m')))

    # true perturbed trajectory
    X = []
    Xdot = []
    V = []
    Vdot = []

    # linearized perturbed trajectory
    tX = []
    tXdot = []
    tV = []
    tVdot = []

    # collect baseline
    x_curr = param.get('x0')
    for k, t in enumerate(param.get('T')):
        u_curr = util.list_to_vec(bU[k, :])
        x_next = x_curr + param.get('dt') * dynamics.get_dxdt(
            x_curr, u_curr, t)

        bX.append(x_curr)
        x_curr = x_next
    bX = np.squeeze(np.asarray(bX))

    # now run two trajectories and look at divergence
    eps_x0 = 0.0 * np.random.uniform(size=(param.get('n'), 1))
    eps_u = 0.0 * np.random.uniform(size=(param.get('nt'), param.get('m')))
    x_curr = param.get('x0') + eps_x0
    tx_curr = param.get('x0') + eps_x0
    scpU = bU + eps_u
    for k, t in enumerate(param.get('T')):

        # current control
        u_curr = np.reshape(np.transpose(scpU[k, :]), (param.get('m'), 1))

        # base
        ub = util.list_to_vec(bU[k, :])
        xb = util.list_to_vec(bX[k, :])

        # true
        dxdt = dynamics.get_dxdt(x_curr, u_curr, t)
        x_next = x_curr + dxdt * param.get('dt')
        v = dynamics.get_V(x_curr, t)
        vdot = dynamics.get_LfV(x_curr, t) + np.matmul(
            dynamics.get_LgV(x_curr, t), u_curr)

        # approximate
        F_k, B_k, d_k = dynamics.get_linear_dynamics( xb, \
         ub, t)
        R_k, w_k = dynamics.get_linear_lyapunov(xb, ub, t)

        tx_next = np.matmul( F_k, tx_curr) + \
         np.matmul( B_k, u_curr) + d_k
        tv = np.matmul(R_k, tx_curr) + w_k

        X.append(x_curr)
        Xdot.append(dxdt)
        V.append(v)
        Vdot.append(vdot)

        tX.append(tx_curr)
        tV.append(tv)

        x_curr = x_next
        tx_curr = tx_next

        print('Timestep:' + str(k) + '/' + str(len(param.get('T'))))

    X = np.squeeze(np.asarray(X))
    V = np.reshape(np.asarray(V), (len(V), 1))
    Xdot = np.squeeze(np.asarray(Xdot))

    tX = np.squeeze(np.asarray(tX))
    tV = np.reshape(np.asarray(tV), (len(tV), 1))
    tXdot = np.gradient(tX, param.get('dt'), axis=0)
    tVdot = np.gradient(tV, param.get('dt'), axis=0)

    print(np.shape(X))
    print(np.shape(V))
    print(np.shape(Xdot))
    print(np.shape(tX))
    print(np.shape(tV))
    print(np.shape(tXdot))
    print(np.shape(tVdot))

    epa1 = np.linalg.norm(X[:, 0:2] - tX[:, 0:2], axis=1)
    eva1 = np.linalg.norm(X[:, 2:4] - tX[:, 2:4], axis=1)
    epa2 = np.linalg.norm(X[:, 4:6] - tX[:, 4:6], axis=1)
    eva2 = np.linalg.norm(X[:, 6:8] - tX[:, 6:8], axis=1)
    epb = np.linalg.norm(X[:, 8:10] - tX[:, 8:10], axis=1)
    evb = np.linalg.norm(X[:, 10:12] - tX[:, 10:12], axis=1)

    eXdot = np.linalg.norm(Xdot - tXdot, axis=1)
    eV = np.linalg.norm(V - tV, axis=1)
    eVdot = np.linalg.norm(Vdot - tVdot, axis=1)

    import matplotlib.pyplot as plt

    fig, ax = plt.subplots()
    plt.plot(epa1, eXdot, label='eXdot')
    plt.plot(epa1, eV, label='eV')
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('epa')
    plt.legend()

    fig, ax = plt.subplots()
    plt.plot(eva1, eXdot, label='eXdot')
    plt.plot(eva1, eV, label='eV')
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('eva')
    plt.legend()

    fig, ax = plt.subplots()
    plt.plot(epb, eXdot, label='eXdot')
    plt.plot(epb, eV, label='eV')
    # ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('epb')
    plt.legend()

    fig, ax = plt.subplots()
    plt.plot(evb, eXdot, label='eXdot')
    plt.plot(evb, eV, label='eV')
    # ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('evb')
    plt.legend()

    # plt.plot( eX, eVdot, label = 'eVdot')

    plotter.save_figs()
    subprocess.call(["xdg-open", pdf_path])
Ejemplo n.º 5
0
def fn2():

    pdf_path = os.path.join(os.getcwd(), param.get('fn_plots'))

    # remove if exists
    if os.path.exists(pdf_path):
        os.remove(pdf_path)

    np.random.seed(88)

    util.get_x0()
    util.get_xd()

    # trjaectory to linearize about
    fX = []
    fU = []
    fV = []
    # scp trajectory
    tX = []
    tU = []
    tV = []
    # integrated trajectory with scp control
    scpV = []
    # linearize about trajectory
    bV = []

    # feedback linearizing baseline
    x_curr = param.get('x0')
    for k, t in enumerate(param.get('T')):
        u_curr = controller.get_fdbk_controller(x_curr, t)
        x_next = x_curr + param.get('dt') * dynamics.get_dxdt(
            x_curr, u_curr, t)
        v = dynamics.get_V(x_curr, t)

        fX.append(x_curr)
        fU.append(u_curr)
        fV.append(v)
        x_curr = x_next
    fX = np.squeeze(np.asarray(fX))
    fU = np.asarray(fU)

    # scp
    scpX, scpU, bX, bU = controller.get_scp_clf_controller()

    # integrate
    tx_curr = param.get('x0')
    x_curr = param.get('x0')
    for k, t in enumerate(param.get('T')):

        ub = util.list_to_vec(bU[k, :])
        xb = util.list_to_vec(bX[k, :])

        u_curr = np.transpose(util.list_to_vec(scpU[k, :]))

        F_k, B_k, d_k = dynamics.get_linear_dynamics(xb, ub, t)

        R_k, w_k = dynamics.get_linear_lyapunov(xb, ub, t)

        tx_next = np.matmul(F_k, tx_curr) + np.matmul(B_k, u_curr) + d_k
        x_next = x_curr + param.get('dt') * dynamics.get_dxdt(
            x_curr, u_curr, t)

        tv = np.matmul(R_k, tx_curr) + w_k
        scpv = dynamics.get_V(x_curr, t)

        tX.append(tx_curr)
        tV.append(tv)
        scpV.append(scpv)
        bV.append(dynamics.get_V(xb, t))

        tx_curr = tx_next
        x_curr = x_next

    tX = np.squeeze(np.asarray(tX))
    bV = np.asarray(bV)

    plotter.plot_SS(fX, param.get('T'), title='Fdbk Linearize SS')
    plotter.plot_V(fV, param.get('T'), title='Fdbk Linearize V')
    plotter.plot_U(fU, param.get('T'), title='Fdbk Linearize U')

    plotter.plot_SS(bX,
                    param.get('T'),
                    title='What SCP is linearizing about: SS')
    plotter.plot_V(bV,
                   param.get('T'),
                   title='What SCP is linearizing about: V')
    # plotter.plot_U(bU, param.get('T'), title = 'What SCP is linearizing about: U')

    plotter.plot_SS(tX, param.get('T'), title='What SCP thinks its doing: SS')
    plotter.plot_V(tV, param.get('T'), title='What SCP thinks its doing: V')
    # plotter.plot_U(tU, param.get('T'), title = 'What SCP thinks its doing: U')

    plotter.plot_SS(scpX,
                    param.get('T'),
                    title='What SCP is actually doing: SS')
    plotter.plot_V(scpV, param.get('T'), title='What SCP is actually doing: V')
    # plotter.plot_U(scpU, param.get('T'), title = 'What SCP is actually doing: U')

    plotter.save_figs()
    subprocess.call(["xdg-open", pdf_path])
Ejemplo n.º 6
0
def get_scp_clf_controller( x0,T):

	lambda_v = util.get_stabilization_rate()
	xbar, ubar = get_scp_initial_trajectory(x0, T)
	xbar = np.transpose( xbar, (1,0,2))
	ubar = np.transpose( ubar, (1,0,2))

	i_iter = 0
	cost_curr = np.inf
	cost_diff = np.inf
	tx_curr = xbar
	state_diff = np.inf
	C = []

	while i_iter < param.get('n_scp_iter') and \
		np.abs(cost_diff) > param.get('scp_cost_tol'):

		if not (i_iter == 0):
			if param.get('nl_correction_on'):
				xbar = []
				x_curr = x0
				for k,t in enumerate(T):
					u_curr = np.reshape( u_next[:,k], (param.get('m'),1))
					x_next = x_curr + dynamics.get_dxdt( x_curr, u_curr, t) * param.get('dt')
					xbar.append(x_curr)
					x_curr = x_next
				xbar = np.transpose( np.asarray(xbar), (1,0,2))
				ubar = u_next
			else:
				ubar = u_next
				xbar = tx_next


		print('Iteration: ' + str(i_iter) + '/' + str(param.get('n_scp_iter')))
	
		u  = cp.Variable( param.get('m'), len(T) )
		tx = cp.Variable( param.get('n'), len(T) )
		tV = cp.Variable( len(T), 1)
		delta_v = cp.Variable( len(T)-1, 1)

		constraints = []
		constraints.append( tx[:,0] == xbar[:,0])

		for k in range( len(T)):

			F_k, B_k, d_k = dynamics.get_linear_dynamics( xbar[:,k], ubar[:,k], T[k])
			R_k, w_k = dynamics.get_linear_lyapunov( xbar[:,k], ubar[:,k], T[k])

			constraints.append( 
				tV[k] == R_k * tx[:,k] + w_k)
			constraints.append( 
				cp.abs( tx[:,k] - xbar[:,k]) <= param.get('tau_x') * np.power( 0.95, i_iter))
			constraints.append( 
				cp.abs( u[:,k] - ubar[:,k]) <= param.get('tau_u') * np.power( 0.95, i_iter))
			constraints.append( 
				cp.abs( u[:,k]) <= param.get('control_max'))
		
			if k < len(T)-1:
				constraints.append( 
					tV[k+1] <= (1-lambda_v*param.get('dt')) * tV[k] + delta_v[k]*param.get('dt'))
				constraints.append(
					tx[:,k+1] == F_k * tx[:,k] + B_k * u[:,k] + d_k)
				constraints.append(
					delta_v[k] >= 0)


		# obj = cp.Minimize( cp.sum_squares(u) + param.get('p_v')*sum(delta_v)) 
		# obj = cp.Minimize( param.get('p_v')*sum(delta_v)) 
		obj = cp.Minimize( sum(tV) + cp.sum_squares(u)) 
		prob = cp.Problem( obj, constraints)

		prob.solve(verbose = False, solver = cp.GUROBI, \
			BarQCPConvTol = 1e-6, BarHomogeneous = True)
		# prob.solve(verbose = False)

		tx_next = np.asarray(tx.value).reshape(param.get('n'), len(T), -1)
		u_next = np.asarray(u.value).reshape(param.get('m'), len(T), -1)
		tV_next = np.asarray(tV.value).reshape(len(T), -1)

		cost_next = prob.value
		cost_diff = cost_next - cost_curr
		state_diff = sum( np.linalg.norm( tx_next - tx_curr, axis = 1))

		i_iter += 1
		C.append( cost_curr)
		cost_curr = cost_next
		tx_curr = tx_next
		print('Curr Cost - Prev Cost: ' + str(cost_diff))
		print('State Change: ' + str(state_diff))
		print('Timestep: ' + str(T[0]) + '/' + str(param.get('T')[-1]) + '\n')

	plotter.debug_scp_iteration_plot( tx_next, u_next, xbar, ubar, x0, T, i_iter)
	plotter.plot_cost_iterations(C)
	U = np.transpose( ubar, (1,0,2))
	return U
Ejemplo n.º 7
0
    TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
    SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "PPO", ENVIRONMENT, TIMESTAMP)

    env = gym.make(ENVIRONMENT)
    ppo = PPO(env, SUMMARY_DIR, gpu=True)

    if MODEL_RESTORE_PATH is not None:
        ppo.restore_model(MODEL_RESTORE_PATH)

    t, terminal = 0, False
    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
    rolling_r = RunningStats()

    # Initialize control prior
    [A,B] = get_linear_dynamics()
    prior = BasePrior(A,B)
    # Set fixed regularization weight
    # lambda_mix = 4.

    reward_total, reward_diff, reward_lqr_prior, reward_h_prior = [], [], [], []

    for episode in range(EP_MAX + 1):

        # Baseline reward using only control prior
        s0 = env.reset()
        sp = np.copy(s0)
        reward_prior = 0.
        while True:
            a_prior = prior.getControl_h(sp)
            a_prior = np.squeeze(np.asarray(a_prior))