Ejemplo n.º 1
0
def train(state_cb,
          pub_cmd,
          pub_act,
          rate,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    rand_controller = RandomController()
    paths = sample(state_cb, pub_cmd, pub_act, rate, rand_controller,
                   num_paths_random, env_horizon, render)
    data = paths_to_array(paths)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        # Fit dynamics model
        print('Training dynamics model...')
        dyn_model.fit(data)
        plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate)
        mpc_controller.dyn_model = dyn_model
        costs = []
        returns = []
        # Do MPC
        for i in range(num_paths_onpol):
            print('On policy path: %i' % i)
            obs_t, obs_tp1, acs_t, rews_t = [], [], [], []
            s_t = state_cb.reset(pub_act, pub_cmd)
            total_return = 0

            for j in range(env_horizon):
                # print('Timestep: %i, Return: %g' % (j,total_return))
                a_t = mpc_controller.get_action(s_t)
                s_tp1, _ = state_cb.step(a_t, pub_act, pub_cmd)
                r_t = 0
                for i in range(9):
                    r_t += s_tp1[i * 12] - s_t[i * 12]
                total_return += r_t

                if render:
                    env.render()
                    time.sleep(0.05)

                obs_t.append(s_t)
                obs_tp1.append(s_tp1)
                acs_t.append(a_t)
                rews_t.append(r_t)

                s_t = s_tp1

            path = {
                "observations": np.array(obs_t),
                "next_observations": np.array(obs_tp1),
                "actions": np.array(acs_t),
                "rewards": np.array(rews_t)
            }
            total_cost = path_cost(cost_fn, path)

            paths.append(path)
            returns.append(total_return)
            costs.append(total_cost)
            print('Total cost: %g, Total reward: %g' %
                  (total_cost, total_return))

        data = paths_to_array(paths)
        normalization = compute_normalization(data)
        # Set new normalization statistics for dynamics model
        dyn_model.normalization = normalization

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 2
0
def train_PG(
             exp_name='',
             env_name='',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,

             # mb mpc arguments
             model_learning_rate=1e-3,
             onpol_iters=10,
             dynamics_iters=260,
             batch_size=512,
             num_paths_random=10, 
             num_paths_onpol=10, 
             num_simulated_paths=1000,
             env_horizon=1000, 
             mpc_horizon=10,
             m_n_layers=2,
             m_size=500,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    # env = gym.make(env_name)
    env = HalfCheetahEnvNew()
    cost_fn = cheetah_cost_fn
    activation=tf.nn.relu
    output_activation=None

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    # max_path_length = max_path_length or env.spec.max_episode_steps
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("-------- env info --------")
    print("Environment name: ", env_name)
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)




    #========================================================================================#
    # Random data collection
    #========================================================================================#

    random_controller = RandomController(env)
    data_buffer_model = DataBuffer()
    data_buffer_ppo = DataBuffer_general(10000, 4)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n])

    print("data buffer size: ", data_buffer_model.size)

    normalization = compute_normalization(data_buffer_model)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    tf_config = tf.ConfigProto() 
    tf_config.allow_soft_placement = True
    tf_config.intra_op_parallelism_threads =4
    tf_config.inter_op_parallelism_threads = 1
    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate)

    if nn_baseline:
        value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate)

    sess.__enter__() # equivalent to `with sess:`

    tf.global_variables_initializer().run()


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        if MPC:
            dyn_model.fit(data_buffer_model)
        returns = []
        costs = []

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []

        while True:
            # print("data buffer size: ", data_buffer_model.size)
            current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]}

            ob = env.reset()
            obs, acs, mpc_acs, rewards = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            return_ = 0
 
            while True:
                # print("steps ", steps)
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                if MPC:
                    mpc_ac = mpc_controller.get_action(ob)
                else:
                    mpc_ac = random_controller.get_action(ob)

                ac = policy_nn.predict(ob, mpc_ac)

                ac = ac[0]

                if not PG:
                    ac = mpc_ac

                acs.append(ac)
                mpc_acs.append(mpc_ac)

                current_path['observations'].append(ob)

                ob, rew, done, _ = env.step(ac)

                current_path['reward'].append(rew)
                current_path['actions'].append(ac)
                current_path['next_observations'].append(ob)

                return_ += rew
                rewards.append(rew)

                steps += 1
                if done or steps > max_path_length:
                    break


            if MPC:
                # cost & return
                cost = path_cost(cost_fn, current_path)
                costs.append(cost)
                returns.append(return_)
                print("total return: ", return_)
                print("costs: ", cost)

                # add into buffers
                for n in range(len(current_path['observations'])):
                    data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n])

            for n in range(len(current_path['observations'])):
                data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n])
        
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs),
                    "mpc_action" : np.array(mpc_acs)}



            paths.append(path)
            timesteps_this_batch += pathlength(path)
            # print("timesteps_this_batch", timesteps_this_batch)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch


        print("data_buffer_ppo.size:", data_buffer_ppo.size)


        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths])


        # Computing Q-values
     
        if reward_to_go:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        if t_ >= t:
                            q += gamma**(t_-t) * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)

        else:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        q += gamma**t_ * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)


        # Computing Baselines
        if nn_baseline:

            # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no})
            b_n = value_nn.predict(ob_no)
            b_n = normalize(b_n)
            b_n = denormalize(b_n, np.std(q_n), np.mean(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # Advantage Normalization
        if normalize_advantages:
            adv_n = normalize(adv_n)

        # Optimizing Neural Network Baseline
        if nn_baseline:
            b_n_target = normalize(q_n)
            value_nn.fit(ob_no, b_n_target)
                # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target})


        # Performing the Policy Update

        # policy_nn.fit(ob_no, ac_na, adv_n)
        policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na)

        # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Ejemplo n.º 3
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None
         ):

    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)
    #========================================================
    # 
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    """ YOUR CODE HERE """
    # sample random paths
    data = sample(env, random_controller, num_paths_random, env_horizon)

    #========================================================
    # 
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network. 
    # 
    normalization = compute_normalization(data)


    #========================================================
    # 
    # Build dynamics model and MPC controllers.
    # 
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    # 
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    # 
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        dyn_model.fit(data)
        paths = []
        # iteration over paths
        for n_path in range(num_paths_onpol):
            ob = env.reset()
            obs, n_obs, acs, rewards = [], [], [], []
            for n_horz in range(env_horizon):
                obs.append(ob)
                ac = mpc_controller.get_action(ob)
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                n_obs.append(ob)
                rewards.append(rew)
        
            cost = trajectory_cost_fn(cost_fn, np.array(obs), np.array(acs), np.array(n_obs))
            path = {"observations" : np.array(obs), 
                    "next_observations" : np.array(n_obs),
                    "rewards" : np.array(rewards), 
                    "actions" : np.array(acs),
                    "costs" : cost}
            paths.append(path)
            data.append(path)

        returns = [path["rewards"].sum() for path in paths]
        costs = [path["costs"] for path in paths]
        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 4
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None
         ):
    # tracker = SummaryTracker()

    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    # 
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    """ YOUR CODE HERE """

    # Print env info
    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print(" ")


    random_controller = RandomController(env)
    data_buffer = DataBuffer()
    bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n])



    #========================================================
    # 
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network. 
    # 
    print("data buffer size: ", data_buffer.size)

    normalization = compute_normalization(data_buffer)

    #========================================================
    # 
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    # 
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)

    bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate)

    mpc_controller_bc = MPCcontroller_BC(env=env, 
                                   dyn_model=dyn_model, 
                                   bc_network=bc_net,
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
          os.mkdir(CHECKPOINT_DIR)  
    #========================================================
    # 
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    # 

    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        print("onpol_iters: ", itr)

        dyn_model.fit(data_buffer)

        saver.save(sess, CHECKPOINT_DIR)

        returns = []
        costs = []

        for w in range(num_paths_onpol):
            print("paths_onpol: ", w, " running.....")
            print("data buffer size: ", data_buffer.size)

            st = env.reset_model()
            path = {'observations': [], 'actions': [], 'next_observations':[]}
            # tracker.print_diff()

            return_ = 0

            for i in range(env_horizon):
                if render:
                    env.render()
                # print("env_horizon: ", i)   

                if BEHAVIORAL_CLONING:
                    if bc_data_buffer.size > 2000:
                        at = mpc_controller_bc.get_action(st)
                    else:
                        at = mpc_controller.get_action(st)
                else:
                    at = mpc_controller.get_action(st)
                    # at = random_controller.get_action(st)

                st_next, env_reward, _, _ = env._step(at)
                path['observations'].append(st)
                path['actions'].append(at)
                path['next_observations'].append(st_next)
                st = st_next
                return_ += env_reward

            # cost & return
            cost = path_cost(cost_fn, path)
            costs.append(cost)
            returns.append(return_)
            print("total return: ", return_)
            print("costs: ", cost)

            # add into buffers
            for n in range(len(path['observations'])):
                data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n])
                bc_data_buffer.add(path['observations'][n], path['actions'][n])

        if BEHAVIORAL_CLONING:
            behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000)




        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # logz.log_tabular('Average_BC_Return', np.mean(bc_returns))

        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 5
0
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=1000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    paths_rand = sample(env,
                        random_controller,
                        num_paths=num_paths_random,
                        horizon=env_horizon,
                        render=render,
                        verbose=False)
    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(paths_rand)

    gamma = 0.99

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    # prefit dynamic before on policy dagger:
    print("****** Pretrain dynamic Model *******")
    losses = []
    obs_rand = np.concatenate([path["observation"] for path in paths_rand])
    action_rand = np.concatenate([path["action"] for path in paths_rand])
    next_ob_rand = np.concatenate([path["obs_next"] for path in paths_rand])
    data_size_rand = obs_rand.shape[0]
    for i in range(1000):
        # obtain batch size from random policy
        batch_idx_rand = np.random.randint(data_size_rand, size=batch_size)
        batch_ob_rand = obs_rand[batch_idx_rand, :]
        batch_ac_rand = action_rand[batch_idx_rand, :]
        batch_nxt_rand = next_ob_rand[batch_idx_rand, :]
        # obtain batch size from on policy
        batch_ob = np.copy(batch_ob_rand)
        batch_ac = np.copy(batch_ac_rand)
        batch_nxt = np.copy(batch_nxt_rand)
        loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt)
        losses.append(loss)
        if (i % 20 == 0):
            print('loss', loss)

    costs = []
    returns = []
    paths_rl = []
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        # fit dynamic model
        if itr > 0:
            obs_rl = np.concatenate([path["observation"] for path in paths_rl])
            action_rl = np.concatenate([path["action"] for path in paths_rl])
            next_ob_rl = np.concatenate(
                [path["obs_next"] for path in paths_rl])
        obs_rand = np.concatenate([path["observation"] for path in paths_rand])
        action_rand = np.concatenate([path["action"] for path in paths_rand])
        next_ob_rand = np.concatenate(
            [path["obs_next"] for path in paths_rand])
        # print obs[128,:].shape
        data_size_rand = obs_rand.shape[0]
        if itr > 0:
            data_size_rl = obs_rl.shape[0]
        # batch_size=128
        losses = []
        # fit model function
        for i in range(dynamics_iters):
            # obtain batch size from random policy
            batch_idx_rand = np.random.randint(data_size_rand,
                                               size=batch_size / 20)
            batch_ob_rand = obs_rand[batch_idx_rand, :]
            batch_ac_rand = action_rand[batch_idx_rand, :]
            batch_nxt_rand = next_ob_rand[batch_idx_rand, :]
            # obtain batch size from on policy
            if itr > 0:
                batch_idx_rl = np.random.randint(data_size_rl,
                                                 size=batch_size * 19 / 20)
                batch_ob_rl = obs_rl[batch_idx_rl, :]
                batch_ac_rl = action_rl[batch_idx_rl, :]
                batch_nxt_rl = next_ob_rl[batch_idx_rl, :]
                # mix them
                batch_ob = np.concatenate((batch_ob_rand, batch_ob_rl))
                batch_ac = np.concatenate((batch_ac_rand, batch_ac_rl))
                batch_nxt = np.concatenate((batch_nxt_rand, batch_nxt_rl))
            else:
                batch_ob = np.copy(batch_ob_rand)
                batch_ac = np.copy(batch_ac_rand)
                batch_nxt = np.copy(batch_nxt_rand)
            loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt)
            losses.append(loss)
            # if(i%20==0):
            #     print('loss', loss)
        print("on policy dagger ", itr)
        ob = env.reset()
        observes, acs, rewards, obs_2, returns = [], [], [], [], []
        steps = 0
        g = 0
        max_path_length = mpc_controller.horizon
        timesteps_this_batch = 0
        while True:
            while True:
                observes.append(ob)
                ac = mpc_controller.get_action(ob)
                # print ac
                acs.append(ac)
                # print ac
                ob, rew, done, _ = env.step(ac)
                g += rew * gamma**steps
                obs_2.append(ob)
                rewards.append(rew)
                returns.append(g)
                steps += 1
                if done or steps > max_path_length:
                    terminated = done
                    break
            path = {
                "observation": np.array(observes),
                "reward": np.array(rewards),
                "action": np.array(acs),
                "obs_next": np.array(obs_2),
                "return": np.array(returns)
            }
            paths_rl.append(path)
            timesteps_this_batch += pathlength(path)
            print g
            if timesteps_this_batch > batch_size:
                break
        trajectory_cost = trajectory_cost_fn(cheetah_cost_fn,
                                             path["observation"],
                                             path["action"], path["obs_next"])
        costs.append(trajectory_cost)
        returns.append(path["return"][-1])

        # print batch_ob.shape

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 6
0
def main():
    nb_total_steps = 1000
    nb_iterations = 40
    hidden_layers = [256, 256]
    writer = tensorboardX.SummaryWriter()

    args = parse_args(__doc__, ['env'])


    env = gym.make(args.env) 

    ctrl = rand_ctrl = RandomController(env)


    # ipdb.set_trace()
    print('#inputs : %d' % ctrl.nb_inputs())
    print('#actions: %d' % ctrl.nb_actions())

    # f_net = make_net(
    #     [ctrl.nb_inputs() + ctrl.nb_actions()] + hidden_layers + [ctrl.nb_inputs()],
    #     [nn.ReLU() for _ in hidden_layers],
    # )
    f_net = MOENetwork(
        nb_inputs=ctrl.nb_inputs() + ctrl.nb_actions(),
        nb_experts=4,
        gait_layers=[64],
        expert_layers=[64, ctrl.nb_inputs()],
    )

    data = collect_data(env, ctrl, nb_total_steps*10)


    # ipdb.set_trace()

    dynamics = DynamicsModel(env, f_net, data.get_all(), writer=writer)
    # cost_func = lambda s,a,sn: -sn[3].item()  # refers to vx
    cost_func = get_cost(args.env)  # refers to vx

    # data.calc_normalizations()
    # dynamics.fit(data)

    mpc_ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=100, horizon=10, num_mpc_steps=10)
    eval_args = EvaluationArgs(nb_burnin_steps=4, nb_episodes=10, horizons=[1, 2, 4, 8, 16, 32])

    for i in range(nb_iterations):
        print('Iteration', i)
        new_data = collect_data(env, ctrl, nb_total_steps)
        dynamics.fit(*new_data.get_all())
        data.extend(new_data)
        dynamics.fit(*data.sample(sample_size=4*nb_total_steps))
        evaluate_and_log_dynamics(
            dynamics.predict, env, rand_ctrl, writer=writer, i_step=i, args=eval_args
        )
        evaluate_and_log_dynamics(
            dynamics.predict, env, mpc_ctrl, writer=writer, i_step=i, args=eval_args
        )
        # dynamics.fit(*data.get_all())
        if random.random() > 0.5:
            ctrl = rand_ctrl
        else:
            ctrl = mpc_ctrl
    
    env = gym.make(args.env)

    ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=1000, num_mpc_steps=4)

    # TODO

    env.render(mode='human')
    obs = env.reset()

    for _ in range(100):
        # time.sleep(1. / 60.)
        obs, r, done, _ = env.step(ctrl.get_action(obs))
        # print('  ', cost_func(obs))
        if done:
            print("done:", r, obs)
            time.sleep(1)
            ctrl.reset()
            obs = env.reset()
    ipdb.set_trace()