Ejemplo n.º 1
0
def plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions.
    """
    print('Plotting nn dynamics results')
    rand_cont = RandomController()
    s = state_cb.reset(pub_act, pub_cmd)
    env_state_traj = s
    model_state_traj = s
    steps = 100
    for i in range(steps):
        a = rand_cont.get_action(None)
        # Step environment
        env_s, _ = state_cb.step(a, pub_act, pub_cmd)
        env_state_traj = np.vstack((env_state_traj, env_s))
        # Step model
        if i == 0:
            model_s = dyn_model.predict(model_state_traj, a)
        else:
            model_s = dyn_model.predict(model_state_traj[i, :], a)
        model_state_traj = np.vstack((model_state_traj, model_s))

    body = 10
    # for i in range(body*12,(body+1)*12):
    for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 108, 109, 110]:
        plt.figure()
        env_state = plt.plot(np.arange(steps + 1),
                             env_state_traj[:, i].reshape((steps + 1)),
                             label='env state')
        model_state = plt.plot(np.arange(steps + 1),
                               model_state_traj[:, i].reshape((steps + 1)),
                               label='model state')
        state = i % 12
        if state == 0:
            plt.title('Body ' + str(body) + ', x position')
        elif state == 1:
            plt.title('Body ' + str(body) + ', y position')
        elif state == 2:
            plt.title('Body ' + str(body) + ', z position')
        elif state == 3:
            plt.title('Body ' + str(body) + ', x angle')
        elif state == 4:
            plt.title('Body ' + str(body) + ', y angle')
        elif state == 5:
            plt.title('Body ' + str(body) + ', z angle')
        elif state == 6:
            plt.title('Body ' + str(body) + ', x velocty')
        elif state == 7:
            plt.title('Body ' + str(body) + ', y velocity')
        elif state == 8:
            plt.title('Body ' + str(body) + ', z velocity')
        elif state == 9:
            plt.title('Body ' + str(body) + ', x angular velocity')
        elif state == 10:
            plt.title('Body ' + str(body) + ', y angular velocity')
        elif state == 11:
            plt.title('Body ' + str(body) + ', z angular velocity')
        plt.legend()
        plt.draw()
    plt.show()
Ejemplo n.º 2
0
def plot_comparison(dyn_model, pub_act, pub_cmd, rate):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions.
    """
    rand_cont = RandomController()
    s = reset(pub_cmd, rate)
    env_state_traj = s
    model_state_traj = s
    steps = 100
    for i in range(steps):
        a = rand_cont.get_action(None)
        # Step environment
        env_s, _ = step(a, pub_act, pub_cmd, rate)
        env_state_traj = np.vstack((env_state_traj, env_s))
        # Step model
        if i == 0:
            model_s = dyn_model.predict(model_state_traj, a)
        else:
            model_s = dyn_model.predict(model_state_traj[i, :], a)
        model_state_traj = np.vstack((model_state_traj, model_s))

    for i in range(len(s)):
        plt.figure()
        env_state = plt.plot(np.arange(steps + 1),
                             env_state_traj[:, i].reshape((steps + 1)),
                             label='env state')
        model_state = plt.plot(np.arange(steps + 1),
                               model_state_traj[:, i].reshape((steps + 1)),
                               label='model state')
        plt.title('State ' + str(i))
        plt.legend()
        plt.draw()
    plt.show()
Ejemplo n.º 3
0
    def get_test_batch(self,
                       num_tasks,
                       resample=False,
                       task=None,
                       controller='Rand',
                       task_range=(0, 7),
                       task_fun=np.random.randint):

        if controller == 'Rand':
            self.controller = RandomController(self.env)
        elif controller == "MPC":
            self.controller = MPCcontroller(self.env)

        if resample:
            # random sample
            if task is None:
                learner_env_goals = sample_goals(num_tasks, task_range,
                                                 task_fun)
            else:
                learner_env_goals = task

            for i in range(num_tasks):
                task = learner_env_goals[i]
                paths = sample(self.env,
                               task,
                               self.controller,
                               num_paths=self.num_paths_random,
                               horizon=self.env_horizon,
                               ignore_done=True,
                               K=self.K,
                               M=self.M)  # 10
                data_x, data_y = self._data_process(paths)
                data_x = data_x[np.newaxis, :]
                data_y = data_y[np.newaxis, :]

                if i == 0:
                    x = data_x
                    y = data_y
                else:
                    x = np.concatenate([x, data_x], axis=0)
                    y = np.concatenate([y, data_y], axis=0)

        data_x, data_y = [], []
        for t in range(num_tasks):
            for h in range(self.env_horizon):
                data_x.append(x[t, h:(h + self.K + self.M), :])
                data_y.append(y[t, h:(h + self.K + self.M), :])
        data_x = np.array(data_x)
        data_y = np.array(data_y)

        # dataset = tf.data.Dataset.from_tensor_slices((data_x, data_y)).shuffle(
        # 	buffer_size=self.env_horizon * self.num_tasks).batch(
        # 	self.env_horizon).repeat()
        # # create the iterator
        # iter = dataset.make_one_shot_iterator()
        #
        # iterator = iter.get_next()

        return data_x, data_y
Ejemplo n.º 4
0
def plot_comparison(env, dyn_model):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 
    """
    """ YOUR CODE HERE """
    random_controller = RandomController(env)
    data = sample(env, rand_controller)
    next_pred_state = dyn_model.predict(data['observations'], data['actions'])

    plot(data['next_observations'])
    plot(next_pred_state)

    pass
Ejemplo n.º 5
0
def plot_comparison(env, dyn_model):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 
    """
    """ YOUR CODE HERE """
    horizon = 100
    ob = env.reset()
    pred = ob[np.newaxis, :]
    obs, next_obs, acs, rewards = [], [], [], []
    preds = []
    steps = 0
    RC = RandomController(env)
    for _ in range(100):
        obs.append(ob)
        preds.append(pred)
        ac = RC.get_action(ob)
        acs.append(ac)
        ob, rew, done, _ = env.step(ac)
        pred = dyn_model.predict(pred, ac[np.newaxis, :])
        next_obs.append(ob)
        rewards.append(rew)
        steps += 1
        if done or steps > horizon:
            break
    path = {
        "observations": np.array(obs),
        "next_observations": np.array(next_obs),
        "rewards": np.array(rewards),
        "actions": np.array(acs),
        "predictions": np.array(preds)
    }

    print(path['observations'].shape)
    print(path['predictions'].shape)
    plt.plot(path['observations'][:, 0])
    plt.plot(path['predictions'][:, 0, 0])
    plt.show()
    pass
Ejemplo n.º 6
0
def plot_comparison(env, dyn_model):
    """
    Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 
    """
    """ YOUR CODE HERE """
    data = sample(env, RandomController(env), num_paths=1)
    pred_states = dyn_model.predict(data[0]['observations'],
                                    data[0]['actions'])
    losses = np.sum((pred_states - data[0]['next_observations'])**2)
    plt.plot(losses)
    plt.ylabel('predicted states squared error')
    plt.xlabel('timestep')
    plt.show()
    pass
Ejemplo n.º 7
0
    def get_dataset(self,
                    resample=False,
                    task=None,
                    controller='Rand',
                    task_range=(0, 7),
                    task_fun=np.random.randint):

        if controller == 'Rand':
            self.controller = RandomController(self.env)
        elif controller == "MPC":
            self.controller = MPCcontroller(self.env)

        if resample:
            # random sample
            if task is None:
                learner_env_goals = sample_goals(self.num_tasks, task_range,
                                                 task_fun)
            else:
                learner_env_goals = task

            for i in range(self.num_tasks):
                task = learner_env_goals[i]
                paths = sample(self.env,
                               task,
                               self.controller,
                               num_paths=self.num_paths_random,
                               horizon=self.env_horizon,
                               ignore_done=True,
                               K=self.K,
                               M=self.M)  # 10
                data_x, data_y = self._data_process(paths)
                data_x = data_x[np.newaxis, :]
                data_y = data_y[np.newaxis, :]

                if i == 0:
                    self.x = data_x
                    self.y = data_y
                else:
                    self.x = np.concatenate([self.x, data_x], axis=0)
                    self.y = np.concatenate([self.y, data_y], axis=0)
        # end = time.time()
        # runtime1 = end - start
        # print('time ', runtime1)
        print('env_horizon:', self.env_horizon)
        print('len of x:', len(self.x))
        return len(self.x)
Ejemplo n.º 8
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation
    |_                          for the loop to run.

    dynamics_iters              Number of iterations of training for the
    |                           dynamics model which happen per iteration of
    |_                          the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |                           aggregation, using the Model Predictive Control
    |_                          policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first
    |                           action of the best fictitious rollout. This
    |                           argument is how many timesteps should be in
    |_                          each fictitious rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    d("env                         = {}".format(env))
    d("env.observation_space       = {}".format(env.observation_space))
    d("env.action_space            = {}".format(env.action_space))
    d("env.observation_space.shape = {}".format(env.observation_space.shape))
    d("env.action_space.shape      = {}".format(env.action_space.shape))
    d("logdir                      = {}".format(logdir))
    d("render                      = {}".format(render))
    d("learning_rate               = {}".format(learning_rate))
    d("onpol_iters                 = {}".format(onpol_iters))
    d("dynamics_iters              = {}".format(dynamics_iters))
    d("batch_size                  = {}".format(batch_size))
    d("num_paths_random            = {}".format(num_paths_random))
    d("num_paths_onpol             = {}".format(num_paths_onpol))
    d("num_simulated_paths         = {}".format(num_simulated_paths))
    d("env_horizon                 = {}".format(env_horizon))
    d("mpc_horizon                 = {}".format(mpc_horizon))
    d("n_layers                    = {}".format(n_layers))
    d("size                        = {}".format(size))

    logz.configure_output_dir(logdir)

    #===========================================================================
    # First, we need a lot of data generated by a random agent, with which
    # we'll begin to train our dynamics model.
    d("Generating random rollouts.")
    random_controller = RandomController(env)
    random_paths = sample(env=env,
                          controller=random_controller,
                          num_paths=num_paths_random,
                          horizon=env_horizon,
                          render=render)
    d("Done generating random rollouts.")

    #===========================================================================
    # The random data will be used to get statistics (mean and std) for the
    # observations, actions, and deltas (where deltas are o_{t+1} - o_t). These
    # will be used for normalizing inputs and denormalizing outputs from the
    # dynamics network.
    d("Normalizing random rollouts.")
    data = paths_to_data(random_paths)
    normalization = compute_normalization(data)
    d("Done normalizing random rollouts.")

    mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action = normalization
    d("mean_obs    = {}".format(mean_obs))
    d("std_obs     = {}".format(std_obs))
    d("mean_deltas = {}".format(mean_deltas))
    d("std_deltas  = {}".format(std_deltas))
    d("mean_action = {}".format(mean_action))
    d("std_action  = {}".format(std_action))

    #===========================================================================
    # Build dynamics model and MPC controllers.
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #===========================================================================
    # Tensorflow session building.
    sess.__enter__()
    tf.global_variables_initializer().run()

    #===========================================================================
    # Take multiple iterations of onpolicy aggregation at each iteration
    # refitting the dynamics model to current dataset and then taking onpolicy
    # samples and aggregating to the dataset.
    #
    # Note: You don't need to use a mixing ratio in this assignment for new and
    # old data as described in https://arxiv.org/abs/1708.02596
    start_time = time.time()
    for itr in range(onpol_iters):
        d("Iteration {}".format(itr))

        # Shuffle data.
        d("Shuffling data.")
        shuffle_indexes = np.random.permutation(data["observations"].shape[0])
        data["observations"] = data["observations"][shuffle_indexes]
        data["actions"] = data["actions"][shuffle_indexes]
        data["next_observations"] = data["next_observations"][shuffle_indexes]
        data["rewards"] = data["rewards"][shuffle_indexes]
        d("Done shuffling data.")

        # Fit the dynamics.
        d("Fitting dynamics.")
        dyn_model.fit(data)
        d("Done fitting dynamics.")

        # Generate on-policy rollouts.
        d("Generating on-policy rollouts.")
        rl_paths = sample(env=env,
                          controller=mpc_controller,
                          num_paths=num_paths_onpol,
                          horizon=env_horizon,
                          render=render)
        d("Done generating on-policy rollouts.")

        # Compute metrics.
        costs = np.array([path_cost(cost_fn, path) for path in rl_paths])
        returns = np.array([sum(path["rewards"]) for path in rl_paths])

        # Update data.
        new_data = paths_to_data(rl_paths)
        data = {
            "observations":
            np.concatenate([data["observations"], new_data["observations"]]),
            "actions":
            np.concatenate([data["actions"], new_data["actions"]]),
            "next_observations":
            np.concatenate(
                [data["next_observations"], new_data["next_observations"]]),
            "rewards":
            np.concatenate([data["rewards"], new_data["rewards"]]),
        }
        # TODO(mwhittaker): Shuffle if we need to.

        # LOGGING
        # Statistics for performance of MPC policy using our learned dynamics
        # model
        logz.log_tabular('Iteration', itr)
        logz.log_tabular('Time', time.time() - start_time)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory
        # using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 9
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    # tracker = SummaryTracker()
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.
    """ YOUR CODE HERE """

    # Print env info
    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print(" ")

    random_controller = RandomController(env)
    data_buffer = DataBuffer_withreward()
    bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   render=False,
                   verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer.add(path['observations'][n], path['actions'][n],
                            path['rewards'][n], path['next_observations'][n])

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    print("data buffer size: ", data_buffer.size)

    normalization = compute_normalization(data_buffer)

    #========================================================
    #
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller_learned_reward(
        env=env,
        dyn_model=dyn_model,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths)

    bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate)

    mpc_controller_bc = MPCcontroller_BC_learned_reward(
        env=env,
        dyn_model=dyn_model,
        bc_network=bc_net,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    if LOAD_MODEL:
        # init or load checkpoint with saver
        saver = tf.train.Saver()

        checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

        if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded:", checkpoint.model_checkpoint_path)
        else:
            print("Could not find old checkpoint")
            if not os.path.exists(CHECKPOINT_DIR):
                os.mkdir(CHECKPOINT_DIR)
    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #

    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        print("onpol_iters: ", itr)

        dyn_model.fit(data_buffer)

        if LOAD_MODEL:
            saver.save(sess, CHECKPOINT_DIR)

        returns = []
        costs = []

        for w in range(num_paths_onpol):
            print("paths_onpol: ", w, " running.....")
            print("data buffer size: ", data_buffer.size)

            st = env.reset_model()
            path = {
                'observations': [],
                'actions': [],
                'rewards': [],
                'next_observations': []
            }
            # tracker.print_diff()

            return_ = 0

            for i in range(env_horizon):
                if render:
                    env.render()
                # print("env_horizon: ", i)

                if BEHAVIORAL_CLONING:
                    if bc_data_buffer.size > 2000:
                        at = mpc_controller_bc.get_action(st)
                    else:
                        at = mpc_controller.get_action(st)
                else:
                    at = mpc_controller.get_action(st)
                    # at = random_controller.get_action(st)

                st_next, env_reward, _, _ = env._step(at)
                path['observations'].append(st)
                path['actions'].append(at)
                path['rewards'].append(env_reward)

                path['next_observations'].append(st_next)
                st = st_next
                return_ += env_reward

            # cost & return
            # cost = path_cost(cost_fn, path)
            # costs.append(cost)
            returns.append(return_)
            print("total return: ", return_)
            # print("costs: ", cost)

            # add into buffers
            for n in range(len(path['observations'])):
                data_buffer.add(path['observations'][n], path['actions'][n],
                                path['rewards'][n],
                                path['next_observations'][n])
                bc_data_buffer.add(path['observations'][n], path['actions'][n])

        if BEHAVIORAL_CLONING:
            bc_returns = behavioral_cloning(sess,
                                            env,
                                            bc_net,
                                            mpc_controller,
                                            env_horizon,
                                            bc_data_buffer,
                                            Training_epoch=1000)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)

        # In terms of cost function which your MPC controller uses to plan
        if BEHAVIORAL_CLONING:
            logz.log_tabular('Average_BC_Return', np.mean(bc_returns))
            logz.log_tabular('Std_BC_Return', np.std(bc_returns))
            logz.log_tabular('Minimum_BC_Return', np.min(bc_returns))
            logz.log_tabular('Maximum_BC_Return', np.max(bc_returns))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 10
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None,
         clip_param=0.2 , 
         entcoeff=0.0,
         gamma=0.99,
         lam=0.95,
         optim_epochs=10,
         optim_batchsize=64,
         schedule='linear',
         bc_lr=1e-3,
         ppo_lr=3e-4,
         timesteps_per_actorbatch=1000,
         MPC = True,
         BEHAVIORAL_CLONING = True,
         PPO = True,
         ):

    start = time.time()

    logz.configure_output_dir(logdir)


    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)
    print("MPC-AUG: ", MPC)
    print(" ")


    # initialize buffers
    model_data_buffer = DataBufferGeneral(1000000, 5)
    ppo_data_buffer = DataBufferGeneral(10000, 4)
    bc_data_buffer = DataBufferGeneral(BC_BUFFER_SIZE, 2)

    # random sample path
    print("collecting random data .....  ")
    random_controller = RandomController(env)
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add([path['observations'][n],
                                 path['actions'][n], 
                                 path['rewards'][n], 
                                 path['next_observations'][n], 
                                 path['next_observations'][n] - path['observations'][n]])


    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    # 
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    # 
    # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
    tf_config = tf.ConfigProto() 

    tf_config.gpu_options.allow_growth = True

    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsRewardModel(env=env, 
                                    normalization=normalization,
                                    batch_size=batch_size,
                                    iterations=dynamics_iters,
                                    learning_rate=learning_rate,
                                    sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)

    policy_nn = MlpPolicy(sess=sess, env=env, hid_size=256, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff)

    mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, 
                                   dyn_model=dyn_model, 
                                   policy_net=policy_nn,
                                   self_exp=False,
                                   horizon=mpc_horizon, 
                                   num_simulated_paths=num_simulated_paths)



    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
          os.mkdir(CHECKPOINT_DIR)  

    #========================================================
    # 
    # Prepare for rollouts
    # 

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon
    bc = False
    ppo_mpc = False
    mpc_returns = 0

    for itr in range(onpol_iters):

        print(" ")

        print("onpol_iters: ", itr)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
            

        print("bc learning_rate: ",  bc_lr)
        print("ppo learning_rate: ",  ppo_lr)


        ################## fit mpc model
        if MPC:
            dyn_model.fit(model_data_buffer)


        ################## ppo seg data
        if PPO:
            ppo_data_buffer.clear()

            # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon)
            mpc = False
            ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon)

            add_vtarg_and_adv(ppo_seg, gamma, lam)

            ob, ac, rew, nxt_ob, atarg, tdlamret = \
            ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"]

            atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

            # add into buffer
            for n in range(len(ob)):
                ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])

                if MPC:
                    model_data_buffer.add([ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]])


        ################## mpc augmented seg data

        if itr % MPC_AUG_GAP == 0 and MPC:
            print("MPC AUG PPO")

            ppo_mpc = True
            mpc = True
            mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon)
            add_vtarg_and_adv(mpc_seg, gamma, lam)

            ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg["ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg["rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg["tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

            # add into buffer
            for n in range(len(ob)):
                # if PPO:
                #     ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])

                if BEHAVIORAL_CLONING and bc:
                    bc_data_buffer.add([ob[n], mpcac[n]])

                if MPC:
                    model_data_buffer.add([ob[n], mpcac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]])

            mpc_returns = mpc_seg["ep_rets"]

        seg = ppo_seg

        # check if seg is good
        ep_lengths = seg["ep_lens"]
        returns =  seg["ep_rets"]

        # saver.save(sess, CHECKPOINT_DIR)
        if BEHAVIORAL_CLONING:
            if np.mean(returns) > 100:
                bc = True
            else:
                bc = False

            print("BEHAVIORAL_CLONING: ", bc)


            bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

            if bc_return > 100:
                ppo_mpc = True
            else:
                ppo_mpc = False


        ################## optimization

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy
        policy_nn.assign_old_eq_new() # set old parameter values to new parameter values
        
        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr*cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING and bc:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(optim_batchsize)
                # print("sample_ob_no", sample_ob_no.shape)
                # print("sample_ac_na", sample_ac_na.shape)

                policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr*cur_lrmult)

            if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc:
                print('epcho: ', op_ep)
                behavioral_cloning_eval(sess, env, policy_nn, env_horizon)


        ################## print and save data

        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values


        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1



        # if np.mean(returns) > 1000:
        #     filename = "seg_data.pkl"
        #     pickle.dump(seg, open(filename, 'wb'))
        #     print("saved", filename)


        logz.log_tabular("TimeSoFar", time.time() - start)
        logz.log_tabular("TimeEp", time.time() - tstart)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("MpcReturn", np.mean(mpc_returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()
        tstart = time.time()
Ejemplo n.º 11
0
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=10000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
    clip_param=0.2,
    entcoeff=0.0,
    gamma=0.99,
    lam=0.95,
    optim_epochs=10,
    optim_batchsize=64,
    schedule='linear',
    bc_lr=1e-3,
    ppo_lr=3e-4,
    timesteps_per_actorbatch=1000,
    MPC=True,
    BEHAVIORAL_CLONING=True,
    PPO=True,
):

    start = time.time()

    logz.configure_output_dir(logdir)
    merged_summary, summary_writer, ppo_return_op, mpc_return_op, model_loss_op, reward_loss_op, ppo_std_op, mpc_std_op = build_summary_ops(
        logdir, env)

    print("-------- env info --------")
    print("Environment: ", FLAGS.env_name)
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("action_space low: ", env.action_space.low)
    print("action_space high: ", env.action_space.high)

    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)
    print("MPC-AUG: ", MPC)

    print(" ")

    random_controller = RandomController(env)

    # Creat buffers
    model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5)
    ppo_data_buffer = DataBufferGeneral(10000, 4)
    bc_data_buffer = DataBufferGeneral(2000, 2)

    # Random sample path

    print("collecting random data .....  ")
    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   render=False,
                   verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add([
                path['observations'][n], path['actions'][n],
                path['rewards'][n], path['next_observations'][n],
                path['next_observations'][n] - path['observations'][n]
            ])

    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    #
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    #
    # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)

    tf_config = tf.ConfigProto()

    tf_config.gpu_options.allow_growth = True

    sess = tf.Session(config=tf_config)

    policy_nn = MlpPolicy(sess=sess,
                          env=env,
                          hid_size=128,
                          num_hid_layers=2,
                          clip_param=clip_param,
                          entcoeff=entcoeff)

    if FLAGS.LEARN_REWARD:
        print("Learn reward function")
        dyn_model = NNDynamicsRewardModel(env=env,
                                          normalization=normalization,
                                          batch_size=batch_size,
                                          iterations=dynamics_iters,
                                          learning_rate=learning_rate,
                                          sess=sess)

        mpc_ppo_controller = MPCcontrollerPolicyNetReward(
            env=env,
            dyn_model=dyn_model,
            explore=FLAGS.MPC_EXP,
            policy_net=policy_nn,
            self_exp=FLAGS.SELFEXP,
            horizon=mpc_horizon,
            num_simulated_paths=num_simulated_paths)
    else:
        print("Use predefined cost function")
        dyn_model = NNDynamicsModel(env=env,
                                    n_layers=n_layers,
                                    size=size,
                                    activation=activation,
                                    output_activation=output_activation,
                                    normalization=normalization,
                                    batch_size=batch_size,
                                    iterations=dynamics_iters,
                                    learning_rate=learning_rate,
                                    sess=sess)

        mpc_ppo_controller = MPCcontrollerPolicyNet(
            env=env,
            dyn_model=dyn_model,
            explore=FLAGS.MPC_EXP,
            policy_net=policy_nn,
            self_exp=FLAGS.SELFEXP,
            horizon=mpc_horizon,
            cost_fn=cost_fn,
            num_simulated_paths=num_simulated_paths)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)
    # if not PPO:
    #     mpc_ppo_controller = mpc_controller

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(logdir)

    if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(logdir):
            os.mkdir(logdir)

    #========================================================
    #
    # Prepare for rollouts
    #

    episodes_so_far = 0
    timesteps_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon
    bc = False
    ppo_mpc = False
    mpc_returns = 0
    model_loss = 0
    for itr in range(onpol_iters):

        print(" ")

        print("onpol_iters: ", itr)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        print("bc learning_rate: ", bc_lr)
        print("ppo learning_rate: ", ppo_lr)

        ################## fit mpc model
        if MPC:
            model_loss, reward_loss = dyn_model.fit(model_data_buffer)

        ################## ppo seg data
        ppo_data_buffer.clear()

        # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon)
        ppo_mpc = False
        mpc = False
        ppo_seg = traj_segment_generator(policy_nn, mpc_controller,
                                         mpc_ppo_controller, bc_data_buffer,
                                         env, mpc, ppo_mpc, env_horizon)

        add_vtarg_and_adv(ppo_seg, gamma, lam)

        ob, ac, rew, nxt_ob, atarg, tdlamret = \
        ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"]

        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        # add into buffer
        for n in range(len(ob)):
            ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])
            model_data_buffer.add(
                [ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n] - ob[n]])

        ppo_std = np.std(ac, axis=0)
        print("ppo_std: ", ppo_std)

        ################## mpc augmented seg data

        if MPC:
            print("MPC AUG PPO")

            ppo_mpc = True
            mpc = True
            mpc_seg = traj_segment_generator(policy_nn, mpc_controller,
                                             mpc_ppo_controller,
                                             bc_data_buffer, env, mpc, ppo_mpc,
                                             env_horizon)
            add_vtarg_and_adv(mpc_seg, gamma, lam)

            ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg[
                "ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg[
                    "rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg[
                        "tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            mpc_returns = mpc_seg["ep_rets"]
            mpc_std = np.std(mpcac)

        if not MPC:
            mpc_std = 0

        ################## mpc random seg data

        if FLAGS.mpc_rand:
            print("MPC Random base policy")

            ppo_mpc = False
            mpc = True
            mpc_random_seg = traj_segment_generator(policy_nn, mpc_controller,
                                                    mpc_ppo_controller,
                                                    bc_data_buffer, env, mpc,
                                                    ppo_mpc, env_horizon)
            add_vtarg_and_adv(mpc_random_seg, gamma, lam)

            ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_random_seg[
                "ob"], mpc_random_seg["ac"], mpc_random_seg[
                    "mpcac"], mpc_random_seg["rew"], mpc_random_seg[
                        "nxt_ob"], mpc_random_seg["adv"], mpc_random_seg[
                            "tdlamret"]
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            mpc_rand_returns = mpc_random_seg["ep_rets"]

        ################# PPO deterministic evaluation
        ppo_determinisitc_return = policy_net_eval(sess,
                                                   env,
                                                   policy_nn,
                                                   env_horizon,
                                                   stochastic=False)

        ################## optimization

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(policy_nn, "ob_rms"):
            policy_nn.ob_rms.update(ob)  # update running mean/std for policy
        policy_nn.assign_old_eq_new(
        )  # set old parameter values to new parameter values

        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(
                    optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(
                    sample_ob_no, sample_ac_na, sample_adv_n,
                    sample_b_n_target, cur_lrmult, ppo_lr * cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING and bc:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(
                    optim_batchsize)
                # print("sample_ob_no", sample_ob_no.shape)
                # print("sample_ac_na", sample_ac_na.shape)

                policy_nn.update_bc(sample_ob_no, sample_ac_na,
                                    bc_lr * cur_lrmult)

            if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc:
                print('epcho: ', op_ep)
                policy_net_eval(sess, env, policy_nn, env_horizon)

        ################## print and save data
        seg = ppo_seg

        ep_lengths = seg["ep_lens"]
        returns = seg["ep_rets"]

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values

        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)

        # log ppo
        logz.log_tabular("TimeSoFar", time.time() - start)
        logz.log_tabular("TimeEp", time.time() - tstart)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.log_tabular("Condition", "PPO")
        logz.dump_tabular()

        # log ppo deterministic
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", ppo_determinisitc_return)
        logz.log_tabular("Condition", "PPO_DETERMINISTIC")
        logz.dump_tabular()

        # log mpc
        if MPC:
            logz.log_tabular("TimeSoFar", time.time() - start)
            logz.log_tabular("TimeEp", time.time() - tstart)
            logz.log_tabular("Iteration", itr)
            logz.log_tabular("AverageReturn", np.mean(mpc_returns))
            logz.log_tabular("StdReturn", np.std(mpc_returns))
            logz.log_tabular("MaxReturn", np.max(mpc_returns))
            logz.log_tabular("MinReturn", np.min(mpc_returns))
            logz.log_tabular("EpLenMean", np.mean(ep_lengths))
            logz.log_tabular("EpLenStd", np.std(ep_lengths))
            logz.log_tabular("TimestepsSoFar", timesteps_so_far)
            logz.log_tabular("Condition", "MPC_PPO")
            logz.dump_tabular()

        if FLAGS.mpc_rand:
            logz.log_tabular("TimeSoFar", time.time() - start)
            logz.log_tabular("TimeEp", time.time() - tstart)
            logz.log_tabular("Iteration", itr)
            logz.log_tabular("AverageReturn", np.mean(mpc_rand_returns))
            logz.log_tabular("StdReturn", np.std(mpc_rand_returns))
            logz.log_tabular("MaxReturn", np.max(mpc_rand_returns))
            logz.log_tabular("MinReturn", np.min(mpc_rand_returns))
            logz.log_tabular("EpLenMean", np.mean(ep_lengths))
            logz.log_tabular("EpLenStd", np.std(ep_lengths))
            logz.log_tabular("TimestepsSoFar", timesteps_so_far)
            logz.log_tabular("Condition", "MPC_RAND")
            logz.dump_tabular()

        # logz.pickle_tf_vars()
        tstart = time.time()

        ################### TF Summaries
        summary_str = sess.run(merged_summary,
                               feed_dict={
                                   ppo_return_op: np.mean(returns),
                                   mpc_return_op: np.mean(mpc_returns),
                                   model_loss_op: model_loss,
                                   ppo_std_op: ppo_std,
                                   reward_loss_op: reward_loss,
                                   mpc_std_op: mpc_std,
                               })
        summary_writer.add_summary(summary_str, itr)
        summary_writer.flush()

        ################ TF SAVE
        if itr % FLAGS.SAVE_ITER == 0 and itr != 0:
            save_path = saver.save(sess, logdir + "/model.ckpt")
            print("Model saved in path: %s" % save_path)
Ejemplo n.º 12
0
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=1000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    paths_rand = sample(env,
                        random_controller,
                        num_paths=num_paths_random,
                        horizon=env_horizon,
                        render=render,
                        verbose=False)
    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(paths_rand)

    gamma = 0.99

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    # prefit dynamic before on policy dagger:
    print("****** Pretrain dynamic Model *******")
    losses = []
    obs_rand = np.concatenate([path["observation"] for path in paths_rand])
    action_rand = np.concatenate([path["action"] for path in paths_rand])
    next_ob_rand = np.concatenate([path["obs_next"] for path in paths_rand])
    data_size_rand = obs_rand.shape[0]
    for i in range(1000):
        # obtain batch size from random policy
        batch_idx_rand = np.random.randint(data_size_rand, size=batch_size)
        batch_ob_rand = obs_rand[batch_idx_rand, :]
        batch_ac_rand = action_rand[batch_idx_rand, :]
        batch_nxt_rand = next_ob_rand[batch_idx_rand, :]
        # obtain batch size from on policy
        batch_ob = np.copy(batch_ob_rand)
        batch_ac = np.copy(batch_ac_rand)
        batch_nxt = np.copy(batch_nxt_rand)
        loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt)
        losses.append(loss)
        if (i % 20 == 0):
            print('loss', loss)

    costs = []
    returns = []
    paths_rl = []
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        # fit dynamic model
        if itr > 0:
            obs_rl = np.concatenate([path["observation"] for path in paths_rl])
            action_rl = np.concatenate([path["action"] for path in paths_rl])
            next_ob_rl = np.concatenate(
                [path["obs_next"] for path in paths_rl])
        obs_rand = np.concatenate([path["observation"] for path in paths_rand])
        action_rand = np.concatenate([path["action"] for path in paths_rand])
        next_ob_rand = np.concatenate(
            [path["obs_next"] for path in paths_rand])
        # print obs[128,:].shape
        data_size_rand = obs_rand.shape[0]
        if itr > 0:
            data_size_rl = obs_rl.shape[0]
        # batch_size=128
        losses = []
        # fit model function
        for i in range(dynamics_iters):
            # obtain batch size from random policy
            batch_idx_rand = np.random.randint(data_size_rand,
                                               size=batch_size / 20)
            batch_ob_rand = obs_rand[batch_idx_rand, :]
            batch_ac_rand = action_rand[batch_idx_rand, :]
            batch_nxt_rand = next_ob_rand[batch_idx_rand, :]
            # obtain batch size from on policy
            if itr > 0:
                batch_idx_rl = np.random.randint(data_size_rl,
                                                 size=batch_size * 19 / 20)
                batch_ob_rl = obs_rl[batch_idx_rl, :]
                batch_ac_rl = action_rl[batch_idx_rl, :]
                batch_nxt_rl = next_ob_rl[batch_idx_rl, :]
                # mix them
                batch_ob = np.concatenate((batch_ob_rand, batch_ob_rl))
                batch_ac = np.concatenate((batch_ac_rand, batch_ac_rl))
                batch_nxt = np.concatenate((batch_nxt_rand, batch_nxt_rl))
            else:
                batch_ob = np.copy(batch_ob_rand)
                batch_ac = np.copy(batch_ac_rand)
                batch_nxt = np.copy(batch_nxt_rand)
            loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt)
            losses.append(loss)
            # if(i%20==0):
            #     print('loss', loss)
        print("on policy dagger ", itr)
        ob = env.reset()
        observes, acs, rewards, obs_2, returns = [], [], [], [], []
        steps = 0
        g = 0
        max_path_length = mpc_controller.horizon
        timesteps_this_batch = 0
        while True:
            while True:
                observes.append(ob)
                ac = mpc_controller.get_action(ob)
                # print ac
                acs.append(ac)
                # print ac
                ob, rew, done, _ = env.step(ac)
                g += rew * gamma**steps
                obs_2.append(ob)
                rewards.append(rew)
                returns.append(g)
                steps += 1
                if done or steps > max_path_length:
                    terminated = done
                    break
            path = {
                "observation": np.array(observes),
                "reward": np.array(rewards),
                "action": np.array(acs),
                "obs_next": np.array(obs_2),
                "return": np.array(returns)
            }
            paths_rl.append(path)
            timesteps_this_batch += pathlength(path)
            print g
            if timesteps_this_batch > batch_size:
                break
        trajectory_cost = trajectory_cost_fn(cheetah_cost_fn,
                                             path["observation"],
                                             path["action"], path["obs_next"])
        costs.append(trajectory_cost)
        returns.append(path["return"][-1])

        # print batch_ob.shape

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 13
0
def train(env,
          cost_fn,
          exp_name='test',
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """
    Arg:
        onpol_iters: Number of iterations of onpolicy aggregation for the loop
                     to run. 

        dynamics_iters: Number of iterations of training for the dynamics model
        |_              which happen per iteration of the aggregation loop.

        batch_size: Batch size for dynamics training.

        num_paths_random: Number of paths/trajectories/rollouts generated 
        |                 by a random agent. We use these to train our 
        |_                initial dynamics model.
    
        num_paths_onpol: Number of paths to collect at each iteration of
        |_               aggregation, using the MPC policy.

        num_simulated_paths: How many fictitious rollouts the MPC policy
        |                    should generate each time it is asked for an
        |_                   action.

        env_horizon: Number of timesteps in each path.

        mpc_horizon: The MPC policy generates actions by imagining 
        |            fictitious rollouts, and picking the first action
        |            of the best fictitious rollout. This argument is
        |            how many timesteps should be in each fictitious
        |_           rollout.

        n_layers/size/activations: Neural network architecture arguments. 
    """
    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    locals_ = locals()
    locals_['cost_fn'] = 'cost_fn'
    locals_['activation'] = 'activation'
    locals_['env'] = 'env'
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    paths = sample(env=env,
                   controller=random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   verbose=False)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = {
        "observations":
        compute_normalization(paths["observations"]),
        "actions":
        compute_normalization(paths["actions"]),
        "deltas":
        compute_normalization(paths["next_observations"] -
                              paths["observations"])
    }

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration
    # refitting the dynamics model to current dataset and then taking onpolicy
    # samples and aggregating to the dataset.
    # TODO: implement mixing ratio for new and old data as described in
    # https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):

        shuffle_indexes = np.random.permutation(paths["observations"].shape[0])
        for key in ['observations', 'actions', 'next_observations', 'rewards']:
            paths[key] = paths[key][shuffle_indexes]

        dyn_model.fit(paths)

        newpaths = sample(env=env,
                          controller=mpc_controller,
                          num_paths=num_paths_onpol,
                          horizon=env_horizon,
                          verbose=False)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        costs = path_cost(cost_fn, newpaths)
        returns = newpaths["acc_rewards"]

        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory
        # using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))
        logz.dump_tabular()

        for key in ['observations', 'actions', 'next_observations', 'rewards']:
            paths[key] = np.concatenate([paths[key], newpaths[key]])
Ejemplo n.º 14
0
def train(env,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          dagger_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_random_rollouts=10,
          num_onpol_rollouts=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          n_hid_units=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    dagger_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dyn_iters                   Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_random_rollouts            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_onpol_rollouts          Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/n_hid_units/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    paths = sample(env,
                   random_controller,
                   num_rollouts=num_random_rollouts,
                   horizon=env_horizon)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.

    normalization_stats = compute_normalization_stats(paths)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                n_hid_units=n_hid_units,
                                activation=activation,
                                output_activation=output_activation,
                                normalization_stats=normalization_stats,
                                batch_size=batch_size,
                                num_iter=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation
    # at each iteration refitting the dynamics model to current dataset
    # and then taking on-policy samples and aggregating to the dataset.
    #
    # Note: You don't need to use a mixing ratio in this assignment
    # for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for i in range(dagger_iters):
        print('********** ITERATION {}/{} ************'.format(
            i + 1, dagger_iters))

        # Fitting dynamics model
        dyn_model.fit(paths)

        # Sampling on-policy
        new_paths = sample(env,
                           mpc_controller,
                           num_rollouts=num_onpol_rollouts,
                           horizon=env_horizon)
        paths = new_paths + random.sample(
            paths,
            len(new_paths) // 9)  # Adding new paths and forgetting old ones
        # paths += new_paths

        returns = [sum(path['rewards']) for path in new_paths]
        costs = [path_cost(path) for path in new_paths]

        # LOGGING
        # Statistics for performance of MPC policy using our learned dynamics model
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))

        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 15
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    print(params)
    # the three lines below are to override the functions passed in, which aren't serializable
    params["activation"] = "relu"
    params["cost_fn"] = "cheetah_cost_fn"
    params["env"] = "HalfCheetahEnvNew"
    logz.save_params(params)

    returns_file = "returns.csv"
    returns_array = []

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    data = sample(env,
                  random_controller,
                  num_paths=num_paths_random,
                  horizon=env_horizon,
                  render=False,
                  verbose=False)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then
    # taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in
    # https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """

        print(itr)
        # learn/fit dynamics model using the Adam optimization algorithm
        l = dyn_model.fit(data)
        print(l)

        # sample a set of on-policy trajectories from the environment
        new_data = sample(env,
                          mpc_controller,
                          num_paths=num_paths_onpol,
                          horizon=env_horizon,
                          render=render,
                          verbose=False)

        # append transition to dataset
        data += new_data

        # compute costs
        costs = np.array([path_cost(cost_fn, path) for path in new_data])
        print(costs)

        # compute returns
        returns = np.array(
            [new_data[i]["returns"] for i in range(len(new_data))])
        print(returns)

        returns_array.append(returns)
        np.array(returns_array).dump(returns_file)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 16
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None,
         clip_param=0.2 , 
         entcoeff=0.0,
         gamma=0.99,
         lam=0.95,
         optim_epochs=10,
         optim_batchsize=64,
         schedule='linear',
         bc_lr=1e-3,
         ppo_lr=3e-4,
         timesteps_per_actorbatch=1000,
         MPC = True,
         BEHAVIORAL_CLONING = True,
         PPO = True,
         ):

    start = time.time()


    print("-------- env info --------")
    print("Environment: ", FLAGS.env_name)
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("action_space low: ", env.action_space.low)
    print("action_space high: ", env.action_space.high)

    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)
    print("MPC-AUG: ", MPC)

    print(" ")


    random_controller = RandomController(env)

    # Creat buffers
    model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5)
    ppo_data_buffer = DataBufferGeneral(10000, 4)
    bc_data_buffer = DataBufferGeneral(2000, 2)

    # Random sample path

    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add([path['observations'][n],
                                 path['actions'][n], 
                                 path['rewards'][n], 
                                 path['next_observations'][n], 
                                 path['next_observations'][n] - path['observations'][n]])

    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    # 
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    # 
    # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    tf_config = tf.ConfigProto() 

    tf_config.gpu_options.allow_growth = True

    sess = tf.Session(config=tf_config)

    policy_nn = MlpPolicy(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff)

    if FLAGS.LEARN_REWARD:
        print("Learn reward function")
        dyn_model = NNDynamicsRewardModel(env=env, 
                                        normalization=normalization,
                                        batch_size=batch_size,
                                        iterations=dynamics_iters,
                                        learning_rate=learning_rate,
                                        sess=sess)

        mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, 
                                       dyn_model=dyn_model, 
                                       explore=FLAGS.MPC_EXP,
                                       policy_net=policy_nn,
                                       self_exp=FLAGS.SELFEXP,
                                       horizon=mpc_horizon, 
                                       num_simulated_paths=num_simulated_paths)
    else:
        print("Use predefined cost function")
        dyn_model = NNDynamicsModel(env=env, 
                                    n_layers=n_layers, 
                                    size=size, 
                                    activation=activation, 
                                    output_activation=output_activation, 
                                    normalization=normalization,
                                    batch_size=batch_size,
                                    iterations=dynamics_iters,
                                    learning_rate=learning_rate,
                                    sess=sess)

        mpc_ppo_controller = MPCcontrollerPolicyNet(env=env, 
                                       dyn_model=dyn_model, 
                                       explore=FLAGS.MPC_EXP,
                                       policy_net=policy_nn,
                                       self_exp=FLAGS.SELFEXP,
                                       horizon=mpc_horizon, 
                                       cost_fn=cost_fn, 
                                       num_simulated_paths=num_simulated_paths)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)
    # if not PPO:
    #     mpc_ppo_controller = mpc_controller

    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(FLAGS.model_path)

    print("checkpoint", checkpoint)

    if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(FLAGS.model_path):
          os.mkdir(FLAGS.model_path)  

    #========================================================
    # 
    # Prepare for rollouts
    # 

    tstart = time.time()


    states_true = []
    states_predict = []
    rewards_true = []
    rewards_predict = []
    ob = env.reset()
    ob_pre = np.expand_dims(ob, axis=0)

    states_true.append(ob)
    states_predict.append(ob_pre)

    for step in range(100):
        # ac = env.action_space.sample() # not used, just so we have the datatype
        ac, _ = policy_nn.act(ob, stochastic=True)
        ob, rew, done, _ = env.step(ac)
        ob_pre, r_pre = dyn_model.predict(ob_pre, ac)
        states_true.append(ob)
        rewards_true.append(rew)
        states_predict.append(ob_pre)
        rewards_predict.append(r_pre[0][0])

    states_true = np.asarray(states_true)
    states_predict = np.asarray(states_predict)
    states_predict = np.squeeze(states_predict, axis=1)
    rewards_true = np.asarray(rewards_true)
    rewards_predict = np.asarray(rewards_predict)

    print("states_true", states_true.shape)
    print("states_predict", states_predict.shape)
    print("rewards_true", rewards_true.shape)
    print("rewards_predict", rewards_predict.shape)

    np.savetxt('./data/eval_model/states_true.out', states_true, delimiter=',') 
    np.savetxt('./data/eval_model/states_predict.out', states_predict, delimiter=',') 

    np.savetxt('./data/eval_model/rewards_true.out', rewards_true, delimiter=',') 
    np.savetxt('./data/eval_model/rewards_predict.out', rewards_predict, delimiter=',') 
Ejemplo n.º 17
0
def train(env,
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=20,
         batch_size=64,
         num_paths_random=10,
         num_paths_onpol=10,
         num_simulated_paths=10000,
         env_horizon=500,
         mpc_horizon=15,
         n_layers=2,
         size=64,
         activation=tf.nn.relu,
         output_activation=None,
         controller_service=None,
         ):

    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    ref_controller = RefMPCController(env, lambda state: call_mpc(env, controller_service))

    paths = sample(env,
           random_controller,
           num_paths=num_paths_random,
           horizon=env_horizon,
           render=False,
           verbose=False,
           )


    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = compute_normalization(paths)
    print(normalization)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)


    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):

        dyn_model.fit(paths)
        new_paths = sample(env,mpc_controller, num_paths=num_paths_onpol,horizon=env_horizon,render=False,verbose=False)
        costs = []
        returns = []
        for new_path in new_paths:
            cost = path_cost(cost_fn, new_path)
            costs.append(cost)
            returns.append(new_path['return'])
        costs = np.array(costs)
        returns = np.array(returns)
        paths = paths + new_paths # Aggregation
        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 18
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    del params['cost_fn']
    del params['activation']
    del params['output_activation']
    del params['env']
    logz.save_params(params)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    # Sample from random controller
    paths = sample(env, random_controller, num_paths_random, env_horizon,
                   render, True)
    # Build data set
    data = dict()
    data['observations'] = np.concatenate(
        [path['observations'] for path in paths])
    data['actions'] = np.concatenate([path['actions'] for path in paths])
    next_observations = np.concatenate(
        [path['next_observations'] for path in paths])
    data['deltas'] = next_observations - data['observations']

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        # Refit dynamic model
        dyn_model.fit(data)
        # Sample on-policy trajectories
        paths = sample(env, mpc_controller, num_paths_onpol, env_horizon,
                       render, True)
        # Summarize trajectories
        costs = [path_cost(cost_fn, path) for path in paths]
        returns = [np.sum(path['rewards']) for path in paths]
        # Aggregate data
        onpol_observations = np.concatenate(
            [path['observations'] for path in paths])
        onpol_actions = np.concatenate([path['actions'] for path in paths])
        onpol_next_observations = np.concatenate(
            [path['next_observations'] for path in paths])
        onpol_deltas = onpol_next_observations - onpol_observations
        data['observations'] = np.append(data['observations'],
                                         onpol_observations, 0)
        data['actions'] = np.append(data['actions'], onpol_actions, 0)
        data['deltas'] = np.append(data['deltas'], onpol_deltas, 0)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 19
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=1,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=1,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=100,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    paths, rewards, costs = sample(env, random_controller, num_paths_random)
    obs = np.concatenate([path["observations"] for path in paths])
    acs = np.concatenate([path["actions"] for path in paths])
    n_obs = np.concatenate([path["next_observations"] for path in paths])
    delta = n_obs - obs
    data = {'observations': obs, 'actions': acs, 'delta': delta}

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    mean_obs, std_obs, mean_deltas, std_deltas, mean_actions, std_actions = compute_normalization(
        data)
    normalization = dict()
    normalization['observations'] = [mean_obs, std_obs]
    normalization['actions'] = [mean_actions, std_actions]
    normalization['delta'] = [mean_deltas, std_deltas]
    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    print("onpol_iter", onpol_iters)
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        print(data['observations'].shape)
        #print(data['observations'].shape)
        dyn_model.fit(data)

        # Generate trajectories from MPC controllers

        pathsM, returns, costs = sample(env, mpc_controller, num_paths_onpol)
        obs = np.concatenate([path["observations"] for path in pathsM])
        acs = np.concatenate([path["actions"] for path in pathsM])
        n_obs = np.concatenate([path["next_observations"] for path in pathsM])
        delta = n_obs - obs
        data = {
            'observations': np.concatenate((data['observations'], obs)),
            'actions': np.concatenate((data['actions'], acs)),
            'delta': np.concatenate((data['delta'], delta))
        }

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 20
0
def main():
    nb_total_steps = 1000
    nb_iterations = 40
    hidden_layers = [256, 256]
    writer = tensorboardX.SummaryWriter()

    args = parse_args(__doc__, ['env'])


    env = gym.make(args.env) 

    ctrl = rand_ctrl = RandomController(env)


    # ipdb.set_trace()
    print('#inputs : %d' % ctrl.nb_inputs())
    print('#actions: %d' % ctrl.nb_actions())

    # f_net = make_net(
    #     [ctrl.nb_inputs() + ctrl.nb_actions()] + hidden_layers + [ctrl.nb_inputs()],
    #     [nn.ReLU() for _ in hidden_layers],
    # )
    f_net = MOENetwork(
        nb_inputs=ctrl.nb_inputs() + ctrl.nb_actions(),
        nb_experts=4,
        gait_layers=[64],
        expert_layers=[64, ctrl.nb_inputs()],
    )

    data = collect_data(env, ctrl, nb_total_steps*10)


    # ipdb.set_trace()

    dynamics = DynamicsModel(env, f_net, data.get_all(), writer=writer)
    # cost_func = lambda s,a,sn: -sn[3].item()  # refers to vx
    cost_func = get_cost(args.env)  # refers to vx

    # data.calc_normalizations()
    # dynamics.fit(data)

    mpc_ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=100, horizon=10, num_mpc_steps=10)
    eval_args = EvaluationArgs(nb_burnin_steps=4, nb_episodes=10, horizons=[1, 2, 4, 8, 16, 32])

    for i in range(nb_iterations):
        print('Iteration', i)
        new_data = collect_data(env, ctrl, nb_total_steps)
        dynamics.fit(*new_data.get_all())
        data.extend(new_data)
        dynamics.fit(*data.sample(sample_size=4*nb_total_steps))
        evaluate_and_log_dynamics(
            dynamics.predict, env, rand_ctrl, writer=writer, i_step=i, args=eval_args
        )
        evaluate_and_log_dynamics(
            dynamics.predict, env, mpc_ctrl, writer=writer, i_step=i, args=eval_args
        )
        # dynamics.fit(*data.get_all())
        if random.random() > 0.5:
            ctrl = rand_ctrl
        else:
            ctrl = mpc_ctrl
    
    env = gym.make(args.env)

    ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=1000, num_mpc_steps=4)

    # TODO

    env.render(mode='human')
    obs = env.reset()

    for _ in range(100):
        # time.sleep(1. / 60.)
        obs, r, done, _ = env.step(ctrl.get_action(obs))
        # print('  ', cost_func(obs))
        if done:
            print("done:", r, obs)
            time.sleep(1)
            ctrl.reset()
            obs = env.reset()
    ipdb.set_trace()
Ejemplo n.º 21
0
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=10000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
    clip_param=0.2,
    entcoeff=0.0,
    gamma=0.99,
    lam=0.95,
    optim_epochs=10,
    optim_batchsize=64,
    schedule='linear',
    optim_stepsize=3e-4,
    timesteps_per_actorbatch=1000,
    BEHAVIORAL_CLONING=True,
    PPO=True,
):

    start = time.time()

    logz.configure_output_dir(logdir)

    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)

    print(" ")

    random_controller = RandomController(env)
    model_data_buffer = DataBuffer()

    ppo_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 6)
    bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   render=False,
                   verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add(path['observations'][n], path['actions'][n],
                                  path['next_observations'][n])

    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    #
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    policy_nn = MlpPolicy_bc(sess=sess,
                             env=env,
                             hid_size=64,
                             num_hid_layers=2,
                             clip_param=clip_param,
                             entcoeff=entcoeff)

    bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate)

    mpc_controller_bc_ppo = MPCcontroller_BC_PPO(
        env=env,
        dyn_model=dyn_model,
        bc_ppo_network=policy_nn,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

    #========================================================
    #
    # Prepare for rollouts
    #

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon

    for itr in range(onpol_iters):

        print("onpol_iters: ", itr)
        dyn_model.fit(model_data_buffer)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        # saver.save(sess, CHECKPOINT_DIR)
        behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

        ppo_data_buffer.clear()
        seg = traj_segment_generator(policy_nn, mpc_controller,
                                     mpc_controller_bc_ppo, bc_data_buffer,
                                     env, env_horizon)
        add_vtarg_and_adv(seg, gamma, lam)

        ob, ac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg["ac"], seg[
            "rew"], seg["nxt_ob"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        for n in range(len(ob)):
            ppo_data_buffer.add(
                (ob[n], ac[n], rew[n], nxt_ob[n], atarg[n], tdlamret[n]))
            bc_data_buffer.add((ob[n], ac[n]))
            model_data_buffer.add(ob[n], ac[n], nxt_ob[n])

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        # behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000)

        if hasattr(policy_nn, "ob_rms"):
            policy_nn.ob_rms.update(ob)  # update running mean/std for policy
        policy_nn.assign_old_eq_new(
        )  # set old parameter values to new parameter values

        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_rew, sample_nxt_ob_no, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(
                    optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(
                    sample_ob_no, sample_ac_na, sample_adv_n,
                    sample_b_n_target, cur_lrmult, optim_stepsize * cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(
                    optim_batchsize)
                policy_nn.update_bc(sample_ob_no, sample_ac_na,
                                    optim_stepsize * cur_lrmult)

            if op_ep % 100 == 0:
                print('epcho: ', op_ep)
                behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values

        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        ep_lengths = seg["ep_lens"]
        returns = seg["ep_rets"]

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Ejemplo n.º 22
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=10,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    paths = sample(env, random_controller, num_paths=50)
    first = 1
    for path in paths:
        if (first):
            data = {
                "observations": path['observations'],
                "next_observations": path['next_observations'],
                "rewards": path['rewards'],
                "actions": path['actions'],
                "returns": path['returns']
            }
            first = 0
        else:
            data['observations'] = np.vstack(
                (data['observations'], path['observations']))
            data['next_observations'] = np.vstack(
                (data['next_observations'], path['next_observations']))
            data['rewards'] = np.vstack((data['rewards'], path['rewards']))
            data['actions'] = np.vstack((data['actions'], path['actions']))
            data['returns'] = np.vstack((data['returns'], path['returns']))

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #

    #open CSV
    csv_file = open('results.csv', 'w')
    writer = csv.writer(csv_file, delimiter=',')

    for itr in range(onpol_iters):
        print(itr)
        costs = []
        returns = []
        """ YOUR CODE HERE """
        dyn_model.fit(data)
        #plot_comparison(env,dyn_model)
        mpc_controller.dyn_model = dyn_model  #need to update or not?
        new_paths = sample(env, mpc_controller)
        for path in new_paths:
            cost = path_cost(cost_fn, path)
            costs.append(cost)
            returns.append(path['returns'][-1])

            data['observations'] = np.vstack(
                (data['observations'], path['observations']))
            data['next_observations'] = np.vstack(
                (data['next_observations'], path['next_observations']))
            data['actions'] = np.vstack((data['actions'], path['actions']))

        dyn_model.normalization = compute_normalization(data)

        writer.writerow([itr, np.mean(returns)])

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 23
0
def train(state_cb,
          pub_cmd,
          pub_act,
          rate,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    rand_controller = RandomController()
    paths = sample(state_cb, pub_cmd, pub_act, rate, rand_controller,
                   num_paths_random, env_horizon, render)
    data = paths_to_array(paths)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        # Fit dynamics model
        print('Training dynamics model...')
        dyn_model.fit(data)
        plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate)
        mpc_controller.dyn_model = dyn_model
        costs = []
        returns = []
        # Do MPC
        for i in range(num_paths_onpol):
            print('On policy path: %i' % i)
            obs_t, obs_tp1, acs_t, rews_t = [], [], [], []
            s_t = state_cb.reset(pub_act, pub_cmd)
            total_return = 0

            for j in range(env_horizon):
                # print('Timestep: %i, Return: %g' % (j,total_return))
                a_t = mpc_controller.get_action(s_t)
                s_tp1, _ = state_cb.step(a_t, pub_act, pub_cmd)
                r_t = 0
                for i in range(9):
                    r_t += s_tp1[i * 12] - s_t[i * 12]
                total_return += r_t

                if render:
                    env.render()
                    time.sleep(0.05)

                obs_t.append(s_t)
                obs_tp1.append(s_tp1)
                acs_t.append(a_t)
                rews_t.append(r_t)

                s_t = s_tp1

            path = {
                "observations": np.array(obs_t),
                "next_observations": np.array(obs_tp1),
                "actions": np.array(acs_t),
                "rewards": np.array(rews_t)
            }
            total_cost = path_cost(cost_fn, path)

            paths.append(path)
            returns.append(total_return)
            costs.append(total_cost)
            print('Total cost: %g, Total reward: %g' %
                  (total_cost, total_return))

        data = paths_to_array(paths)
        normalization = compute_normalization(data)
        # Set new normalization statistics for dynamics model
        dyn_model.normalization = normalization

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 24
0
def train_PG(
             exp_name='',
             env_name='',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,

             # mb mpc arguments
             model_learning_rate=1e-3,
             onpol_iters=10,
             dynamics_iters=260,
             batch_size=512,
             num_paths_random=10, 
             num_paths_onpol=10, 
             num_simulated_paths=1000,
             env_horizon=1000, 
             mpc_horizon=10,
             m_n_layers=2,
             m_size=500,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    # env = gym.make(env_name)
    env = HalfCheetahEnvNew()
    cost_fn = cheetah_cost_fn
    activation=tf.nn.relu
    output_activation=None

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    # max_path_length = max_path_length or env.spec.max_episode_steps
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("-------- env info --------")
    print("Environment name: ", env_name)
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)




    #========================================================================================#
    # Random data collection
    #========================================================================================#

    random_controller = RandomController(env)
    data_buffer_model = DataBuffer()
    data_buffer_ppo = DataBuffer_general(10000, 4)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n])

    print("data buffer size: ", data_buffer_model.size)

    normalization = compute_normalization(data_buffer_model)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    tf_config = tf.ConfigProto() 
    tf_config.allow_soft_placement = True
    tf_config.intra_op_parallelism_threads =4
    tf_config.inter_op_parallelism_threads = 1
    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate)

    if nn_baseline:
        value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate)

    sess.__enter__() # equivalent to `with sess:`

    tf.global_variables_initializer().run()


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        if MPC:
            dyn_model.fit(data_buffer_model)
        returns = []
        costs = []

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []

        while True:
            # print("data buffer size: ", data_buffer_model.size)
            current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]}

            ob = env.reset()
            obs, acs, mpc_acs, rewards = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            return_ = 0
 
            while True:
                # print("steps ", steps)
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                if MPC:
                    mpc_ac = mpc_controller.get_action(ob)
                else:
                    mpc_ac = random_controller.get_action(ob)

                ac = policy_nn.predict(ob, mpc_ac)

                ac = ac[0]

                if not PG:
                    ac = mpc_ac

                acs.append(ac)
                mpc_acs.append(mpc_ac)

                current_path['observations'].append(ob)

                ob, rew, done, _ = env.step(ac)

                current_path['reward'].append(rew)
                current_path['actions'].append(ac)
                current_path['next_observations'].append(ob)

                return_ += rew
                rewards.append(rew)

                steps += 1
                if done or steps > max_path_length:
                    break


            if MPC:
                # cost & return
                cost = path_cost(cost_fn, current_path)
                costs.append(cost)
                returns.append(return_)
                print("total return: ", return_)
                print("costs: ", cost)

                # add into buffers
                for n in range(len(current_path['observations'])):
                    data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n])

            for n in range(len(current_path['observations'])):
                data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n])
        
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs),
                    "mpc_action" : np.array(mpc_acs)}



            paths.append(path)
            timesteps_this_batch += pathlength(path)
            # print("timesteps_this_batch", timesteps_this_batch)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch


        print("data_buffer_ppo.size:", data_buffer_ppo.size)


        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths])


        # Computing Q-values
     
        if reward_to_go:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        if t_ >= t:
                            q += gamma**(t_-t) * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)

        else:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        q += gamma**t_ * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)


        # Computing Baselines
        if nn_baseline:

            # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no})
            b_n = value_nn.predict(ob_no)
            b_n = normalize(b_n)
            b_n = denormalize(b_n, np.std(q_n), np.mean(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # Advantage Normalization
        if normalize_advantages:
            adv_n = normalize(adv_n)

        # Optimizing Neural Network Baseline
        if nn_baseline:
            b_n_target = normalize(q_n)
            value_nn.fit(ob_no, b_n_target)
                # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target})


        # Performing the Policy Update

        # policy_nn.fit(ob_no, ac_na, adv_n)
        policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na)

        # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Ejemplo n.º 25
0
Archivo: main.py Proyecto: Snowstu/MBMF
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation='relu',
          output_activation=None):
    """

	Arguments:

	onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

	dynamics_iters              Number of iterations of training for the dynamics model
	|_                          which happen per iteration of the aggregation loop.

	batch_size                  Batch size for dynamics training.

	num_paths_random            Number of paths/trajectories/rollouts generated
	|                           by a random agent. We use these to train our
	|_                          initial dynamics model.

	num_paths_onpol             Number of paths to collect at each iteration of
	|_                          aggregation, using the Model Predictive Control policy.

	num_simulated_paths         How many fictitious rollouts the MPC policy
	|                           should generate each time it is asked for an
	|_                          action.

	env_horizon                 Number of timesteps in each path.

	mpc_horizon                 The MPC policy generates actions by imagining
	|                           fictitious rollouts, and picking the first action
	|                           of the best fictitious rollout. This argument is
	|                           how many timesteps should be in each fictitious
	|_                          rollout.

	n_layers/size/activations   Neural network architecture arguments.

	"""
    logz.configure_output_dir(logdir)
    # ========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.
    """ YOUR CODE HERE """
    random_controller = RandomController(env)

    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   ignore_done=True)  # 10

    # ========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    # concatenate observations & actions to numpy data_rand_x
    # concatenate (next_observations -observations) to numpy data_rand_y
    for i in range(num_paths_random):
        if i == 0:
            data_rand_x = np.concatenate(
                (paths[i]['observations'], paths[i]['actions']), axis=1)
            data_rand_y = paths[i]['next_observations'] - paths[i][
                'observations']
        else:
            x = np.concatenate((paths[i]['observations'], paths[i]['actions']),
                               axis=1)
            data_rand_x = np.concatenate((data_rand_x, x), axis=0)
            y = paths[i]['next_observations'] - paths[i]['observations']
            data_rand_y = np.concatenate((data_rand_y, y), axis=0)

    # Initialize data set D to Drand
    data_x = data_rand_x
    data_y = data_rand_y

    # ========================================================
    #
    # Build dynamics model and MPC controllers.
    #

    # sess = tf.Session()

    # dyn_model = NNDynamicsModel(env=env,
    # 							n_layers=n_layers,
    # 							size=size,
    # 							activation=activation,
    # 							output_activation=output_activation,
    # 							batch_size=batch_size,
    # 							iterations=dynamics_iters,
    # 							learning_rate=learning_rate,
    # 							normalization=normalization
    # 							)
    dyn_model = NNDynamicsModel(
        env=env,
        hidden_size=(500, 500),
        activation=activation,  #'tanh'
    ).cuda()

    mpc_controller = MPCcontroller(
        env=env,
        dyn_model=dyn_model,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths,
    )

    # ========================================================
    #
    # Tensorflow session building.
    #
    # sess.__enter__()
    # tf.global_variables_initializer().run()

    # ========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #

    # make dirs output
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)
    path = os.path.join(logdir, 'model')
    if not (os.path.exists(path)):
        os.makedirs(path)

    for itr in range(onpol_iters):
        """ YOUR CODE HERE """

        if itr != 0:
            dyn_model.load_state_dict(torch.load(path + '/net_params.pkl'))

        # store data
        # if (itr % 9) == 0 or itr == (onpol_iters-1):
        if itr >= 0:
            logger = Logger(logdir, csvname='log_orig' + str(itr))
            data = np.concatenate((data_x, data_y), axis=1)
            logger.log_table2csv(data)
        if itr == 0:
            data_x += np.random.normal(0, 0.001, size=data_x.shape)
            data_y += np.random.normal(0, 0.001, size=data_y.shape)
        else:
            data_x = best_x + np.random.normal(0, 0.001, size=best_x.shape)
            data_y = best_y + np.random.normal(0, 0.001, size=best_y.shape)

        dyn_model.fit(data_x,
                      data_y,
                      epoch_size=dynamics_iters,
                      batch_size=batch_size,
                      test=True)

        torch.save(dyn_model.state_dict(),
                   path + '/net_params.pkl')  # save only the parameters
        torch.save(dyn_model,
                   path + '/net' + str(itr) + '.pkl')  # save entire net

        print('-------------Itr %d-------------' % itr)
        print('Start time:\n')
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

        start = time.time()  # caculate run time --start time point
        # sample
        if Monitor is True:
            monitor_path = os.path.join(logdir, 'monitor' + str(itr))
            env = wrappers.Monitor(env, monitor_path, force=True)

        paths = sample(env,
                       mpc_controller,
                       num_paths=num_paths_onpol,
                       horizon=env_horizon,
                       render=False,
                       ignore_done=False,
                       MPC=True)

        end = time.time()
        runtime2 = end - start
        print('runtime = ', runtime2)

        print('End time:\n')
        print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

        # concatenate observations & actions to numpy data_rand_x
        # concatenate (next_observations -observations) to numpy data_rand_y
        for i in range(num_paths_onpol):
            if i == 0:
                data_rl_x = np.concatenate(
                    (paths[i]['observations'], paths[i]['actions']), axis=1)
                data_rl_y = paths[i]['next_observations'] - paths[i][
                    'observations']
            else:
                x = np.concatenate(
                    (paths[i]['observations'], paths[i]['actions']), axis=1)
                data_rl_x = np.concatenate((data_rl_x, x), axis=0)
                y = paths[i]['next_observations'] - paths[i]['observations']
                data_rl_y = np.concatenate((data_rl_y, y), axis=0)

        # Aggregate data
        data_x = np.concatenate((data_x, data_rl_x), axis=0)
        data_y = np.concatenate((data_y, data_rl_y), axis=0)

        costs = np.zeros((num_paths_onpol, 1))
        returns = np.zeros((num_paths_onpol, 1))
        for i in range(num_paths_onpol):
            costs[i] = paths[i]['cost']
            returns[i] = paths[i]['returns'][0]

        if itr == 0:
            best_x = data_rl_x
            best_y = data_rl_y
        else:
            best_x = np.concatenate((best_x, data_rl_x), axis=0)
            best_y = np.concatenate((best_y, data_rl_y), axis=0)
        # store data
        #if (itr % 9) == 0 or itr == (onpol_iters-1):
        if itr >= 0:
            logger = Logger(logdir, csvname='best' + str(itr))
            data = np.concatenate((best_x, best_y), axis=1)
            logger.log_table2csv(data)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
Ejemplo n.º 26
0
def train(env,
         cost_fn,
         load_model,
         model_path,
         logdir=None,
         render=False,
         learning_rate_dyn=1e-3,
         learning_rate_policy=1e-4,
         onpol_iters=10,
         dynamics_iters=60,
         policy_iters=100,
         batch_size=512,
         num_paths_random=10,
         num_paths_onpol=5,
         num_simulated_paths=10000,
         env_horizon=1000,
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None,
         ):

    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    #logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    """ YOUR CODE HERE """
    data = sample(env, random_controller, num_paths_random, env_horizon)


    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)


    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate_dyn,
                                sess=sess)

    policy = NNPolicy(env=env,
                    normalization=normalization,
                    batch_size=batch_size,
                    iterations=policy_iters,
                    learning_rate=learning_rate_policy,
                    sess=sess,
                    model_path=model_path,
                    save_path="./policy/",
                    load_model=load_model)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    lqr_controller = LQRcontroller(env=env,
                                   delta=0.005,
                                   T=50,
                                   dyn_model=dyn_model,
                                   cost_fn=cost_fn,
                                   iterations=1)

    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()

    #========================================================
    #
    # Tensorflow session building.
    #

    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #

    # training the MPC controller as well as dynamics
    for itr in range(onpol_iters):

        print("fitting dynamics for worker ", rank)
        dyn_model.fit(data)
        print("sampling new trajectories from worker ", rank)
        new_data = sample(env, lqr_controller, num_paths_onpol, env_horizon)

        data += new_data
        comm.send(new_data, 0)

        if rank == 0:
            costs, returns = [], []

            for path in data:

                costs.append(path_cost(cost_fn, path))
                returns.append(np.sum(path['rewards']))

            print("returns ",returns)

            for i in range(1, size):
                data += comm.recv(source=i)

            print("fitting policy...")
            policy.fit(data)

            # LOGGING
            # Statistics for performance of MPC policy using
            # our learned dynamics model
            logz.log_tabular('Iteration', itr)
            # In terms of cost function which your MPC controller uses to plan
            logz.log_tabular('AverageCost', np.mean(costs))
            logz.log_tabular('StdCost', np.std(costs))
            logz.log_tabular('MinimumCost', np.min(costs))
            logz.log_tabular('MaximumCost', np.max(costs))
            # In terms of true environment reward of your rolled out trajectory using the MPC controller
            logz.log_tabular('AverageReturn', np.mean(returns))
            logz.log_tabular('StdReturn', np.std(returns))
            logz.log_tabular('MinimumReturn', np.min(returns))
            logz.log_tabular('MaximumReturn', np.max(returns))

            logz.dump_tabular()

    # applying the learned neural policy
    if rank == 0:
        ob = env.reset()

        while True:
            a = policy.get_action(ob.reshape((1, ob.shape[0])))

            # control clipping to be added

            next_ob, reward, done, info = env.step(a[0])
            print("action", a)
            print("predicted ob", dyn_model.predict(ob, a))
            print("actual ob", (next_ob - normalization[0]) / (normalization[1] + 1e-10))
            env.render()
            ob = next_ob
            if done:
                ob = env.reset()
Ejemplo n.º 27
0
Archivo: main.py Proyecto: amoliu/MPC
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    """ YOUR CODE HERE """
    data = sample(env, random_controller, num_paths_random, env_horizon)


    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)
Ejemplo n.º 28
0
def get_dynamics_test_loss(env, dyn_model):
    return dyn_model.get_loss_on_data(sample(env, RandomController(env)))