Esempio n. 1
0
    def initialize(self):

        R = rospy.get_param('~train_pub_rate', 100)
        self.rate = rospy.Rate(R)
        self.buffer_duration = rospy.get_param('~buffer_duration', 0.1)
        self.buffer_size = rospy.get_param('~buffer_size', 500)

        # Robot-specific setup implemented by derived class
        self.setup_robot()

        # Jitter robot initially
        XI, YI = self.jitter_robot()
        num_joints = np.size(YI, 1)

        # Create and initialize SOLAR_GP model
        self.solar = LocalModels(self.num_inducing,
                                 wgen=self.wgen,
                                 xdim=3,
                                 ndim=num_joints * 2)
        self.solar.initializeF(XI, YI)

        # Serialize SOLAR_GP model into custom message and publish
        SolarMsg = self.constructMsg(self.solar)
        self.pub_solar.publish(SolarMsg)

        # Create Data buffer listening on training input-output topics
        self.TrainData = DataBuffer(self.x_topic, self.y_topic,
                                    self.joint_names, self.buffer_duration,
                                    self.buffer_size)
Esempio n. 2
0
def customLoss(y_true, y_pred):
    return K.mean(K.square(y_pred - y_true), axis=-1)


with open('model.json', 'r') as jfile:
    model = model_from_json(jfile.read())

model.compile("adam", "mse")
weights_file = 'weights.0209-0.046.hdf5'
model.load_weights(weights_file)

# model = load_model("multiModel.h5", custom_objects={'customLoss':customLoss})
graph = tf.get_default_graph()

data_buffer = DataBuffer()
res_queue = queue.Queue(maxsize=1)
#
# idxs = [0, 1, 2]
# means = [-122.33790211, 39.53881540, 62.68238949]
# stds = [0.00099555, 0.00180817, 13.48539298]

# def normalize_vector(xVec):
#     for i, mean, std in zip(idxs, means, stds):
#         xVec[i] -= mean
#         xVec[i] /= std
#     return xVec


def copyImage(byte_array, imageSize):
    if imageSize > 8:
Esempio n. 3
0
# topic /sensors/odom
from nav_msgs.msg import Odometry

# topic /darknet_ros/bounding_boxes
from darknet_ros_msgs.msg import BoundingBoxes

# topic /scan
from sensor_msgs.msg import LaserScan
# scan.range[179] = distance between the robot and an object

object_searched = [
    "stop sign", "backpack", "refrigerator", "motorbike", "pottedplant",
    "suitcase", "teddy bear"
]

odom_bag = DataBuffer(maxlen=1000)
bounding_boxes_bag = DataBuffer()
scan_bag = DataBuffer(maxlen=1000)

objects = []
previous_robot_position = None
previous_object = None


def _process_bounding_box(bounding_box):
    global previous_robot_position, previous_object
    current_robot_position = odom_bag.get_closest_to(
        bounding_box.image_header.stamp)
    distance_from_object = scan_bag.get_closest_to(
        bounding_box.image_header.stamp)
    # TODO: checker si c'est le même object ou un autre
Esempio n. 4
0
def train_PG(
             exp_name='',
             env_name='',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,

             # mb mpc arguments
             model_learning_rate=1e-3,
             onpol_iters=10,
             dynamics_iters=260,
             batch_size=512,
             num_paths_random=10, 
             num_paths_onpol=10, 
             num_simulated_paths=1000,
             env_horizon=1000, 
             mpc_horizon=10,
             m_n_layers=2,
             m_size=500,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    # env = gym.make(env_name)
    env = HalfCheetahEnvNew()
    cost_fn = cheetah_cost_fn
    activation=tf.nn.relu
    output_activation=None

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    # max_path_length = max_path_length or env.spec.max_episode_steps
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("-------- env info --------")
    print("Environment name: ", env_name)
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)




    #========================================================================================#
    # Random data collection
    #========================================================================================#

    random_controller = RandomController(env)
    data_buffer_model = DataBuffer()
    data_buffer_ppo = DataBuffer_general(10000, 4)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n])

    print("data buffer size: ", data_buffer_model.size)

    normalization = compute_normalization(data_buffer_model)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    tf_config = tf.ConfigProto() 
    tf_config.allow_soft_placement = True
    tf_config.intra_op_parallelism_threads =4
    tf_config.inter_op_parallelism_threads = 1
    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate)

    if nn_baseline:
        value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate)

    sess.__enter__() # equivalent to `with sess:`

    tf.global_variables_initializer().run()


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        if MPC:
            dyn_model.fit(data_buffer_model)
        returns = []
        costs = []

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []

        while True:
            # print("data buffer size: ", data_buffer_model.size)
            current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]}

            ob = env.reset()
            obs, acs, mpc_acs, rewards = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            return_ = 0
 
            while True:
                # print("steps ", steps)
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                if MPC:
                    mpc_ac = mpc_controller.get_action(ob)
                else:
                    mpc_ac = random_controller.get_action(ob)

                ac = policy_nn.predict(ob, mpc_ac)

                ac = ac[0]

                if not PG:
                    ac = mpc_ac

                acs.append(ac)
                mpc_acs.append(mpc_ac)

                current_path['observations'].append(ob)

                ob, rew, done, _ = env.step(ac)

                current_path['reward'].append(rew)
                current_path['actions'].append(ac)
                current_path['next_observations'].append(ob)

                return_ += rew
                rewards.append(rew)

                steps += 1
                if done or steps > max_path_length:
                    break


            if MPC:
                # cost & return
                cost = path_cost(cost_fn, current_path)
                costs.append(cost)
                returns.append(return_)
                print("total return: ", return_)
                print("costs: ", cost)

                # add into buffers
                for n in range(len(current_path['observations'])):
                    data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n])

            for n in range(len(current_path['observations'])):
                data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n])
        
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs),
                    "mpc_action" : np.array(mpc_acs)}



            paths.append(path)
            timesteps_this_batch += pathlength(path)
            # print("timesteps_this_batch", timesteps_this_batch)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch


        print("data_buffer_ppo.size:", data_buffer_ppo.size)


        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths])


        # Computing Q-values
     
        if reward_to_go:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        if t_ >= t:
                            q += gamma**(t_-t) * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)

        else:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        q += gamma**t_ * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)


        # Computing Baselines
        if nn_baseline:

            # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no})
            b_n = value_nn.predict(ob_no)
            b_n = normalize(b_n)
            b_n = denormalize(b_n, np.std(q_n), np.mean(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # Advantage Normalization
        if normalize_advantages:
            adv_n = normalize(adv_n)

        # Optimizing Neural Network Baseline
        if nn_baseline:
            b_n_target = normalize(q_n)
            value_nn.fit(ob_no, b_n_target)
                # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target})


        # Performing the Policy Update

        # policy_nn.fit(ob_no, ac_na, adv_n)
        policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na)

        # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Esempio n. 5
0
class Solar_Trainer():
    """
    This class creates a base Trainer module to train a SOLAR_GP model on training input-output pairs.
    SOLAR_GP models are serialized and sent across a custom topic message interpreted by the SolarPredictor
    """
    def __init__(self, njit, degrees, num_inducing, wgen, use_old_Z=False):
        """
        njit: number of initial jittered ponts
        degrees: degree range of jittered joints
        num_inducing: number of inducing points for sparse GP models
        wgen: weigted threshold for generating new models
        """
        self.solar = []
        self.pub_solar = rospy.Publisher('solarGP',
                                         LocalGP,
                                         queue_size=10,
                                         latch=True)
        self.njit = njit
        self.degrees = degrees
        self.num_inducing = num_inducing
        self.wgen = wgen
        self.use_old_Z = use_old_Z
        self.joint_names = []
        self.TrainData = []
        self.rate = []
        self.stop = False
        self.pub_traintime = rospy.Publisher('traintime',
                                             Float64,
                                             queue_size=10)
        self.x_topic = ""
        self.y_topic = ""

        rospy.wait_for_service('set_neutral')
        self.set_neutral = rospy.ServiceProxy('set_neutral', SetNeutral)

        rospy.wait_for_service('jitter')
        self.jitter_init = rospy.ServiceProxy('jitter', Jitter)

    def initialize(self):

        R = rospy.get_param('~train_pub_rate', 100)
        self.rate = rospy.Rate(R)
        self.buffer_duration = rospy.get_param('~buffer_duration', 0.1)
        self.buffer_size = rospy.get_param('~buffer_size', 500)

        # Robot-specific setup implemented by derived class
        self.setup_robot()

        # Jitter robot initially
        XI, YI = self.jitter_robot()
        num_joints = np.size(YI, 1)

        # Create and initialize SOLAR_GP model
        self.solar = LocalModels(self.num_inducing,
                                 wgen=self.wgen,
                                 xdim=3,
                                 ndim=num_joints * 2)
        self.solar.initializeF(XI, YI)

        # Serialize SOLAR_GP model into custom message and publish
        SolarMsg = self.constructMsg(self.solar)
        self.pub_solar.publish(SolarMsg)

        # Create Data buffer listening on training input-output topics
        self.TrainData = DataBuffer(self.x_topic, self.y_topic,
                                    self.joint_names, self.buffer_duration,
                                    self.buffer_size)

    def setup_robot(self):
        print("Setup Robot not implemented")
        return False

    def jitter_robot(self):
        XI = []
        YI = []
        print("Jitter Robot not implemented")

        #        Service based implementation
        #        self.set_neutral()
        #        self.TrainData = DataBuffer(self.x_topic, self.y_topic, self.joint_names, self.buffer_duration, self.buffer_size)
        #        self.jitter_init(self.njit, self.degrees)
        #
        #        XI = np.asarray(self.TrainData.Xexp).reshape(len(self.TrainData.Xexp),3)
        #        YI = np.asarray(self.TrainData.Yexp).reshape(len(self.TrainData.Yexp),len(self.joint_names))
        #        rospy.loginfo("Number of initial points: %s", len(XI))
        #        self.TrainData.clear()

        return XI, YI

    def jitter(self, n, Y_init, deg=5):
        """
        Randomly sample joint states within specified degree range from initial joint position
        """
        max_rough = 0.0174533
        pert = deg * max_rough * np.random.uniform(-1., 1.,
                                                   (n, np.size(Y_init, 1)))
        Y_start = Y_init + pert
        return Y_start

    def constructMsg(self, local):
        """
        Serializes SOLAR_GP object into custom ROS topic msg
        """
        LocMsg = LocalGP()
        L = []
        for count, m in enumerate(local.Models):
            GP = OSGPR_GP()
            GP.kern_var = m.kern.variance[0]
            GP.kern_lengthscale = np.array(m.kern.lengthscale).tolist()
            GP.likelihood_var = m.likelihood.variance[0]
            GP.xmean = local.LocalData[count][2][0].tolist()
            GP.ymean = local.LocalData[count][3][0].tolist()
            GP.numloc = local.LocalData[count][0]
            Z = np.array(m.Z)
            Z_old = np.array(m.Z_old)
            mu_old = np.array(m.mu_old)
            Su_old = np.array(m.Su_old)
            Kaa_old = np.array(m.Kaa_old)

            X_arr = []
            Y_arr = []
            Z_arr = []
            Z_old_arr = []
            mu_old_arr = []
            Su_old_arr = []
            Kaa_old_arr = []

            for j in range(0, np.shape(m.X)[0]):
                X_row = Arrays()
                Y_row = Arrays()
                X_row.array = np.array(m.X[j, :]).tolist()
                Y_row.array = np.array(m.Y[j, :]).tolist()
                X_arr.append(X_row)
                Y_arr.append(Y_row)

            for j in range(0, np.shape(Z)[0]):
                Z_row = Arrays()
                Z_row.array = Z[j, :].tolist()
                Z_arr.append(Z_row)

            for j in range(0, np.shape(Z_old)[0]):

                Z_old_row = Arrays()
                mu_old_row = Arrays()
                Su_old_row = Arrays()
                Kaa_old_row = Arrays()

                Z_old_row.array = Z_old[j, :].tolist()
                mu_old_row.array = mu_old[j, :].tolist()
                Su_old_row.array = Su_old[j, :].tolist()
                Kaa_old_row.array = Kaa_old[j, :].tolist()

                Z_old_arr.append(Z_old_row)
                mu_old_arr.append(mu_old_row)
                Su_old_arr.append(Su_old_row)
                Kaa_old_arr.append(Kaa_old_row)

            GP.X = X_arr
            GP.Y = Y_arr
            GP.Z = Z_arr
            GP.Z_old = Z_old_arr
            GP.mu_old = mu_old_arr
            GP.Su_old = Su_old_arr
            GP.Kaa_old = Kaa_old_arr

            L.append(GP)

        LocMsg.localGPs = L
        LocMsg.W = local.W.diagonal().tolist()
        LocMsg.M = local.M
        LocMsg.xdim = local.xdim
        LocMsg.ndim = local.ndim

        return LocMsg

    def run(self):

        while not rospy.is_shutdown() and not self.stop:
            t1 = time.time()

            # Skip training if data buffer is empty
            if not self.TrainData.Xexp:
                continue
            else:
                try:
                    # Grab training pairs from buffer
                    Xexp = np.asarray(self.TrainData.Xexp).reshape(
                        len(self.TrainData.Xexp), 3)
                    Y = np.asarray(self.TrainData.Yexp).reshape(
                        len(self.TrainData.Yexp), len(self.joint_names))
                    Yexp = self.solar.encode_ang(Y)
                except:
                    continue

            # Clear buffer
            self.TrainData.clear()
            try:
                # Train drifting model and save trained hyperparameters
                mdrift = self.solar.doOSGPR(Xexp,
                                            Yexp,
                                            self.solar.mdrift,
                                            100,
                                            use_old_Z=True,
                                            driftZ=False)
                mkl = []
                for j in range(0, self.solar.xdim):
                    mkl.append(1 / (mdrift.kern.lengthscale[j]**2))

                W = np.diag(mkl)
                self.solar.W = W
                self.solar.mdrift = mdrift
            except:
                pass

            # Partition training pairs
            self.solar.partition(Xexp.reshape(len(Xexp), self.solar.xdim),
                                 Yexp.reshape(len(Yexp), self.solar.ndim))
            try:
                # Train SOLAR_GP model
                self.solar.train()
            except:
                pass
            # Construct and publish custom SOLAR_GP ROS topic
            LocMsg = self.constructMsg(self.solar)
            self.pub_solar.publish(LocMsg)

            # Publish training time
            t2 = time.time()
            self.pub_traintime.publish(t2 - t1)
            self.rate.sleep()
Esempio n. 6
0
def train(env, 
         cost_fn,
         logdir=None,
         render=False,
         learning_rate=1e-3,
         onpol_iters=10,
         dynamics_iters=60,
         batch_size=512,
         num_paths_random=10, 
         num_paths_onpol=10, 
         num_simulated_paths=10000,
         env_horizon=1000, 
         mpc_horizon=15,
         n_layers=2,
         size=500,
         activation=tf.nn.relu,
         output_activation=None
         ):
    # tracker = SummaryTracker()

    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    #========================================================
    # 
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    """ YOUR CODE HERE """

    # Print env info
    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print(" ")


    random_controller = RandomController(env)
    data_buffer = DataBuffer()
    bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n])



    #========================================================
    # 
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network. 
    # 
    print("data buffer size: ", data_buffer.size)

    normalization = compute_normalization(data_buffer)

    #========================================================
    # 
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    # 
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)

    bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate)

    mpc_controller_bc = MPCcontroller_BC(env=env, 
                                   dyn_model=dyn_model, 
                                   bc_network=bc_net,
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    #========================================================
    # 
    # Tensorflow session building.
    # 
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
          os.mkdir(CHECKPOINT_DIR)  
    #========================================================
    # 
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    # 

    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        print("onpol_iters: ", itr)

        dyn_model.fit(data_buffer)

        saver.save(sess, CHECKPOINT_DIR)

        returns = []
        costs = []

        for w in range(num_paths_onpol):
            print("paths_onpol: ", w, " running.....")
            print("data buffer size: ", data_buffer.size)

            st = env.reset_model()
            path = {'observations': [], 'actions': [], 'next_observations':[]}
            # tracker.print_diff()

            return_ = 0

            for i in range(env_horizon):
                if render:
                    env.render()
                # print("env_horizon: ", i)   

                if BEHAVIORAL_CLONING:
                    if bc_data_buffer.size > 2000:
                        at = mpc_controller_bc.get_action(st)
                    else:
                        at = mpc_controller.get_action(st)
                else:
                    at = mpc_controller.get_action(st)
                    # at = random_controller.get_action(st)

                st_next, env_reward, _, _ = env._step(at)
                path['observations'].append(st)
                path['actions'].append(at)
                path['next_observations'].append(st_next)
                st = st_next
                return_ += env_reward

            # cost & return
            cost = path_cost(cost_fn, path)
            costs.append(cost)
            returns.append(return_)
            print("total return: ", return_)
            print("costs: ", cost)

            # add into buffers
            for n in range(len(path['observations'])):
                data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n])
                bc_data_buffer.add(path['observations'][n], path['actions'][n])

        if BEHAVIORAL_CLONING:
            behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000)




        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # logz.log_tabular('Average_BC_Return', np.mean(bc_returns))

        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=10000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
    clip_param=0.2,
    entcoeff=0.0,
    gamma=0.99,
    lam=0.95,
    optim_epochs=10,
    optim_batchsize=64,
    schedule='linear',
    optim_stepsize=3e-4,
    timesteps_per_actorbatch=1000,
    BEHAVIORAL_CLONING=True,
    PPO=True,
):

    start = time.time()

    logz.configure_output_dir(logdir)

    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)

    print(" ")

    random_controller = RandomController(env)
    model_data_buffer = DataBuffer()

    ppo_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 6)
    bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   render=False,
                   verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add(path['observations'][n], path['actions'][n],
                                  path['next_observations'][n])

    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    #
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    policy_nn = MlpPolicy_bc(sess=sess,
                             env=env,
                             hid_size=64,
                             num_hid_layers=2,
                             clip_param=clip_param,
                             entcoeff=entcoeff)

    bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate)

    mpc_controller_bc_ppo = MPCcontroller_BC_PPO(
        env=env,
        dyn_model=dyn_model,
        bc_ppo_network=policy_nn,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

    #========================================================
    #
    # Prepare for rollouts
    #

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon

    for itr in range(onpol_iters):

        print("onpol_iters: ", itr)
        dyn_model.fit(model_data_buffer)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)

        # saver.save(sess, CHECKPOINT_DIR)
        behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

        ppo_data_buffer.clear()
        seg = traj_segment_generator(policy_nn, mpc_controller,
                                     mpc_controller_bc_ppo, bc_data_buffer,
                                     env, env_horizon)
        add_vtarg_and_adv(seg, gamma, lam)

        ob, ac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg["ac"], seg[
            "rew"], seg["nxt_ob"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        for n in range(len(ob)):
            ppo_data_buffer.add(
                (ob[n], ac[n], rew[n], nxt_ob[n], atarg[n], tdlamret[n]))
            bc_data_buffer.add((ob[n], ac[n]))
            model_data_buffer.add(ob[n], ac[n], nxt_ob[n])

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        # behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000)

        if hasattr(policy_nn, "ob_rms"):
            policy_nn.ob_rms.update(ob)  # update running mean/std for policy
        policy_nn.assign_old_eq_new(
        )  # set old parameter values to new parameter values

        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_rew, sample_nxt_ob_no, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(
                    optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(
                    sample_ob_no, sample_ac_na, sample_adv_n,
                    sample_b_n_target, cur_lrmult, optim_stepsize * cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(
                    optim_batchsize)
                policy_nn.update_bc(sample_ob_no, sample_ac_na,
                                    optim_stepsize * cur_lrmult)

            if op_ep % 100 == 0:
                print('epcho: ', op_ep)
                behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values

        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        ep_lengths = seg["ep_lens"]
        returns = seg["ep_rets"]

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()