def initialize(self): R = rospy.get_param('~train_pub_rate', 100) self.rate = rospy.Rate(R) self.buffer_duration = rospy.get_param('~buffer_duration', 0.1) self.buffer_size = rospy.get_param('~buffer_size', 500) # Robot-specific setup implemented by derived class self.setup_robot() # Jitter robot initially XI, YI = self.jitter_robot() num_joints = np.size(YI, 1) # Create and initialize SOLAR_GP model self.solar = LocalModels(self.num_inducing, wgen=self.wgen, xdim=3, ndim=num_joints * 2) self.solar.initializeF(XI, YI) # Serialize SOLAR_GP model into custom message and publish SolarMsg = self.constructMsg(self.solar) self.pub_solar.publish(SolarMsg) # Create Data buffer listening on training input-output topics self.TrainData = DataBuffer(self.x_topic, self.y_topic, self.joint_names, self.buffer_duration, self.buffer_size)
def customLoss(y_true, y_pred): return K.mean(K.square(y_pred - y_true), axis=-1) with open('model.json', 'r') as jfile: model = model_from_json(jfile.read()) model.compile("adam", "mse") weights_file = 'weights.0209-0.046.hdf5' model.load_weights(weights_file) # model = load_model("multiModel.h5", custom_objects={'customLoss':customLoss}) graph = tf.get_default_graph() data_buffer = DataBuffer() res_queue = queue.Queue(maxsize=1) # # idxs = [0, 1, 2] # means = [-122.33790211, 39.53881540, 62.68238949] # stds = [0.00099555, 0.00180817, 13.48539298] # def normalize_vector(xVec): # for i, mean, std in zip(idxs, means, stds): # xVec[i] -= mean # xVec[i] /= std # return xVec def copyImage(byte_array, imageSize): if imageSize > 8:
# topic /sensors/odom from nav_msgs.msg import Odometry # topic /darknet_ros/bounding_boxes from darknet_ros_msgs.msg import BoundingBoxes # topic /scan from sensor_msgs.msg import LaserScan # scan.range[179] = distance between the robot and an object object_searched = [ "stop sign", "backpack", "refrigerator", "motorbike", "pottedplant", "suitcase", "teddy bear" ] odom_bag = DataBuffer(maxlen=1000) bounding_boxes_bag = DataBuffer() scan_bag = DataBuffer(maxlen=1000) objects = [] previous_robot_position = None previous_object = None def _process_bounding_box(bounding_box): global previous_robot_position, previous_object current_robot_position = odom_bag.get_closest_to( bounding_box.image_header.stamp) distance_from_object = scan_bag.get_closest_to( bounding_box.image_header.stamp) # TODO: checker si c'est le même object ou un autre
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
class Solar_Trainer(): """ This class creates a base Trainer module to train a SOLAR_GP model on training input-output pairs. SOLAR_GP models are serialized and sent across a custom topic message interpreted by the SolarPredictor """ def __init__(self, njit, degrees, num_inducing, wgen, use_old_Z=False): """ njit: number of initial jittered ponts degrees: degree range of jittered joints num_inducing: number of inducing points for sparse GP models wgen: weigted threshold for generating new models """ self.solar = [] self.pub_solar = rospy.Publisher('solarGP', LocalGP, queue_size=10, latch=True) self.njit = njit self.degrees = degrees self.num_inducing = num_inducing self.wgen = wgen self.use_old_Z = use_old_Z self.joint_names = [] self.TrainData = [] self.rate = [] self.stop = False self.pub_traintime = rospy.Publisher('traintime', Float64, queue_size=10) self.x_topic = "" self.y_topic = "" rospy.wait_for_service('set_neutral') self.set_neutral = rospy.ServiceProxy('set_neutral', SetNeutral) rospy.wait_for_service('jitter') self.jitter_init = rospy.ServiceProxy('jitter', Jitter) def initialize(self): R = rospy.get_param('~train_pub_rate', 100) self.rate = rospy.Rate(R) self.buffer_duration = rospy.get_param('~buffer_duration', 0.1) self.buffer_size = rospy.get_param('~buffer_size', 500) # Robot-specific setup implemented by derived class self.setup_robot() # Jitter robot initially XI, YI = self.jitter_robot() num_joints = np.size(YI, 1) # Create and initialize SOLAR_GP model self.solar = LocalModels(self.num_inducing, wgen=self.wgen, xdim=3, ndim=num_joints * 2) self.solar.initializeF(XI, YI) # Serialize SOLAR_GP model into custom message and publish SolarMsg = self.constructMsg(self.solar) self.pub_solar.publish(SolarMsg) # Create Data buffer listening on training input-output topics self.TrainData = DataBuffer(self.x_topic, self.y_topic, self.joint_names, self.buffer_duration, self.buffer_size) def setup_robot(self): print("Setup Robot not implemented") return False def jitter_robot(self): XI = [] YI = [] print("Jitter Robot not implemented") # Service based implementation # self.set_neutral() # self.TrainData = DataBuffer(self.x_topic, self.y_topic, self.joint_names, self.buffer_duration, self.buffer_size) # self.jitter_init(self.njit, self.degrees) # # XI = np.asarray(self.TrainData.Xexp).reshape(len(self.TrainData.Xexp),3) # YI = np.asarray(self.TrainData.Yexp).reshape(len(self.TrainData.Yexp),len(self.joint_names)) # rospy.loginfo("Number of initial points: %s", len(XI)) # self.TrainData.clear() return XI, YI def jitter(self, n, Y_init, deg=5): """ Randomly sample joint states within specified degree range from initial joint position """ max_rough = 0.0174533 pert = deg * max_rough * np.random.uniform(-1., 1., (n, np.size(Y_init, 1))) Y_start = Y_init + pert return Y_start def constructMsg(self, local): """ Serializes SOLAR_GP object into custom ROS topic msg """ LocMsg = LocalGP() L = [] for count, m in enumerate(local.Models): GP = OSGPR_GP() GP.kern_var = m.kern.variance[0] GP.kern_lengthscale = np.array(m.kern.lengthscale).tolist() GP.likelihood_var = m.likelihood.variance[0] GP.xmean = local.LocalData[count][2][0].tolist() GP.ymean = local.LocalData[count][3][0].tolist() GP.numloc = local.LocalData[count][0] Z = np.array(m.Z) Z_old = np.array(m.Z_old) mu_old = np.array(m.mu_old) Su_old = np.array(m.Su_old) Kaa_old = np.array(m.Kaa_old) X_arr = [] Y_arr = [] Z_arr = [] Z_old_arr = [] mu_old_arr = [] Su_old_arr = [] Kaa_old_arr = [] for j in range(0, np.shape(m.X)[0]): X_row = Arrays() Y_row = Arrays() X_row.array = np.array(m.X[j, :]).tolist() Y_row.array = np.array(m.Y[j, :]).tolist() X_arr.append(X_row) Y_arr.append(Y_row) for j in range(0, np.shape(Z)[0]): Z_row = Arrays() Z_row.array = Z[j, :].tolist() Z_arr.append(Z_row) for j in range(0, np.shape(Z_old)[0]): Z_old_row = Arrays() mu_old_row = Arrays() Su_old_row = Arrays() Kaa_old_row = Arrays() Z_old_row.array = Z_old[j, :].tolist() mu_old_row.array = mu_old[j, :].tolist() Su_old_row.array = Su_old[j, :].tolist() Kaa_old_row.array = Kaa_old[j, :].tolist() Z_old_arr.append(Z_old_row) mu_old_arr.append(mu_old_row) Su_old_arr.append(Su_old_row) Kaa_old_arr.append(Kaa_old_row) GP.X = X_arr GP.Y = Y_arr GP.Z = Z_arr GP.Z_old = Z_old_arr GP.mu_old = mu_old_arr GP.Su_old = Su_old_arr GP.Kaa_old = Kaa_old_arr L.append(GP) LocMsg.localGPs = L LocMsg.W = local.W.diagonal().tolist() LocMsg.M = local.M LocMsg.xdim = local.xdim LocMsg.ndim = local.ndim return LocMsg def run(self): while not rospy.is_shutdown() and not self.stop: t1 = time.time() # Skip training if data buffer is empty if not self.TrainData.Xexp: continue else: try: # Grab training pairs from buffer Xexp = np.asarray(self.TrainData.Xexp).reshape( len(self.TrainData.Xexp), 3) Y = np.asarray(self.TrainData.Yexp).reshape( len(self.TrainData.Yexp), len(self.joint_names)) Yexp = self.solar.encode_ang(Y) except: continue # Clear buffer self.TrainData.clear() try: # Train drifting model and save trained hyperparameters mdrift = self.solar.doOSGPR(Xexp, Yexp, self.solar.mdrift, 100, use_old_Z=True, driftZ=False) mkl = [] for j in range(0, self.solar.xdim): mkl.append(1 / (mdrift.kern.lengthscale[j]**2)) W = np.diag(mkl) self.solar.W = W self.solar.mdrift = mdrift except: pass # Partition training pairs self.solar.partition(Xexp.reshape(len(Xexp), self.solar.xdim), Yexp.reshape(len(Yexp), self.solar.ndim)) try: # Train SOLAR_GP model self.solar.train() except: pass # Construct and publish custom SOLAR_GP ROS topic LocMsg = self.constructMsg(self.solar) self.pub_solar.publish(LocMsg) # Publish training time t2 = time.time() self.pub_traintime.publish(t2 - t1) self.rate.sleep()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None ): # tracker = SummaryTracker() """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. """ YOUR CODE HERE """ # Print env info print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print(" ") random_controller = RandomController(env) data_buffer = DataBuffer() bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # print("data buffer size: ", data_buffer.size) normalization = compute_normalization(data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc = MPCcontroller_BC(env=env, dyn_model=dyn_model, bc_network=bc_net, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print("onpol_iters: ", itr) dyn_model.fit(data_buffer) saver.save(sess, CHECKPOINT_DIR) returns = [] costs = [] for w in range(num_paths_onpol): print("paths_onpol: ", w, " running.....") print("data buffer size: ", data_buffer.size) st = env.reset_model() path = {'observations': [], 'actions': [], 'next_observations':[]} # tracker.print_diff() return_ = 0 for i in range(env_horizon): if render: env.render() # print("env_horizon: ", i) if BEHAVIORAL_CLONING: if bc_data_buffer.size > 2000: at = mpc_controller_bc.get_action(st) else: at = mpc_controller.get_action(st) else: at = mpc_controller.get_action(st) # at = random_controller.get_action(st) st_next, env_reward, _, _ = env._step(at) path['observations'].append(st) path['actions'].append(at) path['next_observations'].append(st_next) st = st_next return_ += env_reward # cost & return cost = path_cost(cost_fn, path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) bc_data_buffer.add(path['observations'][n], path['actions'][n]) if BEHAVIORAL_CLONING: behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # logz.log_tabular('Average_BC_Return', np.mean(bc_returns)) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', optim_stepsize=3e-4, timesteps_per_actorbatch=1000, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print(" ") random_controller = RandomController(env) model_data_buffer = DataBuffer() ppo_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 6) bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc_ppo = MPCcontroller_BC_PPO( env=env, dyn_model=dyn_model, bc_ppo_network=policy_nn, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon for itr in range(onpol_iters): print("onpol_iters: ", itr) dyn_model.fit(model_data_buffer) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) # saver.save(sess, CHECKPOINT_DIR) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ppo_data_buffer.clear() seg = traj_segment_generator(policy_nn, mpc_controller, mpc_controller_bc_ppo, bc_data_buffer, env, env_horizon) add_vtarg_and_adv(seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "rew"], seg["nxt_ob"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate for n in range(len(ob)): ppo_data_buffer.add( (ob[n], ac[n], rew[n], nxt_ob[n], atarg[n], tdlamret[n])) bc_data_buffer.add((ob[n], ac[n])) model_data_buffer.add(ob[n], ac[n], nxt_ob[n]) print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] # behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_rew, sample_nxt_ob_no, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) policy_nn.update_bc(sample_ob_no, sample_ac_na, optim_stepsize * cur_lrmult) if op_ep % 100 == 0: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars()