def plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate): """ Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. """ print('Plotting nn dynamics results') rand_cont = RandomController() s = state_cb.reset(pub_act, pub_cmd) env_state_traj = s model_state_traj = s steps = 100 for i in range(steps): a = rand_cont.get_action(None) # Step environment env_s, _ = state_cb.step(a, pub_act, pub_cmd) env_state_traj = np.vstack((env_state_traj, env_s)) # Step model if i == 0: model_s = dyn_model.predict(model_state_traj, a) else: model_s = dyn_model.predict(model_state_traj[i, :], a) model_state_traj = np.vstack((model_state_traj, model_s)) body = 10 # for i in range(body*12,(body+1)*12): for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 108, 109, 110]: plt.figure() env_state = plt.plot(np.arange(steps + 1), env_state_traj[:, i].reshape((steps + 1)), label='env state') model_state = plt.plot(np.arange(steps + 1), model_state_traj[:, i].reshape((steps + 1)), label='model state') state = i % 12 if state == 0: plt.title('Body ' + str(body) + ', x position') elif state == 1: plt.title('Body ' + str(body) + ', y position') elif state == 2: plt.title('Body ' + str(body) + ', z position') elif state == 3: plt.title('Body ' + str(body) + ', x angle') elif state == 4: plt.title('Body ' + str(body) + ', y angle') elif state == 5: plt.title('Body ' + str(body) + ', z angle') elif state == 6: plt.title('Body ' + str(body) + ', x velocty') elif state == 7: plt.title('Body ' + str(body) + ', y velocity') elif state == 8: plt.title('Body ' + str(body) + ', z velocity') elif state == 9: plt.title('Body ' + str(body) + ', x angular velocity') elif state == 10: plt.title('Body ' + str(body) + ', y angular velocity') elif state == 11: plt.title('Body ' + str(body) + ', z angular velocity') plt.legend() plt.draw() plt.show()
def plot_comparison(dyn_model, pub_act, pub_cmd, rate): """ Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. """ rand_cont = RandomController() s = reset(pub_cmd, rate) env_state_traj = s model_state_traj = s steps = 100 for i in range(steps): a = rand_cont.get_action(None) # Step environment env_s, _ = step(a, pub_act, pub_cmd, rate) env_state_traj = np.vstack((env_state_traj, env_s)) # Step model if i == 0: model_s = dyn_model.predict(model_state_traj, a) else: model_s = dyn_model.predict(model_state_traj[i, :], a) model_state_traj = np.vstack((model_state_traj, model_s)) for i in range(len(s)): plt.figure() env_state = plt.plot(np.arange(steps + 1), env_state_traj[:, i].reshape((steps + 1)), label='env state') model_state = plt.plot(np.arange(steps + 1), model_state_traj[:, i].reshape((steps + 1)), label='model state') plt.title('State ' + str(i)) plt.legend() plt.draw() plt.show()
def get_test_batch(self, num_tasks, resample=False, task=None, controller='Rand', task_range=(0, 7), task_fun=np.random.randint): if controller == 'Rand': self.controller = RandomController(self.env) elif controller == "MPC": self.controller = MPCcontroller(self.env) if resample: # random sample if task is None: learner_env_goals = sample_goals(num_tasks, task_range, task_fun) else: learner_env_goals = task for i in range(num_tasks): task = learner_env_goals[i] paths = sample(self.env, task, self.controller, num_paths=self.num_paths_random, horizon=self.env_horizon, ignore_done=True, K=self.K, M=self.M) # 10 data_x, data_y = self._data_process(paths) data_x = data_x[np.newaxis, :] data_y = data_y[np.newaxis, :] if i == 0: x = data_x y = data_y else: x = np.concatenate([x, data_x], axis=0) y = np.concatenate([y, data_y], axis=0) data_x, data_y = [], [] for t in range(num_tasks): for h in range(self.env_horizon): data_x.append(x[t, h:(h + self.K + self.M), :]) data_y.append(y[t, h:(h + self.K + self.M), :]) data_x = np.array(data_x) data_y = np.array(data_y) # dataset = tf.data.Dataset.from_tensor_slices((data_x, data_y)).shuffle( # buffer_size=self.env_horizon * self.num_tasks).batch( # self.env_horizon).repeat() # # create the iterator # iter = dataset.make_one_shot_iterator() # # iterator = iter.get_next() return data_x, data_y
def plot_comparison(env, dyn_model): """ Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. """ """ YOUR CODE HERE """ random_controller = RandomController(env) data = sample(env, rand_controller) next_pred_state = dyn_model.predict(data['observations'], data['actions']) plot(data['next_observations']) plot(next_pred_state) pass
def plot_comparison(env, dyn_model): """ Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. """ """ YOUR CODE HERE """ horizon = 100 ob = env.reset() pred = ob[np.newaxis, :] obs, next_obs, acs, rewards = [], [], [], [] preds = [] steps = 0 RC = RandomController(env) for _ in range(100): obs.append(ob) preds.append(pred) ac = RC.get_action(ob) acs.append(ac) ob, rew, done, _ = env.step(ac) pred = dyn_model.predict(pred, ac[np.newaxis, :]) next_obs.append(ob) rewards.append(rew) steps += 1 if done or steps > horizon: break path = { "observations": np.array(obs), "next_observations": np.array(next_obs), "rewards": np.array(rewards), "actions": np.array(acs), "predictions": np.array(preds) } print(path['observations'].shape) print(path['predictions'].shape) plt.plot(path['observations'][:, 0]) plt.plot(path['predictions'][:, 0, 0]) plt.show() pass
def plot_comparison(env, dyn_model): """ Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. """ """ YOUR CODE HERE """ data = sample(env, RandomController(env), num_paths=1) pred_states = dyn_model.predict(data[0]['observations'], data[0]['actions']) losses = np.sum((pred_states - data[0]['next_observations'])**2) plt.plot(losses) plt.ylabel('predicted states squared error') plt.xlabel('timestep') plt.show() pass
def get_dataset(self, resample=False, task=None, controller='Rand', task_range=(0, 7), task_fun=np.random.randint): if controller == 'Rand': self.controller = RandomController(self.env) elif controller == "MPC": self.controller = MPCcontroller(self.env) if resample: # random sample if task is None: learner_env_goals = sample_goals(self.num_tasks, task_range, task_fun) else: learner_env_goals = task for i in range(self.num_tasks): task = learner_env_goals[i] paths = sample(self.env, task, self.controller, num_paths=self.num_paths_random, horizon=self.env_horizon, ignore_done=True, K=self.K, M=self.M) # 10 data_x, data_y = self._data_process(paths) data_x = data_x[np.newaxis, :] data_y = data_y[np.newaxis, :] if i == 0: self.x = data_x self.y = data_y else: self.x = np.concatenate([self.x, data_x], axis=0) self.y = np.concatenate([self.y, data_y], axis=0) # end = time.time() # runtime1 = end - start # print('time ', runtime1) print('env_horizon:', self.env_horizon) print('len of x:', len(self.x)) return len(self.x)
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation |_ for the loop to run. dynamics_iters Number of iterations of training for the | dynamics model which happen per iteration of |_ the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of | aggregation, using the Model Predictive Control |_ policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first | action of the best fictitious rollout. This | argument is how many timesteps should be in |_ each fictitious rollout. n_layers/size/activations Neural network architecture arguments. """ d("env = {}".format(env)) d("env.observation_space = {}".format(env.observation_space)) d("env.action_space = {}".format(env.action_space)) d("env.observation_space.shape = {}".format(env.observation_space.shape)) d("env.action_space.shape = {}".format(env.action_space.shape)) d("logdir = {}".format(logdir)) d("render = {}".format(render)) d("learning_rate = {}".format(learning_rate)) d("onpol_iters = {}".format(onpol_iters)) d("dynamics_iters = {}".format(dynamics_iters)) d("batch_size = {}".format(batch_size)) d("num_paths_random = {}".format(num_paths_random)) d("num_paths_onpol = {}".format(num_paths_onpol)) d("num_simulated_paths = {}".format(num_simulated_paths)) d("env_horizon = {}".format(env_horizon)) d("mpc_horizon = {}".format(mpc_horizon)) d("n_layers = {}".format(n_layers)) d("size = {}".format(size)) logz.configure_output_dir(logdir) #=========================================================================== # First, we need a lot of data generated by a random agent, with which # we'll begin to train our dynamics model. d("Generating random rollouts.") random_controller = RandomController(env) random_paths = sample(env=env, controller=random_controller, num_paths=num_paths_random, horizon=env_horizon, render=render) d("Done generating random rollouts.") #=========================================================================== # The random data will be used to get statistics (mean and std) for the # observations, actions, and deltas (where deltas are o_{t+1} - o_t). These # will be used for normalizing inputs and denormalizing outputs from the # dynamics network. d("Normalizing random rollouts.") data = paths_to_data(random_paths) normalization = compute_normalization(data) d("Done normalizing random rollouts.") mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action = normalization d("mean_obs = {}".format(mean_obs)) d("std_obs = {}".format(std_obs)) d("mean_deltas = {}".format(mean_deltas)) d("std_deltas = {}".format(std_deltas)) d("mean_action = {}".format(mean_action)) d("std_action = {}".format(std_action)) #=========================================================================== # Build dynamics model and MPC controllers. sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #=========================================================================== # Tensorflow session building. sess.__enter__() tf.global_variables_initializer().run() #=========================================================================== # Take multiple iterations of onpolicy aggregation at each iteration # refitting the dynamics model to current dataset and then taking onpolicy # samples and aggregating to the dataset. # # Note: You don't need to use a mixing ratio in this assignment for new and # old data as described in https://arxiv.org/abs/1708.02596 start_time = time.time() for itr in range(onpol_iters): d("Iteration {}".format(itr)) # Shuffle data. d("Shuffling data.") shuffle_indexes = np.random.permutation(data["observations"].shape[0]) data["observations"] = data["observations"][shuffle_indexes] data["actions"] = data["actions"][shuffle_indexes] data["next_observations"] = data["next_observations"][shuffle_indexes] data["rewards"] = data["rewards"][shuffle_indexes] d("Done shuffling data.") # Fit the dynamics. d("Fitting dynamics.") dyn_model.fit(data) d("Done fitting dynamics.") # Generate on-policy rollouts. d("Generating on-policy rollouts.") rl_paths = sample(env=env, controller=mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=render) d("Done generating on-policy rollouts.") # Compute metrics. costs = np.array([path_cost(cost_fn, path) for path in rl_paths]) returns = np.array([sum(path["rewards"]) for path in rl_paths]) # Update data. new_data = paths_to_data(rl_paths) data = { "observations": np.concatenate([data["observations"], new_data["observations"]]), "actions": np.concatenate([data["actions"], new_data["actions"]]), "next_observations": np.concatenate( [data["next_observations"], new_data["next_observations"]]), "rewards": np.concatenate([data["rewards"], new_data["rewards"]]), } # TODO(mwhittaker): Shuffle if we need to. # LOGGING # Statistics for performance of MPC policy using our learned dynamics # model logz.log_tabular('Iteration', itr) logz.log_tabular('Time', time.time() - start_time) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory # using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): # tracker = SummaryTracker() """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. """ YOUR CODE HERE """ # Print env info print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print(" ") random_controller = RandomController(env) data_buffer = DataBuffer_withreward() bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n]) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # print("data buffer size: ", data_buffer.size) normalization = compute_normalization(data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller_learned_reward( env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc = MPCcontroller_BC_learned_reward( env=env, dyn_model=dyn_model, bc_network=bc_net, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() if LOAD_MODEL: # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print("onpol_iters: ", itr) dyn_model.fit(data_buffer) if LOAD_MODEL: saver.save(sess, CHECKPOINT_DIR) returns = [] costs = [] for w in range(num_paths_onpol): print("paths_onpol: ", w, " running.....") print("data buffer size: ", data_buffer.size) st = env.reset_model() path = { 'observations': [], 'actions': [], 'rewards': [], 'next_observations': [] } # tracker.print_diff() return_ = 0 for i in range(env_horizon): if render: env.render() # print("env_horizon: ", i) if BEHAVIORAL_CLONING: if bc_data_buffer.size > 2000: at = mpc_controller_bc.get_action(st) else: at = mpc_controller.get_action(st) else: at = mpc_controller.get_action(st) # at = random_controller.get_action(st) st_next, env_reward, _, _ = env._step(at) path['observations'].append(st) path['actions'].append(at) path['rewards'].append(env_reward) path['next_observations'].append(st_next) st = st_next return_ += env_reward # cost & return # cost = path_cost(cost_fn, path) # costs.append(cost) returns.append(return_) print("total return: ", return_) # print("costs: ", cost) # add into buffers for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n]) bc_data_buffer.add(path['observations'][n], path['actions'][n]) if BEHAVIORAL_CLONING: bc_returns = behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan if BEHAVIORAL_CLONING: logz.log_tabular('Average_BC_Return', np.mean(bc_returns)) logz.log_tabular('Std_BC_Return', np.std(bc_returns)) logz.log_tabular('Minimum_BC_Return', np.min(bc_returns)) logz.log_tabular('Maximum_BC_Return', np.max(bc_returns)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2 , entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC = True, BEHAVIORAL_CLONING = True, PPO = True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") # initialize buffers model_data_buffer = DataBufferGeneral(1000000, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(BC_BUFFER_SIZE, 2) # random sample path print("collecting random data ..... ") random_controller = RandomController(env) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n]]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=256, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, dyn_model=dyn_model, policy_net=policy_nn, self_exp=False, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon bc = False ppo_mpc = False mpc_returns = 0 for itr in range(onpol_iters): print(" ") print("onpol_iters: ", itr) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) print("bc learning_rate: ", bc_lr) print("ppo learning_rate: ", ppo_lr) ################## fit mpc model if MPC: dyn_model.fit(model_data_buffer) ################## ppo seg data if PPO: ppo_data_buffer.clear() # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon) mpc = False ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(ppo_seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = \ ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if MPC: model_data_buffer.add([ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]]) ################## mpc augmented seg data if itr % MPC_AUG_GAP == 0 and MPC: print("MPC AUG PPO") ppo_mpc = True mpc = True mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg["ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg["rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg["tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): # if PPO: # ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if BEHAVIORAL_CLONING and bc: bc_data_buffer.add([ob[n], mpcac[n]]) if MPC: model_data_buffer.add([ob[n], mpcac[n], rew[n], nxt_ob[n], nxt_ob[n]-ob[n]]) mpc_returns = mpc_seg["ep_rets"] seg = ppo_seg # check if seg is good ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] # saver.save(sess, CHECKPOINT_DIR) if BEHAVIORAL_CLONING: if np.mean(returns) > 100: bc = True else: bc = False print("BEHAVIORAL_CLONING: ", bc) bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon) if bc_return > 100: ppo_mpc = True else: ppo_mpc = False ################## optimization print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new() # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(optim_batchsize) newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr*cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING and bc: sample_ob_no, sample_ac_na = bc_data_buffer.sample(optim_batchsize) # print("sample_ob_no", sample_ob_no.shape) # print("sample_ac_na", sample_ac_na.shape) policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr*cur_lrmult) if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ################## print and save data lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 # if np.mean(returns) > 1000: # filename = "seg_data.pkl" # pickle.dump(seg, open(filename, 'wb')) # print("saved", filename) logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("MpcReturn", np.mean(mpc_returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars() tstart = time.time()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC=True, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) merged_summary, summary_writer, ppo_return_op, mpc_return_op, model_loss_op, reward_loss_op, ppo_std_op, mpc_std_op = build_summary_ops( logdir, env) print("-------- env info --------") print("Environment: ", FLAGS.env_name) print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("action_space low: ", env.action_space.low) print("action_space high: ", env.action_space.high) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") random_controller = RandomController(env) # Creat buffers model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(2000, 2) # Random sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([ path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n] ]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) if FLAGS.LEARN_REWARD: print("Learn reward function") dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNetReward( env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) else: print("Use predefined cost function") dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNet( env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) # if not PPO: # mpc_ppo_controller = mpc_controller #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(logdir) if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(logdir): os.mkdir(logdir) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon bc = False ppo_mpc = False mpc_returns = 0 model_loss = 0 for itr in range(onpol_iters): print(" ") print("onpol_iters: ", itr) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) print("bc learning_rate: ", bc_lr) print("ppo learning_rate: ", ppo_lr) ################## fit mpc model if MPC: model_loss, reward_loss = dyn_model.fit(model_data_buffer) ################## ppo seg data ppo_data_buffer.clear() # ppo_seg = traj_segment_generator_ppo(policy_nn, env, env_horizon) ppo_mpc = False mpc = False ppo_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(ppo_seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = \ ppo_seg["ob"], ppo_seg["ac"], ppo_seg["rew"], ppo_seg["nxt_ob"], ppo_seg["adv"], ppo_seg["tdlamret"] atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate # add into buffer for n in range(len(ob)): ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) model_data_buffer.add( [ob[n], ac[n], rew[n], nxt_ob[n], nxt_ob[n] - ob[n]]) ppo_std = np.std(ac, axis=0) print("ppo_std: ", ppo_std) ################## mpc augmented seg data if MPC: print("MPC AUG PPO") ppo_mpc = True mpc = True mpc_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_seg[ "ob"], mpc_seg["ac"], mpc_seg["mpcac"], mpc_seg[ "rew"], mpc_seg["nxt_ob"], mpc_seg["adv"], mpc_seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate mpc_returns = mpc_seg["ep_rets"] mpc_std = np.std(mpcac) if not MPC: mpc_std = 0 ################## mpc random seg data if FLAGS.mpc_rand: print("MPC Random base policy") ppo_mpc = False mpc = True mpc_random_seg = traj_segment_generator(policy_nn, mpc_controller, mpc_ppo_controller, bc_data_buffer, env, mpc, ppo_mpc, env_horizon) add_vtarg_and_adv(mpc_random_seg, gamma, lam) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = mpc_random_seg[ "ob"], mpc_random_seg["ac"], mpc_random_seg[ "mpcac"], mpc_random_seg["rew"], mpc_random_seg[ "nxt_ob"], mpc_random_seg["adv"], mpc_random_seg[ "tdlamret"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate mpc_rand_returns = mpc_random_seg["ep_rets"] ################# PPO deterministic evaluation ppo_determinisitc_return = policy_net_eval(sess, env, policy_nn, env_horizon, stochastic=False) ################## optimization print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING and bc: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) # print("sample_ob_no", sample_ob_no.shape) # print("sample_ac_na", sample_ac_na.shape) policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr * cur_lrmult) if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc: print('epcho: ', op_ep) policy_net_eval(sess, env, policy_nn, env_horizon) ################## print and save data seg = ppo_seg ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # log ppo logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "PPO") logz.dump_tabular() # log ppo deterministic logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", ppo_determinisitc_return) logz.log_tabular("Condition", "PPO_DETERMINISTIC") logz.dump_tabular() # log mpc if MPC: logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(mpc_returns)) logz.log_tabular("StdReturn", np.std(mpc_returns)) logz.log_tabular("MaxReturn", np.max(mpc_returns)) logz.log_tabular("MinReturn", np.min(mpc_returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "MPC_PPO") logz.dump_tabular() if FLAGS.mpc_rand: logz.log_tabular("TimeSoFar", time.time() - start) logz.log_tabular("TimeEp", time.time() - tstart) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(mpc_rand_returns)) logz.log_tabular("StdReturn", np.std(mpc_rand_returns)) logz.log_tabular("MaxReturn", np.max(mpc_rand_returns)) logz.log_tabular("MinReturn", np.min(mpc_rand_returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.log_tabular("Condition", "MPC_RAND") logz.dump_tabular() # logz.pickle_tf_vars() tstart = time.time() ################### TF Summaries summary_str = sess.run(merged_summary, feed_dict={ ppo_return_op: np.mean(returns), mpc_return_op: np.mean(mpc_returns), model_loss_op: model_loss, ppo_std_op: ppo_std, reward_loss_op: reward_loss, mpc_std_op: mpc_std, }) summary_writer.add_summary(summary_str, itr) summary_writer.flush() ################ TF SAVE if itr % FLAGS.SAVE_ITER == 0 and itr != 0: save_path = saver.save(sess, logdir + "/model.ckpt") print("Model saved in path: %s" % save_path)
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths_rand = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=render, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(paths_rand) gamma = 0.99 #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # prefit dynamic before on policy dagger: print("****** Pretrain dynamic Model *******") losses = [] obs_rand = np.concatenate([path["observation"] for path in paths_rand]) action_rand = np.concatenate([path["action"] for path in paths_rand]) next_ob_rand = np.concatenate([path["obs_next"] for path in paths_rand]) data_size_rand = obs_rand.shape[0] for i in range(1000): # obtain batch size from random policy batch_idx_rand = np.random.randint(data_size_rand, size=batch_size) batch_ob_rand = obs_rand[batch_idx_rand, :] batch_ac_rand = action_rand[batch_idx_rand, :] batch_nxt_rand = next_ob_rand[batch_idx_rand, :] # obtain batch size from on policy batch_ob = np.copy(batch_ob_rand) batch_ac = np.copy(batch_ac_rand) batch_nxt = np.copy(batch_nxt_rand) loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt) losses.append(loss) if (i % 20 == 0): print('loss', loss) costs = [] returns = [] paths_rl = [] for itr in range(onpol_iters): """ YOUR CODE HERE """ # fit dynamic model if itr > 0: obs_rl = np.concatenate([path["observation"] for path in paths_rl]) action_rl = np.concatenate([path["action"] for path in paths_rl]) next_ob_rl = np.concatenate( [path["obs_next"] for path in paths_rl]) obs_rand = np.concatenate([path["observation"] for path in paths_rand]) action_rand = np.concatenate([path["action"] for path in paths_rand]) next_ob_rand = np.concatenate( [path["obs_next"] for path in paths_rand]) # print obs[128,:].shape data_size_rand = obs_rand.shape[0] if itr > 0: data_size_rl = obs_rl.shape[0] # batch_size=128 losses = [] # fit model function for i in range(dynamics_iters): # obtain batch size from random policy batch_idx_rand = np.random.randint(data_size_rand, size=batch_size / 20) batch_ob_rand = obs_rand[batch_idx_rand, :] batch_ac_rand = action_rand[batch_idx_rand, :] batch_nxt_rand = next_ob_rand[batch_idx_rand, :] # obtain batch size from on policy if itr > 0: batch_idx_rl = np.random.randint(data_size_rl, size=batch_size * 19 / 20) batch_ob_rl = obs_rl[batch_idx_rl, :] batch_ac_rl = action_rl[batch_idx_rl, :] batch_nxt_rl = next_ob_rl[batch_idx_rl, :] # mix them batch_ob = np.concatenate((batch_ob_rand, batch_ob_rl)) batch_ac = np.concatenate((batch_ac_rand, batch_ac_rl)) batch_nxt = np.concatenate((batch_nxt_rand, batch_nxt_rl)) else: batch_ob = np.copy(batch_ob_rand) batch_ac = np.copy(batch_ac_rand) batch_nxt = np.copy(batch_nxt_rand) loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt) losses.append(loss) # if(i%20==0): # print('loss', loss) print("on policy dagger ", itr) ob = env.reset() observes, acs, rewards, obs_2, returns = [], [], [], [], [] steps = 0 g = 0 max_path_length = mpc_controller.horizon timesteps_this_batch = 0 while True: while True: observes.append(ob) ac = mpc_controller.get_action(ob) # print ac acs.append(ac) # print ac ob, rew, done, _ = env.step(ac) g += rew * gamma**steps obs_2.append(ob) rewards.append(rew) returns.append(g) steps += 1 if done or steps > max_path_length: terminated = done break path = { "observation": np.array(observes), "reward": np.array(rewards), "action": np.array(acs), "obs_next": np.array(obs_2), "return": np.array(returns) } paths_rl.append(path) timesteps_this_batch += pathlength(path) print g if timesteps_this_batch > batch_size: break trajectory_cost = trajectory_cost_fn(cheetah_cost_fn, path["observation"], path["action"], path["obs_next"]) costs.append(trajectory_cost) returns.append(path["return"][-1]) # print batch_ob.shape # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, exp_name='test', logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arg: onpol_iters: Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters: Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size: Batch size for dynamics training. num_paths_random: Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol: Number of paths to collect at each iteration of |_ aggregation, using the MPC policy. num_simulated_paths: How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon: Number of timesteps in each path. mpc_horizon: The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations: Neural network architecture arguments. """ # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() locals_['cost_fn'] = 'cost_fn' locals_['activation'] = 'activation' locals_['env'] = 'env' params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths = sample(env=env, controller=random_controller, num_paths=num_paths_random, horizon=env_horizon, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = { "observations": compute_normalization(paths["observations"]), "actions": compute_normalization(paths["actions"]), "deltas": compute_normalization(paths["next_observations"] - paths["observations"]) } #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration # refitting the dynamics model to current dataset and then taking onpolicy # samples and aggregating to the dataset. # TODO: implement mixing ratio for new and old data as described in # https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): shuffle_indexes = np.random.permutation(paths["observations"].shape[0]) for key in ['observations', 'actions', 'next_observations', 'rewards']: paths[key] = paths[key][shuffle_indexes] dyn_model.fit(paths) newpaths = sample(env=env, controller=mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, verbose=False) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model costs = path_cost(cost_fn, newpaths) returns = newpaths["acc_rewards"] logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory # using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular() for key in ['observations', 'actions', 'next_observations', 'rewards']: paths[key] = np.concatenate([paths[key], newpaths[key]])
def train(env, logdir=None, render=False, learning_rate=1e-3, dagger_iters=10, dynamics_iters=60, batch_size=512, num_random_rollouts=10, num_onpol_rollouts=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, n_hid_units=500, activation=tf.nn.relu, output_activation=None): """ Arguments: dagger_iters Number of iterations of onpolicy aggregation for the loop to run. dyn_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_random_rollouts Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_onpol_rollouts Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/n_hid_units/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths = sample(env, random_controller, num_rollouts=num_random_rollouts, horizon=env_horizon) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. normalization_stats = compute_normalization_stats(paths) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, n_hid_units=n_hid_units, activation=activation, output_activation=output_activation, normalization_stats=normalization_stats, batch_size=batch_size, num_iter=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation # at each iteration refitting the dynamics model to current dataset # and then taking on-policy samples and aggregating to the dataset. # # Note: You don't need to use a mixing ratio in this assignment # for new and old data as described in https://arxiv.org/abs/1708.02596 # for i in range(dagger_iters): print('********** ITERATION {}/{} ************'.format( i + 1, dagger_iters)) # Fitting dynamics model dyn_model.fit(paths) # Sampling on-policy new_paths = sample(env, mpc_controller, num_rollouts=num_onpol_rollouts, horizon=env_horizon) paths = new_paths + random.sample( paths, len(new_paths) // 9) # Adding new paths and forgetting old ones # paths += new_paths returns = [sum(path['rewards']) for path in new_paths] costs = [path_cost(path) for path in new_paths] # LOGGING # Statistics for performance of MPC policy using our learned dynamics model # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} print(params) # the three lines below are to override the functions passed in, which aren't serializable params["activation"] = "relu" params["cost_fn"] = "cheetah_cost_fn" params["env"] = "HalfCheetahEnvNew" logz.save_params(params) returns_file = "returns.csv" returns_array = [] #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ data = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then # taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in # https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print(itr) # learn/fit dynamics model using the Adam optimization algorithm l = dyn_model.fit(data) print(l) # sample a set of on-policy trajectories from the environment new_data = sample(env, mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=render, verbose=False) # append transition to dataset data += new_data # compute costs costs = np.array([path_cost(cost_fn, path) for path in new_data]) print(costs) # compute returns returns = np.array( [new_data[i]["returns"] for i in range(len(new_data))]) print(returns) returns_array.append(returns) np.array(returns_array).dump(returns_file) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2 , entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC = True, BEHAVIORAL_CLONING = True, PPO = True, ): start = time.time() print("-------- env info --------") print("Environment: ", FLAGS.env_name) print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("action_space low: ", env.action_space.low) print("action_space high: ", env.action_space.high) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print("MPC-AUG: ", MPC) print(" ") random_controller = RandomController(env) # Creat buffers model_data_buffer = DataBufferGeneral(FLAGS.MODELBUFFER_SIZE, 5) ppo_data_buffer = DataBufferGeneral(10000, 4) bc_data_buffer = DataBufferGeneral(2000, 2) # Random sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add([path['observations'][n], path['actions'][n], path['rewards'][n], path['next_observations'][n], path['next_observations'][n] - path['observations'][n]]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # # tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) policy_nn = MlpPolicy(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff) if FLAGS.LEARN_REWARD: print("Learn reward function") dyn_model = NNDynamicsRewardModel(env=env, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNetReward(env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) else: print("Use predefined cost function") dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_ppo_controller = MPCcontrollerPolicyNet(env=env, dyn_model=dyn_model, explore=FLAGS.MPC_EXP, policy_net=policy_nn, self_exp=FLAGS.SELFEXP, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) # if not PPO: # mpc_ppo_controller = mpc_controller #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(FLAGS.model_path) print("checkpoint", checkpoint) if checkpoint and checkpoint.model_checkpoint_path and FLAGS.LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(FLAGS.model_path): os.mkdir(FLAGS.model_path) #======================================================== # # Prepare for rollouts # tstart = time.time() states_true = [] states_predict = [] rewards_true = [] rewards_predict = [] ob = env.reset() ob_pre = np.expand_dims(ob, axis=0) states_true.append(ob) states_predict.append(ob_pre) for step in range(100): # ac = env.action_space.sample() # not used, just so we have the datatype ac, _ = policy_nn.act(ob, stochastic=True) ob, rew, done, _ = env.step(ac) ob_pre, r_pre = dyn_model.predict(ob_pre, ac) states_true.append(ob) rewards_true.append(rew) states_predict.append(ob_pre) rewards_predict.append(r_pre[0][0]) states_true = np.asarray(states_true) states_predict = np.asarray(states_predict) states_predict = np.squeeze(states_predict, axis=1) rewards_true = np.asarray(rewards_true) rewards_predict = np.asarray(rewards_predict) print("states_true", states_true.shape) print("states_predict", states_predict.shape) print("rewards_true", rewards_true.shape) print("rewards_predict", rewards_predict.shape) np.savetxt('./data/eval_model/states_true.out', states_true, delimiter=',') np.savetxt('./data/eval_model/states_predict.out', states_predict, delimiter=',') np.savetxt('./data/eval_model/rewards_true.out', rewards_true, delimiter=',') np.savetxt('./data/eval_model/rewards_predict.out', rewards_predict, delimiter=',')
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=20, batch_size=64, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=500, mpc_horizon=15, n_layers=2, size=64, activation=tf.nn.relu, output_activation=None, controller_service=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) ref_controller = RefMPCController(env, lambda state: call_mpc(env, controller_service)) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False, ) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(paths) print(normalization) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): dyn_model.fit(paths) new_paths = sample(env,mpc_controller, num_paths=num_paths_onpol,horizon=env_horizon,render=False,verbose=False) costs = [] returns = [] for new_path in new_paths: cost = path_cost(cost_fn, new_path) costs.append(cost) returns.append(new_path['return']) costs = np.array(costs) returns = np.array(returns) paths = paths + new_paths # Aggregation # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} del params['cost_fn'] del params['activation'] del params['output_activation'] del params['env'] logz.save_params(params) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ # Sample from random controller paths = sample(env, random_controller, num_paths_random, env_horizon, render, True) # Build data set data = dict() data['observations'] = np.concatenate( [path['observations'] for path in paths]) data['actions'] = np.concatenate([path['actions'] for path in paths]) next_observations = np.concatenate( [path['next_observations'] for path in paths]) data['deltas'] = next_observations - data['observations'] #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ # Refit dynamic model dyn_model.fit(data) # Sample on-policy trajectories paths = sample(env, mpc_controller, num_paths_onpol, env_horizon, render, True) # Summarize trajectories costs = [path_cost(cost_fn, path) for path in paths] returns = [np.sum(path['rewards']) for path in paths] # Aggregate data onpol_observations = np.concatenate( [path['observations'] for path in paths]) onpol_actions = np.concatenate([path['actions'] for path in paths]) onpol_next_observations = np.concatenate( [path['next_observations'] for path in paths]) onpol_deltas = onpol_next_observations - onpol_observations data['observations'] = np.append(data['observations'], onpol_observations, 0) data['actions'] = np.append(data['actions'], onpol_actions, 0) data['deltas'] = np.append(data['deltas'], onpol_deltas, 0) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=1, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=1, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=100, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths, rewards, costs = sample(env, random_controller, num_paths_random) obs = np.concatenate([path["observations"] for path in paths]) acs = np.concatenate([path["actions"] for path in paths]) n_obs = np.concatenate([path["next_observations"] for path in paths]) delta = n_obs - obs data = {'observations': obs, 'actions': acs, 'delta': delta} #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # mean_obs, std_obs, mean_deltas, std_deltas, mean_actions, std_actions = compute_normalization( data) normalization = dict() normalization['observations'] = [mean_obs, std_obs] normalization['actions'] = [mean_actions, std_actions] normalization['delta'] = [mean_deltas, std_deltas] #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # print("onpol_iter", onpol_iters) for itr in range(onpol_iters): """ YOUR CODE HERE """ print(data['observations'].shape) #print(data['observations'].shape) dyn_model.fit(data) # Generate trajectories from MPC controllers pathsM, returns, costs = sample(env, mpc_controller, num_paths_onpol) obs = np.concatenate([path["observations"] for path in pathsM]) acs = np.concatenate([path["actions"] for path in pathsM]) n_obs = np.concatenate([path["next_observations"] for path in pathsM]) delta = n_obs - obs data = { 'observations': np.concatenate((data['observations'], obs)), 'actions': np.concatenate((data['actions'], acs)), 'delta': np.concatenate((data['delta'], delta)) } # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def main(): nb_total_steps = 1000 nb_iterations = 40 hidden_layers = [256, 256] writer = tensorboardX.SummaryWriter() args = parse_args(__doc__, ['env']) env = gym.make(args.env) ctrl = rand_ctrl = RandomController(env) # ipdb.set_trace() print('#inputs : %d' % ctrl.nb_inputs()) print('#actions: %d' % ctrl.nb_actions()) # f_net = make_net( # [ctrl.nb_inputs() + ctrl.nb_actions()] + hidden_layers + [ctrl.nb_inputs()], # [nn.ReLU() for _ in hidden_layers], # ) f_net = MOENetwork( nb_inputs=ctrl.nb_inputs() + ctrl.nb_actions(), nb_experts=4, gait_layers=[64], expert_layers=[64, ctrl.nb_inputs()], ) data = collect_data(env, ctrl, nb_total_steps*10) # ipdb.set_trace() dynamics = DynamicsModel(env, f_net, data.get_all(), writer=writer) # cost_func = lambda s,a,sn: -sn[3].item() # refers to vx cost_func = get_cost(args.env) # refers to vx # data.calc_normalizations() # dynamics.fit(data) mpc_ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=100, horizon=10, num_mpc_steps=10) eval_args = EvaluationArgs(nb_burnin_steps=4, nb_episodes=10, horizons=[1, 2, 4, 8, 16, 32]) for i in range(nb_iterations): print('Iteration', i) new_data = collect_data(env, ctrl, nb_total_steps) dynamics.fit(*new_data.get_all()) data.extend(new_data) dynamics.fit(*data.sample(sample_size=4*nb_total_steps)) evaluate_and_log_dynamics( dynamics.predict, env, rand_ctrl, writer=writer, i_step=i, args=eval_args ) evaluate_and_log_dynamics( dynamics.predict, env, mpc_ctrl, writer=writer, i_step=i, args=eval_args ) # dynamics.fit(*data.get_all()) if random.random() > 0.5: ctrl = rand_ctrl else: ctrl = mpc_ctrl env = gym.make(args.env) ctrl = MPCcontroller(env, dynamics.predict, cost_func, num_simulated_paths=1000, num_mpc_steps=4) # TODO env.render(mode='human') obs = env.reset() for _ in range(100): # time.sleep(1. / 60.) obs, r, done, _ = env.step(ctrl.get_action(obs)) # print(' ', cost_func(obs)) if done: print("done:", r, obs) time.sleep(1) ctrl.reset() obs = env.reset() ipdb.set_trace()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', optim_stepsize=3e-4, timesteps_per_actorbatch=1000, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print(" ") random_controller = RandomController(env) model_data_buffer = DataBuffer() ppo_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 6) bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc_ppo = MPCcontroller_BC_PPO( env=env, dyn_model=dyn_model, bc_ppo_network=policy_nn, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon for itr in range(onpol_iters): print("onpol_iters: ", itr) dyn_model.fit(model_data_buffer) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) # saver.save(sess, CHECKPOINT_DIR) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ppo_data_buffer.clear() seg = traj_segment_generator(policy_nn, mpc_controller, mpc_controller_bc_ppo, bc_data_buffer, env, env_horizon) add_vtarg_and_adv(seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "rew"], seg["nxt_ob"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate for n in range(len(ob)): ppo_data_buffer.add( (ob[n], ac[n], rew[n], nxt_ob[n], atarg[n], tdlamret[n])) bc_data_buffer.add((ob[n], ac[n])) model_data_buffer.add(ob[n], ac[n], nxt_ob[n]) print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] # behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_rew, sample_nxt_ob_no, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) policy_nn.update_bc(sample_ob_no, sample_ac_na, optim_stepsize * cur_lrmult) if op_ep % 100 == 0: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=10, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths = sample(env, random_controller, num_paths=50) first = 1 for path in paths: if (first): data = { "observations": path['observations'], "next_observations": path['next_observations'], "rewards": path['rewards'], "actions": path['actions'], "returns": path['returns'] } first = 0 else: data['observations'] = np.vstack( (data['observations'], path['observations'])) data['next_observations'] = np.vstack( (data['next_observations'], path['next_observations'])) data['rewards'] = np.vstack((data['rewards'], path['rewards'])) data['actions'] = np.vstack((data['actions'], path['actions'])) data['returns'] = np.vstack((data['returns'], path['returns'])) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # #open CSV csv_file = open('results.csv', 'w') writer = csv.writer(csv_file, delimiter=',') for itr in range(onpol_iters): print(itr) costs = [] returns = [] """ YOUR CODE HERE """ dyn_model.fit(data) #plot_comparison(env,dyn_model) mpc_controller.dyn_model = dyn_model #need to update or not? new_paths = sample(env, mpc_controller) for path in new_paths: cost = path_cost(cost_fn, path) costs.append(cost) returns.append(path['returns'][-1]) data['observations'] = np.vstack( (data['observations'], path['observations'])) data['next_observations'] = np.vstack( (data['next_observations'], path['next_observations'])) data['actions'] = np.vstack((data['actions'], path['actions'])) dyn_model.normalization = compute_normalization(data) writer.writerow([itr, np.mean(returns)]) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(state_cb, pub_cmd, pub_act, rate, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. rand_controller = RandomController() paths = sample(state_cb, pub_cmd, pub_act, rate, rand_controller, num_paths_random, env_horizon, render) data = paths_to_array(paths) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): # Fit dynamics model print('Training dynamics model...') dyn_model.fit(data) plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate) mpc_controller.dyn_model = dyn_model costs = [] returns = [] # Do MPC for i in range(num_paths_onpol): print('On policy path: %i' % i) obs_t, obs_tp1, acs_t, rews_t = [], [], [], [] s_t = state_cb.reset(pub_act, pub_cmd) total_return = 0 for j in range(env_horizon): # print('Timestep: %i, Return: %g' % (j,total_return)) a_t = mpc_controller.get_action(s_t) s_tp1, _ = state_cb.step(a_t, pub_act, pub_cmd) r_t = 0 for i in range(9): r_t += s_tp1[i * 12] - s_t[i * 12] total_return += r_t if render: env.render() time.sleep(0.05) obs_t.append(s_t) obs_tp1.append(s_tp1) acs_t.append(a_t) rews_t.append(r_t) s_t = s_tp1 path = { "observations": np.array(obs_t), "next_observations": np.array(obs_tp1), "actions": np.array(acs_t), "rewards": np.array(rews_t) } total_cost = path_cost(cost_fn, path) paths.append(path) returns.append(total_return) costs.append(total_cost) print('Total cost: %g, Total reward: %g' % (total_cost, total_return)) data = paths_to_array(paths) normalization = compute_normalization(data) # Set new normalization statistics for dynamics model dyn_model.normalization = normalization # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation='relu', output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # ======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. """ YOUR CODE HERE """ random_controller = RandomController(env) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, ignore_done=True) # 10 # ======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ # concatenate observations & actions to numpy data_rand_x # concatenate (next_observations -observations) to numpy data_rand_y for i in range(num_paths_random): if i == 0: data_rand_x = np.concatenate( (paths[i]['observations'], paths[i]['actions']), axis=1) data_rand_y = paths[i]['next_observations'] - paths[i][ 'observations'] else: x = np.concatenate((paths[i]['observations'], paths[i]['actions']), axis=1) data_rand_x = np.concatenate((data_rand_x, x), axis=0) y = paths[i]['next_observations'] - paths[i]['observations'] data_rand_y = np.concatenate((data_rand_y, y), axis=0) # Initialize data set D to Drand data_x = data_rand_x data_y = data_rand_y # ======================================================== # # Build dynamics model and MPC controllers. # # sess = tf.Session() # dyn_model = NNDynamicsModel(env=env, # n_layers=n_layers, # size=size, # activation=activation, # output_activation=output_activation, # batch_size=batch_size, # iterations=dynamics_iters, # learning_rate=learning_rate, # normalization=normalization # ) dyn_model = NNDynamicsModel( env=env, hidden_size=(500, 500), activation=activation, #'tanh' ).cuda() mpc_controller = MPCcontroller( env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths, ) # ======================================================== # # Tensorflow session building. # # sess.__enter__() # tf.global_variables_initializer().run() # ======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # make dirs output if not (os.path.exists(logdir)): os.makedirs(logdir) path = os.path.join(logdir, 'model') if not (os.path.exists(path)): os.makedirs(path) for itr in range(onpol_iters): """ YOUR CODE HERE """ if itr != 0: dyn_model.load_state_dict(torch.load(path + '/net_params.pkl')) # store data # if (itr % 9) == 0 or itr == (onpol_iters-1): if itr >= 0: logger = Logger(logdir, csvname='log_orig' + str(itr)) data = np.concatenate((data_x, data_y), axis=1) logger.log_table2csv(data) if itr == 0: data_x += np.random.normal(0, 0.001, size=data_x.shape) data_y += np.random.normal(0, 0.001, size=data_y.shape) else: data_x = best_x + np.random.normal(0, 0.001, size=best_x.shape) data_y = best_y + np.random.normal(0, 0.001, size=best_y.shape) dyn_model.fit(data_x, data_y, epoch_size=dynamics_iters, batch_size=batch_size, test=True) torch.save(dyn_model.state_dict(), path + '/net_params.pkl') # save only the parameters torch.save(dyn_model, path + '/net' + str(itr) + '.pkl') # save entire net print('-------------Itr %d-------------' % itr) print('Start time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) start = time.time() # caculate run time --start time point # sample if Monitor is True: monitor_path = os.path.join(logdir, 'monitor' + str(itr)) env = wrappers.Monitor(env, monitor_path, force=True) paths = sample(env, mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=False, ignore_done=False, MPC=True) end = time.time() runtime2 = end - start print('runtime = ', runtime2) print('End time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) # concatenate observations & actions to numpy data_rand_x # concatenate (next_observations -observations) to numpy data_rand_y for i in range(num_paths_onpol): if i == 0: data_rl_x = np.concatenate( (paths[i]['observations'], paths[i]['actions']), axis=1) data_rl_y = paths[i]['next_observations'] - paths[i][ 'observations'] else: x = np.concatenate( (paths[i]['observations'], paths[i]['actions']), axis=1) data_rl_x = np.concatenate((data_rl_x, x), axis=0) y = paths[i]['next_observations'] - paths[i]['observations'] data_rl_y = np.concatenate((data_rl_y, y), axis=0) # Aggregate data data_x = np.concatenate((data_x, data_rl_x), axis=0) data_y = np.concatenate((data_y, data_rl_y), axis=0) costs = np.zeros((num_paths_onpol, 1)) returns = np.zeros((num_paths_onpol, 1)) for i in range(num_paths_onpol): costs[i] = paths[i]['cost'] returns[i] = paths[i]['returns'][0] if itr == 0: best_x = data_rl_x best_y = data_rl_y else: best_x = np.concatenate((best_x, data_rl_x), axis=0) best_y = np.concatenate((best_y, data_rl_y), axis=0) # store data #if (itr % 9) == 0 or itr == (onpol_iters-1): if itr >= 0: logger = Logger(logdir, csvname='best' + str(itr)) data = np.concatenate((best_x, best_y), axis=1) logger.log_table2csv(data) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, load_model, model_path, logdir=None, render=False, learning_rate_dyn=1e-3, learning_rate_policy=1e-4, onpol_iters=10, dynamics_iters=60, policy_iters=100, batch_size=512, num_paths_random=10, num_paths_onpol=5, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ #logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ data = sample(env, random_controller, num_paths_random, env_horizon) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate_dyn, sess=sess) policy = NNPolicy(env=env, normalization=normalization, batch_size=batch_size, iterations=policy_iters, learning_rate=learning_rate_policy, sess=sess, model_path=model_path, save_path="./policy/", load_model=load_model) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) lqr_controller = LQRcontroller(env=env, delta=0.005, T=50, dyn_model=dyn_model, cost_fn=cost_fn, iterations=1) comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # training the MPC controller as well as dynamics for itr in range(onpol_iters): print("fitting dynamics for worker ", rank) dyn_model.fit(data) print("sampling new trajectories from worker ", rank) new_data = sample(env, lqr_controller, num_paths_onpol, env_horizon) data += new_data comm.send(new_data, 0) if rank == 0: costs, returns = [], [] for path in data: costs.append(path_cost(cost_fn, path)) returns.append(np.sum(path['rewards'])) print("returns ",returns) for i in range(1, size): data += comm.recv(source=i) print("fitting policy...") policy.fit(data) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular() # applying the learned neural policy if rank == 0: ob = env.reset() while True: a = policy.get_action(ob.reshape((1, ob.shape[0]))) # control clipping to be added next_ob, reward, done, info = env.step(a[0]) print("action", a) print("predicted ob", dyn_model.predict(ob, a)) print("actual ob", (next_ob - normalization[0]) / (normalization[1] + 1e-10)) env.render() ob = next_ob if done: ob = env.reset()
| how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ data = sample(env, random_controller, num_paths_random, env_horizon) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data)
def get_dynamics_test_loss(env, dyn_model): return dyn_model.get_loss_on_data(sample(env, RandomController(env)))