def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=20, batch_size=64, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=500, mpc_horizon=15, n_layers=2, size=64, activation=tf.nn.relu, output_activation=None, controller_service=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) ref_controller = RefMPCController(env, lambda state: call_mpc(env, controller_service)) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False, ) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(paths) print(normalization) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): dyn_model.fit(paths) new_paths = sample(env,mpc_controller, num_paths=num_paths_onpol,horizon=env_horizon,render=False,verbose=False) costs = [] returns = [] for new_path in new_paths: cost = path_cost(cost_fn, new_path) costs.append(cost) returns.append(new_path['return']) costs = np.array(costs) returns = np.array(returns) paths = paths + new_paths # Aggregation # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
# Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print("fitting dynamics...") dyn_model.fit(data) print("sampling new trajectories...") new_data = sample(env, mpc_controller, num_paths_onpol, env_horizon) costs, returns = [], [] for path in data: costs.append(path_cost(cost_fn, path)) returns.append(path['rewards']) print("returns ",returns) data += new_data # LOGGING # Statistics for performance of MPC policy using
def train(state_cb, pub_cmd, pub_act, rate, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. rand_controller = RandomController() paths = sample(state_cb, pub_cmd, pub_act, rate, rand_controller, num_paths_random, env_horizon, render) data = paths_to_array(paths) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): # Fit dynamics model print('Training dynamics model...') dyn_model.fit(data) plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate) mpc_controller.dyn_model = dyn_model costs = [] returns = [] # Do MPC for i in range(num_paths_onpol): print('On policy path: %i' % i) obs_t, obs_tp1, acs_t, rews_t = [], [], [], [] s_t = state_cb.reset(pub_act, pub_cmd) total_return = 0 for j in range(env_horizon): # print('Timestep: %i, Return: %g' % (j,total_return)) a_t = mpc_controller.get_action(s_t) s_tp1, _ = state_cb.step(a_t, pub_act, pub_cmd) r_t = 0 for i in range(9): r_t += s_tp1[i * 12] - s_t[i * 12] total_return += r_t if render: env.render() time.sleep(0.05) obs_t.append(s_t) obs_tp1.append(s_tp1) acs_t.append(a_t) rews_t.append(r_t) s_t = s_tp1 path = { "observations": np.array(obs_t), "next_observations": np.array(obs_tp1), "actions": np.array(acs_t), "rewards": np.array(rews_t) } total_cost = path_cost(cost_fn, path) paths.append(path) returns.append(total_return) costs.append(total_cost) print('Total cost: %g, Total reward: %g' % (total_cost, total_return)) data = paths_to_array(paths) normalization = compute_normalization(data) # Set new normalization statistics for dynamics model dyn_model.normalization = normalization # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} print(params) # the three lines below are to override the functions passed in, which aren't serializable params["activation"] = "relu" params["cost_fn"] = "cheetah_cost_fn" params["env"] = "HalfCheetahEnvNew" logz.save_params(params) returns_file = "returns.csv" returns_array = [] #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ data = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then # taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in # https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print(itr) # learn/fit dynamics model using the Adam optimization algorithm l = dyn_model.fit(data) print(l) # sample a set of on-policy trajectories from the environment new_data = sample(env, mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=render, verbose=False) # append transition to dataset data += new_data # compute costs costs = np.array([path_cost(cost_fn, path) for path in new_data]) print(costs) # compute returns returns = np.array( [new_data[i]["returns"] for i in range(len(new_data))]) print(returns) returns_array.append(returns) np.array(returns_array).dump(returns_file) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, exp_name='test', logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arg: onpol_iters: Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters: Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size: Batch size for dynamics training. num_paths_random: Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol: Number of paths to collect at each iteration of |_ aggregation, using the MPC policy. num_simulated_paths: How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon: Number of timesteps in each path. mpc_horizon: The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations: Neural network architecture arguments. """ # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() locals_['cost_fn'] = 'cost_fn' locals_['activation'] = 'activation' locals_['env'] = 'env' params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths = sample(env=env, controller=random_controller, num_paths=num_paths_random, horizon=env_horizon, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = { "observations": compute_normalization(paths["observations"]), "actions": compute_normalization(paths["actions"]), "deltas": compute_normalization(paths["next_observations"] - paths["observations"]) } #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration # refitting the dynamics model to current dataset and then taking onpolicy # samples and aggregating to the dataset. # TODO: implement mixing ratio for new and old data as described in # https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): shuffle_indexes = np.random.permutation(paths["observations"].shape[0]) for key in ['observations', 'actions', 'next_observations', 'rewards']: paths[key] = paths[key][shuffle_indexes] dyn_model.fit(paths) newpaths = sample(env=env, controller=mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, verbose=False) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model costs = path_cost(cost_fn, newpaths) returns = newpaths["acc_rewards"] logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory # using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular() for key in ['observations', 'actions', 'next_observations', 'rewards']: paths[key] = np.concatenate([paths[key], newpaths[key]])
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation |_ for the loop to run. dynamics_iters Number of iterations of training for the | dynamics model which happen per iteration of |_ the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of | aggregation, using the Model Predictive Control |_ policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first | action of the best fictitious rollout. This | argument is how many timesteps should be in |_ each fictitious rollout. n_layers/size/activations Neural network architecture arguments. """ d("env = {}".format(env)) d("env.observation_space = {}".format(env.observation_space)) d("env.action_space = {}".format(env.action_space)) d("env.observation_space.shape = {}".format(env.observation_space.shape)) d("env.action_space.shape = {}".format(env.action_space.shape)) d("logdir = {}".format(logdir)) d("render = {}".format(render)) d("learning_rate = {}".format(learning_rate)) d("onpol_iters = {}".format(onpol_iters)) d("dynamics_iters = {}".format(dynamics_iters)) d("batch_size = {}".format(batch_size)) d("num_paths_random = {}".format(num_paths_random)) d("num_paths_onpol = {}".format(num_paths_onpol)) d("num_simulated_paths = {}".format(num_simulated_paths)) d("env_horizon = {}".format(env_horizon)) d("mpc_horizon = {}".format(mpc_horizon)) d("n_layers = {}".format(n_layers)) d("size = {}".format(size)) logz.configure_output_dir(logdir) #=========================================================================== # First, we need a lot of data generated by a random agent, with which # we'll begin to train our dynamics model. d("Generating random rollouts.") random_controller = RandomController(env) random_paths = sample(env=env, controller=random_controller, num_paths=num_paths_random, horizon=env_horizon, render=render) d("Done generating random rollouts.") #=========================================================================== # The random data will be used to get statistics (mean and std) for the # observations, actions, and deltas (where deltas are o_{t+1} - o_t). These # will be used for normalizing inputs and denormalizing outputs from the # dynamics network. d("Normalizing random rollouts.") data = paths_to_data(random_paths) normalization = compute_normalization(data) d("Done normalizing random rollouts.") mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action = normalization d("mean_obs = {}".format(mean_obs)) d("std_obs = {}".format(std_obs)) d("mean_deltas = {}".format(mean_deltas)) d("std_deltas = {}".format(std_deltas)) d("mean_action = {}".format(mean_action)) d("std_action = {}".format(std_action)) #=========================================================================== # Build dynamics model and MPC controllers. sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #=========================================================================== # Tensorflow session building. sess.__enter__() tf.global_variables_initializer().run() #=========================================================================== # Take multiple iterations of onpolicy aggregation at each iteration # refitting the dynamics model to current dataset and then taking onpolicy # samples and aggregating to the dataset. # # Note: You don't need to use a mixing ratio in this assignment for new and # old data as described in https://arxiv.org/abs/1708.02596 start_time = time.time() for itr in range(onpol_iters): d("Iteration {}".format(itr)) # Shuffle data. d("Shuffling data.") shuffle_indexes = np.random.permutation(data["observations"].shape[0]) data["observations"] = data["observations"][shuffle_indexes] data["actions"] = data["actions"][shuffle_indexes] data["next_observations"] = data["next_observations"][shuffle_indexes] data["rewards"] = data["rewards"][shuffle_indexes] d("Done shuffling data.") # Fit the dynamics. d("Fitting dynamics.") dyn_model.fit(data) d("Done fitting dynamics.") # Generate on-policy rollouts. d("Generating on-policy rollouts.") rl_paths = sample(env=env, controller=mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=render) d("Done generating on-policy rollouts.") # Compute metrics. costs = np.array([path_cost(cost_fn, path) for path in rl_paths]) returns = np.array([sum(path["rewards"]) for path in rl_paths]) # Update data. new_data = paths_to_data(rl_paths) data = { "observations": np.concatenate([data["observations"], new_data["observations"]]), "actions": np.concatenate([data["actions"], new_data["actions"]]), "next_observations": np.concatenate( [data["next_observations"], new_data["next_observations"]]), "rewards": np.concatenate([data["rewards"], new_data["rewards"]]), } # TODO(mwhittaker): Shuffle if we need to. # LOGGING # Statistics for performance of MPC policy using our learned dynamics # model logz.log_tabular('Iteration', itr) logz.log_tabular('Time', time.time() - start_time) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory # using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} del params['cost_fn'] del params['activation'] del params['output_activation'] del params['env'] logz.save_params(params) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ # Sample from random controller paths = sample(env, random_controller, num_paths_random, env_horizon, render, True) # Build data set data = dict() data['observations'] = np.concatenate( [path['observations'] for path in paths]) data['actions'] = np.concatenate([path['actions'] for path in paths]) next_observations = np.concatenate( [path['next_observations'] for path in paths]) data['deltas'] = next_observations - data['observations'] #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ # Refit dynamic model dyn_model.fit(data) # Sample on-policy trajectories paths = sample(env, mpc_controller, num_paths_onpol, env_horizon, render, True) # Summarize trajectories costs = [path_cost(cost_fn, path) for path in paths] returns = [np.sum(path['rewards']) for path in paths] # Aggregate data onpol_observations = np.concatenate( [path['observations'] for path in paths]) onpol_actions = np.concatenate([path['actions'] for path in paths]) onpol_next_observations = np.concatenate( [path['next_observations'] for path in paths]) onpol_deltas = onpol_next_observations - onpol_observations data['observations'] = np.append(data['observations'], onpol_observations, 0) data['actions'] = np.append(data['actions'], onpol_actions, 0) data['deltas'] = np.append(data['deltas'], onpol_deltas, 0) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train_PG( exp_name='', env_name='', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=False, animate=True, logdir=None, normalize_advantages=False, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, # mb mpc arguments model_learning_rate=1e-3, onpol_iters=10, dynamics_iters=260, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=10, m_n_layers=2, m_size=500, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment # env = gym.make(env_name) env = HalfCheetahEnvNew() cost_fn = cheetah_cost_fn activation=tf.nn.relu output_activation=None # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes # max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # Print environment infomation print("-------- env info --------") print("Environment name: ", env_name) print("Action space is discrete: ", discrete) print("Action space dim: ", ac_dim) print("Observation space dim: ", ob_dim) print("Max_path_length ", max_path_length) #========================================================================================# # Random data collection #========================================================================================# random_controller = RandomController(env) data_buffer_model = DataBuffer() data_buffer_ppo = DataBuffer_general(10000, 4) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("data buffer size: ", data_buffer_model.size) normalization = compute_normalization(data_buffer_model) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True tf_config.intra_op_parallelism_threads =4 tf_config.inter_op_parallelism_threads = 1 sess = tf.Session(config=tf_config) dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate) if nn_baseline: value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) if MPC: dyn_model.fit(data_buffer_model) returns = [] costs = [] # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: # print("data buffer size: ", data_buffer_model.size) current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]} ob = env.reset() obs, acs, mpc_acs, rewards = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 return_ = 0 while True: # print("steps ", steps) if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) if MPC: mpc_ac = mpc_controller.get_action(ob) else: mpc_ac = random_controller.get_action(ob) ac = policy_nn.predict(ob, mpc_ac) ac = ac[0] if not PG: ac = mpc_ac acs.append(ac) mpc_acs.append(mpc_ac) current_path['observations'].append(ob) ob, rew, done, _ = env.step(ac) current_path['reward'].append(rew) current_path['actions'].append(ac) current_path['next_observations'].append(ob) return_ += rew rewards.append(rew) steps += 1 if done or steps > max_path_length: break if MPC: # cost & return cost = path_cost(cost_fn, current_path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(current_path['observations'])): data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n]) for n in range(len(current_path['observations'])): data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n]) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs), "mpc_action" : np.array(mpc_acs)} paths.append(path) timesteps_this_batch += pathlength(path) # print("timesteps_this_batch", timesteps_this_batch) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch print("data_buffer_ppo.size:", data_buffer_ppo.size) # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths]) # Computing Q-values if reward_to_go: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): if t_ >= t: q += gamma**(t_-t) * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) else: q_n = [] for path in paths: for t in range(len(path["reward"])): t_ = 0 q = 0 while t_ < len(path["reward"]): q += gamma**t_ * path["reward"][t_] t_ += 1 q_n.append(q) q_n = np.asarray(q_n) # Computing Baselines if nn_baseline: # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no}) b_n = value_nn.predict(ob_no) b_n = normalize(b_n) b_n = denormalize(b_n, np.std(q_n), np.mean(q_n)) adv_n = q_n - b_n else: adv_n = q_n.copy() # Advantage Normalization if normalize_advantages: adv_n = normalize(adv_n) # Optimizing Neural Network Baseline if nn_baseline: b_n_target = normalize(q_n) value_nn.fit(ob_no, b_n_target) # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target}) # Performing the Policy Update # policy_nn.fit(ob_no, ac_na, adv_n) policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na) # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=1, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=1, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=100, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths, rewards, costs = sample(env, random_controller, num_paths_random) obs = np.concatenate([path["observations"] for path in paths]) acs = np.concatenate([path["actions"] for path in paths]) n_obs = np.concatenate([path["next_observations"] for path in paths]) delta = n_obs - obs data = {'observations': obs, 'actions': acs, 'delta': delta} #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # mean_obs, std_obs, mean_deltas, std_deltas, mean_actions, std_actions = compute_normalization( data) normalization = dict() normalization['observations'] = [mean_obs, std_obs] normalization['actions'] = [mean_actions, std_actions] normalization['delta'] = [mean_deltas, std_deltas] #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # print("onpol_iter", onpol_iters) for itr in range(onpol_iters): """ YOUR CODE HERE """ print(data['observations'].shape) #print(data['observations'].shape) dyn_model.fit(data) # Generate trajectories from MPC controllers pathsM, returns, costs = sample(env, mpc_controller, num_paths_onpol) obs = np.concatenate([path["observations"] for path in pathsM]) acs = np.concatenate([path["actions"] for path in pathsM]) n_obs = np.concatenate([path["next_observations"] for path in pathsM]) delta = n_obs - obs data = { 'observations': np.concatenate((data['observations'], obs)), 'actions': np.concatenate((data['actions'], acs)), 'delta': np.concatenate((data['delta'], delta)) } # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None ): # tracker = SummaryTracker() """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. """ YOUR CODE HERE """ # Print env info print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print(" ") random_controller = RandomController(env) data_buffer = DataBuffer() bc_data_buffer = DataBuffer_SA(BC_BUFFER_SIZE) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # print("data buffer size: ", data_buffer.size) normalization = compute_normalization(data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc = MPCcontroller_BC(env=env, dyn_model=dyn_model, bc_network=bc_net, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): """ YOUR CODE HERE """ print("onpol_iters: ", itr) dyn_model.fit(data_buffer) saver.save(sess, CHECKPOINT_DIR) returns = [] costs = [] for w in range(num_paths_onpol): print("paths_onpol: ", w, " running.....") print("data buffer size: ", data_buffer.size) st = env.reset_model() path = {'observations': [], 'actions': [], 'next_observations':[]} # tracker.print_diff() return_ = 0 for i in range(env_horizon): if render: env.render() # print("env_horizon: ", i) if BEHAVIORAL_CLONING: if bc_data_buffer.size > 2000: at = mpc_controller_bc.get_action(st) else: at = mpc_controller.get_action(st) else: at = mpc_controller.get_action(st) # at = random_controller.get_action(st) st_next, env_reward, _, _ = env._step(at) path['observations'].append(st) path['actions'].append(at) path['next_observations'].append(st_next) st = st_next return_ += env_reward # cost & return cost = path_cost(cost_fn, path) costs.append(cost) returns.append(return_) print("total return: ", return_) print("costs: ", cost) # add into buffers for n in range(len(path['observations'])): data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) bc_data_buffer.add(path['observations'][n], path['actions'][n]) if BEHAVIORAL_CLONING: behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # logz.log_tabular('Average_BC_Return', np.mean(bc_returns)) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=1000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths_rand = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=render, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(paths_rand) gamma = 0.99 #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # prefit dynamic before on policy dagger: print("****** Pretrain dynamic Model *******") losses = [] obs_rand = np.concatenate([path["observation"] for path in paths_rand]) action_rand = np.concatenate([path["action"] for path in paths_rand]) next_ob_rand = np.concatenate([path["obs_next"] for path in paths_rand]) data_size_rand = obs_rand.shape[0] for i in range(1000): # obtain batch size from random policy batch_idx_rand = np.random.randint(data_size_rand, size=batch_size) batch_ob_rand = obs_rand[batch_idx_rand, :] batch_ac_rand = action_rand[batch_idx_rand, :] batch_nxt_rand = next_ob_rand[batch_idx_rand, :] # obtain batch size from on policy batch_ob = np.copy(batch_ob_rand) batch_ac = np.copy(batch_ac_rand) batch_nxt = np.copy(batch_nxt_rand) loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt) losses.append(loss) if (i % 20 == 0): print('loss', loss) costs = [] returns = [] paths_rl = [] for itr in range(onpol_iters): """ YOUR CODE HERE """ # fit dynamic model if itr > 0: obs_rl = np.concatenate([path["observation"] for path in paths_rl]) action_rl = np.concatenate([path["action"] for path in paths_rl]) next_ob_rl = np.concatenate( [path["obs_next"] for path in paths_rl]) obs_rand = np.concatenate([path["observation"] for path in paths_rand]) action_rand = np.concatenate([path["action"] for path in paths_rand]) next_ob_rand = np.concatenate( [path["obs_next"] for path in paths_rand]) # print obs[128,:].shape data_size_rand = obs_rand.shape[0] if itr > 0: data_size_rl = obs_rl.shape[0] # batch_size=128 losses = [] # fit model function for i in range(dynamics_iters): # obtain batch size from random policy batch_idx_rand = np.random.randint(data_size_rand, size=batch_size / 20) batch_ob_rand = obs_rand[batch_idx_rand, :] batch_ac_rand = action_rand[batch_idx_rand, :] batch_nxt_rand = next_ob_rand[batch_idx_rand, :] # obtain batch size from on policy if itr > 0: batch_idx_rl = np.random.randint(data_size_rl, size=batch_size * 19 / 20) batch_ob_rl = obs_rl[batch_idx_rl, :] batch_ac_rl = action_rl[batch_idx_rl, :] batch_nxt_rl = next_ob_rl[batch_idx_rl, :] # mix them batch_ob = np.concatenate((batch_ob_rand, batch_ob_rl)) batch_ac = np.concatenate((batch_ac_rand, batch_ac_rl)) batch_nxt = np.concatenate((batch_nxt_rand, batch_nxt_rl)) else: batch_ob = np.copy(batch_ob_rand) batch_ac = np.copy(batch_ac_rand) batch_nxt = np.copy(batch_nxt_rand) loss = dyn_model.fit(batch_ob, batch_ac, batch_nxt) losses.append(loss) # if(i%20==0): # print('loss', loss) print("on policy dagger ", itr) ob = env.reset() observes, acs, rewards, obs_2, returns = [], [], [], [], [] steps = 0 g = 0 max_path_length = mpc_controller.horizon timesteps_this_batch = 0 while True: while True: observes.append(ob) ac = mpc_controller.get_action(ob) # print ac acs.append(ac) # print ac ob, rew, done, _ = env.step(ac) g += rew * gamma**steps obs_2.append(ob) rewards.append(rew) returns.append(g) steps += 1 if done or steps > max_path_length: terminated = done break path = { "observation": np.array(observes), "reward": np.array(rewards), "action": np.array(acs), "obs_next": np.array(obs_2), "return": np.array(returns) } paths_rl.append(path) timesteps_this_batch += pathlength(path) print g if timesteps_this_batch > batch_size: break trajectory_cost = trajectory_cost_fn(cheetah_cost_fn, path["observation"], path["action"], path["obs_next"]) costs.append(trajectory_cost) returns.append(path["return"][-1]) # print batch_ob.shape # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, load_model, model_path, logdir=None, render=False, learning_rate_dyn=1e-3, learning_rate_policy=1e-4, onpol_iters=10, dynamics_iters=60, policy_iters=100, batch_size=512, num_paths_random=10, num_paths_onpol=5, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, ): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ #logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ data = sample(env, random_controller, num_paths_random, env_horizon) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate_dyn, sess=sess) policy = NNPolicy(env=env, normalization=normalization, batch_size=batch_size, iterations=policy_iters, learning_rate=learning_rate_policy, sess=sess, model_path=model_path, save_path="./policy/", load_model=load_model) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) lqr_controller = LQRcontroller(env=env, delta=0.005, T=50, dyn_model=dyn_model, cost_fn=cost_fn, iterations=1) comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # # training the MPC controller as well as dynamics for itr in range(onpol_iters): print("fitting dynamics for worker ", rank) dyn_model.fit(data) print("sampling new trajectories from worker ", rank) new_data = sample(env, lqr_controller, num_paths_onpol, env_horizon) data += new_data comm.send(new_data, 0) if rank == 0: costs, returns = [], [] for path in data: costs.append(path_cost(cost_fn, path)) returns.append(np.sum(path['rewards'])) print("returns ",returns) for i in range(1, size): data += comm.recv(source=i) print("fitting policy...") policy.fit(data) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular() # applying the learned neural policy if rank == 0: ob = env.reset() while True: a = policy.get_action(ob.reshape((1, ob.shape[0]))) # control clipping to be added next_ob, reward, done, info = env.step(a[0]) print("action", a) print("predicted ob", dyn_model.predict(ob, a)) print("actual ob", (next_ob - normalization[0]) / (normalization[1] + 1e-10)) env.render() ob = next_ob if done: ob = env.reset()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=10, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths = sample(env, random_controller, num_paths=50) first = 1 for path in paths: if (first): data = { "observations": path['observations'], "next_observations": path['next_observations'], "rewards": path['rewards'], "actions": path['actions'], "returns": path['returns'] } first = 0 else: data['observations'] = np.vstack( (data['observations'], path['observations'])) data['next_observations'] = np.vstack( (data['next_observations'], path['next_observations'])) data['rewards'] = np.vstack((data['rewards'], path['rewards'])) data['actions'] = np.vstack((data['actions'], path['actions'])) data['returns'] = np.vstack((data['returns'], path['returns'])) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # #open CSV csv_file = open('results.csv', 'w') writer = csv.writer(csv_file, delimiter=',') for itr in range(onpol_iters): print(itr) costs = [] returns = [] """ YOUR CODE HERE """ dyn_model.fit(data) #plot_comparison(env,dyn_model) mpc_controller.dyn_model = dyn_model #need to update or not? new_paths = sample(env, mpc_controller) for path in new_paths: cost = path_cost(cost_fn, path) costs.append(cost) returns.append(path['returns'][-1]) data['observations'] = np.vstack( (data['observations'], path['observations'])) data['next_observations'] = np.vstack( (data['next_observations'], path['next_observations'])) data['actions'] = np.vstack((data['actions'], path['actions'])) dyn_model.normalization = compute_normalization(data) writer.writerow([itr, np.mean(returns)]) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', optim_stepsize=3e-4, timesteps_per_actorbatch=1000, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print(" ") random_controller = RandomController(env) model_data_buffer = DataBuffer() ppo_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 6) bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2) # sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) bc_net = BCnetwork(sess, env, BATCH_SIZE_BC, learning_rate) mpc_controller_bc_ppo = MPCcontroller_BC_PPO( env=env, dyn_model=dyn_model, bc_ppo_network=policy_nn, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon for itr in range(onpol_iters): print("onpol_iters: ", itr) dyn_model.fit(model_data_buffer) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) # saver.save(sess, CHECKPOINT_DIR) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) ppo_data_buffer.clear() seg = traj_segment_generator(policy_nn, mpc_controller, mpc_controller_bc_ppo, bc_data_buffer, env, env_horizon) add_vtarg_and_adv(seg, gamma, lam) ob, ac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "rew"], seg["nxt_ob"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate for n in range(len(ob)): ppo_data_buffer.add( (ob[n], ac[n], rew[n], nxt_ob[n], atarg[n], tdlamret[n])) bc_data_buffer.add((ob[n], ac[n])) model_data_buffer.add(ob[n], ac[n], nxt_ob[n]) print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] # behavioral_cloning(sess, env, bc_net, mpc_controller, env_horizon, bc_data_buffer, Training_epoch=1000) if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_rew, sample_nxt_ob_no, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) policy_nn.update_bc(sample_ob_no, sample_ac_na, optim_stepsize * cur_lrmult) if op_ep % 100 == 0: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars()
def train(env, logdir=None, render=False, learning_rate=1e-3, dagger_iters=10, dynamics_iters=60, batch_size=512, num_random_rollouts=10, num_onpol_rollouts=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, n_hid_units=500, activation=tf.nn.relu, output_activation=None): """ Arguments: dagger_iters Number of iterations of onpolicy aggregation for the loop to run. dyn_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_random_rollouts Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_onpol_rollouts Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/n_hid_units/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths = sample(env, random_controller, num_rollouts=num_random_rollouts, horizon=env_horizon) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. normalization_stats = compute_normalization_stats(paths) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, n_hid_units=n_hid_units, activation=activation, output_activation=output_activation, normalization_stats=normalization_stats, batch_size=batch_size, num_iter=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation # at each iteration refitting the dynamics model to current dataset # and then taking on-policy samples and aggregating to the dataset. # # Note: You don't need to use a mixing ratio in this assignment # for new and old data as described in https://arxiv.org/abs/1708.02596 # for i in range(dagger_iters): print('********** ITERATION {}/{} ************'.format( i + 1, dagger_iters)) # Fitting dynamics model dyn_model.fit(paths) # Sampling on-policy new_paths = sample(env, mpc_controller, num_rollouts=num_onpol_rollouts, horizon=env_horizon) paths = new_paths + random.sample( paths, len(new_paths) // 9) # Adding new paths and forgetting old ones # paths += new_paths returns = [sum(path['rewards']) for path in new_paths] costs = [path_cost(path) for path in new_paths] # LOGGING # Statistics for performance of MPC policy using our learned dynamics model # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()