def train(state_cb, pub_cmd, pub_act, rate, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. rand_controller = RandomController() paths = sample(state_cb, pub_cmd, pub_act, rate, rand_controller, num_paths_random, env_horizon, render) data = paths_to_array(paths) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): # Fit dynamics model print('Training dynamics model...') dyn_model.fit(data) plot_comparison(dyn_model, state_cb, pub_act, pub_cmd, rate) mpc_controller.dyn_model = dyn_model costs = [] returns = [] # Do MPC for i in range(num_paths_onpol): print('On policy path: %i' % i) obs_t, obs_tp1, acs_t, rews_t = [], [], [], [] s_t = state_cb.reset(pub_act, pub_cmd) total_return = 0 for j in range(env_horizon): # print('Timestep: %i, Return: %g' % (j,total_return)) a_t = mpc_controller.get_action(s_t) s_tp1, _ = state_cb.step(a_t, pub_act, pub_cmd) r_t = 0 for i in range(9): r_t += s_tp1[i * 12] - s_t[i * 12] total_return += r_t if render: env.render() time.sleep(0.05) obs_t.append(s_t) obs_tp1.append(s_tp1) acs_t.append(a_t) rews_t.append(r_t) s_t = s_tp1 path = { "observations": np.array(obs_t), "next_observations": np.array(obs_tp1), "actions": np.array(acs_t), "rewards": np.array(rews_t) } total_cost = path_cost(cost_fn, path) paths.append(path) returns.append(total_return) costs.append(total_cost) print('Total cost: %g, Total reward: %g' % (total_cost, total_return)) data = paths_to_array(paths) normalization = compute_normalization(data) # Set new normalization statistics for dynamics model dyn_model.normalization = normalization # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=10, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) """ YOUR CODE HERE """ paths = sample(env, random_controller, num_paths=50) first = 1 for path in paths: if (first): data = { "observations": path['observations'], "next_observations": path['next_observations'], "rewards": path['rewards'], "actions": path['actions'], "returns": path['returns'] } first = 0 else: data['observations'] = np.vstack( (data['observations'], path['observations'])) data['next_observations'] = np.vstack( (data['next_observations'], path['next_observations'])) data['rewards'] = np.vstack((data['rewards'], path['rewards'])) data['actions'] = np.vstack((data['actions'], path['actions'])) data['returns'] = np.vstack((data['returns'], path['returns'])) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # """ YOUR CODE HERE """ normalization = compute_normalization(data) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # #open CSV csv_file = open('results.csv', 'w') writer = csv.writer(csv_file, delimiter=',') for itr in range(onpol_iters): print(itr) costs = [] returns = [] """ YOUR CODE HERE """ dyn_model.fit(data) #plot_comparison(env,dyn_model) mpc_controller.dyn_model = dyn_model #need to update or not? new_paths = sample(env, mpc_controller) for path in new_paths: cost = path_cost(cost_fn, path) costs.append(cost) returns.append(path['returns'][-1]) data['observations'] = np.vstack( (data['observations'], path['observations'])) data['next_observations'] = np.vstack( (data['next_observations'], path['next_observations'])) data['actions'] = np.vstack((data['actions'], path['actions'])) dyn_model.normalization = compute_normalization(data) writer.writerow([itr, np.mean(returns)]) # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()