class PusherEnvModified(HelperEnv): def __init__(self): self.simulator = PusherEnv(render=False) def transition_function(state, action): self.simulator.apply_action(action) return self.simulator.get_obs() def reward_function(state, action): return self.simulator.compute_reward_push(state) HelperEnv.__init__(self, initial_state=self.simulator.get_obs(), transition_function=transition_function, reward_function=reward_function, state_space_dimension=9, action_space_dimension=2) def reset(self): self.simulator.reset()
def main(): logger.info("Instantiating model and importing weights") # instantiate model and import pretrained weights bc_model = BehaviorCloningModel(num_envstate_dims=NUM_STATE_DIMS, num_action_dims=NUM_ACTION_DIMS, hidden_layer_sizes=HIDDEN_LAYER_SIZES, criterion=CRITERION, lr=LEARNING_RATE, activation=ACTIVATION, seed=SEED) bc_model.load_state_dict(torch.load("bcmodel_learned_params.pt")) # Load in data logger.info("Importing test data") dataset = np.load('./expert.npz') tensor_dataset = data.TensorDataset(torch.Tensor(dataset['obs']), torch.Tensor(dataset['action'])) train_size = int(0.7 * len(tensor_dataset)) test_size = len(tensor_dataset) - train_size train_dataset, test_dataset = data.random_split(tensor_dataset, [train_size, test_size]) # only want 1 push each time, so set batch_size to 1 test_loader = data.DataLoader(test_dataset, batch_size=1, shuffle=True) env = PusherEnv() errors = [] true_pushes = [] pred_pushes = [] logger.info("Running loop") for i, (state, action) in enumerate(test_loader): logger.info(f'Iteration #{i}') # Convert inputs to floats state = state.float() action = action.float() # Use model to predict action given state pred_action = bc_model(state) # Switch output from tensors to numpy for easy use later state = state.data.numpy()[0] action = action.data.numpy()[0] pred_action = pred_action.data.numpy()[0] end_state, _, _, _ = env.step(action=pred_action) end_state = np.array(end_state) # Calculate errors action_error = np.linalg.norm(action - pred_action) state_error = np.linalg.norm(state - end_state) # Keep the results ################### errors.append(dict(action_error=action_error, state_error=state_error)) true_pushes.append(dict(d_x=action[0], d_y=action[1], state=state)) pred_pushes.append(dict(d_x=action[0], d_y=action[1], state=end_state)) # true_pushes.append(dict(obj_x=start_state[0], obj_y=start_state[1], start_push_x=true_action[0], # start_push_y=true_action[1], end_push_x=true_action[2], end_push_y=true_action[3])) # pred_pushes.append(dict(obj_x=start_state[0], obj_y=start_state[1], start_push_x=pred_action[0], # start_push_y=pred_action[1], end_push_x=pred_action[2], end_push_y=pred_action[3])) ################### if i > NUM_PUSHES - 1: break pd.DataFrame(errors).to_csv("results/P1/bc_model_errors.csv") pd.DataFrame(true_pushes).to_csv("results/P1/true_pushes.csv") pd.DataFrame(pred_pushes).to_csv("results/P1/pred_pushes.csv")
plt.figure() plt.plot(range(num_epochs+1), train_losses) plt.plot(range(num_epochs+1), valid_losses) plt.ylabel("Loss (MSE)") plt.xlabel("Epoch") plt.title("Behavioral Cloning Training") plt.legend(["Training Loss", "Validation Loss"]) plt.ylim(0, train_losses[1] * 2.0) plt.show() plt.savefig("behavioral_cloning_training.png") ## evaluate model on 100 episodes env = PusherEnv() num_episodes = 100 avg_L2_dist = 0 avg_reward = 0 frame = 0 for i in range(num_episodes): done = False obs = env.reset() total_reward = 0 while not done: action = model.infer(obs) obs, reward, done, info = env.step(action) total_reward += reward if i < 10:
def main(): # Load data expert_data = np.load("./expert.npz") expert_data = TensorDataset(torch.tensor(expert_data["obs"]), torch.tensor(expert_data["action"])) # Instantiate the environment (had to modify it slightly from the form given to make for easier recording later) environment = PusherEnvModified() # Instantiate the three models according to the problem statement policy = ActorCritic( state_space_dimension=environment.state_space_dimension, action_space_dimension=environment.action_space_dimension, actor_hidden_layer_units=(act_layer_one, act_layer_two), critic_hidden_layer_units=(crit_layer_one, crit_layer_two), actor_std=actor_std, activation=nn.Tanh) fromscratch_model = PPO_Model( environment=environment, policy=deepcopy(policy), bc_coefficient=0, n_steps_per_trajectory=n_steps_per_trajectory, n_trajectories_per_batch=n_trajectories_per_batch, n_epochs=n_epochs, n_iterations=n_iterations, learning_rate=learning_rate, clipping_param=clipping_param, entropy_coefficient=entropy_coefficient, seed=seed) policy.load(path="./results/p1/bc_model_params.pt") jointlossfinetune_model = PPO_Model( environment=environment, policy=deepcopy(policy), bc_coefficient=0.1, n_steps_per_trajectory=n_steps_per_trajectory, n_trajectories_per_batch=n_trajectories_per_batch, n_epochs=n_epochs, n_iterations=n_iterations, learning_rate=learning_rate, clipping_param=clipping_param, entropy_coefficient=entropy_coefficient, seed=seed) vanillafinetune_model = PPO_Model( environment=environment, policy=deepcopy(policy), bc_coefficient=0, n_steps_per_trajectory=n_steps_per_trajectory, n_trajectories_per_batch=n_trajectories_per_batch, n_epochs=n_epochs, n_iterations=n_iterations, learning_rate=learning_rate, clipping_param=clipping_param, entropy_coefficient=entropy_coefficient, seed=seed) # Train each vanillafinetune_model.train( train_critic_only_on_init=train_critic_only_on_init) jointlossfinetune_model.train( expert_data=expert_data, train_critic_only_on_init=train_critic_only_on_init) fromscratch_model.train() # First, generate results and video for model trained from scratch fromscratch_model.save_training_rewards( "./results/p2/rewards_fromscratchmodel") fromscratch_ltwo_dist_list = [] fromscratch_trajectories_list = [] for i in range(num_episodes_to_evaluate_on): _, actions, _, _ = fromscratch_model.generate_trajectory( use_argmax=True, perform_reset=False) fromscratch_trajectories_list.append(actions) state = fromscratch_model.environment.simulator.get_obs() fromscratch_ltwo_dist_list.append( np.linalg.norm(state[3:6] - state[6:9])) fromscratch_model.environment.reset() pd.DataFrame( { "mean_L2_distance": np.mean(fromscratch_ltwo_dist_list), "standard_L2dist": np.std(fromscratch_ltwo_dist_list) / np.sqrt(len(fromscratch_ltwo_dist_list)) }, index=["from_scratch" ]).to_csv("./results/p2/l2distances_fromscratchmodel.csv") # Using the trajectories generated above, # make video showing evaluation of policy on 10 episodes env_for_vid = PusherEnv(render=True) env_for_vid.render() vid_output = cv2.VideoWriter("./results/p2/p2_video_fromscratchmodel.mp4", cv2.VideoWriter_fourcc(*'mp4v'), 30, (640, 480)) for given_trajectory in fromscratch_trajectories_list[:num_pushes_in_vid]: for action in given_trajectory: # apply action and record into video env_for_vid.apply_action(action) scene_image = env_for_vid.robot.cam.get_images(get_rgb=True, get_depth=False)[0] vid_output.write(np.array(scene_image)) # Reset video environment after a given push env_for_vid.reset() # Second, generate results and video for joint-loss fine-tuned model jointlossfinetune_model.save_training_rewards( "./results/p2/rewards_jointlossfinetuned") jointlossfinetuned_ltwo_dist_list = [] jointlossfinetuned_trajectories_list = [] for i in range(num_episodes_to_evaluate_on): _, actions, _, _ = jointlossfinetune_model.generate_trajectory( use_argmax=True, perform_reset=False) jointlossfinetuned_trajectories_list.append(actions) state = jointlossfinetune_model.environment.simulator.get_obs() jointlossfinetuned_ltwo_dist_list.append( np.linalg.norm(state[3:6] - state[6:9])) jointlossfinetune_model.environment.reset() pd.DataFrame( { "mean_L2_distance": np.mean(jointlossfinetuned_ltwo_dist_list), "standard_L2dist": np.std(jointlossfinetuned_ltwo_dist_list) / np.sqrt(len(jointlossfinetuned_ltwo_dist_list)) }, index=[ "jointloss_finetuned" ]).to_csv("./results/p2/l2distances_jointlossfinetunedhmodel.csv") # Using the trajectories generated above, # make video showing evaluation of policy on 10 episodes vid_output = cv2.VideoWriter( "./results/p2/p2_video_jointlossfinetunedmodel.mp4", cv2.VideoWriter_fourcc(*'mp4v'), 30, (640, 480)) for given_trajectory in jointlossfinetuned_trajectories_list[: num_pushes_in_vid]: for action in given_trajectory: # apply action and record into video env_for_vid.apply_action(action) scene_image = env_for_vid.robot.cam.get_images(get_rgb=True, get_depth=False)[0] vid_output.write(np.array(scene_image)) # Reset video environment after a given push env_for_vid.reset() # Third, generate results and video for vanilla fine-tuned model vanillafinetune_model.save_training_rewards( "./results/p2/rewards_vanillafinetunedhmodel") vanillafinetuned_ltwo_dist_list = [] vanillafinetuned_trajectories_list = [] for i in range(num_episodes_to_evaluate_on): _, actions, _, _ = vanillafinetune_model.generate_trajectory( use_argmax=True, perform_reset=False) vanillafinetuned_trajectories_list.append(actions) state = vanillafinetune_model.environment.simulator.get_obs() vanillafinetuned_ltwo_dist_list.append( np.linalg.norm(state[3:6] - state[6:9])) vanillafinetune_model.environment.reset() pd.DataFrame( { "mean_L2_distance": np.mean(vanillafinetuned_ltwo_dist_list), "standard_L2dist": np.std(vanillafinetuned_ltwo_dist_list) / np.sqrt(len(vanillafinetuned_ltwo_dist_list)) }, index=["vanilla_finetuned" ]).to_csv("./results/p2/l2distances_vanillafinetunedmodel.csv") # Using the trajectories generated above, # make video showing evaluation of policy on 10 episodes vid_output = cv2.VideoWriter( "./results/p2/p2_video_vanillafinetunedmodel.mp4", cv2.VideoWriter_fourcc(*'mp4v'), 30, (640, 480)) for given_trajectory in vanillafinetuned_trajectories_list[: num_pushes_in_vid]: for action in given_trajectory: # apply action and record into video env_for_vid.apply_action(action) scene_image = env_for_vid.robot.cam.get_images(get_rgb=True, get_depth=False)[0] vid_output.write(np.array(scene_image)) # Reset video environment after a given push env_for_vid.reset() # Plot the learning curves for each policy plt.plot(fromscratch_model.mean_rewards, label="From-scratch policy") plt.plot(jointlossfinetune_model.mean_rewards, label="Joint-loss fine-tuned policy") plt.plot(vanillafinetune_model.mean_rewards, label='Vanilla fine-tuned policy') plt.title("Learning Curves for the Three Policies") plt.ylabel("Mean Rewards") plt.legend() plt.savefig("./results/p2/learningcurves_chart.png") plt.close()
def main(): # Load data expert_data = np.load("./expert.npz") expert_data = TensorDataset(torch.tensor(expert_data["obs"]), torch.tensor(expert_data["action"])) # Instantiate the environment (had to modify it slightly from the form given to make for easier recording later) environment = PusherEnvModified() policy = ActorCritic( state_space_dimension=environment.state_space_dimension, action_space_dimension=environment.action_space_dimension, actor_hidden_layer_units=(act_layer_one, act_layer_two), critic_hidden_layer_units=(crit_layer_one, crit_layer_two), actor_std=4e-2, activation=nn.Tanh) # Use the policy from above to instantiate our behavioral cloning model bc_model = BC_Model(policy=deepcopy(policy), batch_size=batch_size, num_epochs=num_epochs, learning_rate=learning_rate) # Train model and save resulting policy parameters bc_model.train(expert_data=expert_data) bc_model.policy.save(path="./results/p1/bc_model_params.pt") pd.DataFrame(bc_model.training_loss_list, columns=["train_loss" ]).to_csv("./results/p1/bc_train_loss.csv") pd.DataFrame(bc_model.avg_loss_list, columns=["avg_train_loss" ]).to_csv("./results/p1/bc_avg_train_loss.csv") # Plot training loss plt.plot(bc_model.training_loss_list, label="Training loss") plt.title("Loss as a Function of Time") plt.xlabel("# of batches") plt.legend() plt.savefig("./results/p1/bc_train_loss_chart.png") plt.close() # Plot avg. training loss plt.plot(bc_model.avg_loss_list, label="Average training loss per epoch") plt.title("Avg. Loss as a Function of Time") plt.xlabel("# of epochs") plt.legend() plt.savefig("./results/p1/bc_avg_train_loss_chart.png") plt.close() # Now use the policy from the post-training behavioral cloning model, and compare the results produced_model = PPO_Model(environment=environment, policy=deepcopy(bc_model.policy), n_steps_per_trajectory=64) # For comparison, we evaluate the learned policy on 100 episodes ltwo_dist_list = [] trajectories_list = [] for i in range(num_episodes_to_evaluate_on): _, actions, _, _ = produced_model.generate_trajectory( use_argmax=True, perform_reset=False) trajectories_list.append(actions) state = produced_model.environment.simulator.get_obs() ltwo_dist_list.append(np.linalg.norm(state[3:6] - state[6:9])) produced_model.environment.reset() pd.DataFrame( { "mean_L2_distance": np.mean(ltwo_dist_list), "standard_L2dist": np.std(ltwo_dist_list) / np.sqrt(len(ltwo_dist_list)) }, index=["BC"]).to_csv("./results/p1/bc_l2distance.csv") # Using the trajectories generated above, # make video showing evaluation of policy on 10 episodes env_for_vid = PusherEnv(render=True) env_for_vid.render() vid_output = cv2.VideoWriter(vid_path, cv2.VideoWriter_fourcc(*'mp4v'), 30, (640, 480)) for given_trajectory in trajectories_list[:num_pushes_in_vid]: for action in given_trajectory: # apply action and record into video env_for_vid.apply_action(action) scene_image = env_for_vid.robot.cam.get_images(get_rgb=True, get_depth=False)[0] vid_output.write(np.array(scene_image)) # Reset video environment after a given push env_for_vid.reset()