Beispiel #1
0
def main():

    env = PusherEnv(render=True)

    # Ground truth push videos
    logger.info("Recording ground truth videos")
    ground_truth_data_path = "results/P1/true_pushes.csv"

    for i, push in pd.read_csv(ground_truth_data_path, index_col=0).iterrows():
        logger.info(f'Video {i}')

        ################
        # state = push["state"]
        # actions = push["action"]

        state = np.array(push["state"])
        action = np.array([push["d_x"], push["d_y"]])

        # state = np.array([push["obj_x"], push["obj_y"]])
        # actions = [np.array([push["start_push_x"], push["start_push_y"], push["end_push_x"], push["end_push_y"]])]
        ######################

        # Record video
        pybullet.startStateLogging(pybullet.STATE_LOGGING_VIDEO_MP4,
                                   f"results/P1/vids/true_pushes{i}.mp4")
        env.reset()
        # for action in actions:
        #     state, _, _, _ = env.step(action=action)

        state, _, _, _ = env.step(action=action)

        pybullet.stopStateLogging(pybullet.STATE_LOGGING_VIDEO_MP4)

    # Predicted push videos
    predicted_data_path = "results/P1/pred_pushes.csv"
    logger.info("Recording prediction videos")
    for i, push in pd.read_csv(predicted_data_path, index_col=0).iterrows():
        logger.info(f'Video {i}')

        #######################
        # state = push["state"]
        # actions = push["action"]

        state = np.array(push["state"])
        action = np.array([push["d_x"], push["d_y"]])

        # state = np.array([push["obj_x"], push["obj_y"]])
        # actions = [np.array([push["start_push_x"], push["start_push_y"], push["end_push_x"], push["end_push_y"]])]
        ########################

        # Record video
        pybullet.startStateLogging(pybullet.STATE_LOGGING_VIDEO_MP4,
                                   f"results/P1/vids/pred_pushes{i}.mp4")
        env.reset()
        # for action in actions:
        #     state, _, _, _ = env.step(action=action)
        state, _, _, _ = env.step(action=action)
        pybullet.stopStateLogging(pybullet.STATE_LOGGING_VIDEO_MP4)
class PusherEnvModified(HelperEnv):
    def __init__(self):
        self.simulator = PusherEnv(render=False)

        def transition_function(state, action):
            self.simulator.apply_action(action)
            return self.simulator.get_obs()

        def reward_function(state, action):
            return self.simulator.compute_reward_push(state)

        HelperEnv.__init__(self,
                           initial_state=self.simulator.get_obs(),
                           transition_function=transition_function,
                           reward_function=reward_function,
                           state_space_dimension=9,
                           action_space_dimension=2)

    def reset(self):
        self.simulator.reset()
Beispiel #3
0
def evaluate_policy(args):
    with torch.no_grad():
        # load model
        save_path = os.path.join(args.save_dir, args.algo)
        save_file = os.path.join(save_path, args.env_name + ".pt")

        actor_critic = torch.load(save_file)[0]
        
        model = PusherPolicyModel()

        model.net.fc1.weight.data.copy_(actor_critic.base.actor[0].weight.data)
        model.net.fc1.bias.data.copy_(actor_critic.base.actor[0].bias.data)
        model.net.fc2.weight.data.copy_(actor_critic.base.actor[2].weight.data)
        model.net.fc2.bias.data.copy_(actor_critic.base.actor[2].bias.data)
        model.net.fc3.weight.data.copy_(actor_critic.dist.fc_mean.weight.data)
        model.net.fc3.bias.data.copy_(actor_critic.dist.fc_mean.bias.data)

        #device = torch.device("cuda:0" if args.cuda else "cpu")
        # make env
        env = PusherEnv()
        #envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
        #                 args.gamma, args.log_dir, device, True)

        # do episodes
        num_episodes = 100
        avg_L2_dist = 0

        frame = 0        

        for i in range(num_episodes):
            done = False
            obs = env.reset()

            while not done:
                #value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(torch.tensor(obs).float(), None, None, deterministic=True)
                action = model.infer(obs)
                #print(action.numpy()[0,1,0])
                obs, reward, done, info = env.step(action)
                if i < 10:
                    rgb = env.render()
                    im = Image.fromarray(rgb)
                    im.save('imgs/{}{:04d}.png'.format(args.algo, frame))
                    frame += 1

            print(obs)
            dist = np.linalg.norm(obs[3:6] - obs[6:9])
            print(dist)
            avg_L2_dist += dist / num_episodes

        print("Average L2 distance, 100 trials:", avg_L2_dist)\
Beispiel #4
0
def main():
    # Load data
    expert_data = np.load("./expert.npz")
    expert_data = TensorDataset(torch.tensor(expert_data["obs"]),
                                torch.tensor(expert_data["action"]))

    # Instantiate the environment (had to modify it slightly from the form given to make for easier recording later)
    environment = PusherEnvModified()

    # Instantiate the three models according to the problem statement
    policy = ActorCritic(
        state_space_dimension=environment.state_space_dimension,
        action_space_dimension=environment.action_space_dimension,
        actor_hidden_layer_units=(act_layer_one, act_layer_two),
        critic_hidden_layer_units=(crit_layer_one, crit_layer_two),
        actor_std=actor_std,
        activation=nn.Tanh)

    fromscratch_model = PPO_Model(
        environment=environment,
        policy=deepcopy(policy),
        bc_coefficient=0,
        n_steps_per_trajectory=n_steps_per_trajectory,
        n_trajectories_per_batch=n_trajectories_per_batch,
        n_epochs=n_epochs,
        n_iterations=n_iterations,
        learning_rate=learning_rate,
        clipping_param=clipping_param,
        entropy_coefficient=entropy_coefficient,
        seed=seed)

    policy.load(path="./results/p1/bc_model_params.pt")
    jointlossfinetune_model = PPO_Model(
        environment=environment,
        policy=deepcopy(policy),
        bc_coefficient=0.1,
        n_steps_per_trajectory=n_steps_per_trajectory,
        n_trajectories_per_batch=n_trajectories_per_batch,
        n_epochs=n_epochs,
        n_iterations=n_iterations,
        learning_rate=learning_rate,
        clipping_param=clipping_param,
        entropy_coefficient=entropy_coefficient,
        seed=seed)

    vanillafinetune_model = PPO_Model(
        environment=environment,
        policy=deepcopy(policy),
        bc_coefficient=0,
        n_steps_per_trajectory=n_steps_per_trajectory,
        n_trajectories_per_batch=n_trajectories_per_batch,
        n_epochs=n_epochs,
        n_iterations=n_iterations,
        learning_rate=learning_rate,
        clipping_param=clipping_param,
        entropy_coefficient=entropy_coefficient,
        seed=seed)

    # Train each
    vanillafinetune_model.train(
        train_critic_only_on_init=train_critic_only_on_init)
    jointlossfinetune_model.train(
        expert_data=expert_data,
        train_critic_only_on_init=train_critic_only_on_init)
    fromscratch_model.train()

    # First, generate results and video for model trained from scratch
    fromscratch_model.save_training_rewards(
        "./results/p2/rewards_fromscratchmodel")

    fromscratch_ltwo_dist_list = []
    fromscratch_trajectories_list = []
    for i in range(num_episodes_to_evaluate_on):
        _, actions, _, _ = fromscratch_model.generate_trajectory(
            use_argmax=True, perform_reset=False)

        fromscratch_trajectories_list.append(actions)
        state = fromscratch_model.environment.simulator.get_obs()
        fromscratch_ltwo_dist_list.append(
            np.linalg.norm(state[3:6] - state[6:9]))
        fromscratch_model.environment.reset()

    pd.DataFrame(
        {
            "mean_L2_distance":
            np.mean(fromscratch_ltwo_dist_list),
            "standard_L2dist":
            np.std(fromscratch_ltwo_dist_list) /
            np.sqrt(len(fromscratch_ltwo_dist_list))
        },
        index=["from_scratch"
               ]).to_csv("./results/p2/l2distances_fromscratchmodel.csv")

    # Using the trajectories generated above,
    # make video showing evaluation of policy on 10 episodes
    env_for_vid = PusherEnv(render=True)
    env_for_vid.render()
    vid_output = cv2.VideoWriter("./results/p2/p2_video_fromscratchmodel.mp4",
                                 cv2.VideoWriter_fourcc(*'mp4v'), 30,
                                 (640, 480))
    for given_trajectory in fromscratch_trajectories_list[:num_pushes_in_vid]:
        for action in given_trajectory:

            # apply action and record into video
            env_for_vid.apply_action(action)
            scene_image = env_for_vid.robot.cam.get_images(get_rgb=True,
                                                           get_depth=False)[0]
            vid_output.write(np.array(scene_image))

        # Reset video environment after a given push
        env_for_vid.reset()

    # Second, generate results and video for joint-loss fine-tuned model
    jointlossfinetune_model.save_training_rewards(
        "./results/p2/rewards_jointlossfinetuned")

    jointlossfinetuned_ltwo_dist_list = []
    jointlossfinetuned_trajectories_list = []
    for i in range(num_episodes_to_evaluate_on):
        _, actions, _, _ = jointlossfinetune_model.generate_trajectory(
            use_argmax=True, perform_reset=False)

        jointlossfinetuned_trajectories_list.append(actions)
        state = jointlossfinetune_model.environment.simulator.get_obs()
        jointlossfinetuned_ltwo_dist_list.append(
            np.linalg.norm(state[3:6] - state[6:9]))
        jointlossfinetune_model.environment.reset()

    pd.DataFrame(
        {
            "mean_L2_distance":
            np.mean(jointlossfinetuned_ltwo_dist_list),
            "standard_L2dist":
            np.std(jointlossfinetuned_ltwo_dist_list) /
            np.sqrt(len(jointlossfinetuned_ltwo_dist_list))
        },
        index=[
            "jointloss_finetuned"
        ]).to_csv("./results/p2/l2distances_jointlossfinetunedhmodel.csv")

    # Using the trajectories generated above,
    # make video showing evaluation of policy on 10 episodes

    vid_output = cv2.VideoWriter(
        "./results/p2/p2_video_jointlossfinetunedmodel.mp4",
        cv2.VideoWriter_fourcc(*'mp4v'), 30, (640, 480))
    for given_trajectory in jointlossfinetuned_trajectories_list[:
                                                                 num_pushes_in_vid]:
        for action in given_trajectory:

            # apply action and record into video
            env_for_vid.apply_action(action)
            scene_image = env_for_vid.robot.cam.get_images(get_rgb=True,
                                                           get_depth=False)[0]
            vid_output.write(np.array(scene_image))

        # Reset video environment after a given push
        env_for_vid.reset()

    # Third, generate results and video for vanilla fine-tuned model
    vanillafinetune_model.save_training_rewards(
        "./results/p2/rewards_vanillafinetunedhmodel")

    vanillafinetuned_ltwo_dist_list = []
    vanillafinetuned_trajectories_list = []
    for i in range(num_episodes_to_evaluate_on):
        _, actions, _, _ = vanillafinetune_model.generate_trajectory(
            use_argmax=True, perform_reset=False)

        vanillafinetuned_trajectories_list.append(actions)
        state = vanillafinetune_model.environment.simulator.get_obs()
        vanillafinetuned_ltwo_dist_list.append(
            np.linalg.norm(state[3:6] - state[6:9]))
        vanillafinetune_model.environment.reset()

    pd.DataFrame(
        {
            "mean_L2_distance":
            np.mean(vanillafinetuned_ltwo_dist_list),
            "standard_L2dist":
            np.std(vanillafinetuned_ltwo_dist_list) /
            np.sqrt(len(vanillafinetuned_ltwo_dist_list))
        },
        index=["vanilla_finetuned"
               ]).to_csv("./results/p2/l2distances_vanillafinetunedmodel.csv")

    # Using the trajectories generated above,
    # make video showing evaluation of policy on 10 episodes

    vid_output = cv2.VideoWriter(
        "./results/p2/p2_video_vanillafinetunedmodel.mp4",
        cv2.VideoWriter_fourcc(*'mp4v'), 30, (640, 480))
    for given_trajectory in vanillafinetuned_trajectories_list[:
                                                               num_pushes_in_vid]:
        for action in given_trajectory:

            # apply action and record into video
            env_for_vid.apply_action(action)
            scene_image = env_for_vid.robot.cam.get_images(get_rgb=True,
                                                           get_depth=False)[0]
            vid_output.write(np.array(scene_image))

        # Reset video environment after a given push
        env_for_vid.reset()

    # Plot the learning curves for each policy
    plt.plot(fromscratch_model.mean_rewards, label="From-scratch policy")
    plt.plot(jointlossfinetune_model.mean_rewards,
             label="Joint-loss fine-tuned policy")
    plt.plot(vanillafinetune_model.mean_rewards,
             label='Vanilla fine-tuned policy')
    plt.title("Learning Curves for the Three Policies")
    plt.ylabel("Mean Rewards")
    plt.legend()
    plt.savefig("./results/p2/learningcurves_chart.png")
    plt.close()
    plt.show()
    plt.savefig("behavioral_cloning_training.png")
    
    

    ## evaluate model on 100 episodes
    env = PusherEnv()
    num_episodes = 100
    avg_L2_dist = 0
    avg_reward = 0

    frame = 0

    for i in range(num_episodes):
        done = False
        obs = env.reset()
        total_reward = 0
        while not done:
            action = model.infer(obs)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            if i < 10:
                    rgb = env.render()
                    im = Image.fromarray(rgb)
                    im.save('imgs/{}{:04d}.png'.format("bc", frame))
                    frame += 1

        dist = np.linalg.norm(obs[3:6] - obs[6:9])
        avg_L2_dist += dist / num_episodes
        avg_reward += total_reward / num_episodes
Beispiel #6
0
def main():

    # Load data
    expert_data = np.load("./expert.npz")
    expert_data = TensorDataset(torch.tensor(expert_data["obs"]),
                                torch.tensor(expert_data["action"]))

    # Instantiate the environment (had to modify it slightly from the form given to make for easier recording later)
    environment = PusherEnvModified()

    policy = ActorCritic(
        state_space_dimension=environment.state_space_dimension,
        action_space_dimension=environment.action_space_dimension,
        actor_hidden_layer_units=(act_layer_one, act_layer_two),
        critic_hidden_layer_units=(crit_layer_one, crit_layer_two),
        actor_std=4e-2,
        activation=nn.Tanh)

    # Use the policy from above to instantiate our behavioral cloning model
    bc_model = BC_Model(policy=deepcopy(policy),
                        batch_size=batch_size,
                        num_epochs=num_epochs,
                        learning_rate=learning_rate)

    # Train model and save resulting policy parameters
    bc_model.train(expert_data=expert_data)
    bc_model.policy.save(path="./results/p1/bc_model_params.pt")
    pd.DataFrame(bc_model.training_loss_list,
                 columns=["train_loss"
                          ]).to_csv("./results/p1/bc_train_loss.csv")
    pd.DataFrame(bc_model.avg_loss_list,
                 columns=["avg_train_loss"
                          ]).to_csv("./results/p1/bc_avg_train_loss.csv")

    # Plot training loss
    plt.plot(bc_model.training_loss_list, label="Training loss")
    plt.title("Loss as a Function of Time")
    plt.xlabel("# of batches")
    plt.legend()
    plt.savefig("./results/p1/bc_train_loss_chart.png")
    plt.close()

    # Plot avg. training loss
    plt.plot(bc_model.avg_loss_list, label="Average training loss per epoch")
    plt.title("Avg. Loss as a Function of Time")
    plt.xlabel("# of epochs")
    plt.legend()
    plt.savefig("./results/p1/bc_avg_train_loss_chart.png")
    plt.close()

    # Now use the policy from the post-training behavioral cloning model, and compare the results
    produced_model = PPO_Model(environment=environment,
                               policy=deepcopy(bc_model.policy),
                               n_steps_per_trajectory=64)

    # For comparison, we evaluate the learned policy on 100 episodes
    ltwo_dist_list = []
    trajectories_list = []
    for i in range(num_episodes_to_evaluate_on):
        _, actions, _, _ = produced_model.generate_trajectory(
            use_argmax=True, perform_reset=False)

        trajectories_list.append(actions)
        state = produced_model.environment.simulator.get_obs()
        ltwo_dist_list.append(np.linalg.norm(state[3:6] - state[6:9]))
        produced_model.environment.reset()

    pd.DataFrame(
        {
            "mean_L2_distance":
            np.mean(ltwo_dist_list),
            "standard_L2dist":
            np.std(ltwo_dist_list) / np.sqrt(len(ltwo_dist_list))
        },
        index=["BC"]).to_csv("./results/p1/bc_l2distance.csv")

    # Using the trajectories generated above,
    # make video showing evaluation of policy on 10 episodes
    env_for_vid = PusherEnv(render=True)
    env_for_vid.render()
    vid_output = cv2.VideoWriter(vid_path, cv2.VideoWriter_fourcc(*'mp4v'), 30,
                                 (640, 480))
    for given_trajectory in trajectories_list[:num_pushes_in_vid]:
        for action in given_trajectory:

            # apply action and record into video
            env_for_vid.apply_action(action)
            scene_image = env_for_vid.robot.cam.get_images(get_rgb=True,
                                                           get_depth=False)[0]
            vid_output.write(np.array(scene_image))

        # Reset video environment after a given push
        env_for_vid.reset()