コード例 #1
0
class PusherEnvModified(HelperEnv):
    def __init__(self):
        self.simulator = PusherEnv(render=False)

        def transition_function(state, action):
            self.simulator.apply_action(action)
            return self.simulator.get_obs()

        def reward_function(state, action):
            return self.simulator.compute_reward_push(state)

        HelperEnv.__init__(self,
                           initial_state=self.simulator.get_obs(),
                           transition_function=transition_function,
                           reward_function=reward_function,
                           state_space_dimension=9,
                           action_space_dimension=2)

    def reset(self):
        self.simulator.reset()
コード例 #2
0
def main():
    logger.info("Instantiating model and importing weights")
    # instantiate model and import pretrained weights
    bc_model = BehaviorCloningModel(num_envstate_dims=NUM_STATE_DIMS,
                                    num_action_dims=NUM_ACTION_DIMS,
                                    hidden_layer_sizes=HIDDEN_LAYER_SIZES,
                                    criterion=CRITERION,
                                    lr=LEARNING_RATE,
                                    activation=ACTIVATION,
                                    seed=SEED)

    bc_model.load_state_dict(torch.load("bcmodel_learned_params.pt"))

    # Load in data
    logger.info("Importing test data")
    dataset = np.load('./expert.npz')
    tensor_dataset = data.TensorDataset(torch.Tensor(dataset['obs']),
                                        torch.Tensor(dataset['action']))
    train_size = int(0.7 * len(tensor_dataset))
    test_size = len(tensor_dataset) - train_size
    train_dataset, test_dataset = data.random_split(tensor_dataset,
                                                    [train_size, test_size])

    # only want 1 push each time, so set batch_size to 1
    test_loader = data.DataLoader(test_dataset, batch_size=1, shuffle=True)

    env = PusherEnv()

    errors = []
    true_pushes = []
    pred_pushes = []

    logger.info("Running loop")
    for i, (state, action) in enumerate(test_loader):
        logger.info(f'Iteration #{i}')
        # Convert inputs to floats
        state = state.float()
        action = action.float()

        # Use model to predict action given state
        pred_action = bc_model(state)

        # Switch output from tensors to numpy for easy use later
        state = state.data.numpy()[0]
        action = action.data.numpy()[0]
        pred_action = pred_action.data.numpy()[0]

        end_state, _, _, _ = env.step(action=pred_action)
        end_state = np.array(end_state)

        # Calculate errors
        action_error = np.linalg.norm(action - pred_action)
        state_error = np.linalg.norm(state - end_state)

        # Keep the results

        ###################
        errors.append(dict(action_error=action_error, state_error=state_error))
        true_pushes.append(dict(d_x=action[0], d_y=action[1], state=state))
        pred_pushes.append(dict(d_x=action[0], d_y=action[1], state=end_state))

        # true_pushes.append(dict(obj_x=start_state[0], obj_y=start_state[1], start_push_x=true_action[0],
        #                         start_push_y=true_action[1], end_push_x=true_action[2], end_push_y=true_action[3]))
        # pred_pushes.append(dict(obj_x=start_state[0], obj_y=start_state[1], start_push_x=pred_action[0],
        #                         start_push_y=pred_action[1], end_push_x=pred_action[2], end_push_y=pred_action[3]))
        ###################

        if i > NUM_PUSHES - 1:
            break

        pd.DataFrame(errors).to_csv("results/P1/bc_model_errors.csv")
        pd.DataFrame(true_pushes).to_csv("results/P1/true_pushes.csv")
        pd.DataFrame(pred_pushes).to_csv("results/P1/pred_pushes.csv")
コード例 #3
0
    
    plt.figure()
    plt.plot(range(num_epochs+1), train_losses)
    plt.plot(range(num_epochs+1), valid_losses)
    plt.ylabel("Loss (MSE)")
    plt.xlabel("Epoch")
    plt.title("Behavioral Cloning Training")
    plt.legend(["Training Loss", "Validation Loss"])
    plt.ylim(0, train_losses[1] * 2.0)
    plt.show()
    plt.savefig("behavioral_cloning_training.png")
    
    

    ## evaluate model on 100 episodes
    env = PusherEnv()
    num_episodes = 100
    avg_L2_dist = 0
    avg_reward = 0

    frame = 0

    for i in range(num_episodes):
        done = False
        obs = env.reset()
        total_reward = 0
        while not done:
            action = model.infer(obs)
            obs, reward, done, info = env.step(action)
            total_reward += reward
            if i < 10:
コード例 #4
0
ファイル: p2.py プロジェクト: chickert/reinforcement_learning
def main():
    # Load data
    expert_data = np.load("./expert.npz")
    expert_data = TensorDataset(torch.tensor(expert_data["obs"]),
                                torch.tensor(expert_data["action"]))

    # Instantiate the environment (had to modify it slightly from the form given to make for easier recording later)
    environment = PusherEnvModified()

    # Instantiate the three models according to the problem statement
    policy = ActorCritic(
        state_space_dimension=environment.state_space_dimension,
        action_space_dimension=environment.action_space_dimension,
        actor_hidden_layer_units=(act_layer_one, act_layer_two),
        critic_hidden_layer_units=(crit_layer_one, crit_layer_two),
        actor_std=actor_std,
        activation=nn.Tanh)

    fromscratch_model = PPO_Model(
        environment=environment,
        policy=deepcopy(policy),
        bc_coefficient=0,
        n_steps_per_trajectory=n_steps_per_trajectory,
        n_trajectories_per_batch=n_trajectories_per_batch,
        n_epochs=n_epochs,
        n_iterations=n_iterations,
        learning_rate=learning_rate,
        clipping_param=clipping_param,
        entropy_coefficient=entropy_coefficient,
        seed=seed)

    policy.load(path="./results/p1/bc_model_params.pt")
    jointlossfinetune_model = PPO_Model(
        environment=environment,
        policy=deepcopy(policy),
        bc_coefficient=0.1,
        n_steps_per_trajectory=n_steps_per_trajectory,
        n_trajectories_per_batch=n_trajectories_per_batch,
        n_epochs=n_epochs,
        n_iterations=n_iterations,
        learning_rate=learning_rate,
        clipping_param=clipping_param,
        entropy_coefficient=entropy_coefficient,
        seed=seed)

    vanillafinetune_model = PPO_Model(
        environment=environment,
        policy=deepcopy(policy),
        bc_coefficient=0,
        n_steps_per_trajectory=n_steps_per_trajectory,
        n_trajectories_per_batch=n_trajectories_per_batch,
        n_epochs=n_epochs,
        n_iterations=n_iterations,
        learning_rate=learning_rate,
        clipping_param=clipping_param,
        entropy_coefficient=entropy_coefficient,
        seed=seed)

    # Train each
    vanillafinetune_model.train(
        train_critic_only_on_init=train_critic_only_on_init)
    jointlossfinetune_model.train(
        expert_data=expert_data,
        train_critic_only_on_init=train_critic_only_on_init)
    fromscratch_model.train()

    # First, generate results and video for model trained from scratch
    fromscratch_model.save_training_rewards(
        "./results/p2/rewards_fromscratchmodel")

    fromscratch_ltwo_dist_list = []
    fromscratch_trajectories_list = []
    for i in range(num_episodes_to_evaluate_on):
        _, actions, _, _ = fromscratch_model.generate_trajectory(
            use_argmax=True, perform_reset=False)

        fromscratch_trajectories_list.append(actions)
        state = fromscratch_model.environment.simulator.get_obs()
        fromscratch_ltwo_dist_list.append(
            np.linalg.norm(state[3:6] - state[6:9]))
        fromscratch_model.environment.reset()

    pd.DataFrame(
        {
            "mean_L2_distance":
            np.mean(fromscratch_ltwo_dist_list),
            "standard_L2dist":
            np.std(fromscratch_ltwo_dist_list) /
            np.sqrt(len(fromscratch_ltwo_dist_list))
        },
        index=["from_scratch"
               ]).to_csv("./results/p2/l2distances_fromscratchmodel.csv")

    # Using the trajectories generated above,
    # make video showing evaluation of policy on 10 episodes
    env_for_vid = PusherEnv(render=True)
    env_for_vid.render()
    vid_output = cv2.VideoWriter("./results/p2/p2_video_fromscratchmodel.mp4",
                                 cv2.VideoWriter_fourcc(*'mp4v'), 30,
                                 (640, 480))
    for given_trajectory in fromscratch_trajectories_list[:num_pushes_in_vid]:
        for action in given_trajectory:

            # apply action and record into video
            env_for_vid.apply_action(action)
            scene_image = env_for_vid.robot.cam.get_images(get_rgb=True,
                                                           get_depth=False)[0]
            vid_output.write(np.array(scene_image))

        # Reset video environment after a given push
        env_for_vid.reset()

    # Second, generate results and video for joint-loss fine-tuned model
    jointlossfinetune_model.save_training_rewards(
        "./results/p2/rewards_jointlossfinetuned")

    jointlossfinetuned_ltwo_dist_list = []
    jointlossfinetuned_trajectories_list = []
    for i in range(num_episodes_to_evaluate_on):
        _, actions, _, _ = jointlossfinetune_model.generate_trajectory(
            use_argmax=True, perform_reset=False)

        jointlossfinetuned_trajectories_list.append(actions)
        state = jointlossfinetune_model.environment.simulator.get_obs()
        jointlossfinetuned_ltwo_dist_list.append(
            np.linalg.norm(state[3:6] - state[6:9]))
        jointlossfinetune_model.environment.reset()

    pd.DataFrame(
        {
            "mean_L2_distance":
            np.mean(jointlossfinetuned_ltwo_dist_list),
            "standard_L2dist":
            np.std(jointlossfinetuned_ltwo_dist_list) /
            np.sqrt(len(jointlossfinetuned_ltwo_dist_list))
        },
        index=[
            "jointloss_finetuned"
        ]).to_csv("./results/p2/l2distances_jointlossfinetunedhmodel.csv")

    # Using the trajectories generated above,
    # make video showing evaluation of policy on 10 episodes

    vid_output = cv2.VideoWriter(
        "./results/p2/p2_video_jointlossfinetunedmodel.mp4",
        cv2.VideoWriter_fourcc(*'mp4v'), 30, (640, 480))
    for given_trajectory in jointlossfinetuned_trajectories_list[:
                                                                 num_pushes_in_vid]:
        for action in given_trajectory:

            # apply action and record into video
            env_for_vid.apply_action(action)
            scene_image = env_for_vid.robot.cam.get_images(get_rgb=True,
                                                           get_depth=False)[0]
            vid_output.write(np.array(scene_image))

        # Reset video environment after a given push
        env_for_vid.reset()

    # Third, generate results and video for vanilla fine-tuned model
    vanillafinetune_model.save_training_rewards(
        "./results/p2/rewards_vanillafinetunedhmodel")

    vanillafinetuned_ltwo_dist_list = []
    vanillafinetuned_trajectories_list = []
    for i in range(num_episodes_to_evaluate_on):
        _, actions, _, _ = vanillafinetune_model.generate_trajectory(
            use_argmax=True, perform_reset=False)

        vanillafinetuned_trajectories_list.append(actions)
        state = vanillafinetune_model.environment.simulator.get_obs()
        vanillafinetuned_ltwo_dist_list.append(
            np.linalg.norm(state[3:6] - state[6:9]))
        vanillafinetune_model.environment.reset()

    pd.DataFrame(
        {
            "mean_L2_distance":
            np.mean(vanillafinetuned_ltwo_dist_list),
            "standard_L2dist":
            np.std(vanillafinetuned_ltwo_dist_list) /
            np.sqrt(len(vanillafinetuned_ltwo_dist_list))
        },
        index=["vanilla_finetuned"
               ]).to_csv("./results/p2/l2distances_vanillafinetunedmodel.csv")

    # Using the trajectories generated above,
    # make video showing evaluation of policy on 10 episodes

    vid_output = cv2.VideoWriter(
        "./results/p2/p2_video_vanillafinetunedmodel.mp4",
        cv2.VideoWriter_fourcc(*'mp4v'), 30, (640, 480))
    for given_trajectory in vanillafinetuned_trajectories_list[:
                                                               num_pushes_in_vid]:
        for action in given_trajectory:

            # apply action and record into video
            env_for_vid.apply_action(action)
            scene_image = env_for_vid.robot.cam.get_images(get_rgb=True,
                                                           get_depth=False)[0]
            vid_output.write(np.array(scene_image))

        # Reset video environment after a given push
        env_for_vid.reset()

    # Plot the learning curves for each policy
    plt.plot(fromscratch_model.mean_rewards, label="From-scratch policy")
    plt.plot(jointlossfinetune_model.mean_rewards,
             label="Joint-loss fine-tuned policy")
    plt.plot(vanillafinetune_model.mean_rewards,
             label='Vanilla fine-tuned policy')
    plt.title("Learning Curves for the Three Policies")
    plt.ylabel("Mean Rewards")
    plt.legend()
    plt.savefig("./results/p2/learningcurves_chart.png")
    plt.close()
コード例 #5
0
ファイル: p1.py プロジェクト: chickert/reinforcement_learning
def main():

    # Load data
    expert_data = np.load("./expert.npz")
    expert_data = TensorDataset(torch.tensor(expert_data["obs"]),
                                torch.tensor(expert_data["action"]))

    # Instantiate the environment (had to modify it slightly from the form given to make for easier recording later)
    environment = PusherEnvModified()

    policy = ActorCritic(
        state_space_dimension=environment.state_space_dimension,
        action_space_dimension=environment.action_space_dimension,
        actor_hidden_layer_units=(act_layer_one, act_layer_two),
        critic_hidden_layer_units=(crit_layer_one, crit_layer_two),
        actor_std=4e-2,
        activation=nn.Tanh)

    # Use the policy from above to instantiate our behavioral cloning model
    bc_model = BC_Model(policy=deepcopy(policy),
                        batch_size=batch_size,
                        num_epochs=num_epochs,
                        learning_rate=learning_rate)

    # Train model and save resulting policy parameters
    bc_model.train(expert_data=expert_data)
    bc_model.policy.save(path="./results/p1/bc_model_params.pt")
    pd.DataFrame(bc_model.training_loss_list,
                 columns=["train_loss"
                          ]).to_csv("./results/p1/bc_train_loss.csv")
    pd.DataFrame(bc_model.avg_loss_list,
                 columns=["avg_train_loss"
                          ]).to_csv("./results/p1/bc_avg_train_loss.csv")

    # Plot training loss
    plt.plot(bc_model.training_loss_list, label="Training loss")
    plt.title("Loss as a Function of Time")
    plt.xlabel("# of batches")
    plt.legend()
    plt.savefig("./results/p1/bc_train_loss_chart.png")
    plt.close()

    # Plot avg. training loss
    plt.plot(bc_model.avg_loss_list, label="Average training loss per epoch")
    plt.title("Avg. Loss as a Function of Time")
    plt.xlabel("# of epochs")
    plt.legend()
    plt.savefig("./results/p1/bc_avg_train_loss_chart.png")
    plt.close()

    # Now use the policy from the post-training behavioral cloning model, and compare the results
    produced_model = PPO_Model(environment=environment,
                               policy=deepcopy(bc_model.policy),
                               n_steps_per_trajectory=64)

    # For comparison, we evaluate the learned policy on 100 episodes
    ltwo_dist_list = []
    trajectories_list = []
    for i in range(num_episodes_to_evaluate_on):
        _, actions, _, _ = produced_model.generate_trajectory(
            use_argmax=True, perform_reset=False)

        trajectories_list.append(actions)
        state = produced_model.environment.simulator.get_obs()
        ltwo_dist_list.append(np.linalg.norm(state[3:6] - state[6:9]))
        produced_model.environment.reset()

    pd.DataFrame(
        {
            "mean_L2_distance":
            np.mean(ltwo_dist_list),
            "standard_L2dist":
            np.std(ltwo_dist_list) / np.sqrt(len(ltwo_dist_list))
        },
        index=["BC"]).to_csv("./results/p1/bc_l2distance.csv")

    # Using the trajectories generated above,
    # make video showing evaluation of policy on 10 episodes
    env_for_vid = PusherEnv(render=True)
    env_for_vid.render()
    vid_output = cv2.VideoWriter(vid_path, cv2.VideoWriter_fourcc(*'mp4v'), 30,
                                 (640, 480))
    for given_trajectory in trajectories_list[:num_pushes_in_vid]:
        for action in given_trajectory:

            # apply action and record into video
            env_for_vid.apply_action(action)
            scene_image = env_for_vid.robot.cam.get_images(get_rgb=True,
                                                           get_depth=False)[0]
            vid_output.write(np.array(scene_image))

        # Reset video environment after a given push
        env_for_vid.reset()