def _init():
     env = ExamplePushingTrainingEnv(frameskip=3, visualization=False)
     env.seed(seed=rank)
     env.action_space.seed(seed=rank)
     env = FlatObservationWrapper(env)
     return env
Beispiel #2
0
    args = vars(parser.parse_args())
    time_steps = int(args["time_steps"])
    model_path = str(args["model_path"])

    policy_path = os.path.join(model_path,
                               "model_" + str(time_steps) + "_steps")

    model = PPO2.load(policy_path)

    # define a method for the policy fn of your trained model
    def policy_fn(obs):
        return model.predict(obs, deterministic=True)[0]

    # we create the same env as we used for training in train_pushing_ppo.py,
    # such that action and observation space remain coherent with the policy.
    # however, unlike during the training, we set the initialization to the the
    # same as in the standard CubeEnv, since this is what the policy will be
    # evaluated on eventually.
    initializer = cube_env.RandomInitializer(
        difficulty=1)  # difficulty one means pushing
    env = ExamplePushingTrainingEnv(initializer=initializer,
                                    frameskip=3,
                                    visualization=True)
    env = FlatObservationWrapper(env)

    for _ in range(10):
        obs = env.reset()
        done = False
        while not done:
            obs, rew, done, info = env.step(policy_fn(obs))
def main():
    try:
        difficulty = int(sys.argv[1])
        initial_pose_json = sys.argv[2]
        goal_pose_json = sys.argv[3]
        output_file = sys.argv[4]
    except IndexError:
        print("Incorrect number of arguments.")
        print("Usage:\n"
              "\tevaluate_policy.py <difficulty_level> <initial_pose>"
              " <goal_pose> <output_file>")
        sys.exit(1)

    # the poses are passed as JSON strings, so they need to be converted first
    initial_pose = move_cube.Pose.from_json(initial_pose_json)
    goal_pose = move_cube.Pose.from_json(goal_pose_json)

    # create a FixedInitializer with the given values
    initializer = cube_env.FixedInitializer(difficulty, initial_pose,
                                            goal_pose)

    # if difficulty == 1 (i.e. pushing), we load the policy we trained for that
    # task. otherwise, we just use the RandomPolicy as placeholder. Naturally,
    # when you submit you would have a policy for each difficulty level.
    if difficulty == 1:

        # we create the same env as we used for training in
        # train_pushing_ppo.py, such that action and observation space remain
        # coherent with the policy. however, unlike during  training, we set the
        # initialization using the initializer, since this is what's expected
        # during evaluation. if you do not use the initializer, or modify the
        # standard CubeEnv in any way which will affect the simulation (i.e.
        # affect the state action trajectories), the action trajectories you
        # compute will not make sense.
        env = ExamplePushingTrainingEnv(initializer=initializer,
                                        frameskip=3,
                                        visualization=False)
        env = FlatObservationWrapper(env)

        # we load the trained policy
        policy_path = os.path.join("./training_checkpoints",
                                   "model_78000000_steps")
        policy = DQNPolicy(policy_path)

    else:
        env = gym.make(
            "rrc_simulation.gym_wrapper:real_robot_challenge_phase_1-v1",
            initializer=initializer,
            action_type=cube_env.ActionType.POSITION,
            visualization=False,
        )
        policy = RandomPolicy(env.action_space)

    # Execute one episode.  Make sure that the number of simulation steps
    # matches with the episode length of the task.  When using the default Gym
    # environment, this is the case when looping until is_done == True.  Make
    # sure to adjust this in case your custom environment behaves differently!
    is_done = False
    observation = env.reset()
    accumulated_reward = 0
    while not is_done:
        action = policy.predict(observation)
        observation, reward, is_done, info = env.step(action)
        accumulated_reward += reward

    print("Accumulated reward: {}".format(accumulated_reward))

    # store the log for evaluation
    env.platform.store_action_log(output_file)
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_path", required=True, help="output path")
    args = vars(parser.parse_args())
    output_path = str(args["output_path"])

    total_time_steps = 80000000
    validate_every_timesteps = 2000000
    model_path = os.path.join(output_path, "training_checkpoints")

    os.makedirs(model_path, exist_ok=True)

    set_global_seeds(0)
    num_of_active_envs = 1
    policy_kwargs = dict(layer=[256, 256])
    #env = gym.make("real_robot_challenge_phase_1-v1")
    env = FlatObservationWrapper(
        ExamplePushingTrainingEnv(frameskip=20, visualization=False))

    train_configs = {
        "gamma": 0.99,
        "n_steps": int(120000 / 20),
        "ent_coef": 0.01,
        "learning_rate": 0.00025,
        "vf_coef": 0.5,
        "max_grad_norm": 0.5,
        "nminibatches": 40,
        "noptepochs": 4,
    }

    model = HER(MlpPolicy, env, SAC, verbose=1, tensorboard_log=model_path)

    ckpt_frequency = int(validate_every_timesteps / num_of_active_envs)