Exemple #1
0
 def load(self, path, env):
     if self.trpo():
         return TRPO.load(path, env=env)
     elif self.ppo():
         return PPO2.load(path, env=env)
     else:
         return SAC.load(path, env=env)
def main():
    """
    Runs the test
    """
    """
    Create an argparse.ArgumentParser for run_mujoco.py.

    :return:  (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False}

    parser = arg_parser()
    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
    parser.add_argument('--play', default=False, action='store_true')
    return parse
    """
    env_id = 'UR5Gripper-v0'
    model_path = '/tmp/gym/trpo_mpi/'
    # args = mujoco_arg_parser().parse_args()
    # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
    # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path)
    env = gym.make(env_id)
    env = Monitor(env, model_path, allow_early_resets=True)
    model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path)
    model = model.load(model_path + "trpo.pkl")
    model.learn(total_timesteps=int(1e5), callback=callback)
    model.save(model_path + "trpo.pkl")
    # tf_util.save_state(model_path)

    # Enjoy trained agent
    obs = env.reset()
    for i in range(100):
        obs = env.reset()
        env.render()
        for i in range(200):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
def main(env, load_path, fig_path):

    # arguments
    print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path))
    log_path = os.getcwd() + "/log/" + load_path
    os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True)
    fig_path = os.getcwd() + "/figs/" + "/" + fig_path
    load_path = os.getcwd() + "/models/" + load_path

    # make environment, flattened environment, vectorized environment
    env = gym.make(env)
    env = gym.wrappers.FlattenDictWrapper(
        env, ['observation', 'achieved_goal', 'desired_goal'])
    env = DummyVecEnv([lambda: env])

    # load model
    model = TRPO.load(load_path, env=env)
    obs_initial = env.reset()
    obs = obs_initial

    # plot results
    plot_results(fig_path, log_path)

    # initializations
    niter = 10
    counter = 0
    timestep = 0
    results = [[[0, 0, 0] for i in range(100)],
               [[0, 0, 0, 0] for i in range(100)]]
    current = [[[0, 0, 0] for i in range(100)],
               [[0, 0, 0, 0] for i in range(100)]]
    print("==============================")

    # check initial positions and quaternions
    print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip'))
    print("box", env.envs[0].env.env.sim.data.get_site_xpos('box'))
    print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool'))
    print("mocap", env.envs[0].env.env.sim.data.mocap_pos)
    print("quat", env.envs[0].env.env.sim.data.mocap_quat)
    print("==============================")

    # mocap quaternion check
    for i in range(5):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        quat = env.envs[0].env.env.sim.data.mocap_quat
        print("obs", obs)
        print("quat", quat)
    print("==============================")

    # start rendering
    dists = []
    box_goal_pos = np.array([0.6, 0.05, -0.17])
    while True:
        if counter == niter:
            break
        action, _states = model.predict(obs)
        obs_old = obs
        obs, rewards, dones, info = env.step(action)
        quaternion = env.envs[0].env.env.sim.data.mocap_quat
        if obs.all() == obs_initial.all():
            if counter % 10 == 0:
                xyzs = current[0]
                quats = current[1]
                print(xyzs)
                print(quats)
                filename = log_path + "/" + "results_" + str(counter) + ".txt"
                os.makedirs(log_path + "/", exist_ok=True)
                file = open(filename, 'w+')
                for xyz, quat in zip(xyzs, quats):
                    for coord in xyz:
                        file.write(str(coord) + " ")
                    for quat_coord in quat:
                        file.write(str(quat_coord) + " ")
                    file.write("\n")
                file.close()

            box_end_pos = np.array(obs_old[0][3:6].tolist())
            print(box_end_pos)
            print(np.shape(box_end_pos))
            print(box_goal_pos)
            print(np.shape(box_goal_pos))
            dists.append(np.linalg.norm(box_goal_pos - box_end_pos))
            current = [[[0, 0, 0] for i in range(100)],
                       [[0, 0, 0, 0] for i in range(100)]]
            timestep = 0
            counter += 1
        print(timestep)
        print("obs", obs)
        print("quat", quaternion)

        # for average trajectory, smoothed
        for i in range(3):
            results[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            results[1][timestep][j] += quaternion[0].tolist()[j]

        # for current trajectory
        for i in range(3):
            current[0][timestep][i] += obs[0][:3].tolist()[i]
        for j in range(4):
            current[1][timestep][j] += quaternion[0].tolist()[j]

        timestep += 1
        env.render()

    # smooth paths by taking average, and calculate mean distance to goal state
    for timestep in range(100):
        for i in range(3):
            results[0][timeste][i] /= niter
        for j in range(4):
            results[0][timestep][j] /= niter
    dist = np.mean(dists)

    # print and write to file
    xyzs = results[0]
    quats = results[1]
    filename = log_path + "/" + "results_avg.txt"
    os.makedirs(log_path + "/", exist_ok=True)
    file = open(filename, 'w+')
    for xyz, quat in zip(xyzs, quats):
        for coord in xyz:
            file.write(str(coord) + " ")
        for quat_coord in quat:
            file.write(str(quat_coord) + " ")
        file.write("\n")
    file.close()

    # print average distances
    print("average distance of box from end goal: %f" % dist)
Exemple #4
0
def main():
    """
    Runs the test
    """
    parser = mujoco_arg_parser()
    parser.add_argument(
        '--model-path',
        default="/cvgl2/u/surajn/workspace/saved_models/sawyerlift_ppo2/model")
    parser.add_argument('--images', default=False)
    args = parser.parse_args()

    logger.configure()
    if not args.play:
        model, env = train(args.env,
                           num_timesteps=args.num_timesteps,
                           seed=args.seed,
                           model_path=args.model_path,
                           images=args.images)

    if args.play:

        def make_env():
            env_out = GymWrapper(
                suite.make(
                    "SawyerLift",
                    use_camera_obs=False,  # do not use pixel observations
                    has_offscreen_renderer=
                    False,  # not needed since not using pixel obs
                    has_renderer=True,  # make sure we can render to the screen
                    reward_shaping=True,  # use dense rewards
                    control_freq=
                    10,  # control should happen fast enough so that simulation looks smooth
                ))
            env_out.reward_range = None
            env_out.metadata = None
            env_out.spec = None
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            return env_out

        #env = make_env()
        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        policy = MlpPolicy
        #model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
        #         optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1)
        model = TRPO(MlpPolicy,
                     env,
                     timesteps_per_batch=1024,
                     max_kl=0.01,
                     cg_iters=10,
                     cg_damping=0.1,
                     entcoeff=0.0,
                     gamma=0.99,
                     lam=0.98,
                     vf_iters=5,
                     vf_stepsize=1e-3)
        model.load(args.model_path)
        logger.log("Running trained model")
        obs = np.zeros((env.num_envs, ) + env.observation_space.shape)
        obs[:] = env.reset()
        while True:
            env.render()
            actions = model.step(obs)[0]
            obs[:] = env.step(actions)[0]