def load(self, path, env): if self.trpo(): return TRPO.load(path, env=env) elif self.ppo(): return PPO2.load(path, env=env) else: return SAC.load(path, env=env)
def main(): """ Runs the test """ """ Create an argparse.ArgumentParser for run_mujoco.py. :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--num-timesteps', type=int, default=int(1e6)) parser.add_argument('--play', default=False, action='store_true') return parse """ env_id = 'UR5Gripper-v0' model_path = '/tmp/gym/trpo_mpi/' # args = mujoco_arg_parser().parse_args() # train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) # train(env_id=env_id, num_timesteps=int(1e7), seed=0, model_path=model_path) env = gym.make(env_id) env = Monitor(env, model_path, allow_early_resets=True) model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=model_path) model = model.load(model_path + "trpo.pkl") model.learn(total_timesteps=int(1e5), callback=callback) model.save(model_path + "trpo.pkl") # tf_util.save_state(model_path) # Enjoy trained agent obs = env.reset() for i in range(100): obs = env.reset() env.render() for i in range(200): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(env, load_path, fig_path): # arguments print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path)) log_path = os.getcwd() + "/log/" + load_path os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True) fig_path = os.getcwd() + "/figs/" + "/" + fig_path load_path = os.getcwd() + "/models/" + load_path # make environment, flattened environment, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) env = DummyVecEnv([lambda: env]) # load model model = TRPO.load(load_path, env=env) obs_initial = env.reset() obs = obs_initial # plot results plot_results(fig_path, log_path) # initializations niter = 10 counter = 0 timestep = 0 results = [[[0, 0, 0] for i in range(100)], [[0, 0, 0, 0] for i in range(100)]] current = [[[0, 0, 0] for i in range(100)], [[0, 0, 0, 0] for i in range(100)]] print("==============================") # check initial positions and quaternions print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip')) print("box", env.envs[0].env.env.sim.data.get_site_xpos('box')) print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool')) print("mocap", env.envs[0].env.env.sim.data.mocap_pos) print("quat", env.envs[0].env.env.sim.data.mocap_quat) print("==============================") # mocap quaternion check for i in range(5): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) quat = env.envs[0].env.env.sim.data.mocap_quat print("obs", obs) print("quat", quat) print("==============================") # start rendering dists = [] box_goal_pos = np.array([0.6, 0.05, -0.17]) while True: if counter == niter: break action, _states = model.predict(obs) obs_old = obs obs, rewards, dones, info = env.step(action) quaternion = env.envs[0].env.env.sim.data.mocap_quat if obs.all() == obs_initial.all(): if counter % 10 == 0: xyzs = current[0] quats = current[1] print(xyzs) print(quats) filename = log_path + "/" + "results_" + str(counter) + ".txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() box_end_pos = np.array(obs_old[0][3:6].tolist()) print(box_end_pos) print(np.shape(box_end_pos)) print(box_goal_pos) print(np.shape(box_goal_pos)) dists.append(np.linalg.norm(box_goal_pos - box_end_pos)) current = [[[0, 0, 0] for i in range(100)], [[0, 0, 0, 0] for i in range(100)]] timestep = 0 counter += 1 print(timestep) print("obs", obs) print("quat", quaternion) # for average trajectory, smoothed for i in range(3): results[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): results[1][timestep][j] += quaternion[0].tolist()[j] # for current trajectory for i in range(3): current[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): current[1][timestep][j] += quaternion[0].tolist()[j] timestep += 1 env.render() # smooth paths by taking average, and calculate mean distance to goal state for timestep in range(100): for i in range(3): results[0][timeste][i] /= niter for j in range(4): results[0][timestep][j] /= niter dist = np.mean(dists) # print and write to file xyzs = results[0] quats = results[1] filename = log_path + "/" + "results_avg.txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() # print average distances print("average distance of box from end goal: %f" % dist)
def main(): """ Runs the test """ parser = mujoco_arg_parser() parser.add_argument( '--model-path', default="/cvgl2/u/surajn/workspace/saved_models/sawyerlift_ppo2/model") parser.add_argument('--images', default=False) args = parser.parse_args() logger.configure() if not args.play: model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path, images=args.images) if args.play: def make_env(): env_out = GymWrapper( suite.make( "SawyerLift", use_camera_obs=False, # do not use pixel observations has_offscreen_renderer= False, # not needed since not using pixel obs has_renderer=True, # make sure we can render to the screen reward_shaping=True, # use dense rewards control_freq= 10, # control should happen fast enough so that simulation looks smooth )) env_out.reward_range = None env_out.metadata = None env_out.spec = None env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out #env = make_env() env = DummyVecEnv([make_env]) env = VecNormalize(env) policy = MlpPolicy #model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, # optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1) model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) model.load(args.model_path) logger.log("Running trained model") obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() while True: env.render() actions = model.step(obs)[0] obs[:] = env.step(actions)[0]