def run_grid_test(x_points, y_points, z_points, num_test, policy_path): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # set up coordination between eps per iteration and num_test episode_length_time = 4.0 dt = 0.04 timesteps_per_ep = episode_length_time / dt timesteps_per_batch = int(timesteps_per_ep * num_test) total_timesteps = timesteps_per_batch * x_points * y_points * z_points # Create GridTest environment env = GridTestEnv( setup="UR10_6dof", host=None, dof=6, control_type="velocity", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, # was 1.4 speed_max=0.3, # was 0.3 speedj_a=1.4, episode_length_time=episode_length_time, episode_length_step=None, actuation_sync_period=1, dt=dt, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state, x_points=x_points, y_points=y_points, z_points=z_points, num_test=num_test ) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({"write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) builtins.shared_returns = shared_returns # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, timesteps_per_batch, shared_returns, plot_running)) pp.start() # Run TRPO policy run_policy(network='mlp', num_layers=2, # these are network_kwargs for the MLP network num_hidden=64, env=env, total_timesteps=total_timesteps, #Originally 200,000 timesteps_per_batch=timesteps_per_batch, callback=grid_test_callback, load_path=policy_path ) # Safely terminate plotter process plot_running.value = 0 # shutdown plotting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create DXL Reacher1D environment env = DxlReacher1DEnv(setup='dxl_gripper_default', idn=1, baudrate=1000000, obs_history=1, dt=0.04, gripper_dt=0.01, rllab_box=False, episode_length_step=None, episode_length_time=2, max_torque_mag=100, control_type='torque', target_type='position', reset_type='zero', reward_type='linear', use_ctypes_driver=True, random_state=rand_state) # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly. # By default, it does not normalize observations or rewards. env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_dxl_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn( env, policy_fn, max_timesteps=50000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create UR5 Reacher2D environment env = ReacherEnv(setup="UR5_6dof", host=None, dof=6, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=200000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) #Create Asynchronous Simulation of InvertedDoublePendulum-v2 mujoco environment. env = DoubleInvertedPendulumEnv(agent_dt=0.005, sensor_dt=[0.01, 0.0033333], is_render=False, random_state=rand_state) # Start environment processes env.start() # Create baselines ppo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_returns, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines PPO learn( env, policy_fn, max_timesteps=1e6, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.0001, optim_batchsize=64, gamma=0.995, lam=0.995, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create UR5 Reacher2D environment env = SawyerReachXYZEnv(target_goal=(0, 0, 0), indicator_threshold=.05, reward_type='hand_distance', action_mode='torque', use_safety_box=True, torque_action_scale=1, **kwargs) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=200000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # optionally use a pretrained model save_model_path = None load_model_path = None load_trained_model = False hidden_sizes = (64, 64, 64) if len(sys.argv) > 2:# load model load_trained_model = True save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/ os.makedirs(save_model_path, exist_ok=True) run_dirs = os.listdir(save_model_path) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True) save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/' if load_trained_model:# loading true load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model* # use fixed random state #rand_state = np.random.RandomState(1).get_state() #np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 docker environment # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0]) # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0]) #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1 # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2 # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0]) #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0]) # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0]) distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) env = Create2DockerEnv(30, distro, port='/dev/ttyUSB0', ir_window=20, ir_history=1, obs_history=1, dt=0.045) #random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process #plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], "episodic_ss": [], }) # Spawn plotting process #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running)) #pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path) # Train baselines PPO model = learn( env, policy_fn, max_timesteps=100000, timesteps_per_actorbatch=675,#512 clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.00005, optim_batchsize=16, gamma=0.96836, lam=0.99944, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process #plot_running.value = 0 # shutdown ploting process #time.sleep(2) #pp.join() env.close()
def get_env(cfg): # Set seed to potentially fix random state seed_low = cfg['global']['seed']['low'] seed_high = cfg['global']['seed']['high'] if seed_low is not None: logging.debug('Using seed [{},{}) "half-open" interval'.format(seed_low, seed_high)) rand_state = np.random.RandomState(seed_low).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(seed_low, seed_high)) else: logging.debug('Not using any seeds!') # Load the RL Environment env_module = importlib.import_module(cfg['environment']['codebase']['module']) env_class = getattr(env_module, cfg['environment']['codebase']['class']) logging.debug("Environment function: {}".format(env_class)) logging.debug("Host IP: {}".format(cfg['environment']['setup']['host'])) # Create UR5 Reacher2D environment env = env_class( setup = cfg['environment']['setup'], host = cfg['environment']['setup']['host'], dof = cfg['environment']['parameters']['dof'], control_type = cfg['environment']['parameters']['control_type'], target_type = cfg['environment']['parameters']['target_type'], reset_type = cfg['environment']['parameters']['reset_type'], reward_type = cfg['environment']['parameters']['reward_type'], derivative_type = cfg['environment']['parameters']['derivative_type'], deriv_action_max = cfg['environment']['parameters']['deriv_action_max'], first_deriv_max = cfg['environment']['parameters']['first_deriv_max'], accel_max = cfg['environment']['parameters']['accel_max'], speed_max = cfg['environment']['parameters']['speed_max'], speedj_a = cfg['environment']['parameters']['speedj_a'], episode_length_time = cfg['environment']['parameters']['episode_length_time'], episode_length_step = cfg['environment']['parameters']['episode_length_step'], actuation_sync_period = cfg['environment']['parameters']['actuation_sync_period'], dt = cfg['environment']['parameters']['dt'], run_mode = cfg['environment']['parameters']['run_mode'], rllab_box = cfg['environment']['parameters']['rllab_box'], movej_t = cfg['environment']['parameters']['movej_t'], delay = cfg['environment']['parameters']['delay'], random_state = rand_state if seed_low else None ) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() policy_fn_module = importlib.import_module(cfg['model']['module']) policy_fn_class = getattr(policy_fn_module, cfg['model']['class']) logging.debug("Policy function: {}".format(policy_fn_class)) def policy_fn(name, ob_space, ac_space): return policy_fn_class(name = name, ob_space = ob_space, ac_space = ac_space, hid_size = cfg['algorithm']['hyperparameters']['hid_size'], num_hid_layers = cfg['algorithm']['hyperparameters']['num_hid_layers']) return env, policy_fn