def main(): #Create Asynchronous Simulation of InvertedDoublePendulum-v2 mujoco environment. env = DoubleInvertedPendulumEnv( agent_dt=0.005, sensor_dt=[0.01, 0.0033333], ) # Start environment processes env.start() # Create baselines ppo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_returns, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines PPO learn( env, policy_fn, max_timesteps=1e6, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.0001, optim_batchsize=64, gamma=0.995, lam=0.995, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def train(cfg): """ Function to start training and logging processes Args: cfg (dict): Configuration parameters loaded into dict from yaml file """ artifact_path = cfg['train']['artifact_path'] # Get environment m = cfg['environment']['module'] t = imp.load_module(m, *imp.find_module(m)) get_env = getattr(t, cfg['environment']['class']) env, policy_fn = get_env(cfg) # Create and start logging process log_running = Value('i', 1) # Manager to share data between log process and main process shared_returns = Manager().dict({ 'write_lock': False, 'episodic_returns': [], 'episodic_lengths': [], }) # Spawn logging process pp = Process( target=log_function, args=(env, cfg['algorithm']['hyperparameters']['timesteps_per_batch'], shared_returns, log_running, artifact_path)) pp.start() # Create callback function for logging data from learn kindred_callback = create_callback(shared_returns) # Train m = importlib.import_module(cfg['algorithm']['codebase']['module']) learn = getattr(m, cfg['algorithm']['codebase']['class']) logging.debug("Learn function: {}".format(learn)) learn(env, policy_fn, max_timesteps=cfg['algorithm']['hyperparameters']['max_timesteps'], timesteps_per_batch=cfg['algorithm']['hyperparameters'] ['timesteps_per_batch'], max_kl=cfg['algorithm']['hyperparameters']['max_kl'], cg_iters=cfg['algorithm']['hyperparameters']['cg_iters'], cg_damping=cfg['algorithm']['hyperparameters']['cg_damping'], vf_iters=cfg['algorithm']['hyperparameters']['vf_iters'], vf_stepsize=cfg['algorithm']['hyperparameters']['vf_stepsize'], gamma=cfg['algorithm']['hyperparameters']['gamma'], lam=cfg['algorithm']['hyperparameters']['lam'], callback=kindred_callback) # Safely terminate plotter process log_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines trpo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create UR5 Reacher2D environment env = ReacherEnv(setup="UR5_6dof", host=None, dof=6, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=200000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create DXL Reacher1D environment env = DxlReacher1DEnv(setup='dxl_gripper_default', idn=1, baudrate=1000000, obs_history=1, dt=0.04, gripper_dt=0.01, rllab_box=False, episode_length_step=None, episode_length_time=2, max_torque_mag=100, control_type='torque', target_type='position', reset_type='zero', reward_type='linear', use_ctypes_driver=True, random_state=rand_state) # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly. # By default, it does not normalize observations or rewards. env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_dxl_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn( env, policy_fn, max_timesteps=50000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def main(): # optionally use a pretrained model load_model_data = None hidden_sizes = (32, 32) if len(sys.argv) > 1: load_model_path = sys.argv[1] load_model_data = pkl.load(open(load_model_path, 'rb')) hidden_sizes = load_model_data['hidden_sizes'] # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, load_model_data) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) # Create UR10 Reacher2D environment env = ReacherEnvWithRealSense(setup="UR10_default", camera_hosts=('localhost', ), camera_ports=(5000, ), camera_res=(3, 480, 640), host=None, dof=2, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=1.0, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.5, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur10_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # env.action_space.shape # Train baselines TRPO for episode in range(10): print(f"Episode: {episode + 1}") done = False timestep = 0 curr_obs = env.reset() while not done: if timestep % 3 == 0: action = np.random.normal(scale=0.1, size=(2, )) print(action) next_obs, reward, done, _ = env.step(action) timestep += 1 curr_obs = next_obs if timestep == 15: done = True # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create UR5 Reacher2D environment env = SawyerReachXYZEnv(target_goal=(0, 0, 0), indicator_threshold=.05, reward_type='hand_distance', action_mode='torque', use_safety_box=True, torque_action_scale=1, **kwargs) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=200000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # optionally use a pretrained model save_model_path = None load_model_path = None load_trained_model = False hidden_sizes = (64, 64, 64) if len(sys.argv) > 2:# load model load_trained_model = True save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/ os.makedirs(save_model_path, exist_ok=True) run_dirs = os.listdir(save_model_path) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True) save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/' if load_trained_model:# loading true load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model* # use fixed random state #rand_state = np.random.RandomState(1).get_state() #np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 docker environment # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0]) # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0]) #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1 # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2 # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0]) #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0]) # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0]) distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) env = Create2DockerEnv(30, distro, port='/dev/ttyUSB0', ir_window=20, ir_history=1, obs_history=1, dt=0.045) #random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process #plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], "episodic_ss": [], }) # Spawn plotting process #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running)) #pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path) # Train baselines PPO model = learn( env, policy_fn, max_timesteps=100000, timesteps_per_actorbatch=675,#512 clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.00005, optim_batchsize=16, gamma=0.96836, lam=0.99944, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process #plot_running.value = 0 # shutdown ploting process #time.sleep(2) #pp.join() env.close()