def main(): # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines trpo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create DXL Reacher1D environment env = DxlReacher1DEnv(setup='dxl_gripper_default', idn=1, baudrate=1000000, obs_history=1, dt=0.04, gripper_dt=0.01, rllab_box=False, episode_length_step=None, episode_length_time=2, max_torque_mag=100, control_type='torque', target_type='position', reset_type='zero', reward_type='linear', use_ctypes_driver=True, random_state=rand_state) # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly. # By default, it does not normalize observations or rewards. env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_dxl_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn( env, policy_fn, max_timesteps=50000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def main(): # optionally use a pretrained model load_model_data = None hidden_sizes = (32, 32) if len(sys.argv) > 1: load_model_path = sys.argv[1] load_model_data = pkl.load(open(load_model_path, 'rb')) hidden_sizes = load_model_data['hidden_sizes'] # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, load_model_data) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # optionally use a pretrained model save_model_path = None load_model_path = None load_trained_model = False hidden_sizes = (64, 64, 64) if len(sys.argv) > 2:# load model load_trained_model = True save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/ os.makedirs(save_model_path, exist_ok=True) run_dirs = os.listdir(save_model_path) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True) save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/' if load_trained_model:# loading true load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model* # use fixed random state #rand_state = np.random.RandomState(1).get_state() #np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 docker environment # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0]) # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0]) #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1 # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2 # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0]) #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0]) # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0]) distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) env = Create2DockerEnv(30, distro, port='/dev/ttyUSB0', ir_window=20, ir_history=1, obs_history=1, dt=0.045) #random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process #plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], "episodic_ss": [], }) # Spawn plotting process #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running)) #pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path) # Train baselines PPO model = learn( env, policy_fn, max_timesteps=100000, timesteps_per_actorbatch=675,#512 clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.00005, optim_batchsize=16, gamma=0.96836, lam=0.99944, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process #plot_running.value = 0 # shutdown ploting process #time.sleep(2) #pp.join() env.close()