def __init__(self, run_dir, env_name, alg='mairlImit', train_mode=False, obs_mode='pixel'): """ :param run_dir: :param env_name: :param alg: 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail' :param obs_mode: 'pixel', 'state' """ self.run_dir = run_dir self.name = env_name self.alg = alg self.obs_mode = obs_mode assert self.alg in [ 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail' ], '{} is not Implemented!'.format(self.alg) self.train_mode = train_mode if env_name in ['UR5_Reacher']: rand_state = np.random.RandomState(1).get_state() env = ReacherEnv(setup="UR5_6dof", host="192.168.1.102", dof=6, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) self.gym = NormalizedEnv(env) self.gym.start() else: self.gym = gym.make(self.name) self.random_initialization = True self._connect() self._train_params() self.set_seed()
def robotic_env(): rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) env = ReacherEnv( setup="UR5_default", host='169.254.39.68', dof=2, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, #run_mode="multiprocess", run_mode='singlethread', rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state ) env = NormalizedEnv(env) env.start() return env
def main(): # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines trpo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create UR5 Reacher2D environment env = ReacherEnv(setup="UR5_6dof", host=None, dof=6, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=200000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def normal_test(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create UR5 Reacher2D environment env = ReacherEnv( setup="UR10_6dof", host=None, dof=6, control_type="velocity", target_type="position", reset_type="none", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, # was 1.4 speed_max=0.3, # was 0.3 speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state ) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() # Load previously trained model if it exists # No longer needed """def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2)""" # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({"write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO run_policy(network='mlp', num_layers=2, # these are network_kwargs for the MLP network num_hidden=64, env=env, total_timesteps=10000, #Originally 200,000 timesteps_per_batch=2048, callback=kindred_callback, load_path='saved_policies/trpo03/trpo03', ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def simple_circle_test(num_eps, num_iters, policy_path, csv_path, move_vel=0.5, radius=0.15, plane='xy'): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # set up coordination between eps per iteration and num_test episode_length_time = 2*np.pi / move_vel #each ep is one full rotation of the circle dt = 0.04 timesteps_per_ep = int(episode_length_time / dt) timesteps_per_iter = int(timesteps_per_ep * num_eps) timesteps_total = int(timesteps_per_iter * num_iters) # Create GridTest environment env = MovingPointEnv( setup="UR10_6dof", host=None, dof=6, control_type="velocity", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, # was 1.4 speed_max=0.3, # was 0.3 speedj_a=1.4, episode_length_time=episode_length_time, episode_length_step=None, actuation_sync_period=1, dt=dt, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state, move_shape='circle', # circle or line move_vel=move_vel, # velocity of moving point in m/s or rad/s circle_radius=radius, circle_plane=plane, # plane which circle is on (xy, yz, xz) ) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({"write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) builtins.shared_returns = shared_returns callback = create_moving_point_callback(shared_returns, csv_path) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, timesteps_per_iter, shared_returns, plot_running)) pp.start() # Run TRPO policy run_policy(network='mlp', num_layers=2, # these are network_kwargs for the MLP network num_hidden=64, env=env, total_timesteps=timesteps_total, #Originally 200,000 timesteps_per_batch=timesteps_per_iter, callback=callback, load_path=policy_path ) # Safely terminate plotter process plot_running.value = 0 # shutdown plotting process time.sleep(2) pp.join() env.close()
def run_grid_test(x_points, y_points, z_points, num_test, policy_path): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # set up coordination between eps per iteration and num_test episode_length_time = 4.0 dt = 0.04 timesteps_per_ep = episode_length_time / dt timesteps_per_batch = int(timesteps_per_ep * num_test) total_timesteps = timesteps_per_batch * x_points * y_points * z_points # Create GridTest environment env = GridTestEnv( setup="UR10_6dof", host=None, dof=6, control_type="velocity", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, # was 1.4 speed_max=0.3, # was 0.3 speedj_a=1.4, episode_length_time=episode_length_time, episode_length_step=None, actuation_sync_period=1, dt=dt, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state, x_points=x_points, y_points=y_points, z_points=z_points, num_test=num_test ) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({"write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) builtins.shared_returns = shared_returns # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, timesteps_per_batch, shared_returns, plot_running)) pp.start() # Run TRPO policy run_policy(network='mlp', num_layers=2, # these are network_kwargs for the MLP network num_hidden=64, env=env, total_timesteps=total_timesteps, #Originally 200,000 timesteps_per_batch=timesteps_per_batch, callback=grid_test_callback, load_path=policy_path ) # Safely terminate plotter process plot_running.value = 0 # shutdown plotting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create DXL Reacher1D environment env = DxlReacher1DEnv(setup='dxl_gripper_default', idn=1, baudrate=1000000, obs_history=1, dt=0.04, gripper_dt=0.01, rllab_box=False, episode_length_step=None, episode_length_time=2, max_torque_mag=100, control_type='torque', target_type='position', reset_type='zero', reward_type='linear', use_ctypes_driver=True, random_state=rand_state) # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly. # By default, it does not normalize observations or rewards. env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_dxl_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn( env, policy_fn, max_timesteps=50000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def main(): # optionally use a pretrained model load_model_data = None hidden_sizes = (32, 32) if len(sys.argv) > 1: load_model_path = sys.argv[1] load_model_data = pkl.load(open(load_model_path, 'rb')) hidden_sizes = load_model_data['hidden_sizes'] # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, load_model_data) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
class Environment(object): def __init__(self, run_dir, env_name, alg='mairlImit', train_mode=False, obs_mode='pixel'): """ :param run_dir: :param env_name: :param alg: 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail' :param obs_mode: 'pixel', 'state' """ self.run_dir = run_dir self.name = env_name self.alg = alg self.obs_mode = obs_mode assert self.alg in [ 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail' ], '{} is not Implemented!'.format(self.alg) self.train_mode = train_mode if env_name in ['UR5_Reacher']: rand_state = np.random.RandomState(1).get_state() env = ReacherEnv(setup="UR5_6dof", host="192.168.1.102", dof=6, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) self.gym = NormalizedEnv(env) self.gym.start() else: self.gym = gym.make(self.name) self.random_initialization = True self._connect() self._train_params() self.set_seed() def _step(self, action): action = np.squeeze(action) if action.shape == (): action = np.expand_dims(action, axis=0) # or use: action = 【action] self.t += 1 if isinstance(self.gym.action_space, spaces.Discrete): action = int(action) result = self.gym.step(action) self.state, self.reward, self.done, self.info = result[:4] if self.obs_mode == 'pixel': self.state = cv2.resize(self.gym.render('rgb_array'), dsize=(64, 64), interpolation=cv2.INTER_AREA) if self.random_initialization: if hasattr(self.gym, 'env') and hasattr(self.gym.env, 'data'): self.qpos, self.qvel = self.gym.env.data.qpos.flatten( ), self.gym.env.data.qvel.flatten() else: self.qpos, self.qvel = [], [] return np.float32(self.state), np.float32( self.reward), self.done, np.float32(self.qpos), np.float32( self.qvel) else: return np.float32(self.state), np.float32(self.reward), self.done def step(self, action, mode): qvel, qpos = [], [] if mode == 'tensorflow': if self.random_initialization: state, reward, done, qval, qpos = tf.py_func( self._step, inp=[action], Tout=[ tf.float32, tf.float32, tf.bool, tf.float32, tf.float32 ], name='env_step_func') else: state, reward, done = tf.py_func( self._step, inp=[action], Tout=[tf.float32, tf.float32, tf.bool], name='env_step_func') state = tf.reshape(state, shape=self.state_size) done.set_shape(()) else: if self.random_initialization: state, reward, done, qvel, qpos = self._step(action) else: state, reward, done = self._step(action) return state, reward, done, 0., qvel, qpos def reset(self, qpos=None, qvel=None): self.t = 0 self.state = self.gym.reset() if self.obs_mode == 'pixel': self.state = cv2.resize(self.gym.render('rgb_array'), dsize=(64, 64), interpolation=cv2.INTER_CUBIC) if self.random_initialization and qpos is not None and qvel is not None and hasattr( self.gym, 'env') and hasattr(self.gym.env, 'set_state'): self.gym.env.set_state(qpos, qvel) return np.float32(self.state) def get_status(self): return self.done def get_state(self): return self.state def render(self, mode='human'): img = self.gym.render(mode=mode) return img def _connect(self): if self.obs_mode == 'pixel': self.state_size = (64, 64, 3) else: if isinstance(self.gym.observation_space, spaces.Box): self.state_size = self.gym.observation_space.shape else: self.state_size = (self.gym.observation_space.n, ) if isinstance(self.gym.action_space, spaces.Box): self.action_size = self.gym.action_space.shape[0] else: self.action_size = self.gym.action_space.n self.action_space = np.asarray([None] * self.action_size) if hasattr(self.gym, 'env') and hasattr(self.gym.env, 'data'): self.qpos_size = self.gym.env.data.qpos.shape[0] self.qvel_size = self.gym.env.data.qvel.shape[0] else: self.qpos_size = 0 self.qvel_size = 0 def set_seed(self): tf.set_random_seed(self.seed) random.seed(self.seed) self.gym.seed(self.seed) np.random.seed(self.seed) def _train_params(self): self.seed = 0 if self.name == 'Hopper-v2': self.expert_data = 'expert_trajectories/hopper_er.bin' elif self.name in [ 'Ant-v2', 'CartPole-v0', 'GridWorldGym-v0', 'HalfCheetah-v2', 'Swimmer-v2', 'Pendulum-v0' ]: self.expert_data = 'expert_data/{}_expert_{}.bin'.format( self.obs_mode, self.name) elif self.name == 'PointMazeRight-v0': self.expert_data = 'expert_data/{}_expert_{}.bin'.format( self.obs_mode, 'PointMazeLeft-v0') elif self.name == 'DisabledAnt-v0': self.expert_data = 'expert_data/{}_expert_{}.bin'.format( self.obs_mode, 'CustomAnt-v0') elif self.name in ['PointMazeLeft-v0', 'CustomAnt-v0']: self.expert_data = 'packages/gail_expert/{}_expert_{}.bin'.format( self.obs_mode, self.name) elif self.name in ['UR5_Reacher']: self.expert_data = 'packages/gail_expert/{}_expert_{}.bin'.format( self.obs_mode, self.name) else: raise NotImplementedError('Env {} is not implemented.'.format( self.name)) if not self.train_mode: self.trained_model = 'snapshots/20200705225434_Ant-v2_train_mairlImit_s_100/2020-07-06-07-20-175000.sn' # Test episode number: self.n_train_iters / self.test_interval * self.n_episodes_test self.n_train_iters = 1 self.test_interval = 1 self.n_episodes_test = 10 else: if self.alg == 'mairlTransfer': self.trained_model = 'snapshots/20200804190406_PointMazeLeft-v0_train_mairlImit4Transfer_s_10_False_False_False/2020-08-05-11-01-720000.sn' else: self.trained_model = None self.n_train_iters = 1000000 self.test_interval = 1000 self.n_episodes_test = 1 if self.name in ['GridWorldGym-v0']: self.n_steps_test = self.gym.spec.max_episode_steps # 20 else: self.n_steps_test = 1000 self.vis_flag = False self.save_models = True if self.name in ['GridWorldGym-v0', 'MountainCar-v0', 'CartPole-v0']: self.continuous_actions = False else: self.continuous_actions = True self.airl_entropy_weight = 1.0 if self.alg in ['mairlImit4Transfer', 'mairlTransfer']: self.use_airl = True self.disc_out_dim = 1 self.phi_size = None # [200, 100] self.forward_model_type = 'gru' self.state_only = True # False elif self.alg in ['mairlImit']: self.use_airl = True self.disc_out_dim = 1 self.phi_size = None # [200, 100] self.forward_model_type = 'transformer' # 'transformer' # 'gru' self.state_only = False else: self.use_airl = False self.disc_out_dim = 2 self.phi_size = None # [200, 100] self.forward_model_type = 'gru' self.state_only = False # Main parameters to play with: self.er_agent_size = 50000 self.collect_experience_interval = 15 self.n_steps_train = 10 if self.state_only: if self.name in ['PointMazeLeft-v0', 'CustomAnt-v0']: self.discr_policy_itrvl = 10 else: self.discr_policy_itrvl = 100 self.prep_time = 0 self.save_best_ckpt = False else: self.discr_policy_itrvl = 100 self.prep_time = 1000 self.save_best_ckpt = True if self.forward_model_type == 'transformer': self.use_scale_dot_product = True self.use_skip_connection = True self.use_dropout = False else: self.use_scale_dot_product = False self.use_skip_connection = False self.use_dropout = False self.gamma = 0.99 self.batch_size = 512 # 70 self.weight_decay = 1e-7 self.policy_al_w = 1e-2 self.policy_tr_w = 1e-4 self.policy_accum_steps = 7 self.total_trans_err_allowed = 1000 self.temp = 1. self.cost_sensitive_weight = 0.8 self.noise_intensity = 6. self.do_keep_prob = 0.75 self.forward_model_lambda = 0. # 0.1 # Hidden layers size self.fm_size = 100 self.d_size = [200, 100] self.p_size = [100, 50] self.encoder_feat_size = 1024 # (30,) # Learning rates self.fm_lr = 1e-4 self.d_lr = 1e-3 self.p_lr = 1e-4 # Log self.exp_name = '{}_{}_{}_{}_{}_{}_{}_{}_{}'.format( time.strftime("%Y%m%d%H%M%S", time.localtime()), self.name, 'train' if self.train_mode else 'eval', self.alg, 's' if self.state_only else 'sa', self.discr_policy_itrvl, self.use_scale_dot_product, self.use_skip_connection, self.use_dropout) self.config_dir = os.path.join(self.run_dir, 'snapshots', self.exp_name) self.log_intervel = 100 self.save_video = True if not os.path.isdir(self.config_dir): os.makedirs(self.config_dir) with open(os.path.join(self.config_dir, 'log.txt'), 'a') as f: f.write("{0}: {1}\n".format('seed', self.seed)) f.write("{0}: {1}\n".format('name', self.name)) f.write("{0}: {1}\n".format('expert_data', self.expert_data)) f.write("{0}: {1}\n".format('train_mode', self.train_mode)) f.write("{0}: {1}\n".format('trained_model', self.trained_model)) f.write("{0}: {1}\n".format('n_train_iters', self.n_train_iters)) f.write("{0}: {1}\n".format('test_interval', self.test_interval)) f.write("{0}: {1}\n".format('n_episodes_test', self.n_episodes_test)) f.write("{0}: {1}\n".format('alg', self.alg)) f.write("{0}: {1}\n".format('n_steps_test', self.n_steps_test)) f.write("{0}: {1}\n".format('vis_flag', self.vis_flag)) f.write("{0}: {1}\n".format('save_models', self.save_models)) f.write("{0}: {1}\n".format('continuous_actions', self.continuous_actions)) f.write("{0}: {1}\n".format('airl_entropy_weight', self.airl_entropy_weight)) f.write("{0}: {1}\n".format('use_airl', self.use_airl)) f.write("{0}: {1}\n".format('disc_out_dim', self.disc_out_dim)) f.write("{0}: {1}\n".format('phi_size', self.phi_size)) f.write("{0}: {1}\n".format('forward_model_type', self.forward_model_type)) f.write("{0}: {1}\n".format('state_only', self.state_only)) f.write("{0}: {1}\n".format('er_agent_size', self.er_agent_size)) f.write("{0}: {1}\n".format('collect_experience_interval', self.collect_experience_interval)) f.write("{0}: {1}\n".format('n_steps_train', self.n_steps_train)) f.write("{0}: {1}\n".format('discr_policy_itrvl', self.discr_policy_itrvl)) f.write("{0}: {1}\n".format('prep_time', self.prep_time)) f.write("{0}: {1}\n".format('gamma', self.gamma)) f.write("{0}: {1}\n".format('batch_size', self.batch_size)) f.write("{0}: {1}\n".format('weight_decay', self.weight_decay)) f.write("{0}: {1}\n".format('policy_al_w', self.policy_al_w)) f.write("{0}: {1}\n".format('policy_tr_w', self.policy_tr_w)) f.write("{0}: {1}\n".format('policy_accum_steps', self.policy_accum_steps)) f.write("{0}: {1}\n".format('total_trans_err_allowed', self.total_trans_err_allowed)) f.write("{0}: {1}\n".format('temp', self.temp)) f.write("{0}: {1}\n".format('cost_sensitive_weight', self.cost_sensitive_weight)) f.write("{0}: {1}\n".format('noise_intensity', self.noise_intensity)) f.write("{0}: {1}\n".format('do_keep_prob', self.do_keep_prob)) f.write("{0}: {1}\n".format('forward_model_lambda', self.forward_model_lambda)) f.write("{0}: {1}\n".format('fm_size', self.fm_size)) f.write("{0}: {1}\n".format('d_size', self.d_size)) f.write("{0}: {1}\n".format('p_size', self.p_size)) f.write("{0}: {1}\n".format('fm_lr', self.fm_lr)) f.write("{0}: {1}\n".format('d_lr', self.d_lr)) f.write("{0}: {1}\n".format('p_lr', self.p_lr)) f.write("{0}: {1}\n".format('exp_name', self.exp_name)) f.write("{0}: {1}\n".format('config_dir', self.config_dir)) f.write("{0}: {1}\n".format('log_intervel', self.log_intervel)) f.write("{0}: {1}\n".format('save_video', self.save_video)) f.write("{0}: {1}\n".format('save_best_ckpt', self.save_best_ckpt)) f.write("{0}: {1}\n".format('obs_mode', self.obs_mode)) f.write("{0}: {1}\n".format('use_scale_dot_product', self.use_scale_dot_product)) f.write("{0}: {1}\n".format('use_skip_connection', self.use_skip_connection)) f.write("{0}: {1}\n".format('use_dropout', self.use_dropout))
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) # Create UR10 Reacher2D environment env = ReacherEnvWithRealSense(setup="UR10_default", camera_hosts=('localhost', ), camera_ports=(5000, ), camera_res=(3, 480, 640), host=None, dof=2, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=1.0, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.5, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur10_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # env.action_space.shape # Train baselines TRPO for episode in range(10): print(f"Episode: {episode + 1}") done = False timestep = 0 curr_obs = env.reset() while not done: if timestep % 3 == 0: action = np.random.normal(scale=0.1, size=(2, )) print(action) next_obs, reward, done, _ = env.step(action) timestep += 1 curr_obs = next_obs if timestep == 15: done = True # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create UR5 Reacher2D environment env = SawyerReachXYZEnv(target_goal=(0, 0, 0), indicator_threshold=.05, reward_type='hand_distance', action_mode='torque', use_safety_box=True, torque_action_scale=1, **kwargs) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=200000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # optionally use a pretrained model save_model_path = None load_model_path = None load_trained_model = False hidden_sizes = (64, 64, 64) if len(sys.argv) > 2:# load model load_trained_model = True save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/ os.makedirs(save_model_path, exist_ok=True) run_dirs = os.listdir(save_model_path) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True) save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/' if load_trained_model:# loading true load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model* # use fixed random state #rand_state = np.random.RandomState(1).get_state() #np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 docker environment # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0]) # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0]) #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1 # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2 # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0]) #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0]) # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0]) distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) env = Create2DockerEnv(30, distro, port='/dev/ttyUSB0', ir_window=20, ir_history=1, obs_history=1, dt=0.045) #random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process #plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], "episodic_ss": [], }) # Spawn plotting process #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running)) #pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path) # Train baselines PPO model = learn( env, policy_fn, max_timesteps=100000, timesteps_per_actorbatch=675,#512 clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.00005, optim_batchsize=16, gamma=0.96836, lam=0.99944, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process #plot_running.value = 0 # shutdown ploting process #time.sleep(2) #pp.join() env.close()
def main(cycle_time, idn, baud, port_str, batch_size, mini_batch_div, epoch_count, gamma, l, max_action, outdir, ep_time, index): """ :param cycle_time: sense-act cycle time :param idn: dynamixel motor id :param baud: dynamixel baud :param batch_size: How many sample to record for each learning update :param mini_batch_size: How many samples to sample from each batch :param epoch_count: Number of epochs to train each batch on. Is this the number of mini-batches? :param gamma: Usual discount value :param l: lambda value for lambda returns. In the original paper PPO runs N agents each collecting T samples. I need to think about how environment resets are going to work. To calculate things correctly we'd technically need to run out the episodes to termination. How should we handle termination? We might want to have a max number of steps. In our setting we're going to be following a sine wave - I don't see any need to terminate then. So we don't need to run this in an episodic fashion, we'll do a continuing task. We'll collect a total of T samples and then do an update. I think I will implement the environment as a gym environment just to permit some interoperability. If there was an env that had a terminal then we would just track that terminal and reset the env and carry on collecting. Hmmm, actually I'm not sure how to think about this as a gym env. So SenseAct uses this RTRLBaseEnv, but I'm not sure I want to do that. So the changes listed from REINFORCE: 1. Drop γ^t from the update, but not from G_t 2. Batch Updates 3. Multiple Epochs over the same batch 4. Mini-batch updates 5. Surrogate objective: - π_θ/π_θ_{old} * G_t 6. Add Baseline 7. Use λ-return: can you the real lambda returns or use generalized advantage estimation like they do in the paper. 8. Normalize the advantage estimates: H = G^λ - v 9. Proximity constraint: ρ = π_θ/π_θ_{old} objective: -min[ρΗ, clip(ρ, 1-ε, 1+ε)H] Also, there is the value function loss and there is an entropy bonus given. """ #set low latency for usb-serial communications # bashCommand = "setserial /dev/ttyUSB0 low_latency" # process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) # output, error = process.communicate() #bashCommand = "cat /sys/bus/usb-serial/devices/ttyUSB0/latency_timer" #process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) #output, error = process.communicate() # print(output) tag = f"{time.time()}" summaries_dir = f"./summaries/{tag}" returns_dir = "./returns" networks_dir = "./networks" if outdir: summaries_dir = os.path.join(outdir, f"summaries/{tag}") returns_dir = os.path.join(outdir, "returns") networks_dir = os.path.join(outdir, "networks") os.makedirs(summaries_dir, exist_ok=True) os.makedirs(returns_dir, exist_ok=True) os.makedirs(networks_dir, exist_ok=True) summary_writer = SummaryWriter(log_dir=summaries_dir) #env = ReacherEnv(cycle_time, ep_time, dxl.get_driver(False), idn, port_str, baud, max_action,'tourq') # env = ReacherEnv(setup='UR5_default', # host='129.128.159.210', # dof=2, # control_type='position', # derivative_type='none', # target_type='position', # reset_type='random', # reward_type='linear', # deriv_action_max=10, # first_deriv_max=10, # vel_penalty=0, # obs_history=1, # actuation_sync_period=1, # episode_length_time=4.0, # episode_length_step=None, # rllab_box = False, # servoj_t=ur_utils.COMMANDS['SERVOJ']['default']['t'], # servoj_gain=ur_utils.COMMANDS['SERVOJ']['default']['gain'], # speedj_a=ur_utils.COMMANDS['SPEEDJ']['default']['a'], # speedj_t_min=ur_utils.COMMANDS['SPEEDJ']['default']['t_min'], # movej_t=2, # accel_max=None, # speed_max=None, # dt=0.008, # delay=0.0) #DO WE NEED RANDOM STATE Variable?? rand_state = np.random.RandomState(1).get_state() host_ip = '169.254.39.68' env = ReacherEnv(setup="UR5_default", host=host_ip, dof=2, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="singlethread", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) #print('done') env = NormalizedEnv(env) env.start() #print("starting") # obs = env.reset() # print('resetted', obs) # env.step(action=np.array([0,0]) # print('a') # time.sleep(10) #env = gym.make('MountainCarContinuous-v0') obs_len = env.observation_space.shape[0] print(env.action_space.shape) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ppo_network = PPONetwork( action_space=env.action_space, in_size=env.observation_space.shape[0]) # TODO: create your network ppo_network.to(device) # instanciate value_network value_network = nn.Sequential(nn.Linear(obs_len, 50), nn.Sigmoid(), nn.Linear(50, 1)) value_network.to(device) # instanciate the agent agent = PPO(device=device, network=ppo_network, state_size=obs_len, batch_size=batch_size, mini_batch_div=mini_batch_div, epoch_count=epoch_count, gamma=gamma, l=l, eps=0.2, summary_writer=summary_writer, value_network=value_network) # TODO: implement your main loop here. You will want to collect batches of transitions # # total number of timesteps t = 0 #total_steps = 1000 #timestep_per_episode = 200 n_batch = 36 undiscounted_return = np.zeros((n_batch, batch_size)) # do learning for a number of total timesteps for b in range(n_batch): #total_steps // timestep_per_episode): print(b) # gather batch of episodes for ep in range(batch_size): # reset the env before each episode observation = env.reset() reward = 0 n = 0 # gather one episone while (True): #if b > 90 : # env.render() action = agent.step(state=observation, r=reward, t=n) action = action * max_action observation, reward, done, info = env.step( action) # take the action #print(observation) undiscounted_return[b, ep] = undiscounted_return[b, ep] + reward #print(action,reward) t = t + 1 n = n + 1 if done: break # end of one episode # learning using batch of data summary_writer.add_scalar('return', np.mean(undiscounted_return[b, :]), 2048 * b) # env.stop() agent.learn(t=t) agent.reset_buffers() t = 0 env.close() # ploting results undiscounted_return_avg = np.mean(undiscounted_return, axis=1) np.save('ep_returns_{}'.format(index), undiscounted_return_avg) plt.plot(undiscounted_return_avg) plt.show()
def get_env(cfg): # Set seed to potentially fix random state seed_low = cfg['global']['seed']['low'] seed_high = cfg['global']['seed']['high'] if seed_low is not None: logging.debug('Using seed [{},{}) "half-open" interval'.format(seed_low, seed_high)) rand_state = np.random.RandomState(seed_low).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(seed_low, seed_high)) else: logging.debug('Not using any seeds!') # Load the RL Environment env_module = importlib.import_module(cfg['environment']['codebase']['module']) env_class = getattr(env_module, cfg['environment']['codebase']['class']) logging.debug("Environment function: {}".format(env_class)) logging.debug("Host IP: {}".format(cfg['environment']['setup']['host'])) # Create UR5 Reacher2D environment env = env_class( setup = cfg['environment']['setup'], host = cfg['environment']['setup']['host'], dof = cfg['environment']['parameters']['dof'], control_type = cfg['environment']['parameters']['control_type'], target_type = cfg['environment']['parameters']['target_type'], reset_type = cfg['environment']['parameters']['reset_type'], reward_type = cfg['environment']['parameters']['reward_type'], derivative_type = cfg['environment']['parameters']['derivative_type'], deriv_action_max = cfg['environment']['parameters']['deriv_action_max'], first_deriv_max = cfg['environment']['parameters']['first_deriv_max'], accel_max = cfg['environment']['parameters']['accel_max'], speed_max = cfg['environment']['parameters']['speed_max'], speedj_a = cfg['environment']['parameters']['speedj_a'], episode_length_time = cfg['environment']['parameters']['episode_length_time'], episode_length_step = cfg['environment']['parameters']['episode_length_step'], actuation_sync_period = cfg['environment']['parameters']['actuation_sync_period'], dt = cfg['environment']['parameters']['dt'], run_mode = cfg['environment']['parameters']['run_mode'], rllab_box = cfg['environment']['parameters']['rllab_box'], movej_t = cfg['environment']['parameters']['movej_t'], delay = cfg['environment']['parameters']['delay'], random_state = rand_state if seed_low else None ) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() policy_fn_module = importlib.import_module(cfg['model']['module']) policy_fn_class = getattr(policy_fn_module, cfg['model']['class']) logging.debug("Policy function: {}".format(policy_fn_class)) def policy_fn(name, ob_space, ac_space): return policy_fn_class(name = name, ob_space = ob_space, ac_space = ac_space, hid_size = cfg['algorithm']['hyperparameters']['hid_size'], num_hid_layers = cfg['algorithm']['hyperparameters']['num_hid_layers']) return env, policy_fn