def main(): # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines trpo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create DXL Reacher1D environment env = DxlReacher1DEnv(setup='dxl_gripper_default', idn=1, baudrate=1000000, obs_history=1, dt=0.04, gripper_dt=0.01, rllab_box=False, episode_length_step=None, episode_length_time=2, max_torque_mag=100, control_type='torque', target_type='position', reset_type='zero', reward_type='linear', use_ctypes_driver=True, random_state=rand_state) # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly. # By default, it does not normalize observations or rewards. env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_dxl_reacher, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn( env, policy_fn, max_timesteps=50000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
def main(): # optionally use a pretrained model load_model_data = None hidden_sizes = (32, 32) if len(sys.argv) > 1: load_model_path = sys.argv[1] load_model_data = pkl.load(open(load_model_path, 'rb')) hidden_sizes = load_model_data['hidden_sizes'] # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, load_model_data) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
class Environment(object): def __init__(self, run_dir, env_name, alg='mairlImit', train_mode=False, obs_mode='pixel'): """ :param run_dir: :param env_name: :param alg: 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail' :param obs_mode: 'pixel', 'state' """ self.run_dir = run_dir self.name = env_name self.alg = alg self.obs_mode = obs_mode assert self.alg in [ 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail' ], '{} is not Implemented!'.format(self.alg) self.train_mode = train_mode if env_name in ['UR5_Reacher']: rand_state = np.random.RandomState(1).get_state() env = ReacherEnv(setup="UR5_6dof", host="192.168.1.102", dof=6, control_type="velocity", target_type="position", reset_type="zero", reward_type="precision", derivative_type="none", deriv_action_max=5, first_deriv_max=2, accel_max=1.4, speed_max=0.3, speedj_a=1.4, episode_length_time=4.0, episode_length_step=None, actuation_sync_period=1, dt=0.04, run_mode="multiprocess", rllab_box=False, movej_t=2.0, delay=0.0, random_state=rand_state) self.gym = NormalizedEnv(env) self.gym.start() else: self.gym = gym.make(self.name) self.random_initialization = True self._connect() self._train_params() self.set_seed() def _step(self, action): action = np.squeeze(action) if action.shape == (): action = np.expand_dims(action, axis=0) # or use: action = 【action] self.t += 1 if isinstance(self.gym.action_space, spaces.Discrete): action = int(action) result = self.gym.step(action) self.state, self.reward, self.done, self.info = result[:4] if self.obs_mode == 'pixel': self.state = cv2.resize(self.gym.render('rgb_array'), dsize=(64, 64), interpolation=cv2.INTER_AREA) if self.random_initialization: if hasattr(self.gym, 'env') and hasattr(self.gym.env, 'data'): self.qpos, self.qvel = self.gym.env.data.qpos.flatten( ), self.gym.env.data.qvel.flatten() else: self.qpos, self.qvel = [], [] return np.float32(self.state), np.float32( self.reward), self.done, np.float32(self.qpos), np.float32( self.qvel) else: return np.float32(self.state), np.float32(self.reward), self.done def step(self, action, mode): qvel, qpos = [], [] if mode == 'tensorflow': if self.random_initialization: state, reward, done, qval, qpos = tf.py_func( self._step, inp=[action], Tout=[ tf.float32, tf.float32, tf.bool, tf.float32, tf.float32 ], name='env_step_func') else: state, reward, done = tf.py_func( self._step, inp=[action], Tout=[tf.float32, tf.float32, tf.bool], name='env_step_func') state = tf.reshape(state, shape=self.state_size) done.set_shape(()) else: if self.random_initialization: state, reward, done, qvel, qpos = self._step(action) else: state, reward, done = self._step(action) return state, reward, done, 0., qvel, qpos def reset(self, qpos=None, qvel=None): self.t = 0 self.state = self.gym.reset() if self.obs_mode == 'pixel': self.state = cv2.resize(self.gym.render('rgb_array'), dsize=(64, 64), interpolation=cv2.INTER_CUBIC) if self.random_initialization and qpos is not None and qvel is not None and hasattr( self.gym, 'env') and hasattr(self.gym.env, 'set_state'): self.gym.env.set_state(qpos, qvel) return np.float32(self.state) def get_status(self): return self.done def get_state(self): return self.state def render(self, mode='human'): img = self.gym.render(mode=mode) return img def _connect(self): if self.obs_mode == 'pixel': self.state_size = (64, 64, 3) else: if isinstance(self.gym.observation_space, spaces.Box): self.state_size = self.gym.observation_space.shape else: self.state_size = (self.gym.observation_space.n, ) if isinstance(self.gym.action_space, spaces.Box): self.action_size = self.gym.action_space.shape[0] else: self.action_size = self.gym.action_space.n self.action_space = np.asarray([None] * self.action_size) if hasattr(self.gym, 'env') and hasattr(self.gym.env, 'data'): self.qpos_size = self.gym.env.data.qpos.shape[0] self.qvel_size = self.gym.env.data.qvel.shape[0] else: self.qpos_size = 0 self.qvel_size = 0 def set_seed(self): tf.set_random_seed(self.seed) random.seed(self.seed) self.gym.seed(self.seed) np.random.seed(self.seed) def _train_params(self): self.seed = 0 if self.name == 'Hopper-v2': self.expert_data = 'expert_trajectories/hopper_er.bin' elif self.name in [ 'Ant-v2', 'CartPole-v0', 'GridWorldGym-v0', 'HalfCheetah-v2', 'Swimmer-v2', 'Pendulum-v0' ]: self.expert_data = 'expert_data/{}_expert_{}.bin'.format( self.obs_mode, self.name) elif self.name == 'PointMazeRight-v0': self.expert_data = 'expert_data/{}_expert_{}.bin'.format( self.obs_mode, 'PointMazeLeft-v0') elif self.name == 'DisabledAnt-v0': self.expert_data = 'expert_data/{}_expert_{}.bin'.format( self.obs_mode, 'CustomAnt-v0') elif self.name in ['PointMazeLeft-v0', 'CustomAnt-v0']: self.expert_data = 'packages/gail_expert/{}_expert_{}.bin'.format( self.obs_mode, self.name) elif self.name in ['UR5_Reacher']: self.expert_data = 'packages/gail_expert/{}_expert_{}.bin'.format( self.obs_mode, self.name) else: raise NotImplementedError('Env {} is not implemented.'.format( self.name)) if not self.train_mode: self.trained_model = 'snapshots/20200705225434_Ant-v2_train_mairlImit_s_100/2020-07-06-07-20-175000.sn' # Test episode number: self.n_train_iters / self.test_interval * self.n_episodes_test self.n_train_iters = 1 self.test_interval = 1 self.n_episodes_test = 10 else: if self.alg == 'mairlTransfer': self.trained_model = 'snapshots/20200804190406_PointMazeLeft-v0_train_mairlImit4Transfer_s_10_False_False_False/2020-08-05-11-01-720000.sn' else: self.trained_model = None self.n_train_iters = 1000000 self.test_interval = 1000 self.n_episodes_test = 1 if self.name in ['GridWorldGym-v0']: self.n_steps_test = self.gym.spec.max_episode_steps # 20 else: self.n_steps_test = 1000 self.vis_flag = False self.save_models = True if self.name in ['GridWorldGym-v0', 'MountainCar-v0', 'CartPole-v0']: self.continuous_actions = False else: self.continuous_actions = True self.airl_entropy_weight = 1.0 if self.alg in ['mairlImit4Transfer', 'mairlTransfer']: self.use_airl = True self.disc_out_dim = 1 self.phi_size = None # [200, 100] self.forward_model_type = 'gru' self.state_only = True # False elif self.alg in ['mairlImit']: self.use_airl = True self.disc_out_dim = 1 self.phi_size = None # [200, 100] self.forward_model_type = 'transformer' # 'transformer' # 'gru' self.state_only = False else: self.use_airl = False self.disc_out_dim = 2 self.phi_size = None # [200, 100] self.forward_model_type = 'gru' self.state_only = False # Main parameters to play with: self.er_agent_size = 50000 self.collect_experience_interval = 15 self.n_steps_train = 10 if self.state_only: if self.name in ['PointMazeLeft-v0', 'CustomAnt-v0']: self.discr_policy_itrvl = 10 else: self.discr_policy_itrvl = 100 self.prep_time = 0 self.save_best_ckpt = False else: self.discr_policy_itrvl = 100 self.prep_time = 1000 self.save_best_ckpt = True if self.forward_model_type == 'transformer': self.use_scale_dot_product = True self.use_skip_connection = True self.use_dropout = False else: self.use_scale_dot_product = False self.use_skip_connection = False self.use_dropout = False self.gamma = 0.99 self.batch_size = 512 # 70 self.weight_decay = 1e-7 self.policy_al_w = 1e-2 self.policy_tr_w = 1e-4 self.policy_accum_steps = 7 self.total_trans_err_allowed = 1000 self.temp = 1. self.cost_sensitive_weight = 0.8 self.noise_intensity = 6. self.do_keep_prob = 0.75 self.forward_model_lambda = 0. # 0.1 # Hidden layers size self.fm_size = 100 self.d_size = [200, 100] self.p_size = [100, 50] self.encoder_feat_size = 1024 # (30,) # Learning rates self.fm_lr = 1e-4 self.d_lr = 1e-3 self.p_lr = 1e-4 # Log self.exp_name = '{}_{}_{}_{}_{}_{}_{}_{}_{}'.format( time.strftime("%Y%m%d%H%M%S", time.localtime()), self.name, 'train' if self.train_mode else 'eval', self.alg, 's' if self.state_only else 'sa', self.discr_policy_itrvl, self.use_scale_dot_product, self.use_skip_connection, self.use_dropout) self.config_dir = os.path.join(self.run_dir, 'snapshots', self.exp_name) self.log_intervel = 100 self.save_video = True if not os.path.isdir(self.config_dir): os.makedirs(self.config_dir) with open(os.path.join(self.config_dir, 'log.txt'), 'a') as f: f.write("{0}: {1}\n".format('seed', self.seed)) f.write("{0}: {1}\n".format('name', self.name)) f.write("{0}: {1}\n".format('expert_data', self.expert_data)) f.write("{0}: {1}\n".format('train_mode', self.train_mode)) f.write("{0}: {1}\n".format('trained_model', self.trained_model)) f.write("{0}: {1}\n".format('n_train_iters', self.n_train_iters)) f.write("{0}: {1}\n".format('test_interval', self.test_interval)) f.write("{0}: {1}\n".format('n_episodes_test', self.n_episodes_test)) f.write("{0}: {1}\n".format('alg', self.alg)) f.write("{0}: {1}\n".format('n_steps_test', self.n_steps_test)) f.write("{0}: {1}\n".format('vis_flag', self.vis_flag)) f.write("{0}: {1}\n".format('save_models', self.save_models)) f.write("{0}: {1}\n".format('continuous_actions', self.continuous_actions)) f.write("{0}: {1}\n".format('airl_entropy_weight', self.airl_entropy_weight)) f.write("{0}: {1}\n".format('use_airl', self.use_airl)) f.write("{0}: {1}\n".format('disc_out_dim', self.disc_out_dim)) f.write("{0}: {1}\n".format('phi_size', self.phi_size)) f.write("{0}: {1}\n".format('forward_model_type', self.forward_model_type)) f.write("{0}: {1}\n".format('state_only', self.state_only)) f.write("{0}: {1}\n".format('er_agent_size', self.er_agent_size)) f.write("{0}: {1}\n".format('collect_experience_interval', self.collect_experience_interval)) f.write("{0}: {1}\n".format('n_steps_train', self.n_steps_train)) f.write("{0}: {1}\n".format('discr_policy_itrvl', self.discr_policy_itrvl)) f.write("{0}: {1}\n".format('prep_time', self.prep_time)) f.write("{0}: {1}\n".format('gamma', self.gamma)) f.write("{0}: {1}\n".format('batch_size', self.batch_size)) f.write("{0}: {1}\n".format('weight_decay', self.weight_decay)) f.write("{0}: {1}\n".format('policy_al_w', self.policy_al_w)) f.write("{0}: {1}\n".format('policy_tr_w', self.policy_tr_w)) f.write("{0}: {1}\n".format('policy_accum_steps', self.policy_accum_steps)) f.write("{0}: {1}\n".format('total_trans_err_allowed', self.total_trans_err_allowed)) f.write("{0}: {1}\n".format('temp', self.temp)) f.write("{0}: {1}\n".format('cost_sensitive_weight', self.cost_sensitive_weight)) f.write("{0}: {1}\n".format('noise_intensity', self.noise_intensity)) f.write("{0}: {1}\n".format('do_keep_prob', self.do_keep_prob)) f.write("{0}: {1}\n".format('forward_model_lambda', self.forward_model_lambda)) f.write("{0}: {1}\n".format('fm_size', self.fm_size)) f.write("{0}: {1}\n".format('d_size', self.d_size)) f.write("{0}: {1}\n".format('p_size', self.p_size)) f.write("{0}: {1}\n".format('fm_lr', self.fm_lr)) f.write("{0}: {1}\n".format('d_lr', self.d_lr)) f.write("{0}: {1}\n".format('p_lr', self.p_lr)) f.write("{0}: {1}\n".format('exp_name', self.exp_name)) f.write("{0}: {1}\n".format('config_dir', self.config_dir)) f.write("{0}: {1}\n".format('log_intervel', self.log_intervel)) f.write("{0}: {1}\n".format('save_video', self.save_video)) f.write("{0}: {1}\n".format('save_best_ckpt', self.save_best_ckpt)) f.write("{0}: {1}\n".format('obs_mode', self.obs_mode)) f.write("{0}: {1}\n".format('use_scale_dot_product', self.use_scale_dot_product)) f.write("{0}: {1}\n".format('use_skip_connection', self.use_skip_connection)) f.write("{0}: {1}\n".format('use_dropout', self.use_dropout))
def main(): # optionally use a pretrained model save_model_path = None load_model_path = None load_trained_model = False hidden_sizes = (64, 64, 64) if len(sys.argv) > 2:# load model load_trained_model = True save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/ os.makedirs(save_model_path, exist_ok=True) run_dirs = os.listdir(save_model_path) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True) os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True) save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/' if load_trained_model:# loading true load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model* # use fixed random state #rand_state = np.random.RandomState(1).get_state() #np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 docker environment # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0]) # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0]) # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0]) # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0]) #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1 # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2 # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0]) #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0]) # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0]) distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) env = Create2DockerEnv(30, distro, port='/dev/ttyUSB0', ir_window=20, ir_history=1, obs_history=1, dt=0.045) #random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes)) # Create and start plotting process #plot_running = Value('i', 1) shared_returns = Manager().dict({ "write_lock": False, "episodic_returns": [], "episodic_lengths": [], "episodic_ss": [], }) # Spawn plotting process #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running)) #pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path) # Train baselines PPO model = learn( env, policy_fn, max_timesteps=100000, timesteps_per_actorbatch=675,#512 clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.00005, optim_batchsize=16, gamma=0.96836, lam=0.99944, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process #plot_running.value = 0 # shutdown ploting process #time.sleep(2) #pp.join() env.close()
def get_env(cfg): # Set seed to potentially fix random state seed_low = cfg['global']['seed']['low'] seed_high = cfg['global']['seed']['high'] if seed_low is not None: logging.debug('Using seed [{},{}) "half-open" interval'.format(seed_low, seed_high)) rand_state = np.random.RandomState(seed_low).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(seed_low, seed_high)) else: logging.debug('Not using any seeds!') # Load the RL Environment env_module = importlib.import_module(cfg['environment']['codebase']['module']) env_class = getattr(env_module, cfg['environment']['codebase']['class']) logging.debug("Environment function: {}".format(env_class)) logging.debug("Host IP: {}".format(cfg['environment']['setup']['host'])) # Create UR5 Reacher2D environment env = env_class( setup = cfg['environment']['setup'], host = cfg['environment']['setup']['host'], dof = cfg['environment']['parameters']['dof'], control_type = cfg['environment']['parameters']['control_type'], target_type = cfg['environment']['parameters']['target_type'], reset_type = cfg['environment']['parameters']['reset_type'], reward_type = cfg['environment']['parameters']['reward_type'], derivative_type = cfg['environment']['parameters']['derivative_type'], deriv_action_max = cfg['environment']['parameters']['deriv_action_max'], first_deriv_max = cfg['environment']['parameters']['first_deriv_max'], accel_max = cfg['environment']['parameters']['accel_max'], speed_max = cfg['environment']['parameters']['speed_max'], speedj_a = cfg['environment']['parameters']['speedj_a'], episode_length_time = cfg['environment']['parameters']['episode_length_time'], episode_length_step = cfg['environment']['parameters']['episode_length_step'], actuation_sync_period = cfg['environment']['parameters']['actuation_sync_period'], dt = cfg['environment']['parameters']['dt'], run_mode = cfg['environment']['parameters']['run_mode'], rllab_box = cfg['environment']['parameters']['rllab_box'], movej_t = cfg['environment']['parameters']['movej_t'], delay = cfg['environment']['parameters']['delay'], random_state = rand_state if seed_low else None ) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() policy_fn_module = importlib.import_module(cfg['model']['module']) policy_fn_class = getattr(policy_fn_module, cfg['model']['class']) logging.debug("Policy function: {}".format(policy_fn_class)) def policy_fn(name, ob_space, ac_space): return policy_fn_class(name = name, ob_space = ob_space, ac_space = ac_space, hid_size = cfg['algorithm']['hyperparameters']['hid_size'], num_hid_layers = cfg['algorithm']['hyperparameters']['num_hid_layers']) return env, policy_fn