Beispiel #1
0
def main():
    # Create the Create2 mover environment
    env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines trpo policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines PPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Beispiel #2
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create DXL Reacher1D environment
    env = DxlReacher1DEnv(setup='dxl_gripper_default',
                          idn=1,
                          baudrate=1000000,
                          obs_history=1,
                          dt=0.04,
                          gripper_dt=0.01,
                          rllab_box=False,
                          episode_length_step=None,
                          episode_length_time=2,
                          max_torque_mag=100,
                          control_type='torque',
                          target_type='position',
                          reset_type='zero',
                          reward_type='linear',
                          use_ctypes_driver=True,
                          random_state=rand_state)

    # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque
    # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly.
    # By default, it does not normalize observations or rewards.
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Plotting process
    pp = Process(target=plot_dxl_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(
        env,
        policy_fn,
        max_timesteps=50000,
        timesteps_per_batch=2048,
        max_kl=0.05,
        cg_iters=10,
        cg_damping=0.1,
        vf_iters=5,
        vf_stepsize=0.001,
        gamma=0.995,
        lam=0.995,
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
Beispiel #3
0
def main():
    # optionally use a pretrained model
    load_model_data = None
    hidden_sizes = (32, 32)
    if len(sys.argv) > 1:
        load_model_path = sys.argv[1]
        load_model_data = pkl.load(open(load_model_path, 'rb'))
        hidden_sizes = load_model_data['hidden_sizes']

    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 mover environment
    env = Create2MoverEnv(90,
                          port='/dev/ttyUSB0',
                          obs_history=1,
                          dt=0.15,
                          random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hidden_sizes[0],
                         num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, load_model_data)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Beispiel #4
0
class Environment(object):
    def __init__(self,
                 run_dir,
                 env_name,
                 alg='mairlImit',
                 train_mode=False,
                 obs_mode='pixel'):
        """

        :param run_dir:
        :param env_name:
        :param alg: 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail'
        :param obs_mode: 'pixel', 'state'
        """
        self.run_dir = run_dir
        self.name = env_name
        self.alg = alg
        self.obs_mode = obs_mode
        assert self.alg in [
            'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail'
        ], '{} is not Implemented!'.format(self.alg)
        self.train_mode = train_mode
        if env_name in ['UR5_Reacher']:
            rand_state = np.random.RandomState(1).get_state()
            env = ReacherEnv(setup="UR5_6dof",
                             host="192.168.1.102",
                             dof=6,
                             control_type="velocity",
                             target_type="position",
                             reset_type="zero",
                             reward_type="precision",
                             derivative_type="none",
                             deriv_action_max=5,
                             first_deriv_max=2,
                             accel_max=1.4,
                             speed_max=0.3,
                             speedj_a=1.4,
                             episode_length_time=4.0,
                             episode_length_step=None,
                             actuation_sync_period=1,
                             dt=0.04,
                             run_mode="multiprocess",
                             rllab_box=False,
                             movej_t=2.0,
                             delay=0.0,
                             random_state=rand_state)
            self.gym = NormalizedEnv(env)
            self.gym.start()
        else:
            self.gym = gym.make(self.name)
        self.random_initialization = True
        self._connect()
        self._train_params()
        self.set_seed()

    def _step(self, action):
        action = np.squeeze(action)
        if action.shape == ():
            action = np.expand_dims(action, axis=0)
            # or use:  action = 【action]
        self.t += 1
        if isinstance(self.gym.action_space, spaces.Discrete):
            action = int(action)
        result = self.gym.step(action)
        self.state, self.reward, self.done, self.info = result[:4]
        if self.obs_mode == 'pixel':
            self.state = cv2.resize(self.gym.render('rgb_array'),
                                    dsize=(64, 64),
                                    interpolation=cv2.INTER_AREA)
        if self.random_initialization:
            if hasattr(self.gym, 'env') and hasattr(self.gym.env, 'data'):
                self.qpos, self.qvel = self.gym.env.data.qpos.flatten(
                ), self.gym.env.data.qvel.flatten()
            else:
                self.qpos, self.qvel = [], []
            return np.float32(self.state), np.float32(
                self.reward), self.done, np.float32(self.qpos), np.float32(
                    self.qvel)
        else:
            return np.float32(self.state), np.float32(self.reward), self.done

    def step(self, action, mode):
        qvel, qpos = [], []
        if mode == 'tensorflow':
            if self.random_initialization:
                state, reward, done, qval, qpos = tf.py_func(
                    self._step,
                    inp=[action],
                    Tout=[
                        tf.float32, tf.float32, tf.bool, tf.float32, tf.float32
                    ],
                    name='env_step_func')
            else:
                state, reward, done = tf.py_func(
                    self._step,
                    inp=[action],
                    Tout=[tf.float32, tf.float32, tf.bool],
                    name='env_step_func')

            state = tf.reshape(state, shape=self.state_size)
            done.set_shape(())
        else:
            if self.random_initialization:
                state, reward, done, qvel, qpos = self._step(action)
            else:
                state, reward, done = self._step(action)

        return state, reward, done, 0., qvel, qpos

    def reset(self, qpos=None, qvel=None):
        self.t = 0
        self.state = self.gym.reset()
        if self.obs_mode == 'pixel':
            self.state = cv2.resize(self.gym.render('rgb_array'),
                                    dsize=(64, 64),
                                    interpolation=cv2.INTER_CUBIC)
        if self.random_initialization and qpos is not None and qvel is not None and hasattr(
                self.gym, 'env') and hasattr(self.gym.env, 'set_state'):
            self.gym.env.set_state(qpos, qvel)
        return np.float32(self.state)

    def get_status(self):
        return self.done

    def get_state(self):
        return self.state

    def render(self, mode='human'):
        img = self.gym.render(mode=mode)
        return img

    def _connect(self):
        if self.obs_mode == 'pixel':
            self.state_size = (64, 64, 3)
        else:
            if isinstance(self.gym.observation_space, spaces.Box):
                self.state_size = self.gym.observation_space.shape
            else:
                self.state_size = (self.gym.observation_space.n, )
        if isinstance(self.gym.action_space, spaces.Box):
            self.action_size = self.gym.action_space.shape[0]
        else:
            self.action_size = self.gym.action_space.n
        self.action_space = np.asarray([None] * self.action_size)
        if hasattr(self.gym, 'env') and hasattr(self.gym.env, 'data'):
            self.qpos_size = self.gym.env.data.qpos.shape[0]
            self.qvel_size = self.gym.env.data.qvel.shape[0]
        else:
            self.qpos_size = 0
            self.qvel_size = 0

    def set_seed(self):
        tf.set_random_seed(self.seed)
        random.seed(self.seed)
        self.gym.seed(self.seed)
        np.random.seed(self.seed)

    def _train_params(self):
        self.seed = 0
        if self.name == 'Hopper-v2':
            self.expert_data = 'expert_trajectories/hopper_er.bin'
        elif self.name in [
                'Ant-v2', 'CartPole-v0', 'GridWorldGym-v0', 'HalfCheetah-v2',
                'Swimmer-v2', 'Pendulum-v0'
        ]:
            self.expert_data = 'expert_data/{}_expert_{}.bin'.format(
                self.obs_mode, self.name)
        elif self.name == 'PointMazeRight-v0':
            self.expert_data = 'expert_data/{}_expert_{}.bin'.format(
                self.obs_mode, 'PointMazeLeft-v0')
        elif self.name == 'DisabledAnt-v0':
            self.expert_data = 'expert_data/{}_expert_{}.bin'.format(
                self.obs_mode, 'CustomAnt-v0')
        elif self.name in ['PointMazeLeft-v0', 'CustomAnt-v0']:
            self.expert_data = 'packages/gail_expert/{}_expert_{}.bin'.format(
                self.obs_mode, self.name)
        elif self.name in ['UR5_Reacher']:
            self.expert_data = 'packages/gail_expert/{}_expert_{}.bin'.format(
                self.obs_mode, self.name)
        else:
            raise NotImplementedError('Env {} is not implemented.'.format(
                self.name))

        if not self.train_mode:
            self.trained_model = 'snapshots/20200705225434_Ant-v2_train_mairlImit_s_100/2020-07-06-07-20-175000.sn'
            # Test episode number: self.n_train_iters / self.test_interval * self.n_episodes_test
            self.n_train_iters = 1
            self.test_interval = 1
            self.n_episodes_test = 10
        else:
            if self.alg == 'mairlTransfer':
                self.trained_model = 'snapshots/20200804190406_PointMazeLeft-v0_train_mairlImit4Transfer_s_10_False_False_False/2020-08-05-11-01-720000.sn'
            else:
                self.trained_model = None
            self.n_train_iters = 1000000
            self.test_interval = 1000
            self.n_episodes_test = 1

        if self.name in ['GridWorldGym-v0']:
            self.n_steps_test = self.gym.spec.max_episode_steps  # 20
        else:
            self.n_steps_test = 1000
        self.vis_flag = False
        self.save_models = True
        if self.name in ['GridWorldGym-v0', 'MountainCar-v0', 'CartPole-v0']:
            self.continuous_actions = False
        else:
            self.continuous_actions = True
        self.airl_entropy_weight = 1.0
        if self.alg in ['mairlImit4Transfer', 'mairlTransfer']:
            self.use_airl = True
            self.disc_out_dim = 1
            self.phi_size = None  # [200, 100]
            self.forward_model_type = 'gru'
            self.state_only = True  # False
        elif self.alg in ['mairlImit']:
            self.use_airl = True
            self.disc_out_dim = 1
            self.phi_size = None  # [200, 100]
            self.forward_model_type = 'transformer'  # 'transformer'  # 'gru'
            self.state_only = False
        else:
            self.use_airl = False
            self.disc_out_dim = 2
            self.phi_size = None  # [200, 100]
            self.forward_model_type = 'gru'
            self.state_only = False

        # Main parameters to play with:
        self.er_agent_size = 50000
        self.collect_experience_interval = 15
        self.n_steps_train = 10
        if self.state_only:
            if self.name in ['PointMazeLeft-v0', 'CustomAnt-v0']:
                self.discr_policy_itrvl = 10
            else:
                self.discr_policy_itrvl = 100
            self.prep_time = 0
            self.save_best_ckpt = False
        else:
            self.discr_policy_itrvl = 100
            self.prep_time = 1000
            self.save_best_ckpt = True
        if self.forward_model_type == 'transformer':
            self.use_scale_dot_product = True
            self.use_skip_connection = True
            self.use_dropout = False
        else:
            self.use_scale_dot_product = False
            self.use_skip_connection = False
            self.use_dropout = False
        self.gamma = 0.99
        self.batch_size = 512  # 70
        self.weight_decay = 1e-7
        self.policy_al_w = 1e-2
        self.policy_tr_w = 1e-4
        self.policy_accum_steps = 7
        self.total_trans_err_allowed = 1000
        self.temp = 1.
        self.cost_sensitive_weight = 0.8
        self.noise_intensity = 6.
        self.do_keep_prob = 0.75
        self.forward_model_lambda = 0.  # 0.1

        # Hidden layers size
        self.fm_size = 100
        self.d_size = [200, 100]
        self.p_size = [100, 50]
        self.encoder_feat_size = 1024  # (30,)

        # Learning rates
        self.fm_lr = 1e-4
        self.d_lr = 1e-3
        self.p_lr = 1e-4

        # Log
        self.exp_name = '{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(
            time.strftime("%Y%m%d%H%M%S", time.localtime()), self.name,
            'train' if self.train_mode else 'eval', self.alg,
            's' if self.state_only else 'sa', self.discr_policy_itrvl,
            self.use_scale_dot_product, self.use_skip_connection,
            self.use_dropout)
        self.config_dir = os.path.join(self.run_dir, 'snapshots',
                                       self.exp_name)
        self.log_intervel = 100
        self.save_video = True

        if not os.path.isdir(self.config_dir):
            os.makedirs(self.config_dir)

        with open(os.path.join(self.config_dir, 'log.txt'), 'a') as f:
            f.write("{0}: {1}\n".format('seed', self.seed))
            f.write("{0}: {1}\n".format('name', self.name))
            f.write("{0}: {1}\n".format('expert_data', self.expert_data))
            f.write("{0}: {1}\n".format('train_mode', self.train_mode))
            f.write("{0}: {1}\n".format('trained_model', self.trained_model))
            f.write("{0}: {1}\n".format('n_train_iters', self.n_train_iters))
            f.write("{0}: {1}\n".format('test_interval', self.test_interval))
            f.write("{0}: {1}\n".format('n_episodes_test',
                                        self.n_episodes_test))
            f.write("{0}: {1}\n".format('alg', self.alg))
            f.write("{0}: {1}\n".format('n_steps_test', self.n_steps_test))
            f.write("{0}: {1}\n".format('vis_flag', self.vis_flag))
            f.write("{0}: {1}\n".format('save_models', self.save_models))

            f.write("{0}: {1}\n".format('continuous_actions',
                                        self.continuous_actions))
            f.write("{0}: {1}\n".format('airl_entropy_weight',
                                        self.airl_entropy_weight))
            f.write("{0}: {1}\n".format('use_airl', self.use_airl))
            f.write("{0}: {1}\n".format('disc_out_dim', self.disc_out_dim))
            f.write("{0}: {1}\n".format('phi_size', self.phi_size))
            f.write("{0}: {1}\n".format('forward_model_type',
                                        self.forward_model_type))
            f.write("{0}: {1}\n".format('state_only', self.state_only))
            f.write("{0}: {1}\n".format('er_agent_size', self.er_agent_size))
            f.write("{0}: {1}\n".format('collect_experience_interval',
                                        self.collect_experience_interval))
            f.write("{0}: {1}\n".format('n_steps_train', self.n_steps_train))
            f.write("{0}: {1}\n".format('discr_policy_itrvl',
                                        self.discr_policy_itrvl))
            f.write("{0}: {1}\n".format('prep_time', self.prep_time))

            f.write("{0}: {1}\n".format('gamma', self.gamma))
            f.write("{0}: {1}\n".format('batch_size', self.batch_size))
            f.write("{0}: {1}\n".format('weight_decay', self.weight_decay))
            f.write("{0}: {1}\n".format('policy_al_w', self.policy_al_w))
            f.write("{0}: {1}\n".format('policy_tr_w', self.policy_tr_w))
            f.write("{0}: {1}\n".format('policy_accum_steps',
                                        self.policy_accum_steps))
            f.write("{0}: {1}\n".format('total_trans_err_allowed',
                                        self.total_trans_err_allowed))
            f.write("{0}: {1}\n".format('temp', self.temp))
            f.write("{0}: {1}\n".format('cost_sensitive_weight',
                                        self.cost_sensitive_weight))
            f.write("{0}: {1}\n".format('noise_intensity',
                                        self.noise_intensity))
            f.write("{0}: {1}\n".format('do_keep_prob', self.do_keep_prob))
            f.write("{0}: {1}\n".format('forward_model_lambda',
                                        self.forward_model_lambda))

            f.write("{0}: {1}\n".format('fm_size', self.fm_size))
            f.write("{0}: {1}\n".format('d_size', self.d_size))
            f.write("{0}: {1}\n".format('p_size', self.p_size))
            f.write("{0}: {1}\n".format('fm_lr', self.fm_lr))
            f.write("{0}: {1}\n".format('d_lr', self.d_lr))
            f.write("{0}: {1}\n".format('p_lr', self.p_lr))
            f.write("{0}: {1}\n".format('exp_name', self.exp_name))
            f.write("{0}: {1}\n".format('config_dir', self.config_dir))
            f.write("{0}: {1}\n".format('log_intervel', self.log_intervel))
            f.write("{0}: {1}\n".format('save_video', self.save_video))
            f.write("{0}: {1}\n".format('save_best_ckpt', self.save_best_ckpt))
            f.write("{0}: {1}\n".format('obs_mode', self.obs_mode))
            f.write("{0}: {1}\n".format('use_scale_dot_product',
                                        self.use_scale_dot_product))
            f.write("{0}: {1}\n".format('use_skip_connection',
                                        self.use_skip_connection))
            f.write("{0}: {1}\n".format('use_dropout', self.use_dropout))
def main():
    # optionally use a pretrained model
    save_model_path = None
    load_model_path = None
    load_trained_model = False
    hidden_sizes = (64, 64, 64)

    if len(sys.argv) > 2:# load model
        load_trained_model = True

    save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/
    os.makedirs(save_model_path, exist_ok=True)
    run_dirs = os.listdir(save_model_path)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True)
    save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/'

    if load_trained_model:# loading true
        load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model*

    # use fixed random state
    #rand_state = np.random.RandomState(1).get_state()
    #np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 docker environment
    # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0])
    # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0])
    #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1
    # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2
    # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes
    # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0])
    #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0])
    # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0])
    distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) 

    env = Create2DockerEnv(30, distro,
                           port='/dev/ttyUSB0', ir_window=20,
                           ir_history=1,
                           obs_history=1, dt=0.045)
                           #random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    #plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
        "episodic_ss": [],
    })
    # Spawn plotting process
    #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running))
    #pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path)

    # Train baselines PPO
    model = learn(
        env,
        policy_fn,
        max_timesteps=100000,
        timesteps_per_actorbatch=675,#512
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=0.00005,
        optim_batchsize=16,
        gamma=0.96836,
        lam=0.99944,
        schedule="linear",
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    #plot_running.value = 0  # shutdown ploting process
    #time.sleep(2)
    #pp.join()

    env.close()
Beispiel #6
0
def get_env(cfg):
    # Set seed to potentially fix random state
    seed_low  = cfg['global']['seed']['low'] 
    seed_high = cfg['global']['seed']['high']
    if seed_low is not None:
        logging.debug('Using seed [{},{}) "half-open" interval'.format(seed_low, seed_high))
        rand_state = np.random.RandomState(seed_low).get_state()
        np.random.set_state(rand_state)
        tf_set_seeds(np.random.randint(seed_low, seed_high))
    else:
        logging.debug('Not using any seeds!')

    # Load the RL Environment
    env_module = importlib.import_module(cfg['environment']['codebase']['module'])
    env_class = getattr(env_module, cfg['environment']['codebase']['class'])
    logging.debug("Environment function: {}".format(env_class))

    logging.debug("Host IP: {}".format(cfg['environment']['setup']['host']))

    # Create UR5 Reacher2D environment
    env = env_class(
            setup                 = cfg['environment']['setup'],
            host                  = cfg['environment']['setup']['host'],
            dof                   = cfg['environment']['parameters']['dof'],
            control_type          = cfg['environment']['parameters']['control_type'],
            target_type           = cfg['environment']['parameters']['target_type'],
            reset_type            = cfg['environment']['parameters']['reset_type'],
            reward_type           = cfg['environment']['parameters']['reward_type'],
            derivative_type       = cfg['environment']['parameters']['derivative_type'],
            deriv_action_max      = cfg['environment']['parameters']['deriv_action_max'],
            first_deriv_max       = cfg['environment']['parameters']['first_deriv_max'],
            accel_max             = cfg['environment']['parameters']['accel_max'],
            speed_max             = cfg['environment']['parameters']['speed_max'],
            speedj_a              = cfg['environment']['parameters']['speedj_a'],
            episode_length_time   = cfg['environment']['parameters']['episode_length_time'],
            episode_length_step   = cfg['environment']['parameters']['episode_length_step'],
            actuation_sync_period = cfg['environment']['parameters']['actuation_sync_period'],
            dt                    = cfg['environment']['parameters']['dt'],
            run_mode              = cfg['environment']['parameters']['run_mode'],
            rllab_box             = cfg['environment']['parameters']['rllab_box'],
            movej_t               = cfg['environment']['parameters']['movej_t'],
            delay                 = cfg['environment']['parameters']['delay'],
            random_state          = rand_state if seed_low else None
        )
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()
    policy_fn_module = importlib.import_module(cfg['model']['module'])
    policy_fn_class = getattr(policy_fn_module, cfg['model']['class'])
    logging.debug("Policy function: {}".format(policy_fn_class))

    def policy_fn(name, ob_space, ac_space):
        return policy_fn_class(name           = name,
                               ob_space       = ob_space,
                               ac_space       = ac_space,
                               hid_size       = cfg['algorithm']['hyperparameters']['hid_size'],
                               num_hid_layers = cfg['algorithm']['hyperparameters']['num_hid_layers'])
    
    return env, policy_fn