コード例 #1
0
    def __init__(self,
                 run_dir,
                 env_name,
                 alg='mairlImit',
                 train_mode=False,
                 obs_mode='pixel'):
        """

        :param run_dir:
        :param env_name:
        :param alg: 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail'
        :param obs_mode: 'pixel', 'state'
        """
        self.run_dir = run_dir
        self.name = env_name
        self.alg = alg
        self.obs_mode = obs_mode
        assert self.alg in [
            'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail'
        ], '{} is not Implemented!'.format(self.alg)
        self.train_mode = train_mode
        if env_name in ['UR5_Reacher']:
            rand_state = np.random.RandomState(1).get_state()
            env = ReacherEnv(setup="UR5_6dof",
                             host="192.168.1.102",
                             dof=6,
                             control_type="velocity",
                             target_type="position",
                             reset_type="zero",
                             reward_type="precision",
                             derivative_type="none",
                             deriv_action_max=5,
                             first_deriv_max=2,
                             accel_max=1.4,
                             speed_max=0.3,
                             speedj_a=1.4,
                             episode_length_time=4.0,
                             episode_length_step=None,
                             actuation_sync_period=1,
                             dt=0.04,
                             run_mode="multiprocess",
                             rllab_box=False,
                             movej_t=2.0,
                             delay=0.0,
                             random_state=rand_state)
            self.gym = NormalizedEnv(env)
            self.gym.start()
        else:
            self.gym = gym.make(self.name)
        self.random_initialization = True
        self._connect()
        self._train_params()
        self.set_seed()
コード例 #2
0
def robotic_env():
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))
    env = ReacherEnv(
        setup="UR5_default",
        host='169.254.39.68',
        dof=2,
        control_type="velocity",
        target_type="position",
        reset_type="zero",
        reward_type="precision",
        derivative_type="none",
        deriv_action_max=5,
        first_deriv_max=2,
        accel_max=1.4,
        speed_max=0.3,
        speedj_a=1.4,
        episode_length_time=4.0,
        episode_length_step=None,
        actuation_sync_period=1,
        dt=0.04,
        #run_mode="multiprocess",
        run_mode='singlethread',
        rllab_box=False,
        movej_t=2.0,
        delay=0.0,
        random_state=rand_state
        )
    env = NormalizedEnv(env)
    env.start()
    return env
コード例 #3
0
def main():
    # Create the Create2 mover environment
    env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines trpo policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines PPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
コード例 #4
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create UR5 Reacher2D environment
    env = ReacherEnv(setup="UR5_6dof",
                     host=None,
                     dof=6,
                     control_type="velocity",
                     target_type="position",
                     reset_type="zero",
                     reward_type="precision",
                     derivative_type="none",
                     deriv_action_max=5,
                     first_deriv_max=2,
                     accel_max=1.4,
                     speed_max=0.3,
                     speedj_a=1.4,
                     episode_length_time=4.0,
                     episode_length_step=None,
                     actuation_sync_period=1,
                     dt=0.04,
                     run_mode="multiprocess",
                     rllab_box=False,
                     movej_t=2.0,
                     delay=0.0,
                     random_state=rand_state)
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=200000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
コード例 #5
0
def normal_test():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create UR5 Reacher2D environment
    env = ReacherEnv(
            setup="UR10_6dof",
            host=None,
            dof=6,
            control_type="velocity",
            target_type="position",
            reset_type="none",
            reward_type="precision",
            derivative_type="none",
            deriv_action_max=5,
            first_deriv_max=2,
            accel_max=1.4, # was 1.4
            speed_max=0.3, # was 0.3
            speedj_a=1.4,
            episode_length_time=4.0,
            episode_length_step=None,
            actuation_sync_period=1,
            dt=0.04,
            run_mode="multiprocess",
            rllab_box=False,
            movej_t=2.0,
            delay=0.0,
            random_state=rand_state
        )
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    # Load previously trained model if it exists


    # No longer needed
    """def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)"""

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({"write_lock": False,
                                     "episodic_returns": [],
                                     "episodic_lengths": [], })
    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher, args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    run_policy(network='mlp', 
          num_layers=2, # these are network_kwargs for the MLP network
          num_hidden=64,
          env=env, 
          total_timesteps=10000, #Originally 200,000
          timesteps_per_batch=2048,
          callback=kindred_callback,
          load_path='saved_policies/trpo03/trpo03',
          )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
コード例 #6
0
def simple_circle_test(num_eps, num_iters, policy_path, csv_path, move_vel=0.5, radius=0.15, plane='xy'):
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # set up coordination between eps per iteration and num_test
    episode_length_time = 2*np.pi / move_vel #each ep is one full rotation of the circle
    dt = 0.04
    timesteps_per_ep = int(episode_length_time / dt)
    timesteps_per_iter = int(timesteps_per_ep * num_eps)
    timesteps_total = int(timesteps_per_iter * num_iters)


    # Create GridTest environment
    env = MovingPointEnv(
            setup="UR10_6dof",
            host=None,
            dof=6,
            control_type="velocity",
            reset_type="zero",
            reward_type="precision",
            derivative_type="none",
            deriv_action_max=5,
            first_deriv_max=2,
            accel_max=1.4, # was 1.4
            speed_max=0.3, # was 0.3
            speedj_a=1.4,
            episode_length_time=episode_length_time,
            episode_length_step=None,
            actuation_sync_period=1,
            dt=dt,
            run_mode="multiprocess",
            rllab_box=False,
            movej_t=2.0,
            delay=0.0,
            random_state=rand_state,
            move_shape='circle', # circle or line
            move_vel=move_vel, # velocity of moving point in m/s or rad/s
            circle_radius=radius,
            circle_plane=plane, # plane which circle is on (xy, yz, xz)
        )
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({"write_lock": False,
                                     "episodic_returns": [],
                                     "episodic_lengths": [], })
    builtins.shared_returns = shared_returns

    callback = create_moving_point_callback(shared_returns, csv_path)

    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher, args=(env, timesteps_per_iter, shared_returns, plot_running))
    pp.start()

    # Run TRPO policy
    run_policy(network='mlp', 
          num_layers=2, # these are network_kwargs for the MLP network
          num_hidden=64,
          env=env, 
          total_timesteps=timesteps_total, #Originally 200,000
          timesteps_per_batch=timesteps_per_iter,
          callback=callback,
          load_path=policy_path
          )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown plotting process
    time.sleep(2)
    pp.join()

    env.close()
コード例 #7
0
def run_grid_test(x_points, y_points, z_points, num_test, policy_path):
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # set up coordination between eps per iteration and num_test
    episode_length_time = 4.0
    dt = 0.04
    timesteps_per_ep = episode_length_time / dt
    timesteps_per_batch = int(timesteps_per_ep * num_test)
    total_timesteps = timesteps_per_batch * x_points * y_points * z_points


    # Create GridTest environment
    env = GridTestEnv(
            setup="UR10_6dof",
            host=None,
            dof=6,
            control_type="velocity",
            reset_type="zero",
            reward_type="precision",
            derivative_type="none",
            deriv_action_max=5,
            first_deriv_max=2,
            accel_max=1.4, # was 1.4
            speed_max=0.3, # was 0.3
            speedj_a=1.4,
            episode_length_time=episode_length_time,
            episode_length_step=None,
            actuation_sync_period=1,
            dt=dt,
            run_mode="multiprocess",
            rllab_box=False,
            movej_t=2.0,
            delay=0.0,
            random_state=rand_state,
            x_points=x_points,
            y_points=y_points,
            z_points=z_points,
            num_test=num_test
        )
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({"write_lock": False,
                                     "episodic_returns": [],
                                     "episodic_lengths": [], })
    builtins.shared_returns = shared_returns

    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher, args=(env, timesteps_per_batch, shared_returns, plot_running))
    pp.start()

    # Run TRPO policy
    run_policy(network='mlp', 
          num_layers=2, # these are network_kwargs for the MLP network
          num_hidden=64,
          env=env, 
          total_timesteps=total_timesteps, #Originally 200,000
          timesteps_per_batch=timesteps_per_batch,
          callback=grid_test_callback,
          load_path=policy_path
          )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown plotting process
    time.sleep(2)
    pp.join()

    env.close()
コード例 #8
0
ファイル: dxl_reacher.py プロジェクト: williampma/SenseAct
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create DXL Reacher1D environment
    env = DxlReacher1DEnv(setup='dxl_gripper_default',
                          idn=1,
                          baudrate=1000000,
                          obs_history=1,
                          dt=0.04,
                          gripper_dt=0.01,
                          rllab_box=False,
                          episode_length_step=None,
                          episode_length_time=2,
                          max_torque_mag=100,
                          control_type='torque',
                          target_type='position',
                          reset_type='zero',
                          reward_type='linear',
                          use_ctypes_driver=True,
                          random_state=rand_state)

    # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque
    # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly.
    # By default, it does not normalize observations or rewards.
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Plotting process
    pp = Process(target=plot_dxl_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(
        env,
        policy_fn,
        max_timesteps=50000,
        timesteps_per_batch=2048,
        max_kl=0.05,
        cg_iters=10,
        cg_damping=0.1,
        vf_iters=5,
        vf_stepsize=0.001,
        gamma=0.995,
        lam=0.995,
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
コード例 #9
0
def main():
    # optionally use a pretrained model
    load_model_data = None
    hidden_sizes = (32, 32)
    if len(sys.argv) > 1:
        load_model_path = sys.argv[1]
        load_model_data = pkl.load(open(load_model_path, 'rb'))
        hidden_sizes = load_model_data['hidden_sizes']

    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 mover environment
    env = Create2MoverEnv(90,
                          port='/dev/ttyUSB0',
                          obs_history=1,
                          dt=0.15,
                          random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hidden_sizes[0],
                         num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, load_model_data)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
コード例 #10
0
class Environment(object):
    def __init__(self,
                 run_dir,
                 env_name,
                 alg='mairlImit',
                 train_mode=False,
                 obs_mode='pixel'):
        """

        :param run_dir:
        :param env_name:
        :param alg: 'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail'
        :param obs_mode: 'pixel', 'state'
        """
        self.run_dir = run_dir
        self.name = env_name
        self.alg = alg
        self.obs_mode = obs_mode
        assert self.alg in [
            'mairlImit', 'mairlImit4Transfer', 'mairlTransfer', 'mgail'
        ], '{} is not Implemented!'.format(self.alg)
        self.train_mode = train_mode
        if env_name in ['UR5_Reacher']:
            rand_state = np.random.RandomState(1).get_state()
            env = ReacherEnv(setup="UR5_6dof",
                             host="192.168.1.102",
                             dof=6,
                             control_type="velocity",
                             target_type="position",
                             reset_type="zero",
                             reward_type="precision",
                             derivative_type="none",
                             deriv_action_max=5,
                             first_deriv_max=2,
                             accel_max=1.4,
                             speed_max=0.3,
                             speedj_a=1.4,
                             episode_length_time=4.0,
                             episode_length_step=None,
                             actuation_sync_period=1,
                             dt=0.04,
                             run_mode="multiprocess",
                             rllab_box=False,
                             movej_t=2.0,
                             delay=0.0,
                             random_state=rand_state)
            self.gym = NormalizedEnv(env)
            self.gym.start()
        else:
            self.gym = gym.make(self.name)
        self.random_initialization = True
        self._connect()
        self._train_params()
        self.set_seed()

    def _step(self, action):
        action = np.squeeze(action)
        if action.shape == ():
            action = np.expand_dims(action, axis=0)
            # or use:  action = 【action]
        self.t += 1
        if isinstance(self.gym.action_space, spaces.Discrete):
            action = int(action)
        result = self.gym.step(action)
        self.state, self.reward, self.done, self.info = result[:4]
        if self.obs_mode == 'pixel':
            self.state = cv2.resize(self.gym.render('rgb_array'),
                                    dsize=(64, 64),
                                    interpolation=cv2.INTER_AREA)
        if self.random_initialization:
            if hasattr(self.gym, 'env') and hasattr(self.gym.env, 'data'):
                self.qpos, self.qvel = self.gym.env.data.qpos.flatten(
                ), self.gym.env.data.qvel.flatten()
            else:
                self.qpos, self.qvel = [], []
            return np.float32(self.state), np.float32(
                self.reward), self.done, np.float32(self.qpos), np.float32(
                    self.qvel)
        else:
            return np.float32(self.state), np.float32(self.reward), self.done

    def step(self, action, mode):
        qvel, qpos = [], []
        if mode == 'tensorflow':
            if self.random_initialization:
                state, reward, done, qval, qpos = tf.py_func(
                    self._step,
                    inp=[action],
                    Tout=[
                        tf.float32, tf.float32, tf.bool, tf.float32, tf.float32
                    ],
                    name='env_step_func')
            else:
                state, reward, done = tf.py_func(
                    self._step,
                    inp=[action],
                    Tout=[tf.float32, tf.float32, tf.bool],
                    name='env_step_func')

            state = tf.reshape(state, shape=self.state_size)
            done.set_shape(())
        else:
            if self.random_initialization:
                state, reward, done, qvel, qpos = self._step(action)
            else:
                state, reward, done = self._step(action)

        return state, reward, done, 0., qvel, qpos

    def reset(self, qpos=None, qvel=None):
        self.t = 0
        self.state = self.gym.reset()
        if self.obs_mode == 'pixel':
            self.state = cv2.resize(self.gym.render('rgb_array'),
                                    dsize=(64, 64),
                                    interpolation=cv2.INTER_CUBIC)
        if self.random_initialization and qpos is not None and qvel is not None and hasattr(
                self.gym, 'env') and hasattr(self.gym.env, 'set_state'):
            self.gym.env.set_state(qpos, qvel)
        return np.float32(self.state)

    def get_status(self):
        return self.done

    def get_state(self):
        return self.state

    def render(self, mode='human'):
        img = self.gym.render(mode=mode)
        return img

    def _connect(self):
        if self.obs_mode == 'pixel':
            self.state_size = (64, 64, 3)
        else:
            if isinstance(self.gym.observation_space, spaces.Box):
                self.state_size = self.gym.observation_space.shape
            else:
                self.state_size = (self.gym.observation_space.n, )
        if isinstance(self.gym.action_space, spaces.Box):
            self.action_size = self.gym.action_space.shape[0]
        else:
            self.action_size = self.gym.action_space.n
        self.action_space = np.asarray([None] * self.action_size)
        if hasattr(self.gym, 'env') and hasattr(self.gym.env, 'data'):
            self.qpos_size = self.gym.env.data.qpos.shape[0]
            self.qvel_size = self.gym.env.data.qvel.shape[0]
        else:
            self.qpos_size = 0
            self.qvel_size = 0

    def set_seed(self):
        tf.set_random_seed(self.seed)
        random.seed(self.seed)
        self.gym.seed(self.seed)
        np.random.seed(self.seed)

    def _train_params(self):
        self.seed = 0
        if self.name == 'Hopper-v2':
            self.expert_data = 'expert_trajectories/hopper_er.bin'
        elif self.name in [
                'Ant-v2', 'CartPole-v0', 'GridWorldGym-v0', 'HalfCheetah-v2',
                'Swimmer-v2', 'Pendulum-v0'
        ]:
            self.expert_data = 'expert_data/{}_expert_{}.bin'.format(
                self.obs_mode, self.name)
        elif self.name == 'PointMazeRight-v0':
            self.expert_data = 'expert_data/{}_expert_{}.bin'.format(
                self.obs_mode, 'PointMazeLeft-v0')
        elif self.name == 'DisabledAnt-v0':
            self.expert_data = 'expert_data/{}_expert_{}.bin'.format(
                self.obs_mode, 'CustomAnt-v0')
        elif self.name in ['PointMazeLeft-v0', 'CustomAnt-v0']:
            self.expert_data = 'packages/gail_expert/{}_expert_{}.bin'.format(
                self.obs_mode, self.name)
        elif self.name in ['UR5_Reacher']:
            self.expert_data = 'packages/gail_expert/{}_expert_{}.bin'.format(
                self.obs_mode, self.name)
        else:
            raise NotImplementedError('Env {} is not implemented.'.format(
                self.name))

        if not self.train_mode:
            self.trained_model = 'snapshots/20200705225434_Ant-v2_train_mairlImit_s_100/2020-07-06-07-20-175000.sn'
            # Test episode number: self.n_train_iters / self.test_interval * self.n_episodes_test
            self.n_train_iters = 1
            self.test_interval = 1
            self.n_episodes_test = 10
        else:
            if self.alg == 'mairlTransfer':
                self.trained_model = 'snapshots/20200804190406_PointMazeLeft-v0_train_mairlImit4Transfer_s_10_False_False_False/2020-08-05-11-01-720000.sn'
            else:
                self.trained_model = None
            self.n_train_iters = 1000000
            self.test_interval = 1000
            self.n_episodes_test = 1

        if self.name in ['GridWorldGym-v0']:
            self.n_steps_test = self.gym.spec.max_episode_steps  # 20
        else:
            self.n_steps_test = 1000
        self.vis_flag = False
        self.save_models = True
        if self.name in ['GridWorldGym-v0', 'MountainCar-v0', 'CartPole-v0']:
            self.continuous_actions = False
        else:
            self.continuous_actions = True
        self.airl_entropy_weight = 1.0
        if self.alg in ['mairlImit4Transfer', 'mairlTransfer']:
            self.use_airl = True
            self.disc_out_dim = 1
            self.phi_size = None  # [200, 100]
            self.forward_model_type = 'gru'
            self.state_only = True  # False
        elif self.alg in ['mairlImit']:
            self.use_airl = True
            self.disc_out_dim = 1
            self.phi_size = None  # [200, 100]
            self.forward_model_type = 'transformer'  # 'transformer'  # 'gru'
            self.state_only = False
        else:
            self.use_airl = False
            self.disc_out_dim = 2
            self.phi_size = None  # [200, 100]
            self.forward_model_type = 'gru'
            self.state_only = False

        # Main parameters to play with:
        self.er_agent_size = 50000
        self.collect_experience_interval = 15
        self.n_steps_train = 10
        if self.state_only:
            if self.name in ['PointMazeLeft-v0', 'CustomAnt-v0']:
                self.discr_policy_itrvl = 10
            else:
                self.discr_policy_itrvl = 100
            self.prep_time = 0
            self.save_best_ckpt = False
        else:
            self.discr_policy_itrvl = 100
            self.prep_time = 1000
            self.save_best_ckpt = True
        if self.forward_model_type == 'transformer':
            self.use_scale_dot_product = True
            self.use_skip_connection = True
            self.use_dropout = False
        else:
            self.use_scale_dot_product = False
            self.use_skip_connection = False
            self.use_dropout = False
        self.gamma = 0.99
        self.batch_size = 512  # 70
        self.weight_decay = 1e-7
        self.policy_al_w = 1e-2
        self.policy_tr_w = 1e-4
        self.policy_accum_steps = 7
        self.total_trans_err_allowed = 1000
        self.temp = 1.
        self.cost_sensitive_weight = 0.8
        self.noise_intensity = 6.
        self.do_keep_prob = 0.75
        self.forward_model_lambda = 0.  # 0.1

        # Hidden layers size
        self.fm_size = 100
        self.d_size = [200, 100]
        self.p_size = [100, 50]
        self.encoder_feat_size = 1024  # (30,)

        # Learning rates
        self.fm_lr = 1e-4
        self.d_lr = 1e-3
        self.p_lr = 1e-4

        # Log
        self.exp_name = '{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(
            time.strftime("%Y%m%d%H%M%S", time.localtime()), self.name,
            'train' if self.train_mode else 'eval', self.alg,
            's' if self.state_only else 'sa', self.discr_policy_itrvl,
            self.use_scale_dot_product, self.use_skip_connection,
            self.use_dropout)
        self.config_dir = os.path.join(self.run_dir, 'snapshots',
                                       self.exp_name)
        self.log_intervel = 100
        self.save_video = True

        if not os.path.isdir(self.config_dir):
            os.makedirs(self.config_dir)

        with open(os.path.join(self.config_dir, 'log.txt'), 'a') as f:
            f.write("{0}: {1}\n".format('seed', self.seed))
            f.write("{0}: {1}\n".format('name', self.name))
            f.write("{0}: {1}\n".format('expert_data', self.expert_data))
            f.write("{0}: {1}\n".format('train_mode', self.train_mode))
            f.write("{0}: {1}\n".format('trained_model', self.trained_model))
            f.write("{0}: {1}\n".format('n_train_iters', self.n_train_iters))
            f.write("{0}: {1}\n".format('test_interval', self.test_interval))
            f.write("{0}: {1}\n".format('n_episodes_test',
                                        self.n_episodes_test))
            f.write("{0}: {1}\n".format('alg', self.alg))
            f.write("{0}: {1}\n".format('n_steps_test', self.n_steps_test))
            f.write("{0}: {1}\n".format('vis_flag', self.vis_flag))
            f.write("{0}: {1}\n".format('save_models', self.save_models))

            f.write("{0}: {1}\n".format('continuous_actions',
                                        self.continuous_actions))
            f.write("{0}: {1}\n".format('airl_entropy_weight',
                                        self.airl_entropy_weight))
            f.write("{0}: {1}\n".format('use_airl', self.use_airl))
            f.write("{0}: {1}\n".format('disc_out_dim', self.disc_out_dim))
            f.write("{0}: {1}\n".format('phi_size', self.phi_size))
            f.write("{0}: {1}\n".format('forward_model_type',
                                        self.forward_model_type))
            f.write("{0}: {1}\n".format('state_only', self.state_only))
            f.write("{0}: {1}\n".format('er_agent_size', self.er_agent_size))
            f.write("{0}: {1}\n".format('collect_experience_interval',
                                        self.collect_experience_interval))
            f.write("{0}: {1}\n".format('n_steps_train', self.n_steps_train))
            f.write("{0}: {1}\n".format('discr_policy_itrvl',
                                        self.discr_policy_itrvl))
            f.write("{0}: {1}\n".format('prep_time', self.prep_time))

            f.write("{0}: {1}\n".format('gamma', self.gamma))
            f.write("{0}: {1}\n".format('batch_size', self.batch_size))
            f.write("{0}: {1}\n".format('weight_decay', self.weight_decay))
            f.write("{0}: {1}\n".format('policy_al_w', self.policy_al_w))
            f.write("{0}: {1}\n".format('policy_tr_w', self.policy_tr_w))
            f.write("{0}: {1}\n".format('policy_accum_steps',
                                        self.policy_accum_steps))
            f.write("{0}: {1}\n".format('total_trans_err_allowed',
                                        self.total_trans_err_allowed))
            f.write("{0}: {1}\n".format('temp', self.temp))
            f.write("{0}: {1}\n".format('cost_sensitive_weight',
                                        self.cost_sensitive_weight))
            f.write("{0}: {1}\n".format('noise_intensity',
                                        self.noise_intensity))
            f.write("{0}: {1}\n".format('do_keep_prob', self.do_keep_prob))
            f.write("{0}: {1}\n".format('forward_model_lambda',
                                        self.forward_model_lambda))

            f.write("{0}: {1}\n".format('fm_size', self.fm_size))
            f.write("{0}: {1}\n".format('d_size', self.d_size))
            f.write("{0}: {1}\n".format('p_size', self.p_size))
            f.write("{0}: {1}\n".format('fm_lr', self.fm_lr))
            f.write("{0}: {1}\n".format('d_lr', self.d_lr))
            f.write("{0}: {1}\n".format('p_lr', self.p_lr))
            f.write("{0}: {1}\n".format('exp_name', self.exp_name))
            f.write("{0}: {1}\n".format('config_dir', self.config_dir))
            f.write("{0}: {1}\n".format('log_intervel', self.log_intervel))
            f.write("{0}: {1}\n".format('save_video', self.save_video))
            f.write("{0}: {1}\n".format('save_best_ckpt', self.save_best_ckpt))
            f.write("{0}: {1}\n".format('obs_mode', self.obs_mode))
            f.write("{0}: {1}\n".format('use_scale_dot_product',
                                        self.use_scale_dot_product))
            f.write("{0}: {1}\n".format('use_skip_connection',
                                        self.use_skip_connection))
            f.write("{0}: {1}\n".format('use_dropout', self.use_dropout))
コード例 #11
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)

    # Create UR10 Reacher2D environment
    env = ReacherEnvWithRealSense(setup="UR10_default",
                                  camera_hosts=('localhost', ),
                                  camera_ports=(5000, ),
                                  camera_res=(3, 480, 640),
                                  host=None,
                                  dof=2,
                                  control_type="velocity",
                                  target_type="position",
                                  reset_type="zero",
                                  reward_type="precision",
                                  derivative_type="none",
                                  deriv_action_max=5,
                                  first_deriv_max=2,
                                  accel_max=1.4,
                                  speed_max=1.0,
                                  speedj_a=1.4,
                                  episode_length_time=4.0,
                                  episode_length_step=None,
                                  actuation_sync_period=1,
                                  dt=0.5,
                                  run_mode="multiprocess",
                                  rllab_box=False,
                                  movej_t=2.0,
                                  delay=0.0,
                                  random_state=rand_state)
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur10_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # env.action_space.shape
    # Train baselines TRPO
    for episode in range(10):
        print(f"Episode: {episode + 1}")
        done = False
        timestep = 0
        curr_obs = env.reset()
        while not done:
            if timestep % 3 == 0:
                action = np.random.normal(scale=0.1, size=(2, ))
            print(action)
            next_obs, reward, done, _ = env.step(action)

            timestep += 1
            curr_obs = next_obs

            if timestep == 15:
                done = True

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
コード例 #12
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create UR5 Reacher2D environment
    env = SawyerReachXYZEnv(target_goal=(0, 0, 0),
                            indicator_threshold=.05,
                            reward_type='hand_distance',
                            action_mode='torque',
                            use_safety_box=True,
                            torque_action_scale=1,
                            **kwargs)

    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=200000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
コード例 #13
0
def main():
    # optionally use a pretrained model
    save_model_path = None
    load_model_path = None
    load_trained_model = False
    hidden_sizes = (64, 64, 64)

    if len(sys.argv) > 2:# load model
        load_trained_model = True

    save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/
    os.makedirs(save_model_path, exist_ok=True)
    run_dirs = os.listdir(save_model_path)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True)
    save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/'

    if load_trained_model:# loading true
        load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model*

    # use fixed random state
    #rand_state = np.random.RandomState(1).get_state()
    #np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 docker environment
    # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0])
    # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0])
    #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1
    # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2
    # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes
    # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0])
    #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0])
    # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0])
    distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) 

    env = Create2DockerEnv(30, distro,
                           port='/dev/ttyUSB0', ir_window=20,
                           ir_history=1,
                           obs_history=1, dt=0.045)
                           #random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    #plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
        "episodic_ss": [],
    })
    # Spawn plotting process
    #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running))
    #pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path)

    # Train baselines PPO
    model = learn(
        env,
        policy_fn,
        max_timesteps=100000,
        timesteps_per_actorbatch=675,#512
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=0.00005,
        optim_batchsize=16,
        gamma=0.96836,
        lam=0.99944,
        schedule="linear",
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    #plot_running.value = 0  # shutdown ploting process
    #time.sleep(2)
    #pp.join()

    env.close()
コード例 #14
0
def main(cycle_time, idn, baud, port_str, batch_size, mini_batch_div,
         epoch_count, gamma, l, max_action, outdir, ep_time, index):
    """
    :param cycle_time: sense-act cycle time
    :param idn: dynamixel motor id
    :param baud: dynamixel baud
    :param batch_size: How many sample to record for each learning update
    :param mini_batch_size: How many samples to sample from each batch
    :param epoch_count: Number of epochs to train each batch on. Is this the number of mini-batches?
    :param gamma: Usual discount value
    :param l: lambda value for lambda returns.


    In the original paper PPO runs N agents each collecting T samples.
    I need to think about how environment resets are going to work. To calculate things correctly we'd technically
    need to run out the episodes to termination. How should we handle termination? We might want to have a max number
    of steps. In our setting we're going to be following a sine wave - I don't see any need to terminate then. So we
    don't need to run this in an episodic fashion, we'll do a continuing task. We'll collect a total of T samples and
    then do an update. I think I will implement the environment as a gym environment just to permit some
    interoperability. If there was an env that had a terminal then we would just track that terminal and reset the env
    and carry on collecting. Hmmm, actually I'm not sure how to think about this as a gym env. So SenseAct uses this
    RTRLBaseEnv, but I'm not sure I want to do that.

    So the changes listed from REINFORCE:
    1. Drop γ^t from the update, but not from G_t
    2. Batch Updates
    3. Multiple Epochs over the same batch
    4. Mini-batch updates
    5. Surrogate objective: - π_θ/π_θ_{old} * G_t
    6. Add Baseline
    7. Use λ-return: can you the real lambda returns or use generalized advantage estimation like they do in the paper.
    8. Normalize the advantage estimates: H = G^λ - v
    9. Proximity constraint:
        ρ = π_θ/π_θ_{old}
        objective:
        -min[ρΗ, clip(ρ, 1-ε, 1+ε)H]

    Also, there is the value function loss and there is an entropy bonus given.

    """
    #set low latency for usb-serial communications
    #    bashCommand = "setserial /dev/ttyUSB0 low_latency"
    #    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    #    output, error = process.communicate()
    #bashCommand = "cat /sys/bus/usb-serial/devices/ttyUSB0/latency_timer"
    #process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    #output, error = process.communicate()
    #    print(output)

    tag = f"{time.time()}"
    summaries_dir = f"./summaries/{tag}"
    returns_dir = "./returns"
    networks_dir = "./networks"
    if outdir:
        summaries_dir = os.path.join(outdir, f"summaries/{tag}")
        returns_dir = os.path.join(outdir, "returns")
        networks_dir = os.path.join(outdir, "networks")

    os.makedirs(summaries_dir, exist_ok=True)
    os.makedirs(returns_dir, exist_ok=True)
    os.makedirs(networks_dir, exist_ok=True)

    summary_writer = SummaryWriter(log_dir=summaries_dir)

    #env = ReacherEnv(cycle_time, ep_time, dxl.get_driver(False), idn, port_str, baud, max_action,'tourq')
    # env = ReacherEnv(setup='UR5_default',
    #                 host='129.128.159.210',
    #                 dof=2,
    #                 control_type='position',
    #                 derivative_type='none',
    #                 target_type='position',
    #                 reset_type='random',
    #                 reward_type='linear',
    #                 deriv_action_max=10,
    #                 first_deriv_max=10,
    #                 vel_penalty=0,
    #                 obs_history=1,
    #                 actuation_sync_period=1,
    #                 episode_length_time=4.0,
    #                 episode_length_step=None,
    #                 rllab_box = False,
    #                 servoj_t=ur_utils.COMMANDS['SERVOJ']['default']['t'],
    #                 servoj_gain=ur_utils.COMMANDS['SERVOJ']['default']['gain'],
    #                 speedj_a=ur_utils.COMMANDS['SPEEDJ']['default']['a'],
    #                 speedj_t_min=ur_utils.COMMANDS['SPEEDJ']['default']['t_min'],
    #                 movej_t=2,
    #                 accel_max=None,
    #                 speed_max=None,
    #                 dt=0.008,
    #                 delay=0.0)

    #DO WE NEED RANDOM STATE Variable??
    rand_state = np.random.RandomState(1).get_state()
    host_ip = '169.254.39.68'

    env = ReacherEnv(setup="UR5_default",
                     host=host_ip,
                     dof=2,
                     control_type="velocity",
                     target_type="position",
                     reset_type="zero",
                     reward_type="precision",
                     derivative_type="none",
                     deriv_action_max=5,
                     first_deriv_max=2,
                     accel_max=1.4,
                     speed_max=0.3,
                     speedj_a=1.4,
                     episode_length_time=4.0,
                     episode_length_step=None,
                     actuation_sync_period=1,
                     dt=0.04,
                     run_mode="singlethread",
                     rllab_box=False,
                     movej_t=2.0,
                     delay=0.0,
                     random_state=rand_state)
    #print('done')
    env = NormalizedEnv(env)
    env.start()
    #print("starting")
    #    obs = env.reset()
    #    print('resetted', obs)
    #    env.step(action=np.array([0,0])
    #    print('a')
    #    time.sleep(10)
    #env = gym.make('MountainCarContinuous-v0')
    obs_len = env.observation_space.shape[0]
    print(env.action_space.shape)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    ppo_network = PPONetwork(
        action_space=env.action_space,
        in_size=env.observation_space.shape[0])  # TODO: create your network
    ppo_network.to(device)

    # instanciate  value_network
    value_network = nn.Sequential(nn.Linear(obs_len, 50), nn.Sigmoid(),
                                  nn.Linear(50, 1))
    value_network.to(device)

    # instanciate the agent
    agent = PPO(device=device,
                network=ppo_network,
                state_size=obs_len,
                batch_size=batch_size,
                mini_batch_div=mini_batch_div,
                epoch_count=epoch_count,
                gamma=gamma,
                l=l,
                eps=0.2,
                summary_writer=summary_writer,
                value_network=value_network)
    # TODO: implement your main loop here. You will want to collect batches of transitions
    #

    # total number of timesteps
    t = 0
    #total_steps = 1000
    #timestep_per_episode = 200
    n_batch = 36
    undiscounted_return = np.zeros((n_batch, batch_size))
    # do learning for a number of total timesteps
    for b in range(n_batch):  #total_steps // timestep_per_episode):
        print(b)

        # gather batch of episodes
        for ep in range(batch_size):
            # reset the env before each episode
            observation = env.reset()
            reward = 0
            n = 0

            # gather one episone
            while (True):
                #if b > 90 :
                #    env.render()
                action = agent.step(state=observation, r=reward, t=n)
                action = action * max_action
                observation, reward, done, info = env.step(
                    action)  # take the action
                #print(observation)
                undiscounted_return[b,
                                    ep] = undiscounted_return[b, ep] + reward
                #print(action,reward)
                t = t + 1
                n = n + 1
                if done:
                    break
            # end of one episode

        # learning using batch of data
        summary_writer.add_scalar('return', np.mean(undiscounted_return[b, :]),
                                  2048 * b)
        #        env.stop()
        agent.learn(t=t)
        agent.reset_buffers()
        t = 0

    env.close()
    # ploting results
    undiscounted_return_avg = np.mean(undiscounted_return, axis=1)
    np.save('ep_returns_{}'.format(index), undiscounted_return_avg)
    plt.plot(undiscounted_return_avg)
    plt.show()
コード例 #15
0
ファイル: ur.py プロジェクト: ghx-0228/SenseActExperiments
def get_env(cfg):
    # Set seed to potentially fix random state
    seed_low  = cfg['global']['seed']['low'] 
    seed_high = cfg['global']['seed']['high']
    if seed_low is not None:
        logging.debug('Using seed [{},{}) "half-open" interval'.format(seed_low, seed_high))
        rand_state = np.random.RandomState(seed_low).get_state()
        np.random.set_state(rand_state)
        tf_set_seeds(np.random.randint(seed_low, seed_high))
    else:
        logging.debug('Not using any seeds!')

    # Load the RL Environment
    env_module = importlib.import_module(cfg['environment']['codebase']['module'])
    env_class = getattr(env_module, cfg['environment']['codebase']['class'])
    logging.debug("Environment function: {}".format(env_class))

    logging.debug("Host IP: {}".format(cfg['environment']['setup']['host']))

    # Create UR5 Reacher2D environment
    env = env_class(
            setup                 = cfg['environment']['setup'],
            host                  = cfg['environment']['setup']['host'],
            dof                   = cfg['environment']['parameters']['dof'],
            control_type          = cfg['environment']['parameters']['control_type'],
            target_type           = cfg['environment']['parameters']['target_type'],
            reset_type            = cfg['environment']['parameters']['reset_type'],
            reward_type           = cfg['environment']['parameters']['reward_type'],
            derivative_type       = cfg['environment']['parameters']['derivative_type'],
            deriv_action_max      = cfg['environment']['parameters']['deriv_action_max'],
            first_deriv_max       = cfg['environment']['parameters']['first_deriv_max'],
            accel_max             = cfg['environment']['parameters']['accel_max'],
            speed_max             = cfg['environment']['parameters']['speed_max'],
            speedj_a              = cfg['environment']['parameters']['speedj_a'],
            episode_length_time   = cfg['environment']['parameters']['episode_length_time'],
            episode_length_step   = cfg['environment']['parameters']['episode_length_step'],
            actuation_sync_period = cfg['environment']['parameters']['actuation_sync_period'],
            dt                    = cfg['environment']['parameters']['dt'],
            run_mode              = cfg['environment']['parameters']['run_mode'],
            rllab_box             = cfg['environment']['parameters']['rllab_box'],
            movej_t               = cfg['environment']['parameters']['movej_t'],
            delay                 = cfg['environment']['parameters']['delay'],
            random_state          = rand_state if seed_low else None
        )
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()
    policy_fn_module = importlib.import_module(cfg['model']['module'])
    policy_fn_class = getattr(policy_fn_module, cfg['model']['class'])
    logging.debug("Policy function: {}".format(policy_fn_class))

    def policy_fn(name, ob_space, ac_space):
        return policy_fn_class(name           = name,
                               ob_space       = ob_space,
                               ac_space       = ac_space,
                               hid_size       = cfg['algorithm']['hyperparameters']['hid_size'],
                               num_hid_layers = cfg['algorithm']['hyperparameters']['num_hid_layers'])
    
    return env, policy_fn