Ejemplo n.º 1
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import mlp_policy, pposgd_simple
    import baselines.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    mujoco_py.ignore_mujoco_warnings().__enter__()
    workerseed = seed + 10000 * rank
    set_global_seeds(workerseed)
    env = make_robotics_env(env_id, workerseed, rank=rank)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=256,
                                    num_hid_layers=3)

    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=5,
        optim_stepsize=3e-4,
        optim_batchsize=256,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Ejemplo n.º 2
0
def test_ignore_mujoco_warnings():
    # Two boxes on a plane need more than 1 contact (nconmax)
    xml = '''
    <mujoco>
      <size nconmax="1"/>
      <worldbody>
        <geom type="plane" size="1 1 0.1"/>
        <body pos="1 0 1"> <joint type="free"/> <geom size="1"/> </body>
        <body pos="0 1 1"> <joint type="free"/> <geom size="1"/> </body>
      </worldbody>
    </mujoco>
    '''
    model = load_model_from_xml(xml)
    sim = MjSim(model)

    sim.reset()
    with ignore_mujoco_warnings():
        # This should raise an exception due to the mujoco warning callback,
        # but it's suppressed by the context manager.
        sim.step()

    sim.reset()
    with pytest.raises(Exception):
        # test to make sure previous warning callback restored.
        sim.step()
Ejemplo n.º 3
0
def test_ignore_mujoco_warnings():
    # Two boxes on a plane need more than 1 contact (nconmax)
    xml = '''
    <mujoco>
      <size nconmax="1"/>
      <worldbody>
        <geom type="plane" size="1 1 0.1"/>
        <body pos="1 0 1"> <joint type="free"/> <geom size="1"/> </body>
        <body pos="0 1 1"> <joint type="free"/> <geom size="1"/> </body>
      </worldbody>
    </mujoco>
    '''
    model = load_model_from_xml(xml)
    sim = MjSim(model)

    sim.reset()
    with ignore_mujoco_warnings():
        # This should raise an exception due to the mujoco warning callback,
        # but it's suppressed by the context manager.
        sim.step()

    sim.reset()
    with pytest.raises(Exception):
        # test to make sure previous warning callback restored.
        sim.step()
Ejemplo n.º 4
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Robotics environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    rank = MPI.COMM_WORLD.Get_rank()
    with mujoco_py.ignore_mujoco_warnings():
        workerseed = seed + 10000 * rank
        set_global_seeds(workerseed)
        env = make_robotics_env(env_id, workerseed, rank=rank)

        def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3,
                                        sess=sess, placeholders=placeholders)

        pposgd_simple.learn(env, policy_fn,
                            max_timesteps=num_timesteps,
                            timesteps_per_actorbatch=2048,
                            clip_param=0.2, entcoeff=0.0,
                            optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
                            gamma=0.99, lam=0.95, schedule='linear')
        env.close()
Ejemplo n.º 5
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Robotics environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    rank = MPI.COMM_WORLD.Get_rank()
    with mujoco_py.ignore_mujoco_warnings():
        workerseed = seed + 10000 * rank
        set_global_seeds(workerseed)
        env = make_robotics_env(env_id, workerseed, rank=rank)
        print(env.observation_space)

        tblog = "/cvgl2/u/surajn/workspace/tb_logs/ppo1_fetchreach/"
        model = PPO1(MlpPolicy,
                     env,
                     timesteps_per_actorbatch=2048,
                     clip_param=0.2,
                     entcoeff=0.0,
                     optim_epochs=5,
                     optim_stepsize=3e-4,
                     optim_batchsize=256,
                     gamma=0.99,
                     lam=0.95,
                     schedule='linear',
                     tensorboard_log=tblog,
                     verbose=1)
        model.learn(total_timesteps=num_timesteps)
        env.close()
Ejemplo n.º 6
0
def test_locked_cube():
    env = make_env(starting_seed=0)

    is_on_palm = []
    for idx in range(20):
        env.reset()

        expected_joints = (
            "cube:cube_tx",
            "cube:cube_ty",
            "cube:cube_tz",
            "cube:cube_rot",
            "target:cube_tx",
            "target:cube_ty",
            "target:cube_tz",
            "target:cube_rot",
            "robot0:WRJ1",
            "robot0:WRJ0",
            "robot0:FFJ3",
            "robot0:FFJ2",
            "robot0:FFJ1",
            "robot0:FFJ0",
            "robot0:MFJ3",
            "robot0:MFJ2",
            "robot0:MFJ1",
            "robot0:MFJ0",
            "robot0:RFJ3",
            "robot0:RFJ2",
            "robot0:RFJ1",
            "robot0:RFJ0",
            "robot0:LFJ4",
            "robot0:LFJ3",
            "robot0:LFJ2",
            "robot0:LFJ1",
            "robot0:LFJ0",
            "robot0:THJ4",
            "robot0:THJ3",
            "robot0:THJ2",
            "robot0:THJ1",
            "robot0:THJ0",
        )

        assert env.unwrapped.sim.model.joint_names == expected_joints
        with ignore_mujoco_warnings():
            for _ in range(20):
                obs, _, _, _ = env.step(env.action_space.nvec // 2)

        is_on_palm.append(on_palm(env.unwrapped.sim))

        # Make sure the mass is right.
        cube_body_idx = env.unwrapped.sim.model.body_name2id("cube:middle")
        assert_allclose(
            env.unwrapped.sim.model.body_subtreemass[cube_body_idx],
            0.078,
            atol=1e-3)

    assert (
        np.mean(is_on_palm) >= 0.8
    ), "Cube should stay in hand (most of the time) when zero action is sent."
Ejemplo n.º 7
0
def _worker(remote, parent_remote, env_fn_wrapper, env_memory_usage=None):
    with mujoco_py.ignore_mujoco_warnings():
        process = psutil.Process(os.getpid())
        parent_remote.close()
        env = env_fn_wrapper.var()
        while True:
            try:
                cmd, data = remote.recv()
                #print("CMD", cmd, os.getpid())
                if cmd == 'step':
                    #print("step start", os.getpid())
                    observation, reward, done, info = env.step(data)
                    #print("step end", os.getpid())
                    if done:
                        #print("done", os.getpid())
                        # save final observation where user can get it, then reset
                        info['terminal_observation'] = observation
                        observation = env.reset()
                        #print("reset", os.getpid())
                    #print("send", os.getpid())
                    remote.send((observation, reward, done, info))
                    #print("sent", os.getpid())
                elif cmd == 'seed':
                    remote.send(env.seed(data))
                elif cmd == 'reset':
                    observation = env.reset()
                    remote.send(observation)
                    if not env_memory_usage is None:
                        env_memory_usage.value = process.memory_info(
                        ).rss / 1E9
                    #print("Worker memory usage", process.memory_info().rss/1E9)
                elif cmd == 'render':
                    remote.send(env.render(data))
                elif cmd == 'close':
                    env.close()
                    remote.close()
                    break
                elif cmd == 'get_spaces':
                    remote.send((env.observation_space, env.action_space))
                elif cmd == 'env_method':
                    method = getattr(env, data[0])
                    remote.send(method(*data[1], **data[2]))
                elif cmd == 'get_attr':
                    remote.send(getattr(env, data))
                elif cmd == 'set_attr':
                    remote.send(setattr(env, data[0], data[1]))
                else:
                    raise NotImplementedError(
                        "`{}` is not implemented in the worker".format(cmd))
            except:
                "Got some warning stuff, continuar, no throw you fool"
Ejemplo n.º 8
0
    def run(self, once=False):
        while True:
            self.process_events()
            self.update_sim(self.env.unwrapped.sim)
            self.add_extra_menu()

            with ignore_mujoco_warnings():
                obs, reward, done, info = self._run_step(self._get_action())
                self.last_step_result = deepcopy((obs, reward, done, info))

            self.add_overlay(const.GRID_BOTTOMRIGHT, "done", str(done))
            self.render()

            self.update_aux_display()

            if once:
                return
Ejemplo n.º 9
0
 def run(self, once=False):
     while True:
         with ignore_mujoco_warnings():
             self.env.step(self.action)
         self.add_overlay(const.GRID_TOPRIGHT,
                          "Reset env; (current seed: {})".format(self.seed),
                          "N - next / P - previous ")
         self.add_overlay(const.GRID_TOPRIGHT, "Apply action",
                          "A (-0.05) / Z (+0.05)")
         self.add_overlay(
             const.GRID_TOPRIGHT, "on action index %d out %d" %
             (self.action_mod_index, self.num_action), "J / K")
         self.add_overlay(
             const.GRID_BOTTOMRIGHT, "Reset took",
             "%.2f sec." % (sum(self.elapsed) / len(self.elapsed)))
         self.add_overlay(const.GRID_BOTTOMRIGHT, "Action",
                          str(self.action))
         self.render()
         if once:
             return
Ejemplo n.º 10
0
def fit(environ, env_id, num_timesteps, seed, model_path=None):
    # atari
    if environ == 'atari':
        rank = MPI.COMM_WORLD.Get_rank()
        sess = Model().single_threaded_session()
        sess.__enter__()
        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed \
            is not None else None
        set_global_seeds(workerseed)
        env = make_atari(env_id)

        def policy_fn(name, ob_space, ac_space):
            return PPO1Cnn(name=name, ob_space=ob_space, ac_space=ac_space)

        env = Monitor(
            env,
            logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
        env.seed(workerseed)

        env = wrap_deepmind(env)
        env.seed(workerseed)

        pi = PPOSGD(env,
                    policy_fn,
                    env.observation_space,
                    env.action_space,
                    timesteps_per_actorbatch=256,
                    clip_param=0.2,
                    entcoeff=0.01,
                    optim_epochs=4,
                    optim_stepsize=1e-3,
                    optim_batchsize=64,
                    gamma=0.99,
                    lam=0.95,
                    max_timesteps=int(num_timesteps * 1.1),
                    schedule='linear')

        env.close()
        sess.close()
        return pi

    # mujoco
    if environ == 'mujoco':
        from utils.cmd import make_mujoco_env

        sess = Model().init_session(num_cpu=1).__enter__()

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=64,
                           num_hid_layers=2)

        env = make_mujoco_env(env_id, seed)
        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        sess.close()
        return pi

    if environ == 'humanoid':
        import gym
        from utils.cmd import make_mujoco_env

        env_id = 'Humanoid-v2'

        class RewScale(gym.RewardWrapper):
            def __init__(self, env, scale):
                gym.RewardWrapper.__init__(self, env)
                self.scale = scale

            def reward(self, r):
                return r * self.scale

        sess = Model().init_session(num_cpu=1).__enter__()

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=64,
                           num_hid_layers=2)

        env = make_mujoco_env(env_id, seed)

        # parameters below were the best found in a simple random
        # search these are good enough to make humanoid walk, but
        # whether those are an absolute best or not is not certain
        env = RewScale(env, 0.1)
        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        if model_path:
            Model().save_state(model_path)

        sess.close()
        return pi

    if environ == 'robotics':
        import mujoco_py
        from utils.cmd import make_robotics_env
        rank = MPI.COMM_WORLD.Get_rank()
        sess = Model().single_threaded_session()
        sess.__enter__()
        mujoco_py.ignore_mujoco_warnings().__enter__()
        workerseed = seed + 10000 * rank
        set_global_seeds(workerseed)
        env = make_robotics_env(env_id, workerseed, rank=rank)

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=256,
                           num_hid_layers=3)

        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=5,
            optim_stepsize=3e-4,
            optim_batchsize=256,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        sess.close()
        return pi