Beispiel #1
0
    def make_env(i, this_seed):
        # Previously, we directly called `gym.make(env_name)`, but running
        # `imitation.scripts.train_adversarial` within `imitation.scripts.parallel`
        # created a weird interaction between Gym and Ray -- `gym.make` would fail
        # inside this function for any of our custom environment unless those
        # environments were also `gym.register()`ed inside `make_env`. Even
        # registering the custom environment in the scope of `make_vec_env` didn't
        # work. For more discussion and hypotheses on this issue see PR #160:
        # https://github.com/HumanCompatibleAI/imitation/pull/160.
        env = spec.make()

        # Seed each environment with a different, non-sequential seed for diversity
        # (even if caller is passing us sequentially-assigned base seeds). int() is
        # necessary to work around gym bug where it chokes on numpy int64s.
        env.seed(int(this_seed))

        if max_episode_steps is not None:
            env = TimeLimit(env, max_episode_steps)
        elif spec.max_episode_steps is not None:
            env = TimeLimit(env, max_episode_steps=spec.max_episode_steps)

        # Use Monitor to record statistics needed for Baselines algorithms logging
        # Optionally, save to disk
        log_path = None
        if log_dir is not None:
            log_subdir = os.path.join(log_dir, "monitor")
            os.makedirs(log_subdir, exist_ok=True)
            log_path = os.path.join(log_subdir, f"mon{i:03d}")

        env = bench.Monitor(env, log_path)
        env = wrappers.RolloutInfoWrapper(env)
        return env
Beispiel #2
0
def train():
  """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
  rank = MPI.COMM_WORLD.Get_rank()

  if rank == 0:
    logger.configure(folder=LOGDIR)

  else:
    logger.configure(format_strs=[])
  workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
  set_global_seeds(workerseed)
  env = make_env(workerseed)

  env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
  env.seed(workerseed)

  model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
               optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',
               verbose=1)

  eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

  env.close()
  del env
  if rank == 0:
    model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
def test_deepq():
    """
    test DeepQ on atari
    """
    logger.configure()
    set_global_seeds(SEED)
    env = make_atari(ENV_ID)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)

    model = DQN(env=env,
                policy=CnnPolicy,
                learning_rate=1e-4,
                buffer_size=10000,
                exploration_fraction=0.1,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                prioritized_replay_alpha=0.6,
                checkpoint_freq=10000)
    model.learn(total_timesteps=NUM_TIMESTEPS)

    env.close()
    del model, env
Beispiel #4
0
def make_env(datapaths):
    if len(datapaths) > 1:
        env = EnsembleEnv(datapaths)
    else:
        env = Env(datapaths[0])
    env = bench.Monitor(env, logger.get_dir())
    return env
Beispiel #5
0
def test_deepq():
    """
    test DeepQ on atari
    """
    logger.configure()
    set_global_seeds(SEED)
    env = make_atari(ENV_ID)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)
    q_func = deepq_models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                            (64, 3, 1)],
                                     hiddens=[256],
                                     dueling=True)

    model = DeepQ(env=env,
                  policy=q_func,
                  learning_rate=1e-4,
                  buffer_size=10000,
                  exploration_fraction=0.1,
                  exploration_final_eps=0.01,
                  train_freq=4,
                  learning_starts=10000,
                  target_network_update_freq=1000,
                  gamma=0.99,
                  prioritized_replay=True,
                  prioritized_replay_alpha=0.6,
                  checkpoint_freq=10000)
    model.learn(total_timesteps=NUM_TIMESTEPS)

    env.close()
    del model, env
Beispiel #6
0
def train():
    """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure(folder=LOGDIR)

    else:
        logger.configure(format_strs=[])
    workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_env(workerseed)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    model = PPO1.load(BEST_MODEL_PATH, env=env)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    env.close()
    del env
    if rank == 0:
        model.save(os.path.join(
            LOGDIR, "final_model"))  # probably never get to this point.
Beispiel #7
0
 def make_env():
     env_out = gym.make(env_id)
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     env_out.seed(seed)
     return env_out
Beispiel #8
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Atari environments, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    env = bench.Monitor(env, logger.get_dir() and
                        os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4,
                 optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    model.learn(total_timesteps=num_timesteps)
    env.close()
    del env
Beispiel #9
0
    def make_env():
        env_out = gym.make(args.env)

        env_out = bench.Monitor(env_out,
                                logger.get_dir(),
                                allow_early_resets=True)
        return env_out
Beispiel #10
0
 def make_env():
     env_out = gym.make(args.env)
     env_out.env.disableViewer = True
     env_out.env.visualize = False
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     return env_out
Beispiel #11
0
 def _thunk():
     local_env_kwargs = dict(env_kwargs)  # copy this to avoid altering the others
     local_env_kwargs["env_rank"] = rank
     env = _make(env_id, env_kwargs=local_env_kwargs)
     env.seed(seed + rank)
     if log_dir is not None:
         env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets)
     return env
Beispiel #12
0
 def make_env():
     # env_out = gym.make(env_id, reset_noise_scale=1.0)
     env_out = gym.make(env_id)
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     env_out.seed(seed)
     env_out = wrap_mujoco(env_out, random_action_len=random_action_len)
     return env_out
Beispiel #13
0
def create_env(env_id, delay_step, env_str=str(0)):
    # if env_type in ["mujoco", "Mujoco", "MuJoCo", "raw", "mujoco_raw", "raw_mujoco"]:
    env = gym.make(env_id)
    env = TimestepWrapper(env)
    env = DelayedRewardWrapper(env, delay_step)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), env_str))
    return env
def main(args):
    """
    start training the model

    :param args: (ArgumentParser) the training argument
    """
    with tf_util.make_session(num_cpu=1):
        set_global_seeds(args.seed)
        env = gym.make(args.env_id)

        def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None):
            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess,
                                        hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders)


        #========================================================================================================
        env = bench.Monitor(env, logger.get_dir() and
                            os.path.join(logger.get_dir(), "monitor.json"))
        env.seed(args.seed)
        gym.logger.setLevel(logging.WARN)

        # 길고 긴 task name을 받아옵니다. ===========================================================================
        task_name = get_task_name(args)
        args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name)
        args.log_dir = os.path.join(args.log_dir, task_name)
        # =======================================================================================================


        if args.task == 'train':
            dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)

            #discriminator 네트워크 생성
            reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)

            #policy 네트워크 학습
            #policy network 가 policy_fn 으로 선언되어있다는 것에 주의
            train(env, args.seed, policy_fn, reward_giver,
                  dataset, args.algo, args.g_step, args.d_step,
                  args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained, args.bc_max_iter, task_name)



        # ======================================= 이것은 나중에 이해하보는 거시여=============================================
        # 학습된 모델을 evaluate 할 때 사용합니다.
        elif args.task == 'evaluate':
            runner(env,
                   policy_fn,
                   args.load_model_path,
                   timesteps_per_batch=1024,
                   number_trajs=10,
                   stochastic_policy=args.stochastic_policy,
                   save=args.save_sample
                   )
        else:
            raise NotImplementedError

        env.close()
Beispiel #15
0
def make_envs(env_id,
              do_eval,
              seed,
              conf,
              normalize_observations=False,
              normalize_returns=False):
    # Create envs.
    env_params = conf.pop('env_params', {})
    env = base_env = gym.make(env_id)
    if hasattr(base_env, 'env'):
        base_env = base_env.env
    for attr in env_params:
        setattr(base_env, attr, env_params[attr])
    env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)

    # Seed everything to make things reproducible.
    logger.info('seed={}, logdir={}'.format(seed, logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)

    if normalize_observations or normalize_returns:
        env = DummyVecEnv([lambda: env])
        env = VecNormalize(env,
                           norm_obs=normalize_observations,
                           norm_reward=normalize_returns)

    if do_eval:
        eval_env = base_eval_env = gym.make(env_id)
        if hasattr(base_eval_env, 'env'):
            base_eval_env = base_eval_env.env
        for attr in env_params:
            setattr(base_eval_env, attr, env_params[attr])
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'),
                                 allow_early_resets=True)
        eval_env.seed(seed)
        eval_env.base_env = base_eval_env
    else:
        base_eval_env = None
        eval_env = None
    env.base_env = base_env

    return base_env, env, base_eval_env, eval_env
Beispiel #16
0
    def make_env(i):
        env = gym.make(env_id)
        env.seed(seed + i)  # seed each environment separately for diversity

        # Use Monitor to record statistics needed for Baselines algorithms logging
        # Optionally, save to disk
        log_path = None
        if log_dir is not None:
            log_subdir = os.path.join(log_dir, 'monitor')
            os.makedirs(log_subdir, exist_ok=True)
            log_path = os.path.join(log_subdir, f'mon{i:03d}')
        return bench.Monitor(env, log_path, allow_early_resets=True)
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch,
          dc):

    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2,
                                    num_options=num_options,
                                    dc=dc)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)

    if num_options == 1:
        optimsize = 64
    elif num_options == 2:
        optimsize = 32
    else:
        print("Only two options or primitive actions is currently supported.")
        sys.exit()

    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=2048,
                        clip_param=0.2,
                        entcoeff=0.0,
                        optim_epochs=10,
                        optim_stepsize=3e-4,
                        optim_batchsize=optimsize,
                        gamma=0.99,
                        lam=0.95,
                        schedule='constant',
                        num_options=num_options,
                        app=app,
                        saves=saves,
                        wsaves=wsaves,
                        epoch=epoch,
                        seed=seed,
                        dc=dc)
    env.close()
def main(args):
    """
    start training the model

    :param args: (ArgumentParser) the training argument
    """
    with tf_util.make_session(num_cpu=1):
        set_global_seeds(args.seed)
        env = gym.make(args.env_id)

        def policy_fn(name, ob_space, ac_space, reuse=False, sess=None):
            return mlp_policy.MlpPolicy(name=name,
                                        ob_space=ob_space,
                                        ac_space=ac_space,
                                        sess=sess,
                                        reuse=reuse,
                                        hid_size=args.policy_hidden_size,
                                        num_hid_layers=2)

        env = bench.Monitor(
            env,
            logger.get_dir()
            and os.path.join(logger.get_dir(), "monitor.json"))
        env.seed(args.seed)
        gym.logger.setLevel(logging.WARN)
        task_name = get_task_name(args)
        args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name)
        args.log_dir = os.path.join(args.log_dir, task_name)
        dataset = MujocoDset(expert_path=args.expert_path,
                             traj_limitation=args.traj_limitation)
        savedir_fname = learn(env,
                              policy_fn,
                              dataset,
                              max_iters=args.BC_max_iter,
                              ckpt_dir=args.checkpoint_dir,
                              task_name=task_name,
                              verbose=True)
        runner(env,
               policy_fn,
               savedir_fname,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample,
               reuse=True)
Beispiel #19
0
def main():
    """
    run the atari test
    """
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default=None)

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)
    q_func = deepq_models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )

    model = DeepQ(
        env=env,
        policy=q_func,
        learning_rate=1e-4,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        prioritized_replay_alpha=args.prioritized_replay_alpha,
        checkpoint_freq=args.checkpoint_freq,
        checkpoint_path=args.checkpoint_path,
    )
    model.learn(total_timesteps=args.num_timesteps)

    env.close()
Beispiel #20
0
def test_rollout_stats():
    """Applying `ObsRewIncrementWrapper` halves the reward mean.

    `rollout_stats` should reflect this.
    """
    env = gym.make("CartPole-v1")
    env = bench.Monitor(env, None)
    env = ObsRewHalveWrapper(env)
    venv = vec_env.DummyVecEnv([lambda: env])

    with serialize.load_policy("zero", "UNUSED", venv) as policy:
        trajs = rollout.generate_trajectories(policy, venv, rollout.min_episodes(10))
    s = rollout.rollout_stats(trajs)

    np.testing.assert_allclose(s["return_mean"], s["monitor_return_mean"] / 2)
    np.testing.assert_allclose(s["return_std"], s["monitor_return_std"] / 2)
    np.testing.assert_allclose(s["return_min"], s["monitor_return_min"] / 2)
    np.testing.assert_allclose(s["return_max"], s["monitor_return_max"] / 2)
Beispiel #21
0
def train(env_id, num_timesteps, seed):
    """
    Train TRPO model for the atari environment, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):  # pylint: disable=W0613
    #     return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = TRPO(CnnPolicy,
                 env,
                 timesteps_per_batch=512,
                 max_kl=0.001,
                 cg_iters=10,
                 cg_damping=1e-3,
                 entcoeff=0.0,
                 gamma=0.98,
                 lam=1,
                 vf_iters=3,
                 vf_stepsize=1e-4)
    model.learn(total_timesteps=int(num_timesteps * 1.1))
    env.close()
Beispiel #22
0
 def make_env():
     env_out = GymWrapper(
         suite.make(
             "SawyerLift",
             use_camera_obs=False,  # do not use pixel observations
             has_offscreen_renderer=
             False,  # not needed since not using pixel obs
             has_renderer=True,  # make sure we can render to the screen
             reward_shaping=True,  # use dense rewards
             control_freq=
             10,  # control should happen fast enough so that simulation looks smooth
         ))
     env_out.reward_range = None
     env_out.metadata = None
     env_out.spec = None
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     return env_out
Beispiel #23
0
def main():
    """
    Run the atari test
    """
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = wrap_atari_dqn(env)
    policy = partial(CnnPolicy, dueling=args.dueling == 1)

    model = DQN(
        env=env,
        policy=policy,
        learning_rate=1e-4,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        prioritized_replay_alpha=args.prioritized_replay_alpha,
    )
    model.learn(total_timesteps=args.num_timesteps)

    env.close()
Beispiel #24
0
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
    """
    run the training of DDPG

    :param env_id: (str) the environment ID
    :param seed: (int) the initial random seed
    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
        seperating them with commas
    :param layer_norm: (bool) use layer normalization
    :param evaluation: (bool) enable evaluation of DDPG training
    :param kwargs: (dict) extra keywords for the training.train function
    """

    # Configure things.
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create envs.
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        eval_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), 'gym_eval'))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    # Parse noise_type
    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]
    for current_noise_type in noise_type.split(','):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == 'none':
            pass
        elif 'adaptive-param' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif 'normal' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif 'ou' in current_noise_type:
            _, stddev = current_noise_type.split('_')
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))

    # Seed everything to make things reproducible.
    seed = seed + 1000000 * rank
    logger.info('rank {}: seed={}, logdir={}'.format(rank, seed,
                                                     logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    # Disable logging for rank != 0 to avoid noise.
    start_time = 0
    if rank == 0:
        start_time = time.time()
    model = DDPG(policy=MlpPolicy,
                 env=env,
                 memory_policy=Memory,
                 eval_env=eval_env,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 memory_limit=int(1e6),
                 layer_norm=layer_norm,
                 verbose=2,
                 **kwargs)
    model.learn(total_timesteps=10000)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info('total runtime: {}s'.format(time.time() - start_time))
 def make_env(i):
     env = CartPoleNoVelEnv()
     env = bench.Monitor(env, None, allow_early_resets=True)
     env.seed(i)
     return env
Beispiel #26
0
 def _init():
     env = gym.make(env_id)
     env.seed(seed + rank)
     env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
     return env
Beispiel #27
0
 def make_env(i):
     env = CartPoleNoVelEnv()
     env = TimeLimit(env, max_episode_steps=500)
     env = bench.Monitor(env, None, allow_early_resets=True)
     env.seed(i)
     return env
Beispiel #28
0
 def make_env(i):
     env = env = gym.make("Breakout-v0")
     env = bench.Monitor(env, None, allow_early_resets=True)
     env.seed(i)
     return env
Beispiel #29
0
 def make_env():
     env_out = StudentEnv()
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     return env_out
Beispiel #30
0
 def make_env():
     env_out = gym.make('CartPole-v0')
     env_out = bench.Monitor(env_out,
                             logger.get_dir(),
                             allow_early_resets=True)
     return env_out