def _thunk(): env = make_atari(env_id) env = gym.wrappers.Monitor(env, '/tmp/video', force=True, video_callable=lambda ep: True) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) return wrap_deepmind(env, **wrapper_kwargs)
def _create_single_football_env( level, stacked, representation, reward_experiment, write_goal_dumps, write_full_episode_dumps, write_video, dump_frequency, render, process_number=0, ): """ Creates gfootball environment. Meaning of all variables you can find in footbal.gfootball.examples.run_ppo2.py """ env = create_environment( env_name=level, stacked=stacked, representation=representation, rewards=reward_experiment, logdir=logger.get_dir(), write_goal_dumps=write_goal_dumps and (process_number == 0), write_full_episode_dumps=write_full_episode_dumps and (process_number == 0), write_video=write_video, render=render and (process_number == 0), dump_frequency=dump_frequency if render and process_number == 0 else 0) env = monitor.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(process_number))) return env
def make_env_all_params(rank, add_monitor, args): if args["env_kind"] == 'atari': env = gym.make(args['env']) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=args['noop_max']) env = MaxAndSkipEnv(env, skip=4) env = ProcessFrame84(env, crop=False) env = FrameStack(env, 4) env = ExtraTimeLimit(env, args['max_episode_steps']) if 'Montezuma' in args['env']: env = MontezumaInfoWrapper(env) env = AddRandomStateToInfo(env) elif args["env_kind"] == 'mario': env = make_mario_env() elif args["env_kind"] == "retro_multi": env = make_multi_pong() elif args["env_kind"] == 'robopong': if args["env"] == "pong": env = make_robo_pong() elif args["env"] == "hockey": env = make_robo_hockey() elif args["env_kind"] == "virtualbox": print("Make virtualbox") env = make_virtualbox() if add_monitor: logger_dir = "log" if logger.get_dir() is None else logger.get_dir() print("Adding monitor at dir", logger_dir) env = Monitor(env, osp.join(logger_dir, '%.2i' % rank)) return env
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Atari environments, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) model.learn(total_timesteps=num_timesteps) env.close() del env
def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets, reset_keywords=()) # return wrap_deepmind(env, **wrapper_kwargs) return env
def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1.load(BEST_MODEL_PATH, env=env) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join( LOGDIR, "final_model")) # probably never get to this point.
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param rank: (int) the rank of the environment (for logging) :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) keys = ['observation', 'desired_goal'] # TODO: remove try-except once most users are running modern Gym try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success', ), allow_early_resets=allow_early_resets) env.seed(seed) return env
def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) return wrap_deepmind(env, **wrapper_kwargs)
def main(args): """ start training the model :param args: (ArgumentParser) the training argument """ with tf_util.make_session(num_cpu=1): set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess, hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders) #======================================================================================================== env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) # 길고 긴 task name을 받아옵니다. =========================================================================== task_name = get_task_name(args) args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) args.log_dir = os.path.join(args.log_dir, task_name) # ======================================================================================================= if args.task == 'train': dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) #discriminator 네트워크 생성 reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) #policy 네트워크 학습 #policy network 가 policy_fn 으로 선언되어있다는 것에 주의 train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained, args.bc_max_iter, task_name) # ======================================= 이것은 나중에 이해하보는 거시여============================================= # 학습된 모델을 evaluate 할 때 사용합니다. elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def create_env(env_id, delay_step, env_str=str(0)): # if env_type in ["mujoco", "Mujoco", "MuJoCo", "raw", "mujoco_raw", "raw_mujoco"]: env = gym.make(env_id) env = TimestepWrapper(env) env = DelayedRewardWrapper(env, delay_step) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), env_str)) return env
def _thunk(): no_graphics = not use_visual unity_env = UnityEnvironment(env_directory, no_graphics=no_graphics) env = UnityToGymWrapper(unity_env, uint8_visual=False) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env
def _thunk(): env = make_atari(env_id, max_episode_steps=max_episode_steps, action_space=action_space) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) return wrap_deepmind(env, **wrapper_kwargs)
def train(env_id, num_timesteps, seed, num_options, app, saves, wsaves, epoch, dc): U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, num_options=num_options, dc=dc) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) if num_options == 1: optimsize = 64 elif num_options == 2: optimsize = 32 else: print("Only two options or primitive actions is currently supported.") sys.exit() pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=optimsize, gamma=0.99, lam=0.95, schedule='constant', num_options=num_options, app=app, saves=saves, wsaves=wsaves, epoch=epoch, seed=seed, dc=dc) env.close()
def _thunk(): env = make_mario(env_id) env.seed(seed + rank) if cut_map: env = CutMarioMap(env) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) # FIXME do if wrap deepmind, create other methods return wrap_deepmind_custom( env, **wrapper_kwargs) # converts to 84*84 bw, keep for now
def create_single_football_env(seed): """Creates gfootball environment.""" # env = football_env.create_environment( # env_name=FLAGS.level, stacked=('stacked' in FLAGS.state), # rewards=FLAGS.reward_experiment, # logdir=logger.get_dir() + str(seed), # enable_goal_videos=FLAGS.dump_scores and (seed == 0), # enable_full_episode_videos=FLAGS.dump_full_episodes and (seed == 0), # render= True and (seed == 0), # dump_frequency=50 if FLAGS.render and seed == 0 else 0) env = football_env.create_environment(env_name="academy_3_vs_1_with_keeper", stacked=True, representation='extracted', render=False and (seed == 0), channel_dimensions=(64, 64), rewards='scoring,checkpoints') env = monitor.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(seed))) return env
def test_deepq(): """ test DeepQ on atari """ logger.configure() set_global_seeds(SEED) env = make_atari(ENV_ID) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) q_func = deepq_models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) model = DeepQ(env=env, policy=q_func, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6, checkpoint_freq=10000) model.learn(total_timesteps=NUM_TIMESTEPS) env.close() del model, env
def test_deepq(): """ test DeepQ on atari """ logger.configure() set_global_seeds(SEED) env = make_atari(ENV_ID) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) model = DQN(env=env, policy=CnnPolicy, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6, checkpoint_freq=10000) model.learn(total_timesteps=NUM_TIMESTEPS) env.close() del model, env
def make_env(): env_out = gym.make(args.env) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out
def main(): """ Runs the test """ logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render model = train(num_timesteps=1, seed=args.seed) tf_util.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) obs = env.reset() while True: action = model.policy.act(stochastic=False, obs=obs)[0] obs, _, done, _ = env.step(action) env.render() if done: obs = env.reset()
def make_env(datapaths): if len(datapaths) > 1: env = EnsembleEnv(datapaths) else: env = Env(datapaths[0]) env = bench.Monitor(env, logger.get_dir()) return env
def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out
def make_env(): env_out = gym.make(args.env) env_out.env.disableViewer = True env_out.env.visualize = False env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out
def main(args): """ start training the model :param args: (ArgumentParser) the training argument """ with tf_util.make_session(num_cpu=1): set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False, sess=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) args.log_dir = os.path.join(args.log_dir, task_name) dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, task_name=task_name, verbose=True) runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def log_eval(num_update, mean_eval_reward, file_name='eval.csv'): if not os.path.exists(os.path.join(logger.get_dir(), file_name)): with open(os.path.join(logger.get_dir(), file_name), 'a', newline='') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar=',', quoting=csv.QUOTE_MINIMAL) title = ['n_updates', 'mean_eval_reward'] csvwriter.writerow(title) with open(os.path.join(logger.get_dir(), file_name), 'a', newline='') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar=',', quoting=csv.QUOTE_MINIMAL) data = [num_update, mean_eval_reward] csvwriter.writerow(data)
def make_env(): # env_out = gym.make(env_id, reset_noise_scale=1.0) env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) env_out = wrap_mujoco(env_out, random_action_len=random_action_len) return env_out
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param rank: (int) the rank of the environment (for logging) :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',), allow_early_resets=allow_early_resets) env.seed(seed) return env
def make_envs(env_id, do_eval, seed, conf, normalize_observations=False, normalize_returns=False): # Create envs. env_params = conf.pop('env_params', {}) env = base_env = gym.make(env_id) if hasattr(base_env, 'env'): base_env = base_env.env for attr in env_params: setattr(base_env, attr, env_params[attr]) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if normalize_observations or normalize_returns: env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=normalize_observations, norm_reward=normalize_returns) if do_eval: eval_env = base_eval_env = gym.make(env_id) if hasattr(base_eval_env, 'env'): base_eval_env = base_eval_env.env for attr in env_params: setattr(base_eval_env, attr, env_params[attr]) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'), allow_early_resets=True) eval_env.seed(seed) eval_env.base_env = base_eval_env else: base_eval_env = None eval_env = None env.base_env = base_env return base_env, env, base_eval_env, eval_env
def train(env_id, num_timesteps, seed): """ Train TRPO model for the atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): # pylint: disable=W0613 # return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close()
def make_mujoco_env(env_id, seed, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The mujoco environment """ set_global_seeds(seed + 10000 * mpi_rank_or_zero()) env = gym.make(env_id) env = Monitor(env, os.path.join(logger.get_dir(), '0'), allow_early_resets=allow_early_resets) env.seed(seed) return env