def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the initial seed for RNG :param rank: (int) the rank of the environment (for logging) :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) keys = ['observation', 'desired_goal'] # TODO: remove try-except once most users are running modern Gym try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success', ), allow_early_resets=allow_early_resets) env.seed(seed) return env
def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) return wrap_deepmind(env, **wrapper_kwargs)
def train(env_id, num_timesteps, seed): """ Train TRPO model for the atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close() # Free memory del env
def make_mujoco_env(env_id, seed, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the initial seed for RNG :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The mujoco environment """ set_global_seeds(seed + 10000 * mpi_rank_or_zero()) env = gym.make(env_id) env = Monitor(env, os.path.join(logger.get_dir(), '0'), allow_early_resets=allow_early_resets) env.seed(seed) return env
def main(): """ Run the atari test """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = wrap_atari_dqn(env) policy = partial(CnnPolicy, dueling=args.dueling == 1) model = DQN( env=env, policy=policy, learning_rate=1e-4, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, ) model.learn(total_timesteps=args.num_timesteps) env.close()
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() if layer_norm: policy = LnMlpPolicy else: policy = MlpPolicy num_timesteps = kwargs['num_timesteps'] del kwargs['num_timesteps'] model = DDPG(policy=policy, env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, buffer_size=int(1e6), verbose=2, **kwargs) model.learn(total_timesteps=num_timesteps) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) return env_out