def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',)) env.seed(seed) return env
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',)) env.seed(seed) return env
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param rank: (int) the rank of the environment (for logging) :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) keys = ['observation', 'desired_goal'] # TODO: remove try-except once most users are running modern Gym try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success', ), allow_early_resets=allow_early_resets) env.seed(seed) return env
def _make_robosuite_env(): from gym.wrappers import FlattenDictWrapper from baselines.bench import Monitor env = suite.make(env_id) env = FlattenDictWrapper(env, ['robot-state', 'object-state']) env = Monitor(env, logger.get_dir(), allow_early_resets=True) return env
def test_flatten2dict(): dict_env = gym.make('PendulumDictEnv-v0') dict_env = GymEnv(dict_env) dict_ob = dict_env.observation_space.sample() dict_observation_space = dict_env.observation_space env = FlattenDictWrapper(dict_env, dict_env.observation_space.spaces.keys()) flatten_ob = env.observation(dict_ob) dict_keys = env.dict_keys recovered_dict_ob = flatten_to_dict(flatten_ob, dict_observation_space, dict_keys) tf = [] for (a_key, a_val), (b_key, b_val) in zip(dict_ob.items(), recovered_dict_ob.items()): tf.append(a_key == b_key) tf.append(all(a_val == b_val)) assert all(tf)
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param rank: (int) the rank of the environment (for logging) :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success', )) env.seed(seed) return env
def _thunk(): env = gym.make(env_id) if env_id.find('Fetch') == -1: env = FlattenObservation(env) else: env = FlattenDictWrapper(env, ['achieved_goal', 'desired_goal']) env = RandomizedEnvWrapper(env, seed + rank) env.seed(seed + rank) return env
def environment(spec, kwargs): env = FlattenDictWrapper(spec.make(**kwargs), ['observation', 'desired_goal', 'achieved_goal']) ob_space = env.observation_space act_space = env.action_space ob = env.reset() assert ob_space.contains(ob), 'Reset observation: {!r} not in space'.format(ob) a = act_space.sample() observation, reward, done, _info = env.step(a) assert ob_space.contains(observation), 'Step observation: {!r} not in space'.format(observation) assert np.isscalar(reward), "{} is not a scalar for {}".format(reward, env) assert isinstance(done, bool), "Expected {} to be a boolean".format(done) for mode in env.metadata.get('render.modes', []): env.render(mode=mode) # Make sure we can render the environment after close. for mode in env.metadata.get('render.modes', []): env.render(mode=mode) env.close()
def run_games_for_agent(self, agent_number, agent_class): """Runs a set of games for a given agent, saving the results in self.results""" agent_results = [] agent_name = agent_class.agent_name agent_group = self.agent_to_agent_group[agent_name] agent_round = 1 # print("!!", self.config.environment) # print(self.config.environment._max_episode_steps) #&&&&&&&&&&&& agent_config = copy.deepcopy(self.config) if self.environment_has_changeable_goals(agent_config.environment) \ and self.agent_cant_handle_changeable_goals_without_flattening(agent_name): print("Flattening changeable-goal environment for agent {}".format( agent_name)) agent_config.environment = FlattenDictWrapper( agent_config.environment, dict_keys=["observation", "desired_goal"]) if self.config.randomise_random_seed: agent_config.seed = random.randint(0, 2**32 - 2) agent_config.hyperparameters = agent_config.hyperparameters[ agent_group] print("AGENT NAME: {}".format(agent_name)) manager = mp.Manager() return_q = manager.Queue() agent = agent_class(agent_config) self.environment_name = agent.environment_title jobs = [] for i in range(self.config.runs_per_agent): p = mp.Process(target=agent.run_n_episodes, args=(return_q, )) jobs.append(p) p.start() for proc in jobs: proc.join() # print("(GridTrainer.py) process end!") for game_scores, rolling_scores, time_taken in iter( return_q.get, None): agent_results.append([ game_scores, rolling_scores, len(rolling_scores), -1 * max(rolling_scores), time_taken ]) if return_q.empty(): break #&&&&&&&&& self.results[agent_name] = agent_results
def _make_env(env_id, env_type, seed, reward_shaping, frame_stack, **kwargs): """Make single env""" check_name_in_list(env_id, env_type) # check existence of env_id in env_type if env_type == 'atari': env = gym.make(env_id) env = NoopResetEnv(env, noop_max=30) if 'NoFrameskip' in env.spec.id: env = MaxAndSkipEnv(env, skip=4) env = Monitor(env) # deepmind wrap env = EpisodicLifeEnv(env) if 'FIRE' in env.unwrapped.get_action_meanings(): env = FireResetEnv(env) env = WarpFrame(env) env = ClipRewardEnv(env) if frame_stack: env = FrameStack(env, 4) elif env_type in ['classic_control', 'box2d', 'mujoco']: env = gym.make(env_id).unwrapped max_episode_steps = kwargs.get('max_episode_steps') if max_episode_steps is not None: env = TimeLimit(env.unwrapped, max_episode_steps) env = Monitor(env) elif env_type == 'robotics': env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor(env, info_keywords=('is_success', )) elif env_type == 'dm_control': env = gym.make('dm2gym:' + env_id, environment_kwargs={'flat_observation': True}) env = DmObsTrans(env) elif env_type == 'rlbench': from rlzoo.common.build_rlbench_env import RLBenchEnv state_type = kwargs.get('state_type') env = RLBenchEnv(env_id) if state_type is None else RLBenchEnv( env_id, state_type) else: raise NotImplementedError if reward_shaping is not None: if callable(reward_shaping): env = RewardShaping(env, reward_shaping) else: raise ValueError('reward_shaping parameter must be callable') env.seed(seed) return env
def random_rollout(spec, kwargs): env = FlattenDictWrapper(spec.make(**kwargs), ['observation', 'desired_goal', 'achieved_goal']) agent = lambda ob: env.action_space.sample() ob = env.reset() for _ in range(10): assert env.observation_space.contains(ob) a = agent(ob) assert env.action_space.contains(a) (ob, _reward, done, _info) = env.step(a) if done: break env.close()
def make_env(env_id, rank, log_dir=None, allow_early_resets=True, flatten_dict=False, kwargs=None): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The mujoco environment """ if env_id in ENTRY_POINT.keys(): kwargs = kwargs.copy() max_episode_steps = None if 'max_episode_steps' in kwargs: max_episode_steps = kwargs['max_episode_steps'] del kwargs['max_episode_steps'] gym.register(env_id, entry_point=ENTRY_POINT[env_id], max_episode_steps=max_episode_steps, kwargs=kwargs) env = gym.make(env_id) else: raise NotImplementedError if flatten_dict: env = FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) if 'FetchStack' in env_id and ( 'Unlimit' not in env_id) and max_episode_steps is None: from utils.wrapper import FlexibleTimeLimitWrapper env = FlexibleTimeLimitWrapper(env, 100) if kwargs['reward_type'] != 'sparse': env = DoneOnSuccessWrapper(env, 0.0) else: env = DoneOnSuccessWrapper(env) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, str(rank) + ".monitor.csv"), allow_early_resets=allow_early_resets, info_keywords=('is_success', )) return env
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) #env = FlattenDictWrapper(env, ['observation', 'desired_goal']) keys = ['observation', 'desired_goal'] # TODO: remove try-except once most users are running modern Gym try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success', )) env.seed(seed) return env
def create_environment(name: str) -> gym.Env: print('Creating environment %s...' % name) ids = name.split('-') framework = ids[0].lower() env_id = '-'.join(ids[1:]) if framework == 'dm': from envs.deepmind import DMSuiteEnv return DMSuiteEnv(env_id) elif framework == 'gym': env = gym.make(env_id).env from gym.envs.robotics.robot_env import RobotEnv if not isinstance(env, RobotEnv): env = BetterRgbRenderingEnv(env) if isinstance(env.observation_space, gym.spaces.Dict): from gym.wrappers import FlattenDictWrapper env = FlattenDictWrapper(env, env.observation_space.spaces.keys()) return env elif framework == 'rllab': from envs.rllab import RllabEnv return RllabEnv(env_id) raise LookupError("Could not find environment \"%s\"." % env_id)
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') # env=suite.load(domain,task,environment_kwargs=dict(flat_observation=True)) # env=DMControlEnv(env) p = "dm2gym:" + domain.capitalize() + task.capitalize() + "-v0" env = gym.make(p, environment_kwargs=dict(flat_observation=True)) env = FlattenDictWrapper(env, ['observations']) else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = make_atari(env_id) env.seed(seed + rank) obs_shape = env.observation_space.shape if str(env.__class__.__name__).find('TimeLimit') >= 0: env = TimeLimitMask(env) if log_dir is not None: env = bench.Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if is_atari: if len(env.observation_space.shape) == 3: env = wrap_deepmind(env) elif len(env.observation_space.shape) == 3: raise NotImplementedError( "CNN models work only for atari,\n" "please use a custom wrapper for a custom pixel input env.\n" "See wrap_deepmind for an example.") # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env, op=[2, 0, 1]) return env
def main(args): log_dir = args.log_path if ( args.log_path is not None ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(log_dir) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(log_dir, format_strs=[]) set_global_seeds(args.seed) model_class = SAC_parallel n_workers = args.num_workers if not args.play else 1 env_kwargs = get_env_kwargs(args.env, random_ratio=args.random_ratio, sequential=args.sequential, reward_type=args.reward_type, n_object=args.n_object) def make_thunk(rank): return lambda: make_env( env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs) env = ParallelSubprocVecEnv([make_thunk(i) for i in range(n_workers)], reset_when_done=True) if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): os.remove(os.path.join(logger.get_dir(), 'eval.csv')) print('Remove existing eval.csv') eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs) eval_env = FlattenDictWrapper( eval_env, ['observation', 'achieved_goal', 'desired_goal']) if not args.play: os.makedirs(log_dir, exist_ok=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE if not args.play: from stable_baselines.ddpg.noise import NormalActionNoise noise_type = args.action_noise.split('_')[0] if noise_type == 'none': parsed_action_noise = None elif noise_type == 'normal': sigma = float(args.action_noise.split('_')[1]) parsed_action_noise = NormalActionNoise( mean=np.zeros(env.action_space.shape), sigma=sigma * np.ones(env.action_space.shape)) else: raise NotImplementedError train_kwargs = get_train_kwargs("sac", args, parsed_action_noise, eval_env) def callback(_locals, _globals): if _locals['step'] % int(1e3) == 0: if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model( eval_env, _locals["self"], init_on_table=(args.env == 'FetchStack-v2')) elif 'MasspointPushDoubleObstacle-v2' in args.env: mean_eval_reward = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.])) mean_eval_reward2 = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], goal_idx=0, fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.])) log_eval(_locals['self'].num_timesteps, mean_eval_reward2, file_name="eval_box.csv") else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(_locals['self'].num_timesteps, mean_eval_reward) if _locals['step'] % int(2e4) == 0: model_path = os.path.join( log_dir, 'model_' + str(_locals['step'] // int(2e4))) model.save(model_path) print('model saved to', model_path) return True class CustomSACPolicy(SACPolicy): def __init__(self, *model_args, **model_kwargs): super(CustomSACPolicy, self).__init__( *model_args, **model_kwargs, layers=[256, 256] if 'MasspointPushDoubleObstacle' in args.env else [256, 256, 256, 256], feature_extraction="mlp") register_policy('CustomSACPolicy', CustomSACPolicy) from utils.sac_attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("sac", args) if rank == 0: print('train_kwargs', train_kwargs) print('policy_kwargs', policy_kwargs) # Wrap the model model = HER2(args.policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, num_workers=args.num_workers, policy_kwargs=policy_kwargs, verbose=1, **train_kwargs) print(model.get_parameter_list()) # Train the model model.learn( int(args.num_timesteps), seed=args.seed, callback=callback, log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10) if rank == 0: model.save(os.path.join(log_dir, 'final')) # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method if args.play and rank == 0: assert args.load_path is not None model = HER2.load(args.load_path, env=env) fig, ax = plt.subplots(1, 1, figsize=(8, 8)) obs = env.reset() if 'FetchStack' in args.env: env.env_method('set_task_array', [[(env.get_attr('n_object')[0], 0)]]) obs = env.reset() while env.get_attr('current_nobject')[0] != env.get_attr( 'n_object')[0] or env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'FetchPushWallObstacle' in args.env: while not (obs['observation'][0][4] > 0.7 and obs['observation'][0][4] < 0.8): obs = env.reset() env.env_method('set_goal', [np.array([1.18, 0.8, 0.425, 1, 0])]) obs = env.env_method('get_obs') obs = { 'observation': obs[0]['observation'][None], 'achieved_goal': obs[0]['achieved_goal'][None], 'desired_goal': obs[0]['desired_goal'][None] } # obs[0] = np.concatenate([obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal']]) elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env: while np.argmax(obs['desired_goal'][0][3:]) != 0: obs = env.reset() elif 'MasspointMaze-v2' in args.env: while obs['observation'][0][0] < 3 or obs['observation'][0][1] < 3: obs = env.reset() env.env_method('set_goal', [np.array([1., 1., 0.15])]) obs = env.env_method('get_obs') obs = { 'observation': obs[0]['observation'][None], 'achieved_goal': obs[0]['achieved_goal'][None], 'desired_goal': obs[0]['desired_goal'][None] } print('goal', obs['desired_goal'][0], 'obs', obs['observation'][0]) episode_reward = 0.0 images = [] frame_idx = 0 num_episode = 0 for i in range(env_kwargs['max_episode_steps'] * 10): img = env.render(mode='rgb_array') ax.cla() ax.imshow(img) tasks = ['pick and place', 'stack'] ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', task: ' + tasks[np.argmax(obs['observation'][0][-2:])]) images.append(img) action, _ = model.predict(obs, deterministic=True) obs, reward, done, _ = env.step(action) episode_reward += reward frame_idx += 1 if args.export_gif: plt.imsave( os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img) else: plt.pause(0.02) if done: print('episode_reward', episode_reward) obs = env.reset() if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env: while np.argmax(obs['desired_goal'][0][3:]) != 0: obs = env.reset() print('goal', obs['desired_goal'][0]) episode_reward = 0.0 frame_idx = 0 num_episode += 1 if num_episode >= 1: break exit() if args.export_gif: os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) + '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + os.path.join(os.path.dirname(args.load_path), args.env + '.mp4')) for i in range(env_kwargs['max_episode_steps'] * 10): # images.append(plt.imread('tempimg' + str(i) + '.png')) try: os.remove( os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png')) except: pass
def main(args): log_dir = args.log_path if ( args.log_path is not None ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(log_dir) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(log_dir, format_strs=[]) set_global_seeds(args.seed) model_class = SAC_SIR # works also with SAC, DDPG and TD3 env_kwargs = get_env_kwargs(args.env, random_ratio=args.random_ratio, sequential=args.sequential, reward_type=args.reward_type, n_object=args.n_object) def make_thunk(rank): return lambda: make_env( env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs) env = ParallelSubprocVecEnv( [make_thunk(i) for i in range(args.num_workers)], reset_when_done=True) def make_thunk_aug(rank): return lambda: FlattenDictWrapper( make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs), ['observation', 'achieved_goal', 'desired_goal']) aug_env_kwargs = env_kwargs.copy() del aug_env_kwargs['max_episode_steps'] aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1] aug_env = ParallelSubprocVecEnv( [make_thunk_aug(i) for i in range(args.num_workers)], reset_when_done=False) if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')): os.remove(os.path.join(logger.get_dir(), 'eval.csv')) print('Remove existing eval.csv') eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs) eval_env = FlattenDictWrapper( eval_env, ['observation', 'achieved_goal', 'desired_goal']) if not args.play: os.makedirs(log_dir, exist_ok=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE if not args.play: from stable_baselines.ddpg.noise import NormalActionNoise noise_type = args.action_noise.split('_')[0] if noise_type == 'none': parsed_action_noise = None elif noise_type == 'normal': sigma = float(args.action_noise.split('_')[1]) parsed_action_noise = NormalActionNoise( mean=np.zeros(env.action_space.shape), sigma=sigma * np.ones(env.action_space.shape)) else: raise NotImplementedError train_kwargs = get_train_kwargs("sac_sir", args, parsed_action_noise, eval_env, aug_env) def callback(_locals, _globals): if _locals['step'] % int(1e3) == 0: if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model( eval_env, _locals["self"], init_on_table=(args.env == 'FetchStack-v2')) elif 'MasspointPushDoubleObstacle-v2' in args.env: mean_eval_reward = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.])) mean_eval_reward2 = egonav_eval_model( eval_env, _locals["self"], env_kwargs["random_ratio"], goal_idx=0, fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.])) log_eval(_locals['self'].num_timesteps, mean_eval_reward2, file_name="eval_box.csv") else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(_locals['self'].num_timesteps, mean_eval_reward) if _locals['step'] % int(2e4) == 0: model_path = os.path.join( log_dir, 'model_' + str(_locals['step'] // int(2e4))) model.save(model_path) print('model saved to', model_path) return True class CustomSACPolicy(SACPolicy): def __init__(self, *model_args, **model_kwargs): super(CustomSACPolicy, self).__init__( *model_args, **model_kwargs, layers=[256, 256] if 'MasspointPushDoubleObstacle' in args.env else [256, 256, 256, 256], feature_extraction="mlp") register_policy('CustomSACPolicy', CustomSACPolicy) from utils.sac_attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("sac_sir", args) if rank == 0: print('train_kwargs', train_kwargs) print('policy_kwargs', policy_kwargs) # Wrap the model model = HER2(args.policy, env, model_class, n_sampled_goal=4, start_augment_time=args.start_augment, goal_selection_strategy=goal_selection_strategy, num_workers=args.num_workers, policy_kwargs=policy_kwargs, verbose=1, **train_kwargs) print(model.get_parameter_list()) # Train the model model.learn( int(args.num_timesteps), seed=args.seed, callback=callback, log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10) if rank == 0: model.save(os.path.join(log_dir, 'final'))
def make_thunk_aug(rank): return lambda: FlattenDictWrapper( make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs), ['observation', 'achieved_goal', 'desired_goal'])
def load(environment_name, env_id=None, concat_desired_goal=True, discount=1.0, max_episode_steps=None, sparse_reward=False, use_success_wrapper=True, gym_env_wrappers=(), alf_env_wrappers=(), wrap_with_process=False): """Loads the selected environment and wraps it with the specified wrappers. Note that by default a ``TimeLimit`` wrapper is used to limit episode lengths to the default benchmarks defined by the registered environments. Args: environment_name: Name for the environment to load. env_id: A scalar ``Tensor`` of the environment ID of the time step. discount: Discount to use for the environment. max_episode_steps: If None the ``max_episode_steps`` will be set to the default step limit defined in the environment's spec. No limit is applied if set to 0 or if there is no ``timestep_limit`` set in the environment's spec. sparse_reward (bool): If True, the game ends once the goal is achieved. Rewards will be added by 1, changed from -1/0 to 0/1. use_success_wrapper (bool): If True, wraps the environment with the SuccessWrapper which will record Success info after a specified amount of timesteps. gym_env_wrappers: Iterable with references to wrapper classes to use directly on the gym environment. alf_env_wrappers: Iterable with references to wrapper classes to use on the torch environment. Returns: An AlfEnvironment instance. """ assert (environment_name.startswith("Fetch") or environment_name.startswith("HandManipulate")), ( "This suite only supports OpenAI's Fetch and ShadowHand envs!") _unwrapped_env_checker_.check_and_update(wrap_with_process) gym_spec = gym.spec(environment_name) env = gym_spec.make() if max_episode_steps is None: if gym_spec.max_episode_steps is not None: max_episode_steps = gym_spec.max_episode_steps else: max_episode_steps = 0 def env_ctor(env_id=None): return suite_gym.wrap_env( env, env_id=env_id, discount=discount, max_episode_steps=max_episode_steps, gym_env_wrappers=gym_env_wrappers, alf_env_wrappers=alf_env_wrappers, image_channel_first=False) # concat robot's observation and the goal location if concat_desired_goal: keys = ["observation", "desired_goal"] try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) if use_success_wrapper: env = SuccessWrapper(env, max_episode_steps) env = ObservationClipWrapper(env) if sparse_reward: env = SparseReward(env) if wrap_with_process: process_env = process_environment.ProcessEnvironment( functools.partial(env_ctor)) process_env.start() torch_env = alf_wrappers.AlfEnvironmentBaseWrapper(process_env) else: torch_env = env_ctor(env_id=env_id) return torch_env
def run_games_for_agent(self, agent_number, agent_class): """Runs a set of games for a given agent, saving the results in self.results""" agent_results = [] agent_name = agent_class.agent_name agent_group = self.agent_to_agent_group[agent_name] agent_round = 1 # print("!!", self.config.environment) # print(self.config.environment._max_episode_steps) for run in range(self.config.runs_per_agent): agent_config = copy.deepcopy(self.config) if self.environment_has_changeable_goals(agent_config.environment) \ and self.agent_cant_handle_changeable_goals_without_flattening(agent_name): print("Flattening changeable-goal environment for agent {}". format(agent_name)) agent_config.environment = FlattenDictWrapper( agent_config.environment, dict_keys=["observation", "desired_goal"]) # print("!!!", agent_config.environment) # print(agent_config.environment.env._max_episode_steps) if self.config.randomise_random_seed: agent_config.seed = random.randint(0, 2**32 - 2) agent_config.hyperparameters = agent_config.hyperparameters[ agent_group] print("AGENT NAME: {}".format(agent_name)) print("\033[1m" + "{}.{}: {}".format(agent_number, agent_round, agent_name) + "\033[0m", flush=True) agent = agent_class(agent_config) self.environment_name = agent.environment_title print(agent.hyperparameters) print("RANDOM SEED ", agent_config.seed) game_scores, rolling_scores, time_taken = agent.run_n_episodes( ) ##************ print("Time taken: {}".format(time_taken), flush=True) self.print_two_empty_lines() episode_succeded = agent.achieved_required_score_at_index() if episode_succeded >= 0 and episode_succeded <= 1: # we will not accept runs that episode succeeded too early it is an anomaly print( "Since this run succeeded at episode: {}, it will be neglected" .format(episode_succeded)) # print("The initial state of the anomaly is:") # print("list: ", agent.initial_state_list) # print(agent.initial_state_list[episode_succeded]) # print("Recording the anomaly...") # # f = open("Anomalies.txt", 'a') # f2 = open("Anomaly_list.txt", 'a') # string = "[" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + "Anamaly!" # string += "] [Agent: {}".format(agent_name)+"]\n" # string += "RANDOM SEED: {}".format(agent_config.seed) + "\n" # string += "Initial State: {}".format(agent.initial_state_list[episode_succeded])+"\n" # if("HER" in agent_name): # string += "It is HER Case, L2 Norm is : " # string += str(LA.norm(agent.initial_state_list[episode_succeded]["achieved_goal"] - agent.initial_state_list[episode_succeded]["desired_goal"])) # f2.write(str(LA.norm(agent.initial_state_list[episode_succeded]["achieved_goal"] - agent.initial_state_list[episode_succeded]["desired_goal"]))+"\n") # string += "\n--------------------------\n" # f.write(string) # f.close() # f2.close() else: agent_results.append([ game_scores, rolling_scores, len(rolling_scores), -1 * max(rolling_scores), time_taken ]) # if ("HER" in agent_name): # f = open("Anomalies.txt", 'a') # f2 = open("Normallist.txt", 'a') # string1 = "[" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + "Normal Case in HER!" # string1 += "] [Agent: {}".format(agent_name) + "]\n" # string1 += "RANDOM SEED: {}".format(agent_config.seed) + "\n" # string1 += "Initial State: {}".format(agent.initial_state_list[episode_succeded]) + "\n" # string1 += "It is HER Case, L2 Norm is : \n" # string1 += str(LA.norm(agent.initial_state_list[episode_succeded]["achieved_goal"] - # agent.initial_state_list[episode_succeded]["desired_goal"])) # string1 += "\n----------------------------\n" # f.write(string1) # f2.write(str(LA.norm(agent.initial_state_list[episode_succeded]["achieved_goal"] - agent.initial_state_list[episode_succeded]["desired_goal"]))+"\n") # f.close() # f2.close() if self.config.visualise_individual_results: self.visualise_overall_agent_results([rolling_scores], agent_name, show_each_run=True) #plt.show() agent_round += 1 '''Saving Videos!''' # h, w, _ = self.render_file_list[0][0].shape # size = (w, h) # out = cv2.VideoWriter('videos/{}_{}.avi'.format(self.environment_name,agent_class.agent_name), # cv2.VideoWriter_fourcc(*'DIVX'), 120, size) # for i in range(len(self.render_file_list)): # for j in range(len(self.render_file_list[0])): # out.write(self.render_file_list[i][j]) # out.release() # print("Saving Complete!!") self.results[agent_name] = agent_results
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None): if initializer is not None: initializer(mpi_rank=mpi_rank, subrank=subrank) wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} if ':' in env_id: import re import importlib module_name = re.sub(':.*', '', env_id) env_id = re.sub('.*:', '', env_id) importlib.import_module(module_name) if env_type == 'atari': env = make_atari(env_id) elif env_type == 'retro': import retro gamestate = gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro( game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate) elif env_type == 'robotics': env = gym.make(env_id) env = FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) else: if env_id == 'LunarLanderContinuousPOMDP-v0': new_lunar_lander_pomdp_env(hist_len=hist_len, block_high=block_high, not_guided=not_guided, give_state=give_state) else: env = gym.make(env_id, **env_kwargs) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): keys = env.observation_space.spaces.keys() env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys)) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if env_type == 'atari': env = wrap_deepmind(env, **wrapper_kwargs) elif env_type == 'retro': if 'frame_stack' not in wrapper_kwargs: wrapper_kwargs['frame_stack'] = 1 env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs) if isinstance(env.action_space, gym.spaces.Box): env = ClipActionsWrapper(env) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
def setUp(self): dict_env = gym.make('PendulumDictEnv-v0') self.dict_observation_space = dict_env.observation_space env = FlattenDictWrapper(dict_env, dict_env.observation_space.spaces.keys()) self.env = GymEnv(env)
env_id = sys.argv[1] model_path = sys.argv[2] env_kwargs = get_env_kwargs(env_id, random_ratio=0.0) def make_thunk(rank): return lambda: make_env(env_id=env_id, rank=rank, kwargs=env_kwargs) env = ParallelSubprocVecEnv([make_thunk(i) for i in range(1)], reset_when_done=True) aug_env_id = env_id.split('-')[0] + 'Unlimit-' + env_id.split('-')[1] aug_env_kwargs = env_kwargs.copy() aug_env_kwargs['max_episode_steps'] = None aug_env = make_env(aug_env_id, rank=0, kwargs=aug_env_kwargs) aug_env = FlattenDictWrapper( aug_env, ['observation', 'achieved_goal', 'desired_goal']) goal_dim = aug_env.goal.shape[0] obs_dim = aug_env.observation_space.shape[0] - 2 * goal_dim noise_mag = aug_env.size_obstacle[1] n_object = aug_env.n_object model = HER2.load(model_path, env=env) model.model.env_id = env_id model.model.goal_dim = goal_dim model.model.obs_dim = obs_dim model.model.noise_mag = noise_mag model.model.n_object = n_object count1 = 0 count2 = 0 fail1 = [0, 0]
def _make_flat(*args, **kargs): if "FlattenDictWrapper" in dir(): return FlattenDictWrapper(*args, **kargs) return FlattenObservation(FilterObservation(*args, **kargs))
def make_env(delta=False): env = FetchEnvBasic(delta) env = FlattenDictWrapper(env, ['observation']) return env