def __init__(self, config, policy_params, env_creator, noise, monitor_path, min_task_runtime=0.2): self.min_task_runtime = min_task_runtime self.config = config self.policy_params = policy_params self.noise = SharedNoiseTable(noise) self.monitor_path = monitor_path self.env = env_creator(config["env_config"]) self.memory = [] if is_atari(self.env): self.env = wrap_deepmind(self.env, dim=84, framestack=4) if np.random.uniform() < 0.5: if self.monitor_path: self.env = _monitor(self.env, self.monitor_path) else: self.env = env_creator(config["env_config"]) if is_atari(self.env): self.env = wrap_deepmind(self.env, dim=84, framestack=4) self.env2 = env_creator(config["env_config"]) if is_atari(self.env2): self.env2 = wrap_deepmind(self.env2, dim=84, framestack=4) from ray.rllib import models self.preprocessor = models.ModelCatalog.get_preprocessor( self.env, config["model"]) self.sess = utils.make_session(single_threaded=True) self.policy = policies.GenericPolicy(self.sess, self.env, self.env2, self.env.action_space, self.env.observation_space, self.preprocessor, config["observation_filter"], config["model"], **policy_params) #self.sess.run(tf.global_variables_initializer()) #self.sess1 = utils.make_session(single_threaded=True) # self.policymax = policies.GenericPolicy( # self.sess, self.env,self.env2,self.env.action_space, self.env.observation_space, # self.preprocessor, config["observation_filter"], config["model"], # **policy_params) self.sess.run(tf.global_variables_initializer())
def wrap(env): env = wrap_deepmind(env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: env = _monitor(env, monitor_path) return env
def wrap(env): env = wrap_deepmind(env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: env = gym.wrappers.Monitor(env, monitor_path, resume=True) return env
def wrap(env): env = wrap_deepmind(env, dim=model_config.get("dim", 84), framestack=not model_config.get("use_lstm") and not model_config.get("no_framestack")) if monitor_path: env = _monitor(env, monitor_path) return env
def wrap(env): env = wrap_deepmind( env, dim=model_config.get("dim"), framestack=model_config.get("framestack")) if monitor_path: env = _monitor(env, monitor_path) return env
def __init__(self): self.agents = [gym.make(args.env) for _ in range(args.num_agents)] if args.is_atari: self.agents = [ wrap_deepmind(env, dim=args.dim) for env in self.agents ] self.dones = set() self.observation_space = self.agents[0].observation_space self.action_space = self.agents[0].action_space
def wrap_dqn(env, options): """Apply a common set of wrappers for DQN.""" is_atari = hasattr(env.unwrapped, "ale") # Override atari default to use the deepmind wrappers. # TODO(ekl) this logic should be pushed to the catalog. if is_atari and not options.get("custom_preprocessor"): return wrap_deepmind(env, dim=options.get("dim", 84)) return ModelCatalog.get_preprocessor_as_wrapper(env, options)
def gen_policy_graphs(args): single_env = gym.make(args.env) if args.is_atari: single_env = wrap_deepmind(single_env, dim=args.dim) obs_space = single_env.observation_space act_space = single_env.action_space policy_graphs = { f'agent_{i}': (None, obs_space, act_space, {}) for i in range(args.num_agents) } return policy_graphs
def _init(self): policy_params = {"action_noise_std": 0.01} self.theta_dict = [] self.policymax = [] self.curr_parent = 0 self.population = [] self.returns_n2 = [] self.ret = [] env = self.env_creator(self.config["env_config"]) self.monitor_path = self.logdir if self.config["monitor"] else None if is_atari(env): env = wrap_deepmind(env, dim=84, framestack=4) env2 = self.env_creator(self.config["env_config"]) if is_atari(env2): env2 = wrap_deepmind(env2, dim=84, framestack=4) from ray.rllib import models preprocessor = models.ModelCatalog.get_preprocessor(env) for p in range(self.config["pop_size"]): with tf.Graph().as_default(): self.sess = utils.make_session(single_threaded=False) self.policy = policies.GenericPolicy( self.sess, env, env2, env.action_space, env.observation_space, preprocessor, self.config["observation_filter"], self.config["model"], **policy_params) tf_util.initialize(self.sess) theta = self.policy.get_weights() self.theta_dict.append(theta) self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"]) self.report_length = self.config["report_length"] # Create the shared noise table. logger.info("Creating shared noise table.") noise_id = create_shared_noise.remote(self.config["noise_size"]) self.noise = SharedNoiseTable(ray.get(noise_id)) # for p in range(self.config["population_size"]): # noise_index = self.noise.sample_index(self.policy.num_params) # self.population.append(self.noise.get(noise_index, self.policy.num_params)) # Create the actors. logger.info("Creating actors.") self.workers = [ Worker.remote(self.config, policy_params, self.env_creator, noise_id, self.monitor_path) for _ in range(self.config["num_workers"]) ] self.episodes_so_far = 0 self.reward_list1 = [] self.reward_list2 = [] self.reward_list3 = [] self.tstart = time.time() self.noisy_rew_max1 = -1000 self.noisy_rew_mean1 = -1000 self.noisy_rew_max2 = -1000 self.noisy_rew_mean2 = -1000 self.noisy_rew_max3 = -1000 self.noisy_rew_mean3 = -1000 self.noisy_rew_max4 = -1000 self.noisy_rew_mean4 = -1000 self.reward_mean1 = -1000 self.reward_mean2 = -1000 self.reward_mean3 = -1000 self.maxrew = [] self.maxrew2 = []
def wrap(env): return wrap_deepmind(env, dim=model_config.get("dim", 80))
def __init__(self, env, **config): self.config = copy.deepcopy(config) # self.env = config["env"] self.env = env seed_int = None if "seed" in config: seed_int = config["seed"] self.seed(seed_int) # seed # IMP Move below code from here to seed()? Because if seed is called # during the run of an env, the expectation is that all obs., act. space, # etc. seeds are set? Only Atari in Gym seems to do something similar, the # others I saw there don't seem to set seed for obs., act. spaces. self.env.seed( seed_int ) # seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env? obs_space_seed = self.np_random.randint(sys.maxsize) # random act_space_seed = self.np_random.randint(sys.maxsize) # random self.env.observation_space.seed(obs_space_seed) # seed self.env.action_space.seed(act_space_seed) # seed # if "dummy_eval" in config: #hack # del config["dummy_eval"] if "delay" in config: self.delay = config["delay"] assert config["delay"] >= 0 self.reward_buffer = [0.0] * (self.delay) else: self.delay = 0 if "transition_noise" in config: self.transition_noise = config["transition_noise"] if config["state_space_type"] == "continuous": assert callable(self.transition_noise), ( "transition_noise must be a function when env is continuous, it was of type:" + str(type(self.transition_noise))) else: assert self.transition_noise <= 1.0 and self.transition_noise >= 0.0, ( "transition_noise must be a value in [0.0, 1.0] when env is discrete, it was:" + str(self.transition_noise)) else: if config["state_space_type"] == "discrete": self.transition_noise = 0.0 else: self.transition_noise = lambda a: 0.0 if "reward_noise" in config: if callable(config["reward_noise"]): self.reward_noise = config["reward_noise"] else: reward_noise_std = config["reward_noise"] self.reward_noise = lambda a: a.normal(0, reward_noise_std) else: self.reward_noise = None if ("wrap_deepmind_ray" in config and config["wrap_deepmind_ray"]): # hack ##TODO remove? self.env = wrap_deepmind(self.env, dim=42, framestack=True) elif "atari_preprocessing" in config and config["atari_preprocessing"]: self.frame_skip = 4 # default for AtariPreprocessing if "frame_skip" in config: self.frame_skip = config["frame_skip"] self.grayscale_obs = False if "grayscale_obs" in config: self.grayscale_obs = config["grayscale_obs"] # Use AtariPreprocessing with frame_skip # noop_max set to 1 because we want to keep the vanilla env as # deterministic as possible and setting it 0 was not allowed. ##TODO # noop_max=0 is poosible in new Gym version, so update Gym version. self.env = AtariPreprocessing( self.env, frame_skip=self.frame_skip, grayscale_obs=self.grayscale_obs, noop_max=1, ) print("self.env.noop_max set to: ", self.env.noop_max) if "irrelevant_features" in config: # self.irrelevant_features = config["irrelevant_features"] irr_toy_env_conf = config["irrelevant_features"] if "seed" not in irr_toy_env_conf: irr_toy_env_conf["seed"] = self.np_random.randint( sys.maxsize) # random self.irr_toy_env = RLToyEnv(**irr_toy_env_conf) if config["state_space_type"] == "discrete": self.action_space = Tuple( (self.env.action_space, self.irr_toy_env.action_space)) self.observation_space = Tuple( (self.env.observation_space, self.irr_toy_env.observation_space) ) # TODO for image observations, concatenate to 1 obs. space here and in step() and reset()? else: # TODO Check the test case added for cont. irr features case and code for it in run_experiments.py. env_obs_low = self.env.observation_space.low env_obs_high = self.env.observation_space.high env_obs_dtype = env_obs_low.dtype env_obs_shape = env_obs_low.shape irr_env_obs_low = self.irr_toy_env.observation_space.low irr_env_obs_high = self.irr_toy_env.observation_space.high irr_env_obs_dtype = self.irr_toy_env.observation_space.low.dtype assert env_obs_dtype == irr_env_obs_dtype, ( "Datatypes of base env and irrelevant toy env should match. Were: " + str(env_obs_dtype) + ", " + str(irr_env_obs_dtype)) ext_low = np.concatenate((env_obs_low, irr_env_obs_low)) ext_high = np.concatenate((env_obs_high, irr_env_obs_high)) self.observation_space = Box(low=ext_low, high=ext_high, dtype=env_obs_dtype) env_act_low = self.env.action_space.low env_act_high = self.env.action_space.high env_act_dtype = env_act_low.dtype self.env_act_shape = env_act_low.shape assert (len(self.env_act_shape) == 1 ), "Length of shape of action space should be 1." irr_env_act_low = self.irr_toy_env.action_space.low irr_env_act_high = self.irr_toy_env.action_space.high irr_env_act_dtype = irr_env_act_low.dtype # assert env_obs_dtype == env_act_dtype, "Datatypes of obs. and act. of # base env should match. Were: " + str(env_obs_dtype) + ", " + # str(env_act_dtype) #TODO Apparently, observations are np.float64 and # actions np.float32 for Mujoco. ext_low = np.concatenate((env_act_low, irr_env_act_low)) ext_high = np.concatenate((env_act_high, irr_env_act_high)) self.action_space = Box( low=ext_low, high=ext_high, dtype=env_act_dtype ) # TODO Use BoxExtended here and above? self.observation_space.seed(obs_space_seed) # seed self.action_space.seed(act_space_seed) # seed else: self.action_space = self.env.action_space self.observation_space = self.env.observation_space self.total_episodes = 0 # if "action_loss_weight" in config: #hack # del config["action_loss_weight"] # if "action_space_max" in config: #hack # action_space_max = config["action_space_max"] # del config["action_space_max"] # if "time_unit" in config: #hack # time_unit = config["time_unit"] # del config["time_unit"] # if "dummy_seed" in config: #hack # del config["dummy_seed"] super(GymEnvWrapper, self).__init__()
import numpy as np import ray.utils from ray.rllib.env.atari_wrappers import wrap_deepmind from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.offline.json_writer import JsonWriter if __name__ == "__main__": batch_builder = SampleBatchBuilder() # or MultiAgentSampleBatchBuilder writer = JsonWriter( os.path.join(ray.utils.get_user_temp_dir(), "out")) # You normally wouldn't want to manually create sample batches if a # simulator is available, but let's do it anyways for example purposes: env = gym.make("PongNoFrameskip-v4") env = wrap_deepmind(env) # RLlib uses preprocessors to implement transforms such as one-hot encoding # and flattening of tuple and dict observations. For CartPole a no-op # preprocessor is used, but this may be relevant for more complex envs. prep = get_preprocessor(env.observation_space)(env.observation_space) print("The preprocessor is", prep) for eps_id in range(100): obs = env.reset() prev_action = np.zeros_like(env.action_space.sample()) prev_reward = 0 done = False t = 0 while not done: action = env.action_space.sample()