Exemple #1
0
    def __init__(self,
                 config,
                 policy_params,
                 env_creator,
                 noise,
                 monitor_path,
                 min_task_runtime=0.2):
        self.min_task_runtime = min_task_runtime
        self.config = config
        self.policy_params = policy_params
        self.noise = SharedNoiseTable(noise)

        self.monitor_path = monitor_path
        self.env = env_creator(config["env_config"])
        self.memory = []

        if is_atari(self.env):
            self.env = wrap_deepmind(self.env, dim=84, framestack=4)

        if np.random.uniform() < 0.5:
            if self.monitor_path:
                self.env = _monitor(self.env, self.monitor_path)
        else:
            self.env = env_creator(config["env_config"])
            if is_atari(self.env):
                self.env = wrap_deepmind(self.env, dim=84, framestack=4)

        self.env2 = env_creator(config["env_config"])
        if is_atari(self.env2):
            self.env2 = wrap_deepmind(self.env2, dim=84, framestack=4)

        from ray.rllib import models
        self.preprocessor = models.ModelCatalog.get_preprocessor(
            self.env, config["model"])

        self.sess = utils.make_session(single_threaded=True)

        self.policy = policies.GenericPolicy(self.sess, self.env, self.env2,
                                             self.env.action_space,
                                             self.env.observation_space,
                                             self.preprocessor,
                                             config["observation_filter"],
                                             config["model"], **policy_params)
        #self.sess.run(tf.global_variables_initializer())

        #self.sess1 = utils.make_session(single_threaded=True)

        # self.policymax = policies.GenericPolicy(
        #     self.sess, self.env,self.env2,self.env.action_space, self.env.observation_space,
        #     self.preprocessor, config["observation_filter"], config["model"],
        #     **policy_params)
        self.sess.run(tf.global_variables_initializer())
Exemple #2
0
 def wrap(env):
     env = wrap_deepmind(env,
                         dim=model_config.get("dim"),
                         framestack=model_config.get("framestack"))
     if monitor_path:
         env = _monitor(env, monitor_path)
     return env
Exemple #3
0
 def wrap(env):
     env = wrap_deepmind(env,
                         dim=model_config.get("dim"),
                         framestack=model_config.get("framestack"))
     if monitor_path:
         env = gym.wrappers.Monitor(env, monitor_path, resume=True)
     return env
Exemple #4
0
 def wrap(env):
     env = wrap_deepmind(env,
                         dim=model_config.get("dim", 84),
                         framestack=not model_config.get("use_lstm")
                         and not model_config.get("no_framestack"))
     if monitor_path:
         env = _monitor(env, monitor_path)
     return env
Exemple #5
0
 def wrap(env):
     env = wrap_deepmind(
         env,
         dim=model_config.get("dim"),
         framestack=model_config.get("framestack"))
     if monitor_path:
         env = _monitor(env, monitor_path)
     return env
Exemple #6
0
 def __init__(self):
     self.agents = [gym.make(args.env) for _ in range(args.num_agents)]
     if args.is_atari:
         self.agents = [
             wrap_deepmind(env, dim=args.dim) for env in self.agents
         ]
     self.dones = set()
     self.observation_space = self.agents[0].observation_space
     self.action_space = self.agents[0].action_space
Exemple #7
0
def wrap_dqn(env, options):
    """Apply a common set of wrappers for DQN."""

    is_atari = hasattr(env.unwrapped, "ale")

    # Override atari default to use the deepmind wrappers.
    # TODO(ekl) this logic should be pushed to the catalog.
    if is_atari and not options.get("custom_preprocessor"):
        return wrap_deepmind(env, dim=options.get("dim", 84))

    return ModelCatalog.get_preprocessor_as_wrapper(env, options)
Exemple #8
0
def gen_policy_graphs(args):
    single_env = gym.make(args.env)
    if args.is_atari:
        single_env = wrap_deepmind(single_env, dim=args.dim)
    obs_space = single_env.observation_space
    act_space = single_env.action_space
    policy_graphs = {
        f'agent_{i}': (None, obs_space, act_space, {})
        for i in range(args.num_agents)
    }
    return policy_graphs
Exemple #9
0
    def _init(self):
        policy_params = {"action_noise_std": 0.01}
        self.theta_dict = []
        self.policymax = []
        self.curr_parent = 0
        self.population = []
        self.returns_n2 = []
        self.ret = []

        env = self.env_creator(self.config["env_config"])
        self.monitor_path = self.logdir if self.config["monitor"] else None
        if is_atari(env):
            env = wrap_deepmind(env, dim=84, framestack=4)

        env2 = self.env_creator(self.config["env_config"])
        if is_atari(env2):
            env2 = wrap_deepmind(env2, dim=84, framestack=4)

        from ray.rllib import models
        preprocessor = models.ModelCatalog.get_preprocessor(env)

        for p in range(self.config["pop_size"]):
            with tf.Graph().as_default():
                self.sess = utils.make_session(single_threaded=False)
                self.policy = policies.GenericPolicy(
                    self.sess, env, env2, env.action_space,
                    env.observation_space, preprocessor,
                    self.config["observation_filter"], self.config["model"],
                    **policy_params)
                tf_util.initialize(self.sess)
                theta = self.policy.get_weights()
                self.theta_dict.append(theta)

        self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
        self.report_length = self.config["report_length"]
        # Create the shared noise table.
        logger.info("Creating shared noise table.")
        noise_id = create_shared_noise.remote(self.config["noise_size"])
        self.noise = SharedNoiseTable(ray.get(noise_id))

        # for p in range(self.config["population_size"]):
        #     noise_index = self.noise.sample_index(self.policy.num_params)
        #     self.population.append(self.noise.get(noise_index, self.policy.num_params))

        # Create the actors.
        logger.info("Creating actors.")
        self.workers = [
            Worker.remote(self.config, policy_params, self.env_creator,
                          noise_id, self.monitor_path)
            for _ in range(self.config["num_workers"])
        ]

        self.episodes_so_far = 0
        self.reward_list1 = []
        self.reward_list2 = []
        self.reward_list3 = []
        self.tstart = time.time()

        self.noisy_rew_max1 = -1000
        self.noisy_rew_mean1 = -1000
        self.noisy_rew_max2 = -1000
        self.noisy_rew_mean2 = -1000
        self.noisy_rew_max3 = -1000
        self.noisy_rew_mean3 = -1000
        self.noisy_rew_max4 = -1000
        self.noisy_rew_mean4 = -1000

        self.reward_mean1 = -1000
        self.reward_mean2 = -1000
        self.reward_mean3 = -1000

        self.maxrew = []
        self.maxrew2 = []
Exemple #10
0
 def wrap(env):
     return wrap_deepmind(env, dim=model_config.get("dim", 80))
Exemple #11
0
    def __init__(self, env, **config):
        self.config = copy.deepcopy(config)
        # self.env = config["env"]
        self.env = env

        seed_int = None
        if "seed" in config:
            seed_int = config["seed"]

        self.seed(seed_int)  # seed
        # IMP Move below code from here to seed()? Because if seed is called
        # during the run of an env, the expectation is that all obs., act. space,
        # etc. seeds are set? Only Atari in Gym seems to do something similar, the
        # others I saw there don't seem to set seed for obs., act. spaces.
        self.env.seed(
            seed_int
        )  # seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env?
        obs_space_seed = self.np_random.randint(sys.maxsize)  # random
        act_space_seed = self.np_random.randint(sys.maxsize)  # random
        self.env.observation_space.seed(obs_space_seed)  # seed
        self.env.action_space.seed(act_space_seed)  # seed

        # if "dummy_eval" in config: #hack
        #     del config["dummy_eval"]
        if "delay" in config:
            self.delay = config["delay"]
            assert config["delay"] >= 0
            self.reward_buffer = [0.0] * (self.delay)
        else:
            self.delay = 0

        if "transition_noise" in config:
            self.transition_noise = config["transition_noise"]
            if config["state_space_type"] == "continuous":
                assert callable(self.transition_noise), (
                    "transition_noise must be a function when env is continuous, it was of type:"
                    + str(type(self.transition_noise)))
            else:
                assert self.transition_noise <= 1.0 and self.transition_noise >= 0.0, (
                    "transition_noise must be a value in [0.0, 1.0] when env is discrete, it was:"
                    + str(self.transition_noise))
        else:
            if config["state_space_type"] == "discrete":
                self.transition_noise = 0.0
            else:
                self.transition_noise = lambda a: 0.0

        if "reward_noise" in config:
            if callable(config["reward_noise"]):
                self.reward_noise = config["reward_noise"]
            else:
                reward_noise_std = config["reward_noise"]
                self.reward_noise = lambda a: a.normal(0, reward_noise_std)
        else:
            self.reward_noise = None

        if ("wrap_deepmind_ray" in config
                and config["wrap_deepmind_ray"]):  # hack ##TODO remove?
            self.env = wrap_deepmind(self.env, dim=42, framestack=True)
        elif "atari_preprocessing" in config and config["atari_preprocessing"]:
            self.frame_skip = 4  # default for AtariPreprocessing
            if "frame_skip" in config:
                self.frame_skip = config["frame_skip"]
            self.grayscale_obs = False
            if "grayscale_obs" in config:
                self.grayscale_obs = config["grayscale_obs"]

            # Use AtariPreprocessing with frame_skip
            # noop_max set to 1 because we want to keep the vanilla env as
            # deterministic as possible and setting it 0 was not allowed. ##TODO
            # noop_max=0 is poosible in new Gym version, so update Gym version.
            self.env = AtariPreprocessing(
                self.env,
                frame_skip=self.frame_skip,
                grayscale_obs=self.grayscale_obs,
                noop_max=1,
            )
            print("self.env.noop_max set to: ", self.env.noop_max)

        if "irrelevant_features" in config:
            # self.irrelevant_features =  config["irrelevant_features"]
            irr_toy_env_conf = config["irrelevant_features"]
            if "seed" not in irr_toy_env_conf:
                irr_toy_env_conf["seed"] = self.np_random.randint(
                    sys.maxsize)  # random

            self.irr_toy_env = RLToyEnv(**irr_toy_env_conf)

            if config["state_space_type"] == "discrete":
                self.action_space = Tuple(
                    (self.env.action_space, self.irr_toy_env.action_space))
                self.observation_space = Tuple(
                    (self.env.observation_space,
                     self.irr_toy_env.observation_space)
                )  # TODO for image observations, concatenate to 1 obs. space here and in step() and reset()?
            else:  # TODO Check the test case added for cont. irr features case and code for it in run_experiments.py.
                env_obs_low = self.env.observation_space.low
                env_obs_high = self.env.observation_space.high
                env_obs_dtype = env_obs_low.dtype
                env_obs_shape = env_obs_low.shape
                irr_env_obs_low = self.irr_toy_env.observation_space.low
                irr_env_obs_high = self.irr_toy_env.observation_space.high
                irr_env_obs_dtype = self.irr_toy_env.observation_space.low.dtype
                assert env_obs_dtype == irr_env_obs_dtype, (
                    "Datatypes of base env and irrelevant toy env should match. Were: "
                    + str(env_obs_dtype) + ", " + str(irr_env_obs_dtype))
                ext_low = np.concatenate((env_obs_low, irr_env_obs_low))
                ext_high = np.concatenate((env_obs_high, irr_env_obs_high))
                self.observation_space = Box(low=ext_low,
                                             high=ext_high,
                                             dtype=env_obs_dtype)

                env_act_low = self.env.action_space.low
                env_act_high = self.env.action_space.high
                env_act_dtype = env_act_low.dtype
                self.env_act_shape = env_act_low.shape
                assert (len(self.env_act_shape) == 1
                        ), "Length of shape of action space should be 1."
                irr_env_act_low = self.irr_toy_env.action_space.low
                irr_env_act_high = self.irr_toy_env.action_space.high
                irr_env_act_dtype = irr_env_act_low.dtype
                # assert env_obs_dtype == env_act_dtype, "Datatypes of obs. and act. of
                # base env should match. Were: " + str(env_obs_dtype) + ", " +
                # str(env_act_dtype) #TODO Apparently, observations are np.float64 and
                # actions np.float32 for Mujoco.
                ext_low = np.concatenate((env_act_low, irr_env_act_low))
                ext_high = np.concatenate((env_act_high, irr_env_act_high))
                self.action_space = Box(
                    low=ext_low, high=ext_high, dtype=env_act_dtype
                )  # TODO Use BoxExtended here and above?

            self.observation_space.seed(obs_space_seed)  # seed
            self.action_space.seed(act_space_seed)  # seed
        else:
            self.action_space = self.env.action_space
            self.observation_space = self.env.observation_space

        self.total_episodes = 0

        # if "action_loss_weight" in config: #hack
        #     del config["action_loss_weight"]
        # if "action_space_max" in config: #hack
        #     action_space_max = config["action_space_max"]
        #     del config["action_space_max"]
        # if "time_unit" in config: #hack
        #     time_unit = config["time_unit"]
        #     del config["time_unit"]
        # if "dummy_seed" in config: #hack
        #     del config["dummy_seed"]

        super(GymEnvWrapper, self).__init__()
import numpy as np
import ray.utils
from ray.rllib.env.atari_wrappers import wrap_deepmind
from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder
from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.offline.json_writer import JsonWriter

if __name__ == "__main__":
    batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
    writer = JsonWriter(
        os.path.join(ray.utils.get_user_temp_dir(), "out"))

    # You normally wouldn't want to manually create sample batches if a
    # simulator is available, but let's do it anyways for example purposes:
    env = gym.make("PongNoFrameskip-v4")
    env = wrap_deepmind(env)

    # RLlib uses preprocessors to implement transforms such as one-hot encoding
    # and flattening of tuple and dict observations. For CartPole a no-op
    # preprocessor is used, but this may be relevant for more complex envs.
    prep = get_preprocessor(env.observation_space)(env.observation_space)
    print("The preprocessor is", prep)

    for eps_id in range(100):
        obs = env.reset()
        prev_action = np.zeros_like(env.action_space.sample())
        prev_reward = 0
        done = False
        t = 0
        while not done:
            action = env.action_space.sample()