Ejemplo n.º 1
0
    def __init__(self, t_final: int = 1000):
        """
        Main ran_simulation
        """
        sim_param = SimParam(t_final)

        self.initialize_spaces (sim_param)

        # other attributes of ran_environment
        self.state = None
        self.sim_param = sim_param
        #self.C_algo = 'RL'
        self.slice_scores = None  # slice scores for reward method 3
        self.user_scores = None

        # generate seed values
        new_seed = seeding.create_seed()
        self.sim_param.update_seeds(new_seed)

        # initialize SD_RAN_Controller
        self.SD_RAN_Controller = Controller(self.sim_param)

        # data
        self.user_score_arr = None
        self.slice_score_arr = None
        self.reward_hist = None
        self.cost_tp_hist = None
        self.cost_bp_hist = None
        self.cost_delay_hist = None
        self.reset_counter = 0

        columns = 'reward_hist slice_score_0 slice_score_1 slice_score_2'
        self.env_df = pd.DataFrame(columns=columns.split())
Ejemplo n.º 2
0
    def __init__(self, env, active_handles, names, map_size, seed=None):
        self.map_size = map_size
        self.env = env
        self.handles = active_handles
        if seed is None:
            seed = seeding.create_seed(seed, max_bytes=4)
        env.set_seed(seed)
        env.reset()
        self.generate_map()

        self.team_sizes = team_sizes = [
            env.get_num(handle) for handle in self.handles
        ]
        self.agents = [
            f"{names[j]}_{i}" for j in range(len(team_sizes))
            for i in range(team_sizes[j])
        ]
        self.num_agents = sum(team_sizes)

        num_actions = [
            env.get_action_space(handle)[0] for handle in self.handles
        ]
        self.action_spaces = [
            Discrete(num_actions[j]) for j in range(len(team_sizes))
            for i in range(team_sizes[j])
        ]
        # may change depending on environment config? Not sure.
        team_obs_shapes = self._calc_obs_shapes()
        self.observation_spaces = [
            Box(low=0., high=2., shape=team_obs_shapes[j], dtype=np.float32)
            for j in range(len(team_sizes)) for i in range(team_sizes[j])
        ]

        self._renderer = None
Ejemplo n.º 3
0
 def seed(self, seed=None):
     if seed is None:
         self.seed_ = create_seed()
         random.seed(self.seed_)
     else:
         self.seed_ = hash_seed(seed)
         random.seed(self.seed_)
Ejemplo n.º 4
0
def generate_random_maze(width,
                         height,
                         algorithm='kruskal',
                         rng=None,
                         seed=None):
    if not rng:
        if not seed:
            seed = create_seed()
        (rng, _) = np_random(seed)
    return _generate_random_maze(width, height, algorithm, rng)
Ejemplo n.º 5
0
def extend_meta_data(config):
    """
    Extends the meta-data dictionary of the file to save additional information at training time.
    :param config: (dict) The config dictionary.
    :return: (dict) The updated config dictionary.
    """
    extended_info = {
        "timestamp": util.get_timestamp(),
        "seed": config['meta'].get("seed", seeding.create_seed(max_bytes=4)),
    }
    config['meta'].update(extended_info)
    return config
Ejemplo n.º 6
0
    def __init__(self,
                 config=None,
                 n_eval_episodes=32,
                 deterministic=True,
                 render=False,
                 eval_method="normal",
                 env=None,
                 seed=None):
        self.eval_method = eval_method
        self.config = config
        self.n_eval_episodes = n_eval_episodes
        self.deterministic = deterministic
        self.render = render

        if eval_method in ["normal", "slow"]:
            assert config, "You must provide an environment configuration, if the eval_method is not fast!"
            test_config = deepcopy(config)
            test_env_config = test_config['env']
            if eval_method == "slow":
                test_env_config['n_envs'] = 1

            if not test_env_config.get('n_envs',
                                       None) and not eval_method == "slow":
                test_env_config['n_envs'] = 8

            if not seed:
                seed = create_seed()
            if test_env_config['n_envs'] > 32:
                test_env_config['n_envs'] = 32
            test_env_config[
                'curiosity'] = False  # TODO: Sync train and test curiosity wrappers and reenable

            # Disable dynamic episode length on evaluation to get comparable test results independent of rewards.
            if test_env_config.get("reward_kwargs", None):
                if test_env_config["reward_kwargs"].get(
                        "dynamic_episode_length", False):
                    test_env_config["reward_kwargs"][
                        "dynamic_episode_length"] = False

            self.test_env = create_environment(test_config,
                                               seed,
                                               evaluation=True)
            self.eval_wrapper = unwrap_env(self.test_env, VecEvaluationWrapper,
                                           EvaluationWrapper)
        elif eval_method == "fast":
            assert env, "You must provide an environment with an EvaluationWrapper if the eval_method is fast!"
            self.test_env = None
            self.eval_wrapper = unwrap_env(env, VecEvaluationWrapper,
                                           EvaluationWrapper)
        else:
            raise AttributeError(
                "Unknown eval method '{}'".format(eval_method))
Ejemplo n.º 7
0
    def __init__(self,
                 seed=0,
                 start_variance=0.1,
                 goal_state_pos='variable',
                 goal_state_access=True):
        '''
        Multi-task (population) version of Gym Reacher environment.

        Args:
            seed: for RNG determining goal position.
            start_variance: variance of the starting position for the arm.
            goal_state_pos: if 'fixed', goal position is static across reset().
            goal_state_access: is goal position included in the state?
        '''
        self._start_variance = start_variance
        self._goal_state_pos = goal_state_pos
        self._goal_state_access = goal_state_access
        utils.EzPickle.__init__(self)
        mujoco_env.MujocoEnv.__init__(self, 'reacher.xml', 2)
        self._goal_rng = np.random.RandomState(seeding.create_seed(seed))
        if self._goal_state_pos == 'fixed':
            self._reset_goal()
Ejemplo n.º 8
0
    def __init__(self,
                 params,
                 num_balls=None,
                 particle_size=0.05,
                 ctrl=0.1,
                 seed=0):
        seed = seeding.create_seed(seed)
        rng = np.random.RandomState(seed)

        self.rewards = create_reward(params, rng)
        if num_balls is None:
            num_balls = len(self.rewards)
        self.num_targets = num_balls
        self.particle_size = particle_size
        self.ctrl = ctrl

        num_cats = len(self.rewards)
        assert num_cats >= num_balls
        model = billiards_model(num_cats, particle_size=particle_size)
        with model.asfile() as f:
            MujocoEnv.__init__(self, f.name, 5)
            utils.EzPickle.__init__(self, params, num_balls, particle_size,
                                    seed)
Ejemplo n.º 9
0
    def __init__(self,
                 start_variance=0.1,
                 wall_seed=0,
                 wall_penalty=5,
                 wall_state_access=False):
        self._start_variance = start_variance
        self._wall_state_access = wall_state_access

        if wall_seed is not None:
            wall_rng = np.random.RandomState(seeding.create_seed(wall_seed))
            self._wall_angle = np.pi * (2 * wall_rng.rand() - 1)
            x = np.cos(self._wall_angle)
            y = np.sin(self._wall_angle)
            params = {
                'XS': x * 0.08,
                'YS': y * 0.08,
                'XE': x * 0.21,
                'YE': y * 0.21
            }
            self._wall_penalty = wall_penalty
        else:
            self._wall_angle = 0
            self._wall_penalty = 0
            params = {'XS': -1, 'YS': -1, 'XE': -0.99, 'YE': -0.99}

        model_path = os.path.join(os.path.dirname(__file__),
                                  'reacher_wall.xml')
        with open(model_path, 'r') as model:
            model_xml = model.read()
            for k, v in params.items():
                model_xml = model_xml.replace(k, str(v))

        utils.EzPickle.__init__(self)
        with tempfile.NamedTemporaryFile(mode='w', suffix='.xml') as f:
            f.write(model_xml)
            f.flush()
            mujoco_env.MujocoEnv.__init__(self, f.name, 2)
Ejemplo n.º 10
0
 def seed(self, seed=None):
     if seed == None:
         seed = create_seed()
     (self.rng, self.seed) = np_random(seed)
Ejemplo n.º 11
0
    def __init__(self,
                 layout=None,
                 start=(0, 0),
                 goal=None,
                 width=0,
                 height=0,
                 algorithm='kruskal',
                 rng=None,
                 seed=None,
                 max_steps=10000,
                 goal_reward=100.0,
                 step_reward=-1.0,
                 hit_wall_reward=-5.0,
                 agent_image=None):
        """Create a new FixedMazeEnvironment.
Arguments are:
    layout     numpy.ndarray;  A 2D array of integers indicating connectivity
               between maze cells.  Each cell in the array indicates its
               connectivity to the cell to the right (increasing x) and
               to the cell above (increasing y) by setting the appropriate bit.
               Connectivity to the cells to the left or below is computed by
               going to that cell and checking its connectivity to the
               cell to its right or above.  This removes redundancy in the
               representation.

               If bit 0 is set in the cell's value, it is connected to the
               cell to the right.  If bit 1 is set, it is connected to the
               cell above.  

               If the layout is None, the environment will generate a random
               maze with dimensions specified by the "width" and "height"
               arguments using the algorithm specified by the "algorithm"
               argument.

    start      tuple(int, int); (x, y) coordinates where the agent starts.  If
               the start and the goal are coincident, the starting location
               will be randomized

    goal       tuple(int, int); (x, y) coordinates of the goal; if None, the
               goal will be set randomly

    width      int; Width of randomly-generated mazes, in squares

    height     int; Height of randomly-generated mazes, in squares

    algorithm  str; Algorithm used to generate random mazes.  The currently
               supported algorithms are:
                   kruskal    Kruskal's minimal spanning tree algorithm

    rng        np.random.RandomState or equivalent; the environment's
               source of random numbers.  If None, the environment
               will create its own RNG using gym.utils.seeding.np_random()

    seed       int; If the "rng" argument is None, this seed will be
               used to create and seed the environment's RNG.  If the "rng"
               argument is not None, this argument should contain the
               value used to seed that RNG.  If the "seed" argument is
               None, the environment will create its own seed.

    max_steps  int; Maximum number of steps the agent can take before the
               episode terminates in failure

    goal_reward  float; Reward agent receives upon reaching the goal

    step_reward  float; Reward (usually negative) agent receives for each
                 step that does not contact a wall

    hit_wall_reward float; Reward (usually negative) agent receives when
                 it bumps into a wall

    agent_image  str, unicode or pygame.Surface;  Image to use for the agent
               when the environment is rendered in "human" mode.  If this
               argument is a str or unicode, it specifies the name of a file
               with the image to use for the agent.  If this argument is
               a pygame.Surface, it contains the image itself
"""
        gym.Env.__init__(self)

        if rng:
            if seed == None:
                raise ValueError('If rng is not None, seed cannot be None')
            self.rng = rng
            self.seed = seed
        else:
            if seed == None:
                seed = create_seed()
            (self.rng, self.seed) = np_random(seed)

        if not layout is None:
            if len(layout.shape) != 2:
                raise ValueError('Maze layout must be a 2D array')
            self._layout = layout
        elif width < 1:
            raise ValueError('width must be > 0')
        elif height < 1:
            raise ValueError('height must be > 0')
        else:
            self._layout = _generate_random_maze(width, height, algorithm,
                                                 self.rng)

        if not goal:
            goal = (self.rng.randint(0, self._layout.shape[1]),
                    self.rng.randint(0, self._layout.shape[0]))

        self.goal = goal
        self.max_steps = max_steps
        self.goal_reward = goal_reward
        self.step_reward = step_reward
        self.hit_wall_reward = hit_wall_reward

        while self.goal == start:
            start = (self.rng.randint(0, self._layout.shape[1]),
                     self.rng.randint(0, self._layout.shape[0]))

        self.start = start

        bounds = (self._layout.shape[1], self._layout.shape[0])
        self.observation_space = spaces.MultiDiscrete(bounds)
        self.reward_range = (-5.0 * max_steps, 100)

        self._delta_x = [0, 1, 0, -1]
        self._delta_y = [1, 0, -1, 0]
        self._can_move = [
            self._can_move_north, self._can_move_east, self._can_move_south,
            self._can_move_west
        ]

        self._walls1 = (' *', '  ', ' *', '  ', '#*', '# ', '#*', '# ', '$*',
                        '$ ', '$*', '$ ', '!*', '! ', '!*', '! ')
        self._walls2 = ('**', '**', ' *', ' *')

        self._renderer = None
        self.reset()

        self._agent_image = agent_image
Ejemplo n.º 12
0
    def __init__(self,
                 colors: int = 5,
                 ranks: int = 5,
                 players: int = 2,
                 hand_size: int = 2,
                 max_information_tokens: int = 8,
                 max_life_tokens: int = 3,
                 observation_type: int = 1,
                 seed=None,
                 random_start_player: bool = False,
                 ):

        """
        Parameter descriptions :
              - colors: int, Number of colors in [2,5].
              - ranks: int, Number of ranks in [2,5].
              - players: int, Number of players in [2,5].
              - hand_size: int, Hand size in [2,5].
              - max_information_tokens: int, Number of information tokens (>=0).
              - max_life_tokens: int, Number of life tokens (>=1).
              - observation_type: int.
                    0: Minimal observation.
                    1: First-order common knowledge observation.
              - seed: int, Random seed or None.
              - random_start_player: bool, Random start player.

        Common game configurations:
            Hanabi-Full (default) :  {
                "colors": 5,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 8,
                "max_life_tokens": 3,
                "observation_type": 1,
                "hand_size": 2
                }

            Hanabi-Small : {
                "colors": 5,
                "ranks": 5,
                "players": 2,
                "max_information_tokens":
                "max_life_tokens":
                "observation_type": 1}

            Hanabi-Very-Small : {
                "colors": 2,
                "ranks": 5,
                "players": 2,
                "max_information_tokens":
                "max_life_tokens":
                "observation_type": 1}

        """
        EzPickle.__init__(
            self,
            colors,
            ranks,
            players,
            hand_size,
            max_information_tokens,
            max_life_tokens,
            observation_type,
            seed,
            random_start_player,
        )

        seed = seeding.create_seed(seed, max_bytes=3)

        # ToDo: Starts
        # Check if all possible dictionary values are within a certain ranges.
        self._raise_error_if_config_values_out_of_range(colors,
                                                        ranks,
                                                        players,
                                                        hand_size,
                                                        max_information_tokens,
                                                        max_life_tokens,
                                                        observation_type,
                                                        random_start_player)

        self._config = {
            'colors': colors,
            'ranks': ranks,
            'players': players,
            'hand_size': hand_size,
            'max_information_tokens': max_information_tokens,
            'max_life_tokens': max_life_tokens,
            'observation_type': observation_type,
            'random_start_player': random_start_player,
        }
        self.hanabi_env: HanabiEnv = HanabiEnv(config=self._config)

        # List of agent names
        self.agents = ["player_{}".format(i) for i in range(self.hanabi_env.players)]

        self.agent_selection: str

        # Sets hanabi game to clean state and updates all internal dictionaries
        self.reset(observe=False)

        # Set action_spaces and observation_spaces based on params in hanabi_env
        self.action_spaces = {name: spaces.Discrete(self.hanabi_env.num_moves()) for name in self.agents}
        self.observation_spaces = {player_name: spaces.Box(low=0,
                                                           high=1,
                                                           shape=(self.hanabi_env.vectorized_observation_shape()[0],),
                                                           dtype=np.float32)
                                   for player_name in self.agents}
Ejemplo n.º 13
0
 def seed(self, seed: Optional[int] = None) -> List[int]:
     actual_seed = seeding.create_seed(seed)
     self.outer_env.inner_env.set_seed(actual_seed)
     return [actual_seed]
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(description=None)
    #parser.add_argument('load', type=str)
    parser.add_argument('--normalize', type=str)
    parser.add_argument('--env', type=str, default='SunblazeCartPole-v0')
    parser.add_argument('--seed', type=int, help='RNG seed, defaults to random')
    parser.add_argument('--outdir', type=str)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--eval-n-trials', type=int, default=100)
    parser.add_argument('--episodes-per-trial', type=int, default=5)
    parser.add_argument('--eval-n-parallel', type=int, default=1)
    parser.add_argument('--record', action='store_true')
    parser.add_argument('load', type=str, nargs='*')
    args = parser.parse_args()

    # Fixes problem of eval script being run with ".../checkpoints/*"
    if len(args.load) > 1:
        import natsort
        print("Detected multiple model file args, sorting and choosing last..")
        # Fixes issue of 'normalize' file inside checkpoint folder
        args.load = [f for f in args.load if 'normalize' not in f]
        args.load = natsort.natsorted(args.load, reverse=True)[0]
        print("Using {}".format(args.load))
    else:
        args.load = args.load[0]

    # Create output directory
    os.makedirs(args.outdir, exist_ok=True)

    # If seed is unspecified, generate a pseudorandom one
    if not args.seed:
        # "Seed must be between 0 and 2**32 - 1"
        seed = create_seed(args.seed, max_bytes=4)
    else:
        seed = args.seed

    # Log it for reference
    with open(os.path.join(args.outdir, 'seed.txt'), 'w') as fout:
        fout.write("%d\n" % seed)

    set_global_seeds(seed)

    output_lock = multiprocessing.Lock()

    def evaluator(process_idx):
        def make_env():
            env = base.make_env(args.env, process_idx)
            env.seed(seed + process_idx)
            if args.record:
                env = VideoMonitor(env, args.outdir, video_callable=lambda _: True)
            return env

        env = DummyVecEnv([make_env])
        obs_space = env.observation_space
        act_space = env.action_space
        if len(act_space.shape) == 0:
            discrete = True
        else:
            discrete = False

        # TODO(cpacker): this should really be in the top-level dir
        norm_path = args.normalize if args.normalize else os.path.join(os.path.dirname(args.load), 'normalize')
        with open(norm_path, 'rb') as f:
            obs_norms = pickle.load(f)
        clipob = obs_norms['clipob']
        mean = obs_norms['mean']
        var = obs_norms['var']

        # Load model
        with U.make_session(num_cpu=1) as sess:
            if 'SpaceInvaders' in args.env or 'Breakout' in args.env:
                raise NotImplementedError
            else:
                # '.../checkpoint/XXXX' -> '.../make_model.pkl'
                pkl_path = os.path.join(
                    os.path.dirname(os.path.dirname(args.load)),
                    'make_model.pkl')
                # from: https://github.com/openai/baselines/issues/115
                print("[pidx %d] Constructing model from %s" % (process_idx, pkl_path))
                with open(pkl_path, 'rb') as fh:
                    import cloudpickle
                    make_model = cloudpickle.load(fh)
                model = make_model()
                print("[pidx %d] Loading saved model from %s" % (process_idx, args.load))
                model.load(args.load)

            # Unwrap DummyVecEnv to access mujoco.py object
            env_base = env.envs[0].unwrapped

            # Record a binary success measure if the env supports it
            if hasattr(env_base, 'is_success') and callable(getattr(env_base, 'is_success')):
                success_support = True
            else:
                print("[pidx %d] Warning: env does not support binary success, ignoring." % process_idx)
 
            start = time.time()
            for t in range(args.eval_n_trials):
                progress_pct = 10
                if t > 0 and ((args.eval_n_trials < progress_pct)
                              or (t % (args.eval_n_trials//10) == 0)):
                    # Indicate progress every 10%
                    elapsed = time.time() - start
                    hours, rem = divmod(elapsed, 3600)
                    minutes, seconds = divmod(rem, 60)
                    print("[pidx %d] Trial %d/%d, elapsed: %d:%d:%d" %
                          (process_idx, t, args.eval_n_trials,
                           hours, minutes, seconds))

                obs = env.reset([True])
                state = model.initial_state
                if discrete:
                    action = -1
                    shape = (1,)
                else:
                    shape = (act_space.shape[0],)
                    action = np.zeros(shape, dtype=np.float32)

                rew = 0.0
                done = False
                mask = np.asarray([False])
                success = False
                # Reward for the specific episode in the trial
                all_episodes_rew = np.zeros(args.episodes_per_trial)

                for i in range(args.episodes_per_trial):
                    for _ in range(args.max_episode_len):
                        obs = np.clip((obs-mean) / np.sqrt(var), -clipob, clipob)
                        action = np.reshape(np.asarray([action]), shape)
                        action, value, state, _ = model.step(obs, state, action, np.reshape(np.asarray([rew]), (1,)), np.reshape(np.asarray([done]), (1,)), mask)
                        obs, rew, done, _ = env.step(action, [i==(args.episodes_per_trial-1)])
                        # The reward we report is from the final episode in the trial
                        all_episodes_rew[i] += rew
                        if i == (args.episodes_per_trial-1):
                            if success_support and env_base.is_success():
                                success = True
                            if done:
                                mask = np.asarray([True])
                        if done:
                            break

                with output_lock:
                    with open(os.path.join(args.outdir, 'evaluation.json'), 'a') as results_file:
                        results_file.write(json.dumps({
                            # For logging-sake, track the reward for each episode in the trial
                            'episode_rewards': all_episodes_rew,
                            # The 'reward' counted is still the reward of the final episode
                            'reward': all_episodes_rew[args.episodes_per_trial-1],
                            'success': success if success_support else 'N/A',
                            'environment': env_base.parameters,
                            'model': args.load,
                        }, cls=NumpyEncoder))
                        results_file.write('\n')

    misc.async.run_async(args.eval_n_parallel, evaluator)
Ejemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(description=None)
    #parser.add_argument('load', type=str)
    parser.add_argument('--normalize', type=str)
    parser.add_argument('--env', type=str, default='SunblazeCartPole-v0')
    parser.add_argument('--seed',
                        type=int,
                        help='RNG seed, defaults to random')
    parser.add_argument('--outdir', type=str)
    parser.add_argument('--max-episode-len', type=int, default=10000)
    parser.add_argument('--eval-n-trials', type=int, default=100)
    parser.add_argument('--episodes-per-trial', type=int, default=1)
    parser.add_argument('--eval-n-parallel', type=int, default=1)
    parser.add_argument('--record', action='store_true')
    parser.add_argument('load', type=str, nargs='*')
    args = parser.parse_args()

    # Fixes problem of eval script being run with ".../checkpoints/*"
    if len(args.load) > 1:
        import natsort
        print("Detected multiple model file args, sorting and choosing last..")
        # Fixes issue of 'normalize' file inside checkpoint folder
        args.load = [f for f in args.load if 'normalize' not in f]
        args.load = natsort.natsorted(args.load, reverse=True)[0]
        print("Using {}".format(args.load))
    else:
        args.load = args.load[0]

    # ppo2 is trained on "total episodes" only but is evaluated via "trials"
    total_episodes = args.eval_n_trials * args.episodes_per_trial

    # Create output directory
    os.makedirs(args.outdir, exist_ok=True)

    # If seed is unspecified, generate a pseudorandom one
    if not args.seed:
        # "Seed must be between 0 and 2**32 - 1"
        seed = create_seed(args.seed, max_bytes=4)
    else:
        seed = args.seed

    # Log it for reference
    with open(os.path.join(args.outdir, 'seed.txt'), 'w') as fout:
        fout.write("%d\n" % seed)

    set_global_seeds(seed)

    output_lock = multiprocessing.Lock()

    def evaluator(process_idx):
        def make_env():
            env = base.make_env(args.env, process_idx)
            env.seed(seed + process_idx)
            if args.record:
                env = gym.wrappers.Monitor(env,
                                           args.outdir,
                                           video_callable=lambda _: True)
            return env

        env = DummyVecEnv([make_env])
        obs_space = env.observation_space
        act_space = env.action_space

        # TODO(cpacker): this should really be in the top-level dir
        norm_path = args.normalize if args.normalize else os.path.join(
            os.path.dirname(args.load), 'normalize')
        with open(norm_path, 'rb') as f:
            obs_norms = pickle.load(f)
        clipob = obs_norms['clipob']
        mean = obs_norms['mean']
        var = obs_norms['var']

        # Load model
        with U.make_session(num_cpu=1) as sess:

            if 'SpaceInvaders' in args.env or 'Breakout' in args.env:
                raise NotImplementedError
            else:
                # '.../checkpoint/XXXX' -> '.../make_model.pkl'
                pkl_path = os.path.join(
                    os.path.dirname(os.path.dirname(args.load)),
                    'make_model.pkl')
                # from: https://github.com/openai/baselines/issues/115
                print("Constructing model from " + pkl_path)
                with open(pkl_path, 'rb') as fh:
                    import cloudpickle
                    make_model = cloudpickle.load(fh)
                model = make_model()
                print("Loading saved model from " + args.load)
                model.load(args.load)

            # Unwrap DummyVecEnv to access mujoco.py object
            env_base = env.envs[0].unwrapped

            # Record a binary success measure if the env supports it
            if hasattr(env_base, 'is_success') and callable(
                    getattr(env_base, 'is_success')):
                success_support = True
            else:
                print(
                    "Warning: env does not support binary success, ignoring.")
                success_support = False

            for _ in range(total_episodes):
                obs, state, done = env.reset(), model.initial_state, False
                episode_rew = 0
                success = False
                for _ in range(args.max_episode_len):
                    obs = np.clip((obs - mean) / np.sqrt(var), -clipob,
                                  clipob)  # normalize
                    action, value, state, _ = model.step(
                        obs, state, np.reshape(np.asarray([done]), (1, )))
                    obs, rew, done, _ = env.step(action)
                    episode_rew += rew
                    if success_support and env_base.is_success():
                        success = True
                    if done:
                        break

                with output_lock:
                    with open(os.path.join(args.outdir, 'evaluation.json'),
                              'a') as results_file:
                        results_file.write(
                            json.dumps(
                                {
                                    'reward': episode_rew,
                                    'success':
                                    success if success_support else 'N/A',
                                    'environment': env_base.parameters,
                                    'model': args.load,
                                },
                                cls=NumpyEncoder))
                        results_file.write('\n')

    misc. async .run_async(args.eval_n_parallel, evaluator)
Ejemplo n.º 16
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='SunblazeCartPole-v0')
    parser.add_argument('--seed',
                        type=int,
                        help='RNG seed, defaults to random')
    parser.add_argument('--output', type=str)

    # parser.add_argument('--episodes-per-trial', type=int, default=5)
    # parser.add_argument('--trials', type=int, default=10 ** 4)
    # The total number of episodes is now trials*episodes_per_trial
    parser.add_argument('--total-episodes', type=int, default=5e4)

    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['mlp', 'lstm'],
                        default='mlp')
    parser.add_argument('--processes', default=1, help='int or "max" for all')
    parser.add_argument('--reward-scale', type=float, default=1.0)

    # Hyperparameters
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--nsteps', type=int, default=5)
    parser.add_argument('--ent-coef', type=float, default=1e-2)

    args = parser.parse_args()
    #total_episodes = args.trials * args.episodes_per_trial

    # Configure logger
    if args.output:
        try:
            os.makedirs(args.output)
        except OSError:
            pass

        logger.reset()
        logger.configure(dir=args.output)

    # If seed is unspecified, generate a pseudorandom one
    if not args.seed:
        # "Seed must be between 0 and 2**32 - 1"
        seed = create_seed(args.seed, max_bytes=4)
    else:
        seed = args.seed

    # Log it for reference
    with open(os.path.join(args.output, 'seed.txt'), 'w') as fout:
        fout.write("%d\n" % seed)

    if args.processes == 'max':
        ncpu = multiprocessing.cpu_count()
        # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15
        if sys.platform == 'darwin': ncpu //= 2
    else:
        try:
            ncpu = int(args.processes)
        except ValueError:
            raise argparse.ArgumentTypeError("Invalid number of processes")

    train(
        args.env,
        total_episodes=args.total_episodes,
        policy=args.policy,
        lr=args.lr,
        num_processes=ncpu,
        rew_scale=args.reward_scale,
        seed=seed,
        nsteps=args.nsteps,
        ent_coef=args.
        ent_coef,  # default 0.01 in baselines, 0.0001 in chainer A3C
    )
Ejemplo n.º 17
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='SunblazeBreakout-v0')
    parser.add_argument('--seed',
                        type=int,
                        help='RNG seed, defaults to random')
    parser.add_argument('--output', type=str)
    parser.add_argument('--processes', default=1, help='int or "max" for all')
    # parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--total-episodes', type=int, default=int(5e4))
    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['mlp', 'lstm'],
                        default='mlp')

    # Hyperparameters
    parser.add_argument('--lr', type=float, default=3e-4)
    parser.add_argument('--nsteps', type=int, default=2048)
    parser.add_argument('--nminibatches', type=int, default=32)

    args = parser.parse_args()

    # Configure logger
    if args.output:
        try:
            os.makedirs(args.output)
        except OSError:
            pass
        logger.reset()
        logger.configure(dir=args.output)

    # If seed is unspecified, generate a pseudorandom one
    if not args.seed:
        # "Seed must be between 0 and 2**32 - 1"
        seed = create_seed(args.seed, max_bytes=4)
    else:
        seed = args.seed

    # Log it for reference
    with open(os.path.join(args.output, 'seed.txt'), 'w') as fout:
        fout.write("%d\n" % seed)

    if args.processes == 'max':
        ncpu = multiprocessing.cpu_count()
        # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15
        if sys.platform == 'darwin': ncpu //= 2
    else:
        try:
            ncpu = int(args.processes)
        except ValueError:
            raise argparse.ArgumentTypeError("Invalid number of processes")

    train(
        args.env,
        total_episodes=args.total_episodes,
        seed=seed,
        ncpu=ncpu,
        policy=args.policy,
        lr=args.lr,
        nsteps=args.nsteps,
        nminibatches=args.nminibatches,
    )
Ejemplo n.º 18
0
    def __init__(self,
                 game,
                 num_players,
                 mode_num=None,
                 seed=None,
                 obs_type='rgb_image',
                 full_action_space=True,
                 max_frames=100000):
        """Frameskip should be either a tuple (indicating a random range to
        choose from, with the top value exclude), or an int."""
        EzPickle.__init__(self, game, num_players, mode_num, seed, obs_type,
                          full_action_space, max_frames)

        assert obs_type in (
            'ram', 'rgb_image', "grayscale_image"
        ), "obs_type must  either be 'ram' or 'rgb_image' or 'grayscale_image'"
        self.obs_type = obs_type
        self.full_action_space = full_action_space
        self.num_players = num_players
        self.max_frames = max_frames

        multi_agent_ale_py.ALEInterface.setLoggerMode("error")
        self.ale = multi_agent_ale_py.ALEInterface()

        if seed is None:
            seed = seeding.create_seed(seed, max_bytes=4)

        self.ale.setInt(b"random_seed", seed)
        self.ale.setFloat(b'repeat_action_probability', 0.)

        pathstart = os.path.dirname(multi_agent_ale_py.__file__)
        final_path = os.path.join(pathstart, "ROM", game, game + ".bin")
        if not os.path.exists(final_path):
            raise IOError(
                "rom {} is not installed. Please install roms using AutoROM tool (https://github.com/PettingZoo-Team/AutoROM)"
                .format(game))

        self.ale.loadROM(final_path)

        all_modes = self.ale.getAvailableModes(num_players)

        if mode_num is None:
            mode = all_modes[0]
        else:
            mode = mode_num
            assert mode in all_modes, "mode_num parameter is wrong. Mode {} selected, only {} modes are supported".format(
                mode_num, str(list(all_modes)))

        self.ale.setMode(mode)
        assert num_players == self.ale.numPlayersActive()

        if full_action_space:
            action_size = 18
            action_mapping = np.arange(action_size)
        else:
            action_mapping = self.ale.getMinimalActionSet()
            action_size = len(action_mapping)

        self.action_mapping = action_mapping

        if obs_type == 'ram':
            observation_space = gym.spaces.Box(low=0,
                                               high=255,
                                               dtype=np.uint8,
                                               shape=(128, ))
        else:
            (screen_width, screen_height) = self.ale.getScreenDims()
            if obs_type == 'rgb_image':
                num_channels = 3
            elif obs_type == 'grayscale_image':
                num_channels = 1
            observation_space = spaces.Box(low=0,
                                           high=255,
                                           shape=(screen_height, screen_width,
                                                  num_channels),
                                           dtype=np.uint8)

        self.num_agents = num_players
        player_names = ["first", "second", "third", "fourth"]
        self.agents = [f"{player_names[n]}_0" for n in range(self.num_agents)]

        self.action_spaces = [gym.spaces.Discrete(action_size)
                              ] * self.num_agents
        self.observation_spaces = [observation_space] * self.num_agents

        self._screen = None
Ejemplo n.º 19
0
 def seed(self, seed=None):
     """Set the seed for the env's random number
   generator"""
     seed = seeding.create_seed(seed)
     self._random.seed(seed)
     return [seed] + self._rew.seed(self._random.randint(2**32))
Ejemplo n.º 20
0
 def seed(self, seed=None):
     seed = seeding.create_seed(seed)
     np.random.seed(seed)
Ejemplo n.º 21
0
def train_maml_like_ppo_(
    init_model,
    args,
    learning_rate,
    num_episodes=20,
    num_updates=1,
    vis=False,
    run_idx=0,
    use_linear_lr_decay=False,
):
    num_steps = num_episodes * 100

    torch.set_num_threads(1)
    device = torch.device("cpu")

    envs = make_vec_envs(ENV_NAME, seeding.create_seed(None), NUM_PROC,
                         args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors)
    raw_env = navigation_2d.unpeele_navigation_env(envs, 0)

    # raw_env.set_arguments(args.rm_nogo, args.reduce_goals, True, args.large_nogos)
    new_task = raw_env.sample_tasks(run_idx)
    raw_env.reset_task(new_task[0])

    # actor_critic = Policy(
    #     envs.observation_space.shape,
    #     envs.action_space,
    #     base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic = copy.deepcopy(init_model)
    actor_critic.to(device)

    agent = algo.PPO(
        actor_critic,
        args.clip_param,
        args.ppo_epoch,
        args.num_mini_batch,
        args.value_loss_coef,
        args.entropy_coef,
        lr=learning_rate,
        eps=args.eps,
        max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(num_steps, NUM_PROC,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    fitnesses = []

    for j in range(num_updates):

        # if args.use_linear_lr_decay:
        #    # decrease learning rate linearly
        #    utils.update_linear_schedule(
        #        agent.optimizer, j, num_updates,
        #        agent.optimizer.lr if args.algo == "acktr" else args.lr)
        min_c_rew = float("inf")
        vis = []
        offending = []
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            if done[0]:
                c_rew = infos[0]["cummulative_reward"]
                vis.append((infos[0]['path'], infos[0]['goal']))
                offending.extend(infos[0]['offending'])
                if c_rew < min_c_rew:
                    min_c_rew = c_rew
            # If done then clean the history of observations.
            masks = torch.FloatTensor(
                [[0.0] if done_ else [1.0] for done_ in done])
            bad_masks = torch.FloatTensor(
                [[0.0] if 'bad_transition' in info.keys() else [1.0]
                 for info in infos])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks, bad_masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.gae_lambda, args.use_proper_time_limits)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        ob_rms = utils.get_vec_normalize(envs)
        if ob_rms is not None:
            ob_rms = ob_rms.ob_rms

        fits, info = evaluate(actor_critic, ob_rms, envs, NUM_PROC, device)
        print(f"fitness {fits} update {j+1}")
        if (j+1) % 1 == 0:
            vis_path(vis, eval_path_rec=info['path'], offending=offending)
        fitnesses.append(fits)

    return fitnesses[-1], info[0]['reached'], None
Ejemplo n.º 22
0
 def seed(self, seed=None):
     if seed is None:
         seed = seeding.create_seed(seed, max_bytes=4)
     self.ale.setInt(b"random_seed", seed)
     self.ale.loadROM(self.rom_path)
     self.ale.setMode(self.mode)
Ejemplo n.º 23
0
 def seed(self, seed=None):
     seed = seeding.create_seed(seed)
     self.rng_key = jax.random.PRNGKey(seed)
     return seed
Ejemplo n.º 24
0
 def seed(self, seed=None):
     """Set the seed for the random number generator"""
     seed = seeding.create_seed(seed)
     self._random.seed(seed)
     return [seed]
Ejemplo n.º 25
0
    def __init__(self,
                 colors: int = 5,
                 ranks: int = 5,
                 players: int = 2,
                 hand_size: int = 2,
                 max_information_tokens: int = 8,
                 max_life_tokens: int = 3,
                 observation_type: int = 1,
                 seed=None,
                 random_start_player: bool = False,
                 ):

        """
        Parameter descriptions :
              - colors: int, Number of colors in [2,5].
              - ranks: int, Number of ranks in [2,5].
              - players: int, Number of players in [2,5].
              - hand_size: int, Hand size in [2,5].
              - max_information_tokens: int, Number of information tokens (>=0).
              - max_life_tokens: int, Number of life tokens (>=1).
              - observation_type: int.
                    0: Minimal observation.
                    1: First-order common knowledge observation.
              - seed: int, Random seed or None.
              - random_start_player: bool, Random start player.

        Common game configurations:
            Hanabi-Full (default) :  {
                "colors": 5,
                "ranks": 5,
                "players": 2,
                "max_information_tokens": 8,
                "max_life_tokens": 3,
                "observation_type": 1,
                "hand_size": 2
                }

            Hanabi-Small : {
                "colors": 5,
                "ranks": 5,
                "players": 2,
                "max_information_tokens":
                "max_life_tokens":
                "observation_type": 1}

            Hanabi-Very-Small : {
                "colors": 2,
                "ranks": 5,
                "players": 2,
                "max_information_tokens":
                "max_life_tokens":
                "observation_type": 1}

        """
        EzPickle.__init__(
            self,
            colors,
            ranks,
            players,
            hand_size,
            max_information_tokens,
            max_life_tokens,
            observation_type,
            seed,
            random_start_player,
        )

        seed = seeding.create_seed(seed, max_bytes=3)

        # importing Hanabi and throw error message if pypi package is not installed correctly.
        try:
            from hanabi_learning_environment.rl_env import HanabiEnv, make

        except ModuleNotFoundError:
            raise ImportError(
                (
                    "Hanabi is not installed.\n",
                    "Run ´pip3 install hanabi_learning_environment´ from within your project environment.\n",
                    "Consult hanabi/README.md for detailed information."
                )
            )

        else:

            # ToDo: Starts
            # Check if all possible dictionary values are within a certain ranges.
            self._raise_error_if_config_values_out_of_range(colors,
                                                            ranks,
                                                            players,
                                                            hand_size,
                                                            max_information_tokens,
                                                            max_life_tokens,
                                                            observation_type,
                                                            random_start_player)

            self.hanabi_env: HanabiEnv = HanabiEnv(config={'colors': colors,
                                                           'ranks': ranks,
                                                           'players': players,
                                                           'hand_size': hand_size,
                                                           'max_information_tokens': max_information_tokens,
                                                           'max_life_tokens': max_life_tokens,
                                                           'observation_type': observation_type,
                                                           'random_start_player': random_start_player,
                                                           'seed': seed})

            # List of agent names
            self.agents = ["player_{}".format(i) for i in range(self.hanabi_env.players)]

            self.agent_selection: str

            # Sets hanabi game to clean state and updates all internal dictionaries
            self.reset(observe=False)

            # Set action_spaces and observation_spaces based on params in hanabi_env
            self.action_spaces = {name: spaces.Discrete(self.hanabi_env.num_moves()) for name in self.agents}
            self.observation_spaces = {player_name: spaces.Box(low=0,
                                                               high=1,
                                                               shape=(self.hanabi_env.vectorized_observation_shape()[0],),
                                                               dtype=np.float32)
                                       for player_name in self.agents}
Ejemplo n.º 26
0
def create_seed(seed=None, max_bytes=8):
    return seeding.create_seed(seed, max_bytes=max_bytes)
Ejemplo n.º 27
0
 def seed(self, seed=None):
     if seed is None:
         self.env._seed = seeding.create_seed(seed, max_bytes=4)
     else:
         self.env._seed = seed
     self.env.full_restart()
Ejemplo n.º 28
0
 def seed(self, seed=None):
     if seed is None:
         seed = seeding.create_seed(max_bytes=4)
     self.__nprandom = np.random.RandomState(seed)
     return [seed]
Ejemplo n.º 29
0
 def seed(self, seed=None):
     if seed is None:
         seed = seeding.create_seed(seed, max_bytes=4)
     self.env.set_seed(seed)
Ejemplo n.º 30
0
                        default=100,
                        type=int,
                        help='Specify batch size')
    parser.add_argument('--max_iters',
                        dest='max_iters',
                        default=2000,
                        type=int,
                        help='Maximum number of iterations')
    parser.add_argument('--filepath',
                        dest='filepath',
                        default='experiments',
                        type=str,
                        help='Where to save the data')
    parser.add_argument('--random_seed',
                        dest='random_seed',
                        default=seeding.create_seed(),
                        type=int,
                        help='Random seed')
    parser.add_argument('--experiment_class',
                        dest='experiment_class',
                        default=list(AVAILABLE_EXPERIMENTS.keys())[0],
                        type=str,
                        help='type of experiment: ' +
                        ', '.join(AVAILABLE_EXPERIMENTS.keys()))

    args = parser.parse_args()

    maybe_make_dir(args.filepath)

    run(**vars(args))