def __init__(self, t_final: int = 1000): """ Main ran_simulation """ sim_param = SimParam(t_final) self.initialize_spaces (sim_param) # other attributes of ran_environment self.state = None self.sim_param = sim_param #self.C_algo = 'RL' self.slice_scores = None # slice scores for reward method 3 self.user_scores = None # generate seed values new_seed = seeding.create_seed() self.sim_param.update_seeds(new_seed) # initialize SD_RAN_Controller self.SD_RAN_Controller = Controller(self.sim_param) # data self.user_score_arr = None self.slice_score_arr = None self.reward_hist = None self.cost_tp_hist = None self.cost_bp_hist = None self.cost_delay_hist = None self.reset_counter = 0 columns = 'reward_hist slice_score_0 slice_score_1 slice_score_2' self.env_df = pd.DataFrame(columns=columns.split())
def __init__(self, env, active_handles, names, map_size, seed=None): self.map_size = map_size self.env = env self.handles = active_handles if seed is None: seed = seeding.create_seed(seed, max_bytes=4) env.set_seed(seed) env.reset() self.generate_map() self.team_sizes = team_sizes = [ env.get_num(handle) for handle in self.handles ] self.agents = [ f"{names[j]}_{i}" for j in range(len(team_sizes)) for i in range(team_sizes[j]) ] self.num_agents = sum(team_sizes) num_actions = [ env.get_action_space(handle)[0] for handle in self.handles ] self.action_spaces = [ Discrete(num_actions[j]) for j in range(len(team_sizes)) for i in range(team_sizes[j]) ] # may change depending on environment config? Not sure. team_obs_shapes = self._calc_obs_shapes() self.observation_spaces = [ Box(low=0., high=2., shape=team_obs_shapes[j], dtype=np.float32) for j in range(len(team_sizes)) for i in range(team_sizes[j]) ] self._renderer = None
def seed(self, seed=None): if seed is None: self.seed_ = create_seed() random.seed(self.seed_) else: self.seed_ = hash_seed(seed) random.seed(self.seed_)
def generate_random_maze(width, height, algorithm='kruskal', rng=None, seed=None): if not rng: if not seed: seed = create_seed() (rng, _) = np_random(seed) return _generate_random_maze(width, height, algorithm, rng)
def extend_meta_data(config): """ Extends the meta-data dictionary of the file to save additional information at training time. :param config: (dict) The config dictionary. :return: (dict) The updated config dictionary. """ extended_info = { "timestamp": util.get_timestamp(), "seed": config['meta'].get("seed", seeding.create_seed(max_bytes=4)), } config['meta'].update(extended_info) return config
def __init__(self, config=None, n_eval_episodes=32, deterministic=True, render=False, eval_method="normal", env=None, seed=None): self.eval_method = eval_method self.config = config self.n_eval_episodes = n_eval_episodes self.deterministic = deterministic self.render = render if eval_method in ["normal", "slow"]: assert config, "You must provide an environment configuration, if the eval_method is not fast!" test_config = deepcopy(config) test_env_config = test_config['env'] if eval_method == "slow": test_env_config['n_envs'] = 1 if not test_env_config.get('n_envs', None) and not eval_method == "slow": test_env_config['n_envs'] = 8 if not seed: seed = create_seed() if test_env_config['n_envs'] > 32: test_env_config['n_envs'] = 32 test_env_config[ 'curiosity'] = False # TODO: Sync train and test curiosity wrappers and reenable # Disable dynamic episode length on evaluation to get comparable test results independent of rewards. if test_env_config.get("reward_kwargs", None): if test_env_config["reward_kwargs"].get( "dynamic_episode_length", False): test_env_config["reward_kwargs"][ "dynamic_episode_length"] = False self.test_env = create_environment(test_config, seed, evaluation=True) self.eval_wrapper = unwrap_env(self.test_env, VecEvaluationWrapper, EvaluationWrapper) elif eval_method == "fast": assert env, "You must provide an environment with an EvaluationWrapper if the eval_method is fast!" self.test_env = None self.eval_wrapper = unwrap_env(env, VecEvaluationWrapper, EvaluationWrapper) else: raise AttributeError( "Unknown eval method '{}'".format(eval_method))
def __init__(self, seed=0, start_variance=0.1, goal_state_pos='variable', goal_state_access=True): ''' Multi-task (population) version of Gym Reacher environment. Args: seed: for RNG determining goal position. start_variance: variance of the starting position for the arm. goal_state_pos: if 'fixed', goal position is static across reset(). goal_state_access: is goal position included in the state? ''' self._start_variance = start_variance self._goal_state_pos = goal_state_pos self._goal_state_access = goal_state_access utils.EzPickle.__init__(self) mujoco_env.MujocoEnv.__init__(self, 'reacher.xml', 2) self._goal_rng = np.random.RandomState(seeding.create_seed(seed)) if self._goal_state_pos == 'fixed': self._reset_goal()
def __init__(self, params, num_balls=None, particle_size=0.05, ctrl=0.1, seed=0): seed = seeding.create_seed(seed) rng = np.random.RandomState(seed) self.rewards = create_reward(params, rng) if num_balls is None: num_balls = len(self.rewards) self.num_targets = num_balls self.particle_size = particle_size self.ctrl = ctrl num_cats = len(self.rewards) assert num_cats >= num_balls model = billiards_model(num_cats, particle_size=particle_size) with model.asfile() as f: MujocoEnv.__init__(self, f.name, 5) utils.EzPickle.__init__(self, params, num_balls, particle_size, seed)
def __init__(self, start_variance=0.1, wall_seed=0, wall_penalty=5, wall_state_access=False): self._start_variance = start_variance self._wall_state_access = wall_state_access if wall_seed is not None: wall_rng = np.random.RandomState(seeding.create_seed(wall_seed)) self._wall_angle = np.pi * (2 * wall_rng.rand() - 1) x = np.cos(self._wall_angle) y = np.sin(self._wall_angle) params = { 'XS': x * 0.08, 'YS': y * 0.08, 'XE': x * 0.21, 'YE': y * 0.21 } self._wall_penalty = wall_penalty else: self._wall_angle = 0 self._wall_penalty = 0 params = {'XS': -1, 'YS': -1, 'XE': -0.99, 'YE': -0.99} model_path = os.path.join(os.path.dirname(__file__), 'reacher_wall.xml') with open(model_path, 'r') as model: model_xml = model.read() for k, v in params.items(): model_xml = model_xml.replace(k, str(v)) utils.EzPickle.__init__(self) with tempfile.NamedTemporaryFile(mode='w', suffix='.xml') as f: f.write(model_xml) f.flush() mujoco_env.MujocoEnv.__init__(self, f.name, 2)
def seed(self, seed=None): if seed == None: seed = create_seed() (self.rng, self.seed) = np_random(seed)
def __init__(self, layout=None, start=(0, 0), goal=None, width=0, height=0, algorithm='kruskal', rng=None, seed=None, max_steps=10000, goal_reward=100.0, step_reward=-1.0, hit_wall_reward=-5.0, agent_image=None): """Create a new FixedMazeEnvironment. Arguments are: layout numpy.ndarray; A 2D array of integers indicating connectivity between maze cells. Each cell in the array indicates its connectivity to the cell to the right (increasing x) and to the cell above (increasing y) by setting the appropriate bit. Connectivity to the cells to the left or below is computed by going to that cell and checking its connectivity to the cell to its right or above. This removes redundancy in the representation. If bit 0 is set in the cell's value, it is connected to the cell to the right. If bit 1 is set, it is connected to the cell above. If the layout is None, the environment will generate a random maze with dimensions specified by the "width" and "height" arguments using the algorithm specified by the "algorithm" argument. start tuple(int, int); (x, y) coordinates where the agent starts. If the start and the goal are coincident, the starting location will be randomized goal tuple(int, int); (x, y) coordinates of the goal; if None, the goal will be set randomly width int; Width of randomly-generated mazes, in squares height int; Height of randomly-generated mazes, in squares algorithm str; Algorithm used to generate random mazes. The currently supported algorithms are: kruskal Kruskal's minimal spanning tree algorithm rng np.random.RandomState or equivalent; the environment's source of random numbers. If None, the environment will create its own RNG using gym.utils.seeding.np_random() seed int; If the "rng" argument is None, this seed will be used to create and seed the environment's RNG. If the "rng" argument is not None, this argument should contain the value used to seed that RNG. If the "seed" argument is None, the environment will create its own seed. max_steps int; Maximum number of steps the agent can take before the episode terminates in failure goal_reward float; Reward agent receives upon reaching the goal step_reward float; Reward (usually negative) agent receives for each step that does not contact a wall hit_wall_reward float; Reward (usually negative) agent receives when it bumps into a wall agent_image str, unicode or pygame.Surface; Image to use for the agent when the environment is rendered in "human" mode. If this argument is a str or unicode, it specifies the name of a file with the image to use for the agent. If this argument is a pygame.Surface, it contains the image itself """ gym.Env.__init__(self) if rng: if seed == None: raise ValueError('If rng is not None, seed cannot be None') self.rng = rng self.seed = seed else: if seed == None: seed = create_seed() (self.rng, self.seed) = np_random(seed) if not layout is None: if len(layout.shape) != 2: raise ValueError('Maze layout must be a 2D array') self._layout = layout elif width < 1: raise ValueError('width must be > 0') elif height < 1: raise ValueError('height must be > 0') else: self._layout = _generate_random_maze(width, height, algorithm, self.rng) if not goal: goal = (self.rng.randint(0, self._layout.shape[1]), self.rng.randint(0, self._layout.shape[0])) self.goal = goal self.max_steps = max_steps self.goal_reward = goal_reward self.step_reward = step_reward self.hit_wall_reward = hit_wall_reward while self.goal == start: start = (self.rng.randint(0, self._layout.shape[1]), self.rng.randint(0, self._layout.shape[0])) self.start = start bounds = (self._layout.shape[1], self._layout.shape[0]) self.observation_space = spaces.MultiDiscrete(bounds) self.reward_range = (-5.0 * max_steps, 100) self._delta_x = [0, 1, 0, -1] self._delta_y = [1, 0, -1, 0] self._can_move = [ self._can_move_north, self._can_move_east, self._can_move_south, self._can_move_west ] self._walls1 = (' *', ' ', ' *', ' ', '#*', '# ', '#*', '# ', '$*', '$ ', '$*', '$ ', '!*', '! ', '!*', '! ') self._walls2 = ('**', '**', ' *', ' *') self._renderer = None self.reset() self._agent_image = agent_image
def __init__(self, colors: int = 5, ranks: int = 5, players: int = 2, hand_size: int = 2, max_information_tokens: int = 8, max_life_tokens: int = 3, observation_type: int = 1, seed=None, random_start_player: bool = False, ): """ Parameter descriptions : - colors: int, Number of colors in [2,5]. - ranks: int, Number of ranks in [2,5]. - players: int, Number of players in [2,5]. - hand_size: int, Hand size in [2,5]. - max_information_tokens: int, Number of information tokens (>=0). - max_life_tokens: int, Number of life tokens (>=1). - observation_type: int. 0: Minimal observation. 1: First-order common knowledge observation. - seed: int, Random seed or None. - random_start_player: bool, Random start player. Common game configurations: Hanabi-Full (default) : { "colors": 5, "ranks": 5, "players": 2, "max_information_tokens": 8, "max_life_tokens": 3, "observation_type": 1, "hand_size": 2 } Hanabi-Small : { "colors": 5, "ranks": 5, "players": 2, "max_information_tokens": "max_life_tokens": "observation_type": 1} Hanabi-Very-Small : { "colors": 2, "ranks": 5, "players": 2, "max_information_tokens": "max_life_tokens": "observation_type": 1} """ EzPickle.__init__( self, colors, ranks, players, hand_size, max_information_tokens, max_life_tokens, observation_type, seed, random_start_player, ) seed = seeding.create_seed(seed, max_bytes=3) # ToDo: Starts # Check if all possible dictionary values are within a certain ranges. self._raise_error_if_config_values_out_of_range(colors, ranks, players, hand_size, max_information_tokens, max_life_tokens, observation_type, random_start_player) self._config = { 'colors': colors, 'ranks': ranks, 'players': players, 'hand_size': hand_size, 'max_information_tokens': max_information_tokens, 'max_life_tokens': max_life_tokens, 'observation_type': observation_type, 'random_start_player': random_start_player, } self.hanabi_env: HanabiEnv = HanabiEnv(config=self._config) # List of agent names self.agents = ["player_{}".format(i) for i in range(self.hanabi_env.players)] self.agent_selection: str # Sets hanabi game to clean state and updates all internal dictionaries self.reset(observe=False) # Set action_spaces and observation_spaces based on params in hanabi_env self.action_spaces = {name: spaces.Discrete(self.hanabi_env.num_moves()) for name in self.agents} self.observation_spaces = {player_name: spaces.Box(low=0, high=1, shape=(self.hanabi_env.vectorized_observation_shape()[0],), dtype=np.float32) for player_name in self.agents}
def seed(self, seed: Optional[int] = None) -> List[int]: actual_seed = seeding.create_seed(seed) self.outer_env.inner_env.set_seed(actual_seed) return [actual_seed]
def main(): parser = argparse.ArgumentParser(description=None) #parser.add_argument('load', type=str) parser.add_argument('--normalize', type=str) parser.add_argument('--env', type=str, default='SunblazeCartPole-v0') parser.add_argument('--seed', type=int, help='RNG seed, defaults to random') parser.add_argument('--outdir', type=str) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--eval-n-trials', type=int, default=100) parser.add_argument('--episodes-per-trial', type=int, default=5) parser.add_argument('--eval-n-parallel', type=int, default=1) parser.add_argument('--record', action='store_true') parser.add_argument('load', type=str, nargs='*') args = parser.parse_args() # Fixes problem of eval script being run with ".../checkpoints/*" if len(args.load) > 1: import natsort print("Detected multiple model file args, sorting and choosing last..") # Fixes issue of 'normalize' file inside checkpoint folder args.load = [f for f in args.load if 'normalize' not in f] args.load = natsort.natsorted(args.load, reverse=True)[0] print("Using {}".format(args.load)) else: args.load = args.load[0] # Create output directory os.makedirs(args.outdir, exist_ok=True) # If seed is unspecified, generate a pseudorandom one if not args.seed: # "Seed must be between 0 and 2**32 - 1" seed = create_seed(args.seed, max_bytes=4) else: seed = args.seed # Log it for reference with open(os.path.join(args.outdir, 'seed.txt'), 'w') as fout: fout.write("%d\n" % seed) set_global_seeds(seed) output_lock = multiprocessing.Lock() def evaluator(process_idx): def make_env(): env = base.make_env(args.env, process_idx) env.seed(seed + process_idx) if args.record: env = VideoMonitor(env, args.outdir, video_callable=lambda _: True) return env env = DummyVecEnv([make_env]) obs_space = env.observation_space act_space = env.action_space if len(act_space.shape) == 0: discrete = True else: discrete = False # TODO(cpacker): this should really be in the top-level dir norm_path = args.normalize if args.normalize else os.path.join(os.path.dirname(args.load), 'normalize') with open(norm_path, 'rb') as f: obs_norms = pickle.load(f) clipob = obs_norms['clipob'] mean = obs_norms['mean'] var = obs_norms['var'] # Load model with U.make_session(num_cpu=1) as sess: if 'SpaceInvaders' in args.env or 'Breakout' in args.env: raise NotImplementedError else: # '.../checkpoint/XXXX' -> '.../make_model.pkl' pkl_path = os.path.join( os.path.dirname(os.path.dirname(args.load)), 'make_model.pkl') # from: https://github.com/openai/baselines/issues/115 print("[pidx %d] Constructing model from %s" % (process_idx, pkl_path)) with open(pkl_path, 'rb') as fh: import cloudpickle make_model = cloudpickle.load(fh) model = make_model() print("[pidx %d] Loading saved model from %s" % (process_idx, args.load)) model.load(args.load) # Unwrap DummyVecEnv to access mujoco.py object env_base = env.envs[0].unwrapped # Record a binary success measure if the env supports it if hasattr(env_base, 'is_success') and callable(getattr(env_base, 'is_success')): success_support = True else: print("[pidx %d] Warning: env does not support binary success, ignoring." % process_idx) start = time.time() for t in range(args.eval_n_trials): progress_pct = 10 if t > 0 and ((args.eval_n_trials < progress_pct) or (t % (args.eval_n_trials//10) == 0)): # Indicate progress every 10% elapsed = time.time() - start hours, rem = divmod(elapsed, 3600) minutes, seconds = divmod(rem, 60) print("[pidx %d] Trial %d/%d, elapsed: %d:%d:%d" % (process_idx, t, args.eval_n_trials, hours, minutes, seconds)) obs = env.reset([True]) state = model.initial_state if discrete: action = -1 shape = (1,) else: shape = (act_space.shape[0],) action = np.zeros(shape, dtype=np.float32) rew = 0.0 done = False mask = np.asarray([False]) success = False # Reward for the specific episode in the trial all_episodes_rew = np.zeros(args.episodes_per_trial) for i in range(args.episodes_per_trial): for _ in range(args.max_episode_len): obs = np.clip((obs-mean) / np.sqrt(var), -clipob, clipob) action = np.reshape(np.asarray([action]), shape) action, value, state, _ = model.step(obs, state, action, np.reshape(np.asarray([rew]), (1,)), np.reshape(np.asarray([done]), (1,)), mask) obs, rew, done, _ = env.step(action, [i==(args.episodes_per_trial-1)]) # The reward we report is from the final episode in the trial all_episodes_rew[i] += rew if i == (args.episodes_per_trial-1): if success_support and env_base.is_success(): success = True if done: mask = np.asarray([True]) if done: break with output_lock: with open(os.path.join(args.outdir, 'evaluation.json'), 'a') as results_file: results_file.write(json.dumps({ # For logging-sake, track the reward for each episode in the trial 'episode_rewards': all_episodes_rew, # The 'reward' counted is still the reward of the final episode 'reward': all_episodes_rew[args.episodes_per_trial-1], 'success': success if success_support else 'N/A', 'environment': env_base.parameters, 'model': args.load, }, cls=NumpyEncoder)) results_file.write('\n') misc.async.run_async(args.eval_n_parallel, evaluator)
def main(): parser = argparse.ArgumentParser(description=None) #parser.add_argument('load', type=str) parser.add_argument('--normalize', type=str) parser.add_argument('--env', type=str, default='SunblazeCartPole-v0') parser.add_argument('--seed', type=int, help='RNG seed, defaults to random') parser.add_argument('--outdir', type=str) parser.add_argument('--max-episode-len', type=int, default=10000) parser.add_argument('--eval-n-trials', type=int, default=100) parser.add_argument('--episodes-per-trial', type=int, default=1) parser.add_argument('--eval-n-parallel', type=int, default=1) parser.add_argument('--record', action='store_true') parser.add_argument('load', type=str, nargs='*') args = parser.parse_args() # Fixes problem of eval script being run with ".../checkpoints/*" if len(args.load) > 1: import natsort print("Detected multiple model file args, sorting and choosing last..") # Fixes issue of 'normalize' file inside checkpoint folder args.load = [f for f in args.load if 'normalize' not in f] args.load = natsort.natsorted(args.load, reverse=True)[0] print("Using {}".format(args.load)) else: args.load = args.load[0] # ppo2 is trained on "total episodes" only but is evaluated via "trials" total_episodes = args.eval_n_trials * args.episodes_per_trial # Create output directory os.makedirs(args.outdir, exist_ok=True) # If seed is unspecified, generate a pseudorandom one if not args.seed: # "Seed must be between 0 and 2**32 - 1" seed = create_seed(args.seed, max_bytes=4) else: seed = args.seed # Log it for reference with open(os.path.join(args.outdir, 'seed.txt'), 'w') as fout: fout.write("%d\n" % seed) set_global_seeds(seed) output_lock = multiprocessing.Lock() def evaluator(process_idx): def make_env(): env = base.make_env(args.env, process_idx) env.seed(seed + process_idx) if args.record: env = gym.wrappers.Monitor(env, args.outdir, video_callable=lambda _: True) return env env = DummyVecEnv([make_env]) obs_space = env.observation_space act_space = env.action_space # TODO(cpacker): this should really be in the top-level dir norm_path = args.normalize if args.normalize else os.path.join( os.path.dirname(args.load), 'normalize') with open(norm_path, 'rb') as f: obs_norms = pickle.load(f) clipob = obs_norms['clipob'] mean = obs_norms['mean'] var = obs_norms['var'] # Load model with U.make_session(num_cpu=1) as sess: if 'SpaceInvaders' in args.env or 'Breakout' in args.env: raise NotImplementedError else: # '.../checkpoint/XXXX' -> '.../make_model.pkl' pkl_path = os.path.join( os.path.dirname(os.path.dirname(args.load)), 'make_model.pkl') # from: https://github.com/openai/baselines/issues/115 print("Constructing model from " + pkl_path) with open(pkl_path, 'rb') as fh: import cloudpickle make_model = cloudpickle.load(fh) model = make_model() print("Loading saved model from " + args.load) model.load(args.load) # Unwrap DummyVecEnv to access mujoco.py object env_base = env.envs[0].unwrapped # Record a binary success measure if the env supports it if hasattr(env_base, 'is_success') and callable( getattr(env_base, 'is_success')): success_support = True else: print( "Warning: env does not support binary success, ignoring.") success_support = False for _ in range(total_episodes): obs, state, done = env.reset(), model.initial_state, False episode_rew = 0 success = False for _ in range(args.max_episode_len): obs = np.clip((obs - mean) / np.sqrt(var), -clipob, clipob) # normalize action, value, state, _ = model.step( obs, state, np.reshape(np.asarray([done]), (1, ))) obs, rew, done, _ = env.step(action) episode_rew += rew if success_support and env_base.is_success(): success = True if done: break with output_lock: with open(os.path.join(args.outdir, 'evaluation.json'), 'a') as results_file: results_file.write( json.dumps( { 'reward': episode_rew, 'success': success if success_support else 'N/A', 'environment': env_base.parameters, 'model': args.load, }, cls=NumpyEncoder)) results_file.write('\n') misc. async .run_async(args.eval_n_parallel, evaluator)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='SunblazeCartPole-v0') parser.add_argument('--seed', type=int, help='RNG seed, defaults to random') parser.add_argument('--output', type=str) # parser.add_argument('--episodes-per-trial', type=int, default=5) # parser.add_argument('--trials', type=int, default=10 ** 4) # The total number of episodes is now trials*episodes_per_trial parser.add_argument('--total-episodes', type=int, default=5e4) parser.add_argument('--policy', help='Policy architecture', choices=['mlp', 'lstm'], default='mlp') parser.add_argument('--processes', default=1, help='int or "max" for all') parser.add_argument('--reward-scale', type=float, default=1.0) # Hyperparameters parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--nsteps', type=int, default=5) parser.add_argument('--ent-coef', type=float, default=1e-2) args = parser.parse_args() #total_episodes = args.trials * args.episodes_per_trial # Configure logger if args.output: try: os.makedirs(args.output) except OSError: pass logger.reset() logger.configure(dir=args.output) # If seed is unspecified, generate a pseudorandom one if not args.seed: # "Seed must be between 0 and 2**32 - 1" seed = create_seed(args.seed, max_bytes=4) else: seed = args.seed # Log it for reference with open(os.path.join(args.output, 'seed.txt'), 'w') as fout: fout.write("%d\n" % seed) if args.processes == 'max': ncpu = multiprocessing.cpu_count() # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15 if sys.platform == 'darwin': ncpu //= 2 else: try: ncpu = int(args.processes) except ValueError: raise argparse.ArgumentTypeError("Invalid number of processes") train( args.env, total_episodes=args.total_episodes, policy=args.policy, lr=args.lr, num_processes=ncpu, rew_scale=args.reward_scale, seed=seed, nsteps=args.nsteps, ent_coef=args. ent_coef, # default 0.01 in baselines, 0.0001 in chainer A3C )
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='SunblazeBreakout-v0') parser.add_argument('--seed', type=int, help='RNG seed, defaults to random') parser.add_argument('--output', type=str) parser.add_argument('--processes', default=1, help='int or "max" for all') # parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--total-episodes', type=int, default=int(5e4)) parser.add_argument('--policy', help='Policy architecture', choices=['mlp', 'lstm'], default='mlp') # Hyperparameters parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--nsteps', type=int, default=2048) parser.add_argument('--nminibatches', type=int, default=32) args = parser.parse_args() # Configure logger if args.output: try: os.makedirs(args.output) except OSError: pass logger.reset() logger.configure(dir=args.output) # If seed is unspecified, generate a pseudorandom one if not args.seed: # "Seed must be between 0 and 2**32 - 1" seed = create_seed(args.seed, max_bytes=4) else: seed = args.seed # Log it for reference with open(os.path.join(args.output, 'seed.txt'), 'w') as fout: fout.write("%d\n" % seed) if args.processes == 'max': ncpu = multiprocessing.cpu_count() # from: https://github.com/openai/baselines/blob/1f8a03f3a62367526f20215188fb5ea4b9ec27e0/baselines/ppo2/run_atari.py#L15 if sys.platform == 'darwin': ncpu //= 2 else: try: ncpu = int(args.processes) except ValueError: raise argparse.ArgumentTypeError("Invalid number of processes") train( args.env, total_episodes=args.total_episodes, seed=seed, ncpu=ncpu, policy=args.policy, lr=args.lr, nsteps=args.nsteps, nminibatches=args.nminibatches, )
def __init__(self, game, num_players, mode_num=None, seed=None, obs_type='rgb_image', full_action_space=True, max_frames=100000): """Frameskip should be either a tuple (indicating a random range to choose from, with the top value exclude), or an int.""" EzPickle.__init__(self, game, num_players, mode_num, seed, obs_type, full_action_space, max_frames) assert obs_type in ( 'ram', 'rgb_image', "grayscale_image" ), "obs_type must either be 'ram' or 'rgb_image' or 'grayscale_image'" self.obs_type = obs_type self.full_action_space = full_action_space self.num_players = num_players self.max_frames = max_frames multi_agent_ale_py.ALEInterface.setLoggerMode("error") self.ale = multi_agent_ale_py.ALEInterface() if seed is None: seed = seeding.create_seed(seed, max_bytes=4) self.ale.setInt(b"random_seed", seed) self.ale.setFloat(b'repeat_action_probability', 0.) pathstart = os.path.dirname(multi_agent_ale_py.__file__) final_path = os.path.join(pathstart, "ROM", game, game + ".bin") if not os.path.exists(final_path): raise IOError( "rom {} is not installed. Please install roms using AutoROM tool (https://github.com/PettingZoo-Team/AutoROM)" .format(game)) self.ale.loadROM(final_path) all_modes = self.ale.getAvailableModes(num_players) if mode_num is None: mode = all_modes[0] else: mode = mode_num assert mode in all_modes, "mode_num parameter is wrong. Mode {} selected, only {} modes are supported".format( mode_num, str(list(all_modes))) self.ale.setMode(mode) assert num_players == self.ale.numPlayersActive() if full_action_space: action_size = 18 action_mapping = np.arange(action_size) else: action_mapping = self.ale.getMinimalActionSet() action_size = len(action_mapping) self.action_mapping = action_mapping if obs_type == 'ram': observation_space = gym.spaces.Box(low=0, high=255, dtype=np.uint8, shape=(128, )) else: (screen_width, screen_height) = self.ale.getScreenDims() if obs_type == 'rgb_image': num_channels = 3 elif obs_type == 'grayscale_image': num_channels = 1 observation_space = spaces.Box(low=0, high=255, shape=(screen_height, screen_width, num_channels), dtype=np.uint8) self.num_agents = num_players player_names = ["first", "second", "third", "fourth"] self.agents = [f"{player_names[n]}_0" for n in range(self.num_agents)] self.action_spaces = [gym.spaces.Discrete(action_size) ] * self.num_agents self.observation_spaces = [observation_space] * self.num_agents self._screen = None
def seed(self, seed=None): """Set the seed for the env's random number generator""" seed = seeding.create_seed(seed) self._random.seed(seed) return [seed] + self._rew.seed(self._random.randint(2**32))
def seed(self, seed=None): seed = seeding.create_seed(seed) np.random.seed(seed)
def train_maml_like_ppo_( init_model, args, learning_rate, num_episodes=20, num_updates=1, vis=False, run_idx=0, use_linear_lr_decay=False, ): num_steps = num_episodes * 100 torch.set_num_threads(1) device = torch.device("cpu") envs = make_vec_envs(ENV_NAME, seeding.create_seed(None), NUM_PROC, args.gamma, None, device, allow_early_resets=True, normalize=args.norm_vectors) raw_env = navigation_2d.unpeele_navigation_env(envs, 0) # raw_env.set_arguments(args.rm_nogo, args.reduce_goals, True, args.large_nogos) new_task = raw_env.sample_tasks(run_idx) raw_env.reset_task(new_task[0]) # actor_critic = Policy( # envs.observation_space.shape, # envs.action_space, # base_kwargs={'recurrent': args.recurrent_policy}) actor_critic = copy.deepcopy(init_model) actor_critic.to(device) agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=learning_rate, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(num_steps, NUM_PROC, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) fitnesses = [] for j in range(num_updates): # if args.use_linear_lr_decay: # # decrease learning rate linearly # utils.update_linear_schedule( # agent.optimizer, j, num_updates, # agent.optimizer.lr if args.algo == "acktr" else args.lr) min_c_rew = float("inf") vis = [] offending = [] for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) if done[0]: c_rew = infos[0]["cummulative_reward"] vis.append((infos[0]['path'], infos[0]['goal'])) offending.extend(infos[0]['offending']) if c_rew < min_c_rew: min_c_rew = c_rew # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() ob_rms = utils.get_vec_normalize(envs) if ob_rms is not None: ob_rms = ob_rms.ob_rms fits, info = evaluate(actor_critic, ob_rms, envs, NUM_PROC, device) print(f"fitness {fits} update {j+1}") if (j+1) % 1 == 0: vis_path(vis, eval_path_rec=info['path'], offending=offending) fitnesses.append(fits) return fitnesses[-1], info[0]['reached'], None
def seed(self, seed=None): if seed is None: seed = seeding.create_seed(seed, max_bytes=4) self.ale.setInt(b"random_seed", seed) self.ale.loadROM(self.rom_path) self.ale.setMode(self.mode)
def seed(self, seed=None): seed = seeding.create_seed(seed) self.rng_key = jax.random.PRNGKey(seed) return seed
def seed(self, seed=None): """Set the seed for the random number generator""" seed = seeding.create_seed(seed) self._random.seed(seed) return [seed]
def __init__(self, colors: int = 5, ranks: int = 5, players: int = 2, hand_size: int = 2, max_information_tokens: int = 8, max_life_tokens: int = 3, observation_type: int = 1, seed=None, random_start_player: bool = False, ): """ Parameter descriptions : - colors: int, Number of colors in [2,5]. - ranks: int, Number of ranks in [2,5]. - players: int, Number of players in [2,5]. - hand_size: int, Hand size in [2,5]. - max_information_tokens: int, Number of information tokens (>=0). - max_life_tokens: int, Number of life tokens (>=1). - observation_type: int. 0: Minimal observation. 1: First-order common knowledge observation. - seed: int, Random seed or None. - random_start_player: bool, Random start player. Common game configurations: Hanabi-Full (default) : { "colors": 5, "ranks": 5, "players": 2, "max_information_tokens": 8, "max_life_tokens": 3, "observation_type": 1, "hand_size": 2 } Hanabi-Small : { "colors": 5, "ranks": 5, "players": 2, "max_information_tokens": "max_life_tokens": "observation_type": 1} Hanabi-Very-Small : { "colors": 2, "ranks": 5, "players": 2, "max_information_tokens": "max_life_tokens": "observation_type": 1} """ EzPickle.__init__( self, colors, ranks, players, hand_size, max_information_tokens, max_life_tokens, observation_type, seed, random_start_player, ) seed = seeding.create_seed(seed, max_bytes=3) # importing Hanabi and throw error message if pypi package is not installed correctly. try: from hanabi_learning_environment.rl_env import HanabiEnv, make except ModuleNotFoundError: raise ImportError( ( "Hanabi is not installed.\n", "Run ´pip3 install hanabi_learning_environment´ from within your project environment.\n", "Consult hanabi/README.md for detailed information." ) ) else: # ToDo: Starts # Check if all possible dictionary values are within a certain ranges. self._raise_error_if_config_values_out_of_range(colors, ranks, players, hand_size, max_information_tokens, max_life_tokens, observation_type, random_start_player) self.hanabi_env: HanabiEnv = HanabiEnv(config={'colors': colors, 'ranks': ranks, 'players': players, 'hand_size': hand_size, 'max_information_tokens': max_information_tokens, 'max_life_tokens': max_life_tokens, 'observation_type': observation_type, 'random_start_player': random_start_player, 'seed': seed}) # List of agent names self.agents = ["player_{}".format(i) for i in range(self.hanabi_env.players)] self.agent_selection: str # Sets hanabi game to clean state and updates all internal dictionaries self.reset(observe=False) # Set action_spaces and observation_spaces based on params in hanabi_env self.action_spaces = {name: spaces.Discrete(self.hanabi_env.num_moves()) for name in self.agents} self.observation_spaces = {player_name: spaces.Box(low=0, high=1, shape=(self.hanabi_env.vectorized_observation_shape()[0],), dtype=np.float32) for player_name in self.agents}
def create_seed(seed=None, max_bytes=8): return seeding.create_seed(seed, max_bytes=max_bytes)
def seed(self, seed=None): if seed is None: self.env._seed = seeding.create_seed(seed, max_bytes=4) else: self.env._seed = seed self.env.full_restart()
def seed(self, seed=None): if seed is None: seed = seeding.create_seed(max_bytes=4) self.__nprandom = np.random.RandomState(seed) return [seed]
def seed(self, seed=None): if seed is None: seed = seeding.create_seed(seed, max_bytes=4) self.env.set_seed(seed)
default=100, type=int, help='Specify batch size') parser.add_argument('--max_iters', dest='max_iters', default=2000, type=int, help='Maximum number of iterations') parser.add_argument('--filepath', dest='filepath', default='experiments', type=str, help='Where to save the data') parser.add_argument('--random_seed', dest='random_seed', default=seeding.create_seed(), type=int, help='Random seed') parser.add_argument('--experiment_class', dest='experiment_class', default=list(AVAILABLE_EXPERIMENTS.keys())[0], type=str, help='type of experiment: ' + ', '.join(AVAILABLE_EXPERIMENTS.keys())) args = parser.parse_args() maybe_make_dir(args.filepath) run(**vars(args))