def init_policies(observation_space, action_space, base_kwargs, num_agents, base): actor_critics = [ Policy(observation_space.shape[1:], action_space if num_agents == 1 else Discrete(action_space.nvec[0]), base=get_base(base), base_kwargs=base_kwargs) for _ in range(num_agents) ] shared_cpu_actor_critics = [ Policy(observation_space.shape[1:], action_space if num_agents == 1 else Discrete(action_space.nvec[0]), base=get_base(base), base_kwargs=base_kwargs).share_memory() for _ in range(num_agents) ] shared_cpu_actor_critics_env_actor = [ Policy(observation_space.shape[1:], action_space if num_agents == 1 else Discrete(action_space.nvec[0]), base=get_base(base), base_kwargs=base_kwargs).share_memory() for _ in range(num_agents) ] pytorch_total_params = sum(p.numel() for p in actor_critics[0].parameters() if p.requires_grad) print('number of params ', pytorch_total_params) return actor_critics, shared_cpu_actor_critics, shared_cpu_actor_critics_env_actor
def __init__(self, args): self.num_envs = args.num_processes self.num_graphs = 500 root = os.getcwd() if args.env_name == 'playground': args.game_config = Playground() args.render, args.game, args.fixed_map_mode = args.render, args.env_name, True self.envs = [ Mazebase_high(args) for i in range(args.num_processes) ] self.graph = Batch_SubtaskGraph(args) self.observation_space = Box(low=0, high=1, shape=self.envs[0].obs_shape, dtype=np.float32) self.feat_dim = args.game_config.feat_dim self.max_task = self.envs[0].n_actions self.state_space = Box(low=0, high=1, shape=self.envs[0].obs_shape, dtype=np.float32) self.action_space = Discrete(self.envs[0].n_actions) self.feed_time = True self.feed_prev_ard = True self.load_graph = False elif args.env_name == 'mining': args.game_config = Mining() args.render, args.game, args.fixed_map_mode = args.render, args.env_name, True self.envs = [ Mazebase_high(args) for i in range(args.num_processes) ] if args.mode == 'meta_train': self.graph = Batch_SubtaskGraph(args) self.load_graph = False else: seed = args.seed if seed < 1: seed = 1 args.graph_config = dict(folder=os.path.join( root, 'environment', 'data', 'task_graph_mining', 'new'), gamename='eval1_mining_' + str(seed)) self.graph = SubtaskGraph(args) self.load_graph = True self.num_graphs = self.graph.num_graph self.observation_space = Box(low=0, high=1, shape=self.envs[0].obs_shape, dtype=np.float32) self.feat_dim = args.game_config.feat_dim self.max_task = self.envs[0].n_actions self.state_space = Box(low=0, high=1, shape=self.envs[0].obs_shape, dtype=np.float32) self.action_space = Discrete(self.envs[0].n_actions) self.feed_time = True self.feed_prev_ard = True
def action_space(self): """See class definition.""" # Accelerate (Lane change to left (0), Lane change to right (1), No lane change (2)), # Decelerate (No lane change (3)), # Maintain Speed (Lane change to left (4), Lane change to right (5), No lane change (6)), # Emergency Brake (No lane change (7)) speed = Discrete(4) # Lane change to left, Lane change to right, No lane change lane_change = Discrete(3) return Discrete(7)
def __init__(self): self.seed_num = None self.dealer = [] self.player = [] # ACE, 2, 3, 4, 5, 6, 7, 8, 9, 10, Jack, Queen, King self.deck = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]) self.action_space = Discrete(N_ACTIONS) self.observation_space = Tuple( (Discrete(11), Discrete(32), Discrete(2))) self.reward_range = (-1, 1) self.dealer_stop = DEALER_SICK_SUM
def __init__(self, env): super().__init__(env) conf = self.env.unwrapped.conf rows_num = conf["bricks_rows"] self.observation_space = Dict({ "paddle_x": Discrete(81), "ball_x": Discrete(81), "ball_y": Discrete(106), "bricks_status_matrix": Box(low=0, high=1, shape=(rows_num, 18), dtype=np.uint8) })
def __init__(self, n, goal_length, num_distractor, distractor_length, max_steps=2**10): self.goal_length = goal_length self.num_distractor = num_distractor self.distractor_length = distractor_length self.n = n self.pairs = goal_length - 1 + distractor_length * num_distractor self.step_cost = 1e-1 self.reward_gem = 10 self.reward_key = 0 self.max_steps = max_steps self.action_space = Discrete(len(action_space)) self.observation_space = Box(low=0, high=255, shape=(n, n, 3), dtype=np.uint8) self.owned_key = grid_color self.reset()
def make_atari(env_id, max_episode_steps=None): splt = env_id.split("|") env_id = splt[0] env = gym.make(env_id) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=max_episode_steps) if len(splt) > 1: splt_id = splt[1] funcType = type(env.step) # Overwrite env action space! if splt[1][:-1] == "extra_dangling": print("act: ", env.action_space) # quit() def new_step(self, action): #if action == 4: # action = 3 # note: this is duplicate, not dangling! # print("action:", action) action = action % 6 return self._step(action) # ignore dangling action env.action_space = Discrete(6+6*int(splt[1][-1])) env._step = env.step env.step = new_step.__get__(env, type(env)) #funcType(new_step,env,type(env)) elif splt[1] == "extra_duplicate": def new_step(self, action): if action[-1] != 0.0: action[-2] = 1.0 # duplicate action! return self.step(action[:-1]) # ignore dangling action env.step = new_step return env
def __init__(self): self.game = DoomGame() self.game.load_config("O:\\Doom\\a2c\\scenarios\\dodge\\dodge.cfg") self.game.set_doom_scenario_path( "O:\\Doom\\a2c\\scenarios\\dodge\\dodge1.wad") #self.game.load_config('O:\\Doom\\scenarios\\cig_flat.cfg') #self.game.set_doom_scenario_path('O:\\Doom\\scenarios\\cig_flat_small.wad') #self.game.set_doom_map("map03") self.game.add_game_args( "-host 1 -deathmatch +timelimit 1.0 " "+sv_forcerespawn 1 +sv_noautoaim 1 +sv_respawnprotect 1 +sv_spawnfarthest 1 +sv_nocrouch 1 " "+viz_respawn_delay 0") self.game.set_mode(Mode.PLAYER) self.game.set_labels_buffer_enabled(True) self.game.set_depth_buffer_enabled(True) self.game.set_screen_resolution(ScreenResolution.RES_320X240) self.action_space = Discrete(3) self.observation_space = Box(low=0, high=255, shape=(168, 168, 3), dtype=np.uint8) self.available_actions = [[1, 0], [0, 1], [0, 0]] #self.available_actions = [[0,0,0,1,0,0],[0,0,1,0,0,0],[0,0,0,0,0,0]] self.bots = 1
def __init__(self, max_slack=float(1e9), queue_size=int(1e6), max_wrongs=3, past_steps=10, seed=None, target_queue_type="LSTF"): # Fixed variables. self.max_slack = max_slack self.past_steps = past_steps self.max_wrongs = max_wrongs self.seed = seed self.queue_size = queue_size self.target_queue_type = target_queue_type self.init_queue = list() init_slack_rng, _ = seeding.np_random(self.seed) self.observation_space = Space([queue_size], np.dtype(int)) self.action_space = Discrete(queue_size) for i in range(self.queue_size): self.init_queue.append(init_slack_rng.randint(self.max_slack)) # Variables that can be reset. self.slack_rng, _ = seeding.np_random(self.seed) self.wrong_deques = [0] * self.past_steps self.wrong_deque_idx = 0 self.queue = list(self.init_queue)
def __init__(self, env_config): game = Catcher(width=screen_wh, height=screen_wh) fps = 30 # fps we want to run at frame_skip = 2 num_steps = 2 force_fps = False # False for slower speed display_screen = True # make a PLE instance. self.env = PLE(game, fps=fps, frame_skip=frame_skip, num_steps=num_steps, force_fps=force_fps, display_screen=display_screen) self.env.init() self.action_dict = {0: None, 1: 97, 2: 100} #PLE env starts with black screen self.env.act(self.env.NOOP) self.action_space = Discrete(3) self.k = 4 self.observation_space = spaces.Box(low=0, high=255, shape=(screen_wh, screen_wh, 1 * self.k)) self.frames = deque([], maxlen=self.k)
def __init__(self, env): super(OTWrapper, self).__init__(env) self.observation_space = Box(low=0, high=255, dtype=np.uint8, shape=(84, 84, 1)) self.action_space = Discrete(len(HUMAN_ACTIONS))
def __init__(self, env, args): super().__init__( env, args, # default values for this algorithm default_learning_rate=7e-4, default_discount_factor=0.99, default_num_updates=10**7, ) # default values which are currently unavailable from the cmdline. maybe add these as flags self.processes = 16 self.frames_per_process = 128 self.lam = 0.95 self.entropy_coef = 0.01 self.value_loss_coef = 0.5 self.max_grad_norm = 0.5 self.optim_eps = 1e-5 self.clip_eps = 0.2 self.epochs = 4 self.batch_size = 256 self.seed = 666 if not self.seed else self.seed self.frames_per_update = self.frames_per_process * self.processes self.update_shape = (self.frames_per_process, self.processes) seed_all(self.seed) obs_space = {"image": self.env.observation_space.spaces["image"].shape} self.model = {"acmodel": ACModel(obs_space, Discrete(3)).to(device)}
def __init__(self, n, goal_length, num_distractor, distractor_length, viewport_size=5, max_steps=300, world=None, silence=False): self.goal_length = goal_length self.num_distractor = num_distractor self.distractor_length = distractor_length self.viewport_size = viewport_size self.n = n self.num_pairs = goal_length - 1 + distractor_length * num_distractor # Penalties and Rewards self.step_cost = 0.1 self.reward_gem = 10 self.reward_key = 0 # Other Settings self.viewer = None self.max_steps = max_steps self.action_space = Discrete(len(ACTION_LOOKUP)) self.observation_space = Box(low=0, high=255, shape=(n, n, 3), dtype=np.uint8) self.silence = silence # Game initialization self.owned_key = np.array(grid_color, dtype=np.float64) self.np_random_seed = None self.world = None self.reset(world) self.fig = plt.figure() self.ax = self.fig.add_subplot(1, 1, 1) self.img = self.ax.imshow(self.world, vmin=0, vmax=255, interpolation='none') self.fig.canvas.draw() self.axbackground = self.fig.canvas.copy_from_bbox(self.ax.bbox) plt.show(block=False)
def __init__(self, sc_env, dim, id=np.random.randint(1000), verbose_freq=1, agg_n_episodes=100, reselect_army_freq=5 ): """ :param sc_env: SC2Env :param dim: screen dimension of sc2_env :param id: "name" for this environment :param verbose_freq: print results every n episode, 0 is no printing :param agg_n_episodes: print results of this many last episodes :param reselect_army_freq: reselect army every n timesteps, is needed if get new units like in DefeatRoaches, too frequent selecting might hurt the scores little because one timestep is wasted """ self.sc2_env = sc_env self.dim = dim self.verbose_freq = verbose_freq # self.action_space = Discrete(dim ** 2) self.action_space = Discrete(2) self.observation_space = Box( low=0, high=SCREEN_FEATURES.player_relative.scale, shape=[dim, dim, 1] ) self.rolling_episode_score = np.zeros(agg_n_episodes, dtype=np.float32) self.agg_n_episodes = agg_n_episodes self.id = id self.attack_move_action_id = [ k for k in actions.FUNCTIONS if k.name == 'Attack_screen' ][0].id self.reselect_army_freq = reselect_army_freq self.step_counter = 0 self.episode_counter = 0
def __init__(self, game): self.action_space = Discrete(3) self.observation_space = Box(low=0, high=255, shape=(84, 84, 3), dtype=np.uint8) self._game = game
def __init__(self, _iter): self.reset() self.observation_space = np.array([0, 0]) self.iter = _iter self.action_space = Discrete(4) self.fig_num = plt.figure().number plt.close()
def action_space(self): """Identify the dimensions and bounds of the action space. Actions characterized using a dict that map agent choices to traffic light actions: ie. for single light: 0, 1 to not switch or to switch traffic light respectively {0: (0), 1:(1)} for multi light: 0, 1, 2, 3, 4 ... agents values corresponding to action for each traffic light example for 3 lights: {1: (1,0,0), 2:(0,1,0) .. Returns ------- gym.spaces.Discrete object contains shape and bounds of action space characterized """ # get all combinations of actions [(1,0,0), (0,1,0)... lst = list(itertools.product([0, 1], repeat=self.num_traffic_lights)) # create dict mapping agents actions to list {1: (1,0,0), 2:(0,1,0) .. for i in np.arange(len(lst)): self.action_dict.update({i: lst[i]}) return Discrete(len(lst))
def __init__(self, env, mode = "no_act"): super().__init__(env) self.env = env self.action_space = Discrete(NUM_ACTS) self.facing = 1 # 1 if facing right, 0 if facing left self.observation_space = Box(0, 1, shape=(NUM_OBS,)) self.mode = mode
def __init__(self, dim_room=(10, 10), max_steps=120, num_boxes=4, num_gen_steps=None, reset=True): # General Configuration self.dim_room = dim_room if num_gen_steps == None: self.num_gen_steps = int(1.7 * (dim_room[0] + dim_room[1])) else: self.num_gen_steps = num_gen_steps self.num_boxes = num_boxes self.boxes_on_target = 0 # Penalties and Rewards self.penalty_for_step = -0.1 self.penalty_box_off_target = -1 self.reward_box_on_target = 1 self.reward_finished = 10 self.reward_last = 0 # Other Settings self.viewer = None self.max_steps = max_steps self.action_space = Discrete(len(ACTION_LOOKUP)) screen_height, screen_width = (dim_room[0] * 16, dim_room[1] * 16) self.observation_space = Box(low=0, high=255, shape=(screen_height, screen_width, 3), dtype=np.uint8) if reset: # Initialize Room _ = self.reset()
def __init__(self, size, sleep=0, dict_state=False, ma_rew=0): self.size = size self.sleep = sleep self.dict_state = dict_state self.ma_rew = ma_rew self.action_space = Discrete(2) self.reset()
def __init__(self, n, goal_length, num_distractor, distractor_length, max_steps=300, world=None): self.goal_length = goal_length self.num_distractor = num_distractor self.distractor_length = distractor_length self.n = n self.num_pairs = goal_length - 1 + distractor_length * num_distractor # Penalties and Rewards self.step_cost = 0.1 self.reward_gem = 10 self.reward_key = 0 # Other Settings self.viewer = None self.max_steps = max_steps self.action_space = Discrete(len(ACTION_LOOKUP)) self.observation_space = Box(low=0, high=255, shape=(n, n, 3), dtype=np.uint8) # Game initialization self.owned_key = [220, 220, 220] self.np_random_seed = None self.reset(world)
def test_model_action(self): np.random.seed(1) model = MagicMock() target = MagicMock() model.predict = MagicMock( side_effect=[np.array([[0.23, 0.75, 0.11, 0.007]])]) normalizer = Mock() normalizer.normalize_state.return_value = np.ones(shape=(84, 84)) agent = DeepQAgent(action_space=Discrete(4), normalizer=normalizer, experience_size=100, model_network=model, target_network=target, epsilon=0) agent.episode_step = 3 agent.step_counter = 3 state = np.random.randint(256, size=(210, 16, 3)) self.assertEquals(agent.act(state), 1, "Should Make Action according to the model") self.assertEquals(agent.episode_step, 4, "Step 4") self.assertEquals(agent.step_counter, 4, "Step 4") self.assertTrue(np.array_equal(agent.frame[:, :, 3], np.ones(shape=(84, 84)))) normalizer.normalize_state.assert_called_once_with(state) frame = np.zeros((84, 84, 4)) frame[:, :, 3] = np.ones(shape=(84, 84)) self.assertTrue(np.array_equal(frame, model.predict.call_args_list[0][0][0][0])) self.assertTrue(np.array_equal(frame, agent.frame)) self.assertEquals(agent.last_action, 1)
def __init__(self, n, goal_length, num_distractor, distractor_length, max_steps=10**6, collect_key=True, world=None): self.goal_length = goal_length self.num_distractor = num_distractor self.distractor_length = distractor_length self.n = n self.num_pairs = goal_length - 1 + distractor_length * num_distractor self.collect_key = collect_key # if True, keys are collected immediately when available # Penalties and Rewards self.step_cost = 0 self.reward_gem = 10 self.reward_key = 1 self.reward_distractor = -1 # Other Settings self.viewer = None self.max_steps = max_steps self.action_space = Discrete(len(ACTION_LOOKUP)) self.observation_space = Box(low=0, high=255, shape=(n+2, n+2, 3), dtype=np.uint8) # Game initialization self.owned_key = [220, 220, 220] self.np_random_seed = None self.reset(world) self.num_env_steps = 0 self.episode_reward = 0 self.last_frames = deque(maxlen=3)
def __init__(self): self.game = DoomGame() self.game.load_config("O:\\Doom\\scenarios\\cig_flat.cfg") self.game.set_doom_scenario_path( "O:\\Doom\\scenarios\\cig_flat_small.wad") self.game.set_doom_map("map01") self.game.add_game_args( "-host 1 -deathmatch +timelimit 1.0 " "+sv_forcerespawn 1 +sv_noautoaim 1 +sv_respawnprotect 1 +sv_spawnfarthest 1 +sv_nocrouch 1 " "+viz_respawn_delay 1") self.game.add_game_args("+name AI +colorset 0") self.game.set_doom_map("map02") self.game.add_available_game_variable(GameVariable.POSITION_X) self.game.add_available_game_variable(GameVariable.POSITION_Y) self.game.add_available_game_variable( GameVariable.SELECTED_WEAPON_AMMO) self.game.add_available_game_variable(GameVariable.HEALTH) self.game.add_available_game_variable(GameVariable.ARMOR) #self.game.set_labels_buffer_enabled(True) self.game.set_depth_buffer_enabled(True) self.game.set_mode(Mode.PLAYER) self.game.init() self.action_space = Discrete(3) self.observation_space = Box(low=0, high=255, shape=(84, 84, 3), dtype=np.uint8) self._reset_path_history()
def __init__(self): self.observation_space = Box(0 * np.ones(1), 1.0 * np.ones(1), dtype=np.float64) self.action_space = Discrete(2) self.num_envs = 1 self.cnt = 0 self.length = 50
def action_space(self): if self.discrete: return Discrete(2**self.num_traffic_lights) else: return Box(low=0, high=1, shape=(self.num_traffic_lights, ), dtype=np.float32)
def action_space(self): """See class definition.""" if self.env_params.additional_params['communicate']: accel = Box(low=-3.0, high=3.0, shape=(1, ), dtype=np.float32) communicate = Discrete(2) return Tuple((accel, communicate)) else: return Box(low=-3.0, high=3.0, shape=(1, ), dtype=np.float32)
def __init__(self, dim_room=(10, 10), max_steps=120, num_boxes=4, num_gen_steps=None, seed=0, reset=True, fixed_env=False, randomized_init_position=True): self.seed(seed) # General Configuration self.dim_room = dim_room if num_gen_steps == None: self.num_gen_steps = int(1.7 * (dim_room[0] + dim_room[1])) else: self.num_gen_steps = num_gen_steps self.num_boxes = num_boxes self.boxes_on_target = 0 self.num_env_steps = 0 # Penalties and Rewards self.penalty_for_step = -0.1 self.penalty_box_off_target = -1 self.reward_box_on_target = 1 self.reward_finished = 10 self.reward_last = 0 # Other Settings self.viewer = None self.max_steps = max_steps self.action_space = Discrete(len(ACTION_LOOKUP)) screen_height, screen_width = (dim_room[0] * 16, dim_room[1] * 16) self.observation_space = Box(low=0, high=255, shape=(screen_height, screen_width, 3), dtype=np.uint8) self.epsilon = 0.8 #determinstic:1 self.fixed_env = fixed_env self.randomized_init_position = randomized_init_position if fixed_env: try: s = load_obj('state_sokoban') self.init_state(s) except: assert False print("no saved initial state, creating one...") s = self.get_state() save_obj(s, 'state_sokoban') if reset and not fixed_env: # Initialize Room _ = self.reset()
def action_space(self): """See class definition.""" if self.discrete: return Discrete(2**self.num_traffic_lights) else: return Box(low=-1, high=1, shape=(self.num_traffic_lights, ), dtype=np.float32)
def __init__(self): # load pkl model # define the state, action space # define reward function self.state = None self.action_space = Discrete(n=100) self.observation_space = None self.step_count = 0 pass