def __init__(self, value_fn, env_kwargs, depth=4): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n self.value_function = value_fn self.env = SokobanEnv(**env_kwargs) self.env.reset() self.depth = depth self.nodes = dict()
def __init__(self, model, env_kwargs): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n if isinstance(model, str): self.model = load_model(model) else: self.model = model self.env = SokobanEnv(**env_kwargs) self.env.reset() assert len(self.model.outputs) == 1
def _load_shard_vf(shard, data_files_prefix, env_kwargs, filter_values_fn=None, transform_values_fn=None): data = _load_shard(shard, data_files_prefix) render_env = SokobanEnv(**env_kwargs) data_x = [] data_y = [] vf = ValueLoader() for vf_for_root in data: root = vf.load_vf_for_root(vf_for_root, compressed=True) data = vf.dump_vf_for_root(root) for env_state, v in data: if filter_values_fn: if filter_values_fn(v): continue if transform_values_fn: v = transform_values_fn(v) render_env.restore_full_state(env_state) ob = render_env.render(mode=render_env.mode) data_x.append(ob) data_y.append(v) data_y = np.asarray(data_y) if len(data_y.shape) == 1: data_y = data_y.reshape((len(data_y), 1)) return np.asarray(data_x), data_y, {}
def __init__(self, model, env_kwargs): if isinstance(model, str): self.model = load_model(model) else: self.model = model self.env = SokobanEnv(**env_kwargs) self.env.reset()
def test_serialization(dim=(8, 8), num_boxes=1, mode='rgb_array', seed=None, curriculum=300): from ctypes import c_uint if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode=mode, curriculum=curriculum) env.seed(seed) env.reset() state = env.clone_full_state() obs = env.render(mode='rgb_array') value = np.float32(5.0) shapes = (state.shape, obs.shape, (1, )) type = (state.dtype, obs.dtype, np.float32) buf_size = env.max_steps * np.array([np.prod(x) for x in shapes]) game = [(state, obs, value), (state, obs, value)] serial = serialize_game(game, type, buf_size) zz = np.frombuffer(serial, dtype=np.uint8) dgame = deserialize_game(serial, buf_size, shapes, type) return [[(i == j).all() for i, j in zip(a, b)] for a, b in zip(game, dgame)]
def create_env(seed, dim_room=(13, 13), num_boxes=5): env = SokobanEnv(dim_room=dim_room, max_steps=100, num_boxes=num_boxes, mode='rgb_array', max_distinct_rooms=10) env.seed(seed) return env
def test_playing(): env = PlayWrapper( InfoDisplayWrapper(RewardPrinter( SokobanEnv(num_boxes=1, game_mode="Magnetic", penalty_pull_action=-0.3)), augment_observations=True, min_text_area_width=500)) env.play()
def __init__(self, value_function, env_kwargs, nan_for_zero_value=True, copy_negative=True): self.value_function = value_function self.env = SokobanEnv(**env_kwargs) self.env.reset() self.nan_for_zero_value = nan_for_zero_value self.copy_negative_values = copy_negative
def __init__(self, env, value_fun): """ Args: value_fun: callable: obs, states -> value, which would be call by key `states` """ super().__init__(env) self.render_env = SokobanEnv(**env.init_kwargs) self.value_fun = value_fun
def test_img(): env = SokobanEnv(dim_room=(10, 10), max_steps=100, num_boxes=4, mode='rgb_array', max_distinct_rooms=10) from PIL import Image for i in range(10): env.reset() img = env.render() Image.fromarray(img, "RGB").save("{}.png".format(i))
def test_one_hot_mode(): dim_room = (10, 10) env = SokobanEnv(dim_room=dim_room, max_steps=100, num_boxes=2, mode='one_hot', max_distinct_rooms=10) obs = env.reset() assert obs.shape == dim_room + (7, ) assert obs.dtype == np.uint8 print(obs.shape)
def test_room_to_binary_map_and_back(): env = SokobanEnv() for _ in range(100): env.reset() flat_state = env.clone_full_state() (state, structure) = render_utils.get_room_state_and_structure( flat_state, env.dim_room) room = render_utils.make_standalone_state(state, structure) binary_map = render_utils.room_to_binary_map(room) converted_room = render_utils.binary_map_to_room(binary_map) assert (converted_room == room).all()
def test_rendering(): env = InfoDisplayWrapper(RewardPrinter(SokobanEnv()), augment_observations=True, min_text_area_width=500) env.reset() env.step(0) obs = env.render() assert obs.shape == (80, 580, 3) env.render(mode='human') from time import sleep sleep(2)
def test_recover(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None): if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode=mode, max_distinct_rooms=10) env.seed(seed) env.reset() obs = env.render() state = env.clone_full_state() print(state == env.recover_state(obs))
def test_seed(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None): from ctypes import c_uint if not seed: _, seed = seeding.np_random(None) env = SokobanEnv(dim_room=dim, max_steps=100, num_boxes=num_boxes, mode='rgb_array') env.seed(seed) print("Seed: {}".format(np.uint32(c_uint(seed)))) from PIL import Image env.reset() img = env.render() Image.fromarray(img, "RGB").resize((200, 200)).show()
def generate_next_frame_and_done_data(env_kwargs, seed, n_trajectories=100, trajectory_len=40, clone_done=100): num_boxes_range = next_frame_and_done_data_params()["num_boxes_range"] if num_boxes_range is None: print("num_boxes_range", num_boxes_range) num_boxes_range = [env_kwargs["num_boxes"]] env_kwargs = deepcopy(env_kwargs) np.random.seed(seed) env_kwargs["num_boxes"] = num_boxes_range[np.random.randint( len(num_boxes_range))] render_env = SokobanEnv(**env_kwargs) render_env.seed(seed) trajectories = list() # [(observations, actions, done), ...] for i in range(n_trajectories): render_env.reset() state = render_env.clone_full_state() # generate random path trajectories.append( random_trajectory(state, render_env, trajectory_len)) # parse trajectories into arrays data_x = list() data_y_next_frame = list() data_y_if_done = list() for obs, actions, done in trajectories: data_x.extend([ image_with_embedded_action(ob, action, render_env.action_space.n) for ob, action in zip(obs[:-1], actions) ]) data_y_next_frame.extend([ob for ob in obs[1:]]) data_y_if_done.extend([False] * (len(actions) - 1) + [done]) if done and (clone_done > 1): data_x.extend([data_x[-1].copy() for _ in range(clone_done)]) data_y_next_frame.extend( [data_y_next_frame[-1].copy() for _ in range(clone_done)]) data_y_if_done.extend( [data_y_if_done[-1] for _ in range(clone_done)]) data_x = np.array(data_x) data_y = { Target.NEXT_FRAME.value: np.array(data_y_next_frame), "if_done": np.array(data_y_if_done).reshape((-1, 1)).astype(int), } return data_x, data_y, {}
def test_type_counts(dim_room=(13, 13), num_boxes=4): env = SokobanEnv(dim_room=dim_room, max_steps=100, num_boxes=num_boxes, mode='one_hot') ob = env.reset() type_counter = collections.Counter( np.reshape(np.argmax(ob, axis=2), newshape=(-1, ))) def assert_type_count(type_set, number): assert sum(type_counter[type] for type in type_set) == number assert_type_count(OneHotTypeSets.player, 1) assert_type_count(OneHotTypeSets.box, num_boxes) assert_type_count(OneHotTypeSets.target, num_boxes)
def _load_shard_best_action_ignore_finall(shard, data_files_prefix, env_kwargs): """ Choose best action If all actions are equally good, give special target value (equal to env.action_space.n). For Sokoban this will separate dead ends. (for which there is no good action). """ boards = _load_shard(shard, data_files_prefix) render_env = SokobanEnv(**env_kwargs) data_x = [] data_y = [] data_value = [] vf = ValueLoader() policy = PolicyFromValue(vf, env_kwargs) assert policy.env_n_actions == render_env.action_space.n for vf_for_root in boards: root = vf.load_vf_for_root(vf_for_root, compressed=True) data = vf.dump_vf_for_root(root) for node_state, v in data: if v in [0, -float("inf")]: # TODO(kc): ValuePerfect does not produce some states which can be # obtained after solving game. How to clean it up? continue render_env.restore_full_state(node_state) ob = render_env.render(mode=render_env.mode) data_x.append(ob) best_actions = policy.act(node_state, return_single_action=False) y = np.min(best_actions) one_hot_y = np.zeros(shape=render_env.action_space.n, dtype=np.int) one_hot_y[y] = 1 data_y.append(one_hot_y) data_value.append(v) return np.asarray(data_x), np.asarray(data_y), \ dict(value=np.asarray(data_value))
def __init__(self, *, data_files_prefix, env, net, epochs, batch_size, lr, lr_decay=0.0, shards_to_use=None, validation_shards=1, save_every=None, output_dir, histogram_freq=None, validate_every_batch=5000, neptune_first_batch=10000, target="vf", loss=None, n_cores=None, sample_data=False, max_samples_per_board=1000, eval_games_to_play=10, **kwargs): if shards_to_use is None: self.number_of_shards = infer_number_of_shards(data_files_prefix) else: self.number_of_shards = shards_to_use self.validation_shards = validation_shards assert self.validation_shards < self.number_of_shards if self.number_of_shards == 1: print( "WARNING: there is only one shard, so it is used for both training " "and validation.") self.training_shards = [0] self.validation_shards = [0] else: self.training_shards = list( range(self.number_of_shards - self.validation_shards)) self.validation_shards = list( range(self.number_of_shards - self.validation_shards, self.number_of_shards)) self.data_files_prefix = data_files_prefix self.save_every = save_every self.checkpoint_dir = os.path.join(output_dir, "checkpoints", "epoch.{epoch:04d}.hdf5") os.makedirs(self.checkpoint_dir, exist_ok=True) self.exp_dir_path = output_dir self.histogram_freq = histogram_freq self.epochs = epochs self.env_kwargs = env self.render_env = SokobanEnv(**self.env_kwargs) self.render_mode = self.render_env.mode self.target = Target(target) del target print("self.target", self.target) self.loss = loss_for_target(self.target, loss) final_activation = final_network_activation(self.target) net_output_size = net_output_size_for_target( self.target, self.render_env.action_space.n, n_channels_from_mode(env.get("mode", "one_hot"))) input_channels = n_channels_from_mode(env.get("mode", "one_hot")) if self.target in [Target.NEXT_FRAME, Target.NEXT_FRAME_AND_DONE]: input_channels += SokobanEnv(**env).action_space.n if self.target in [Target.DELTA_VALUE, Target.BEST_ACTION_FRAMESTACK]: input_channels *= 2 self.metrics = [self.loss] if isinstance(self.loss, dict): # [0] is a dirty change of metrics for vf_and_type self.metrics = self.metrics[0] self.network = get_network(input_shape=tuple( list(env["dim_room"]) + list((input_channels, ))), output_size=net_output_size, final_activation=final_activation, **net) self.network.compile(optimizer="adam", loss=self.loss, metrics=self.metrics) self.learning_rate_lambda = lambda epoch: lr / (1 + lr_decay * epoch) self.batch_size = batch_size self.validate_every_batch = validate_every_batch self.neptune_first_batch = neptune_first_batch if n_cores is None: n_cores = count_cpu() self.n_cores = n_cores self.sample_data = sample_data self.max_samples_per_board = max_samples_per_board self.random_state = np.random.RandomState(0) self.eval_games_to_play = eval_games_to_play
def process_board_data(compressed_data, target, env_kwargs, sample_data, max_sample_size, random_state): """ Args: compressed_data: dictionary with keys containing ["full_env_state", "perfect_value", "perfect_q"], mapping to compressed arrays. """ render_env = SokobanEnv(**env_kwargs) keys = compressed_data.keys() assert_v2_keys(compressed_data) data = {key: decompress_np_array(compressed_data[key]) for key in keys} assert_env_and_state_match(env_kwargs, data["full_env_state"][0]) filter_values_fn = lambda v, q: False stratified_sample_fn = lambda values, q: stratified_sample( values, q, max_sample_size, random_state) simple_sample_fn = lambda values, q: simple_sample( values, q, max_sample_size, random_state) if target == Target.VF: sample_fn = stratified_sample_fn elif target == Target.VF_SOLVABLE_ONLY: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.STATE_TYPE: sample_fn = stratified_sample_fn elif target == Target.BEST_ACTION: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.VF_AND_TYPE: sample_fn = stratified_sample_fn elif target == Target.NEXT_FRAME: sample_fn = stratified_sample_fn elif target == Target.DELTA_VALUE: sample_fn = stratified_sample_fn elif target == Target.VF_DISCOUNTED: sample_fn = stratified_sample_fn elif target == Target.BEST_ACTION_FRAMESTACK: filter_values_fn = lambda v, q: not is_solvable_state(v, q) sample_fn = simple_sample_fn elif target == Target.NEXT_FRAME_AND_DONE: sample_fn = stratified_sample_fn else: raise ValueError("Unknown target {}".format(target)) mask = ~np.array([ filter_values_fn(v, q) for v, q in zip(data['perfect_value'], data['perfect_q']) ], dtype=np.bool) data = {key: data[key][mask] for key in keys} if sample_data: sample_ix = sample_fn(data["perfect_value"], data["perfect_q"]) else: raise NotImplemented() if target == Target.DELTA_VALUE: data_x, data_y = extract_delta_value(data, sample_ix, render_env, random_state) elif target == Target.VF_DISCOUNTED: data_x, data_y = extract_discounted_value( sample_ix, states=data["full_env_state"], perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], render_env=render_env, ) elif target == Target.BEST_ACTION_FRAMESTACK: data_x, data_y = extract_best_action_from_framestack( sample_ix, states=data["full_env_state"], perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], render_env=render_env, ) else: data = {key: data[key][sample_ix] for key in keys} if target == Target.NEXT_FRAME: data_x, data_y = extract_next_frame_input_and_target( data["full_env_state"], render_env) else: obs = list() for node_state in data['full_env_state']: render_env.restore_full_state(node_state) ob = render_env.render(mode=render_env.mode) obs.append(ob) data_x = np.array(obs) data_y = extract_target_from_value(perfect_v=data["perfect_value"], perfect_q=data["perfect_q"], target=target) if isinstance(data_y, np.ndarray): assert len(data_y.shape) > 1, "data_y should be batched (if target is " \ "scalar it should have shape (num_samples, 1))" return data_x, data_y, {}