def __init__(self, model, env_kwargs):
     if isinstance(model, str):
         self.model = load_model(model)
     else:
         self.model = model
     self.env = SokobanEnv(**env_kwargs)
     self.env.reset()
Esempio n. 2
0
def _load_shard_vf(shard,
                   data_files_prefix,
                   env_kwargs,
                   filter_values_fn=None,
                   transform_values_fn=None):
    data = _load_shard(shard, data_files_prefix)
    render_env = SokobanEnv(**env_kwargs)
    data_x = []
    data_y = []
    vf = ValueLoader()
    for vf_for_root in data:
        root = vf.load_vf_for_root(vf_for_root, compressed=True)
        data = vf.dump_vf_for_root(root)
        for env_state, v in data:
            if filter_values_fn:
                if filter_values_fn(v):
                    continue
            if transform_values_fn:
                v = transform_values_fn(v)
            render_env.restore_full_state(env_state)
            ob = render_env.render(mode=render_env.mode)
            data_x.append(ob)
            data_y.append(v)
    data_y = np.asarray(data_y)
    if len(data_y.shape) == 1:
        data_y = data_y.reshape((len(data_y), 1))
    return np.asarray(data_x), data_y, {}
Esempio n. 3
0
class ChildrenValuePrinter(HumanPrintWrapper):
    def __init__(self, env, value_fun):
        """

    Args:
      value_fun: callable: obs, states -> value, which would be call by key
        `states`
    """
        super().__init__(env)
        self.render_env = SokobanEnv(**env.init_kwargs)
        self.value_fun = value_fun

    def formatted_state_value(self, state):
        return "{:.2f}".format(self.value_fun(states=state)[0][0])

    def build_texts(self, obs, reward, done, info):
        child_values = list()
        state = self.env.clone_full_state()
        value_str = self.formatted_state_value(state)
        for action in range(self.render_env.action_space.n):
            self.render_env.restore_full_state(state)
            self.render_env.step(action)
            child_state = self.render_env.clone_full_state()
            child_value_str = self.formatted_state_value(child_state)
            child_values.append(child_value_str)
        print('Children values: {}'.format(" ".join(child_values)))
        return [
            'Value: {}'.format(value_str),
            'Children values: {}'.format(" ".join(child_values))
        ]
def create_env(seed, dim_room=(13, 13), num_boxes=5):
    env = SokobanEnv(dim_room=dim_room,
                     max_steps=100,
                     num_boxes=num_boxes,
                     mode='rgb_array',
                     max_distinct_rooms=10)
    env.seed(seed)
    return env
 def __init__(self, value_fn, env_kwargs, depth=4):
     self.render_env = SokobanEnv(**env_kwargs)
     self.env_n_actions = self.render_env.action_space.n
     self.value_function = value_fn
     self.env = SokobanEnv(**env_kwargs)
     self.env.reset()
     self.depth = depth
     self.nodes = dict()
Esempio n. 6
0
    def __init__(self, env, value_fun):
        """

    Args:
      value_fun: callable: obs, states -> value, which would be call by key
        `states`
    """
        super().__init__(env)
        self.render_env = SokobanEnv(**env.init_kwargs)
        self.value_fun = value_fun
 def __init__(self,
              value_function,
              env_kwargs,
              nan_for_zero_value=True,
              copy_negative=True):
     self.value_function = value_function
     self.env = SokobanEnv(**env_kwargs)
     self.env.reset()
     self.nan_for_zero_value = nan_for_zero_value
     self.copy_negative_values = copy_negative
 def __init__(self, model, env_kwargs):
     self.render_env = SokobanEnv(**env_kwargs)
     self.env_n_actions = self.render_env.action_space.n
     if isinstance(model, str):
         self.model = load_model(model)
     else:
         self.model = model
     self.env = SokobanEnv(**env_kwargs)
     self.env.reset()
     assert len(self.model.outputs) == 1
def test_img():
    env = SokobanEnv(dim_room=(10, 10),
                     max_steps=100,
                     num_boxes=4,
                     mode='rgb_array',
                     max_distinct_rooms=10)
    from PIL import Image
    for i in range(10):
        env.reset()
        img = env.render()
        Image.fromarray(img, "RGB").save("{}.png".format(i))
def test_one_hot_mode():
    dim_room = (10, 10)
    env = SokobanEnv(dim_room=dim_room,
                     max_steps=100,
                     num_boxes=2,
                     mode='one_hot',
                     max_distinct_rooms=10)
    obs = env.reset()
    assert obs.shape == dim_room + (7, )
    assert obs.dtype == np.uint8
    print(obs.shape)
def test_room_to_binary_map_and_back():
    env = SokobanEnv()
    for _ in range(100):
        env.reset()
        flat_state = env.clone_full_state()
        (state, structure) = render_utils.get_room_state_and_structure(
            flat_state, env.dim_room)
        room = render_utils.make_standalone_state(state, structure)
        binary_map = render_utils.room_to_binary_map(room)
        converted_room = render_utils.binary_map_to_room(binary_map)
        assert (converted_room == room).all()
def test_serialization(dim=(8, 8),
                       num_boxes=1,
                       mode='rgb_array',
                       seed=None,
                       curriculum=300):
    from ctypes import c_uint
    if not seed:
        _, seed = seeding.np_random(None)
    env = SokobanEnv(dim_room=dim,
                     max_steps=100,
                     num_boxes=num_boxes,
                     mode=mode,
                     curriculum=curriculum)
    env.seed(seed)
    env.reset()

    state = env.clone_full_state()
    obs = env.render(mode='rgb_array')
    value = np.float32(5.0)

    shapes = (state.shape, obs.shape, (1, ))
    type = (state.dtype, obs.dtype, np.float32)
    buf_size = env.max_steps * np.array([np.prod(x) for x in shapes])

    game = [(state, obs, value), (state, obs, value)]
    serial = serialize_game(game, type, buf_size)
    zz = np.frombuffer(serial, dtype=np.uint8)

    dgame = deserialize_game(serial, buf_size, shapes, type)

    return [[(i == j).all() for i, j in zip(a, b)]
            for a, b in zip(game, dgame)]
def test_type_counts(dim_room=(13, 13), num_boxes=4):
    env = SokobanEnv(dim_room=dim_room,
                     max_steps=100,
                     num_boxes=num_boxes,
                     mode='one_hot')
    ob = env.reset()
    type_counter = collections.Counter(
        np.reshape(np.argmax(ob, axis=2), newshape=(-1, )))

    def assert_type_count(type_set, number):
        assert sum(type_counter[type] for type in type_set) == number

    assert_type_count(OneHotTypeSets.player, 1)
    assert_type_count(OneHotTypeSets.box, num_boxes)
    assert_type_count(OneHotTypeSets.target, num_boxes)
class ValueFromKerasNet(Value, ABC):
    def __init__(self, model, env_kwargs):
        if isinstance(model, str):
            self.model = load_model(model)
        else:
            self.model = model
        self.env = SokobanEnv(**env_kwargs)
        self.env.reset()

    def _network_prediction(self, state):
        self.env.restore_full_state(state)
        obs = self.env.render()
        return self.model.predict(np.expand_dims(obs, axis=0))

    def __call__(self, state):
        raise NotImplementedError
def test_recover(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None):
    if not seed:
        _, seed = seeding.np_random(None)
    env = SokobanEnv(dim_room=dim,
                     max_steps=100,
                     num_boxes=num_boxes,
                     mode=mode,
                     max_distinct_rooms=10)
    env.seed(seed)
    env.reset()
    obs = env.render()
    state = env.clone_full_state()
    print(state == env.recover_state(obs))
class PolicyFromNet(Policy):
    def __init__(self, model, env_kwargs):
        self.render_env = SokobanEnv(**env_kwargs)
        self.env_n_actions = self.render_env.action_space.n
        if isinstance(model, str):
            self.model = load_model(model)
        else:
            self.model = model
        self.env = SokobanEnv(**env_kwargs)
        self.env.reset()
        assert len(self.model.outputs) == 1

    def best_actions(self, state):
        self.env.restore_full_state(state)
        ob = self.env.render()
        policy = self.model.predict(np.expand_dims(ob, axis=0))[0]
        best_actions = [np.argmax(policy)]
        return best_actions
Esempio n. 17
0
def test_playing():
    env = PlayWrapper(
        InfoDisplayWrapper(RewardPrinter(
            SokobanEnv(num_boxes=1,
                       game_mode="Magnetic",
                       penalty_pull_action=-0.3)),
                           augment_observations=True,
                           min_text_area_width=500))
    env.play()
Esempio n. 18
0
def test_rendering():
    env = InfoDisplayWrapper(RewardPrinter(SokobanEnv()),
                             augment_observations=True,
                             min_text_area_width=500)
    env.reset()
    env.step(0)
    obs = env.render()
    assert obs.shape == (80, 580, 3)

    env.render(mode='human')
    from time import sleep
    sleep(2)
Esempio n. 19
0
def render_state(state, tiny=False):
    # To avoid circular import.
    from gym_sokoban.envs import SokobanEnv

    # Cache the surfaces to avoid reloading.
    if SURFACES is None:
        globals()['SURFACES'] = SokobanEnv.load_surfaces()

    if tiny:
        render_fn = room_to_tiny_world_rgb
        surface_name = 'tiny_rgb_array'
    else:
        render_fn = room_to_rgb
        surface_name = 'rgb_array'
    return render_fn(state, surfaces=SURFACES[surface_name])
Esempio n. 20
0
def _load_shard_best_action_ignore_finall(shard, data_files_prefix,
                                          env_kwargs):
    """ Choose best action

  If all actions are equally good, give special target value (equal to
  env.action_space.n). For Sokoban this will separate dead ends.
  (for which there is no good action).
  """
    boards = _load_shard(shard, data_files_prefix)
    render_env = SokobanEnv(**env_kwargs)
    data_x = []
    data_y = []
    data_value = []
    vf = ValueLoader()
    policy = PolicyFromValue(vf, env_kwargs)
    assert policy.env_n_actions == render_env.action_space.n
    for vf_for_root in boards:
        root = vf.load_vf_for_root(vf_for_root, compressed=True)
        data = vf.dump_vf_for_root(root)
        for node_state, v in data:
            if v in [0, -float("inf")]:
                # TODO(kc): ValuePerfect does not produce some states which can be
                # obtained after solving game. How to clean it up?
                continue

            render_env.restore_full_state(node_state)
            ob = render_env.render(mode=render_env.mode)
            data_x.append(ob)
            best_actions = policy.act(node_state, return_single_action=False)
            y = np.min(best_actions)
            one_hot_y = np.zeros(shape=render_env.action_space.n, dtype=np.int)
            one_hot_y[y] = 1
            data_y.append(one_hot_y)
            data_value.append(v)
    return np.asarray(data_x), np.asarray(data_y), \
           dict(value=np.asarray(data_value))
Esempio n. 21
0
def generate_next_frame_and_done_data(env_kwargs,
                                      seed,
                                      n_trajectories=100,
                                      trajectory_len=40,
                                      clone_done=100):
    num_boxes_range = next_frame_and_done_data_params()["num_boxes_range"]
    if num_boxes_range is None:
        print("num_boxes_range", num_boxes_range)
        num_boxes_range = [env_kwargs["num_boxes"]]
    env_kwargs = deepcopy(env_kwargs)
    np.random.seed(seed)
    env_kwargs["num_boxes"] = num_boxes_range[np.random.randint(
        len(num_boxes_range))]

    render_env = SokobanEnv(**env_kwargs)
    render_env.seed(seed)
    trajectories = list()  # [(observations, actions, done), ...]
    for i in range(n_trajectories):
        render_env.reset()
        state = render_env.clone_full_state()
        # generate random path
        trajectories.append(
            random_trajectory(state, render_env, trajectory_len))

    # parse trajectories into arrays
    data_x = list()
    data_y_next_frame = list()
    data_y_if_done = list()

    for obs, actions, done in trajectories:
        data_x.extend([
            image_with_embedded_action(ob, action, render_env.action_space.n)
            for ob, action in zip(obs[:-1], actions)
        ])
        data_y_next_frame.extend([ob for ob in obs[1:]])
        data_y_if_done.extend([False] * (len(actions) - 1) + [done])

        if done and (clone_done > 1):
            data_x.extend([data_x[-1].copy() for _ in range(clone_done)])
            data_y_next_frame.extend(
                [data_y_next_frame[-1].copy() for _ in range(clone_done)])
            data_y_if_done.extend(
                [data_y_if_done[-1] for _ in range(clone_done)])

    data_x = np.array(data_x)
    data_y = {
        Target.NEXT_FRAME.value: np.array(data_y_next_frame),
        "if_done": np.array(data_y_if_done).reshape((-1, 1)).astype(int),
    }
    return data_x, data_y, {}
def test_seed(dim=(13, 13), num_boxes=5, mode='rgb_array', seed=None):
    from ctypes import c_uint
    if not seed:
        _, seed = seeding.np_random(None)
    env = SokobanEnv(dim_room=dim,
                     max_steps=100,
                     num_boxes=num_boxes,
                     mode='rgb_array')
    env.seed(seed)
    print("Seed: {}".format(np.uint32(c_uint(seed))))
    from PIL import Image
    env.reset()
    img = env.render()
    Image.fromarray(img, "RGB").resize((200, 200)).show()
class PolicyFromFullTree(Policy):
    def __init__(self, value_fn, env_kwargs, depth=4):
        self.render_env = SokobanEnv(**env_kwargs)
        self.env_n_actions = self.render_env.action_space.n
        self.value_function = value_fn
        self.env = SokobanEnv(**env_kwargs)
        self.env.reset()
        self.depth = depth
        self.nodes = dict()

    def best_actions(self, state):
        # Produce all action sequences
        seq_ = [range(self.env.action_space.n)] * self.depth
        action_seq = list(product(*seq_))
        # print("len(action_seq) {}".format(len(action_seq)))
        for actions in action_seq:
            root_action = actions[0]
            self.env.restore_full_state(state)
            branch_reward = 0
            current_depth = 0
            for action in actions:
                current_depth += 1
                ob, reward, done, _ = self.env.step(action)
                branch_reward += reward
                node = tuple(self.env.clone_full_state())
                if node not in self.nodes:
                    value = self.value_function(
                        states=np.array(node)
                    )  # self.model.predict(np.expand_dims(ob, axis=0))[0]
                    if done:
                        value += 1000
                    self.nodes[node] = (value, branch_reward, current_depth,
                                        root_action, actions[:current_depth])
                else:
                    value, previous_reward, previous_depth, _, _ = self.nodes[
                        node]
                    if previous_depth > current_depth:
                        # if previous_reward > branch_reward:
                        #   assert branch_reward > 10., "{} {}".format(previous_reward, branch_reward)
                        self.nodes[node] = (value, branch_reward,
                                            current_depth, root_action,
                                            actions[:current_depth])
                if done:
                    break
        # self.nodes.values()
        best_node = max(
            self.nodes.keys(),
            key=(lambda node: self.nodes[node][0] + self.nodes[node][1]))
        node_value, branch_reward, current_depth, root_action, actions = self.nodes[
            best_node]
        # print("Distinct leaves {}".format(len(self.nodes)))
        # print("Node value {}, reward {:.1f}, depth {}, action {}, actions {}".format(
        #     node_value, branch_reward, current_depth, root_action, actions))
        return [root_action]
class QFromV(object):
    def __init__(self,
                 value_function,
                 env_kwargs,
                 nan_for_zero_value=True,
                 copy_negative=True):
        self.value_function = value_function
        self.env = SokobanEnv(**env_kwargs)
        self.env.reset()
        self.nan_for_zero_value = nan_for_zero_value
        self.copy_negative_values = copy_negative

    @property
    def env_n_actions(self):
        return self.env.action_space.n

    def q_values(self, state):
        q_values = list()
        if self.nan_for_zero_value:
            # Value might not have children for Sokoban success states.
            if self.value_function(states=state) == 0:
                return [np.nan] * self.env_n_actions
        if self.copy_negative_values:
            # For speed-up
            val = self.value_function(states=state)[0]
            if val < 0:
                return [val] * self.env_n_actions

        for action in range(self.env_n_actions):
            self.env.restore_full_state(state)
            ob, reward, done, _ = self.env.step(action)
            value = reward
            child_state = self.env.clone_full_state()
            if not done:
                value += self.value_function(states=child_state)[0]
            q_values.append(float(value))
        return q_values
    def __init__(self,
                 *,
                 data_files_prefix,
                 env,
                 net,
                 epochs,
                 batch_size,
                 lr,
                 lr_decay=0.0,
                 shards_to_use=None,
                 validation_shards=1,
                 save_every=None,
                 output_dir,
                 histogram_freq=None,
                 validate_every_batch=5000,
                 neptune_first_batch=10000,
                 target="vf",
                 loss=None,
                 n_cores=None,
                 sample_data=False,
                 max_samples_per_board=1000,
                 eval_games_to_play=10,
                 **kwargs):
        if shards_to_use is None:
            self.number_of_shards = infer_number_of_shards(data_files_prefix)
        else:
            self.number_of_shards = shards_to_use
        self.validation_shards = validation_shards
        assert self.validation_shards < self.number_of_shards
        if self.number_of_shards == 1:
            print(
                "WARNING: there is only one shard, so it is used for both training "
                "and validation.")
            self.training_shards = [0]
            self.validation_shards = [0]
        else:
            self.training_shards = list(
                range(self.number_of_shards - self.validation_shards))
            self.validation_shards = list(
                range(self.number_of_shards - self.validation_shards,
                      self.number_of_shards))

        self.data_files_prefix = data_files_prefix
        self.save_every = save_every
        self.checkpoint_dir = os.path.join(output_dir, "checkpoints",
                                           "epoch.{epoch:04d}.hdf5")
        os.makedirs(self.checkpoint_dir, exist_ok=True)
        self.exp_dir_path = output_dir
        self.histogram_freq = histogram_freq
        self.epochs = epochs
        self.env_kwargs = env
        self.render_env = SokobanEnv(**self.env_kwargs)
        self.render_mode = self.render_env.mode
        self.target = Target(target)
        del target
        print("self.target", self.target)
        self.loss = loss_for_target(self.target, loss)
        final_activation = final_network_activation(self.target)
        net_output_size = net_output_size_for_target(
            self.target, self.render_env.action_space.n,
            n_channels_from_mode(env.get("mode", "one_hot")))
        input_channels = n_channels_from_mode(env.get("mode", "one_hot"))
        if self.target in [Target.NEXT_FRAME, Target.NEXT_FRAME_AND_DONE]:
            input_channels += SokobanEnv(**env).action_space.n
        if self.target in [Target.DELTA_VALUE, Target.BEST_ACTION_FRAMESTACK]:
            input_channels *= 2
        self.metrics = [self.loss]
        if isinstance(self.loss, dict):
            # [0] is a dirty change of metrics for vf_and_type
            self.metrics = self.metrics[0]
        self.network = get_network(input_shape=tuple(
            list(env["dim_room"]) + list((input_channels, ))),
                                   output_size=net_output_size,
                                   final_activation=final_activation,
                                   **net)
        self.network.compile(optimizer="adam",
                             loss=self.loss,
                             metrics=self.metrics)
        self.learning_rate_lambda = lambda epoch: lr / (1 + lr_decay * epoch)
        self.batch_size = batch_size
        self.validate_every_batch = validate_every_batch
        self.neptune_first_batch = neptune_first_batch
        if n_cores is None:
            n_cores = count_cpu()
        self.n_cores = n_cores
        self.sample_data = sample_data
        self.max_samples_per_board = max_samples_per_board
        self.random_state = np.random.RandomState(0)
        self.eval_games_to_play = eval_games_to_play
Esempio n. 26
0
def process_board_data(compressed_data, target, env_kwargs, sample_data,
                       max_sample_size, random_state):
    """

  Args:
    compressed_data: dictionary with keys containing ["full_env_state",
      "perfect_value",  "perfect_q"], mapping to compressed arrays.
  """
    render_env = SokobanEnv(**env_kwargs)
    keys = compressed_data.keys()
    assert_v2_keys(compressed_data)

    data = {key: decompress_np_array(compressed_data[key]) for key in keys}
    assert_env_and_state_match(env_kwargs, data["full_env_state"][0])

    filter_values_fn = lambda v, q: False

    stratified_sample_fn = lambda values, q: stratified_sample(
        values, q, max_sample_size, random_state)
    simple_sample_fn = lambda values, q: simple_sample(
        values, q, max_sample_size, random_state)

    if target == Target.VF:
        sample_fn = stratified_sample_fn
    elif target == Target.VF_SOLVABLE_ONLY:
        filter_values_fn = lambda v, q: not is_solvable_state(v, q)
        sample_fn = simple_sample_fn
    elif target == Target.STATE_TYPE:
        sample_fn = stratified_sample_fn
    elif target == Target.BEST_ACTION:
        filter_values_fn = lambda v, q: not is_solvable_state(v, q)
        sample_fn = simple_sample_fn
    elif target == Target.VF_AND_TYPE:
        sample_fn = stratified_sample_fn
    elif target == Target.NEXT_FRAME:
        sample_fn = stratified_sample_fn
    elif target == Target.DELTA_VALUE:
        sample_fn = stratified_sample_fn
    elif target == Target.VF_DISCOUNTED:
        sample_fn = stratified_sample_fn
    elif target == Target.BEST_ACTION_FRAMESTACK:
        filter_values_fn = lambda v, q: not is_solvable_state(v, q)
        sample_fn = simple_sample_fn
    elif target == Target.NEXT_FRAME_AND_DONE:
        sample_fn = stratified_sample_fn
    else:
        raise ValueError("Unknown target {}".format(target))

    mask = ~np.array([
        filter_values_fn(v, q)
        for v, q in zip(data['perfect_value'], data['perfect_q'])
    ],
                     dtype=np.bool)
    data = {key: data[key][mask] for key in keys}
    if sample_data:
        sample_ix = sample_fn(data["perfect_value"], data["perfect_q"])
    else:
        raise NotImplemented()

    if target == Target.DELTA_VALUE:
        data_x, data_y = extract_delta_value(data, sample_ix, render_env,
                                             random_state)
    elif target == Target.VF_DISCOUNTED:
        data_x, data_y = extract_discounted_value(
            sample_ix,
            states=data["full_env_state"],
            perfect_v=data["perfect_value"],
            perfect_q=data["perfect_q"],
            render_env=render_env,
        )
    elif target == Target.BEST_ACTION_FRAMESTACK:
        data_x, data_y = extract_best_action_from_framestack(
            sample_ix,
            states=data["full_env_state"],
            perfect_v=data["perfect_value"],
            perfect_q=data["perfect_q"],
            render_env=render_env,
        )
    else:
        data = {key: data[key][sample_ix] for key in keys}
        if target == Target.NEXT_FRAME:
            data_x, data_y = extract_next_frame_input_and_target(
                data["full_env_state"], render_env)
        else:
            obs = list()
            for node_state in data['full_env_state']:
                render_env.restore_full_state(node_state)
                ob = render_env.render(mode=render_env.mode)
                obs.append(ob)
            data_x = np.array(obs)
            data_y = extract_target_from_value(perfect_v=data["perfect_value"],
                                               perfect_q=data["perfect_q"],
                                               target=target)
    if isinstance(data_y, np.ndarray):
        assert len(data_y.shape) > 1, "data_y should be batched (if target is " \
                                      "scalar it should have shape (num_samples, 1))"
    return data_x, data_y, {}