Beispiel #1
0
def select_action_in_rollout(available_after_states, policy_weights, num_features,
                             use_filters_during_rollout, feature_directors, use_dom, use_cumul_dom):
    num_after_states = len(available_after_states)
    action_features = np.zeros((num_after_states, num_features))
    for ix, after_state in enumerate(available_after_states):
        action_features[ix] = after_state.get_features_pure(False)  # , order_by=self.feature_order
    if use_filters_during_rollout:
        not_simply_dominated, not_cumu_dominated = dominance_filter(action_features * feature_directors,
                                                                    len_after_states=num_after_states)  # domtools.
        if use_dom:
            action_features = action_features[not_simply_dominated]
            map_back_vector = np.nonzero(not_simply_dominated)[0]
        elif use_cumul_dom:
            action_features = action_features[not_cumu_dominated]
            map_back_vector = np.nonzero(not_cumu_dominated)[0]
        else:
            raise ValueError("Either use_dom or use_cumul_dom has to be true.")

    utilities = action_features.dot(np.ascontiguousarray(policy_weights))
    move_index = np.argmax(utilities)
    if use_filters_during_rollout:
        state_tmp = available_after_states[map_back_vector[move_index]]
    else:
        state_tmp = available_after_states[move_index]
    return state_tmp
Beispiel #2
0
def choose_max_util_action_in_rollout(available_after_states, policy_weights,
                                      rollout_dom_filter,
                                      rollout_cumu_dom_filter,
                                      feature_directors, num_features):
    num_states = len(available_after_states)
    action_features = np.zeros((num_states, num_features))
    for ix, after_state in enumerate(available_after_states):
        action_features[ix] = after_state.get_features_and_direct(
            feature_directors, False)  # , order_by=self.feature_order
    if rollout_dom_filter or rollout_cumu_dom_filter:
        not_simply_dominated, not_cumu_dominated = dominance_filter(
            action_features, len_after_states=num_states)  # domtools.
        if rollout_cumu_dom_filter:
            action_features = action_features[not_cumu_dominated]
            map_back_vector = np.nonzero(not_cumu_dominated)[0]
        elif rollout_dom_filter:
            action_features = action_features[not_simply_dominated]
            map_back_vector = np.nonzero(not_simply_dominated)[0]
    else:
        map_back_vector = np.arange(num_states)
    utilities = action_features.dot(policy_weights)
    move_index = np.random.choice(
        map_back_vector[utilities == np.max(utilities)])
    # move_index = np.argmax(utilities)
    move = available_after_states[move_index]
    return move
Beispiel #3
0
def choose_greedy_if_reward_else_max_util_from_learned_directions_action_in_rollout(
        available_after_states, policy_weights, rollout_dom_filter,
        rollout_cumu_dom_filter, feature_directors, num_features,
        learned_directions):
    num_states = len(available_after_states)
    action_features = np.zeros((num_states, num_features))
    for ix, after_state in enumerate(available_after_states):
        action_features[ix] = after_state.get_features_and_direct(
            feature_directors, False)  # , order_by=self.feature_order

    if rollout_dom_filter or rollout_cumu_dom_filter:
        not_simply_dominated, not_cumu_dominated = dominance_filter(
            action_features, len_after_states=num_states)  # domtools.
        if rollout_cumu_dom_filter:
            available_after_states = [
                s for (s, d) in zip(available_after_states, not_cumu_dominated)
                if d
            ]
            # map_back_vector = np.nonzero(not_cumu_dominated)[0]
            # available_after_states = [available_after_states[i] for i in map_back_vector]
        elif rollout_dom_filter:
            available_after_states = [
                s
                for (s, d) in zip(available_after_states, not_simply_dominated)
                if d
            ]
            # map_back_vector = np.nonzero(not_simply_dominated)[0]
            # available_after_states = available_after_states[map_back_vector]
        num_states = len(available_after_states)
    # else:
    #     map_back_vector = np.arange(num_states)

    rewards = np.zeros(num_states)
    max_reward = 0
    for ix, after_state in enumerate(available_after_states):
        reward_of_after_state = after_state.n_cleared_lines
        if reward_of_after_state > 0:
            rewards[ix] = after_state.n_cleared_lines
            if reward_of_after_state > max_reward:
                max_reward = reward_of_after_state
    if max_reward > 0:
        # max_reward_indeces = rewards == max_reward
        # available_after_states = [s for (s, d) in zip(available_after_states, not_simply_dominated) if d]
        max_reward_indeces = np.where(rewards == max_reward)[0]
        available_after_states = [
            available_after_states[i] for i in max_reward_indeces
        ]
        action_features = action_features[max_reward_indeces]
        num_states = len(available_after_states)
    utilities = action_features.dot(policy_weights * learned_directions)
    # utilities == np.max(utilities)
    move_index = np.random.choice(
        np.arange(num_states)[utilities == np.max(utilities)])
    # move_index = np.argmax(utilities)
    move = available_after_states[move_index]
    return move
Beispiel #4
0
def choose_action_using_rollouts(
        start_state, start_tetromino, rollout_mechanism, rollout_length,
        generative_model, policy_weights, dom_filter, cumu_dom_filter,
        rollout_dom_filter, rollout_cumu_dom_filter, feature_directors,
        num_features, gamma, number_of_rollouts_per_child, learned_directions):
    children_states = start_tetromino.get_after_states(start_state)
    num_children = len(children_states)
    if num_children == 0:
        # Game over!
        return (
            State(np.zeros((1, 1), dtype=np.bool_), np.zeros(1,
                                                             dtype=np.int64),
                  np.array([0], dtype=np.int64), np.array([0], dtype=np.int64),
                  0.0, 1, "bcts", True, False),  # dummy state
            0,  # dummy child_index
            np.zeros((2, 2)))  # dummy action_features

    action_features = np.zeros((num_children, num_features), dtype=np.float_)
    for ix in range(num_children):
        action_features[ix] = children_states[ix].get_features_and_direct(
            feature_directors, False)  # , order_by=self.feature_order
    if dom_filter or cumu_dom_filter:
        not_simply_dominated, not_cumu_dominated = dominance_filter(
            action_features, len_after_states=num_children)

    child_total_values = np.zeros(num_children)
    for child in range(num_children):
        do_rollout = False
        if cumu_dom_filter:
            if not_cumu_dominated[child]:
                do_rollout = True
        elif dom_filter:
            if not_simply_dominated[child]:
                do_rollout = True
        else:
            do_rollout = True

        if do_rollout:
            for rollout_ix in range(number_of_rollouts_per_child):
                child_total_values[child] += roll_out(
                    children_states[child], rollout_length, rollout_mechanism,
                    generative_model, policy_weights, rollout_dom_filter,
                    rollout_cumu_dom_filter, feature_directors, num_features,
                    gamma, learned_directions)
        else:
            child_total_values[child] = -np.inf

    max_value = np.max(child_total_values)
    max_value_indices = np.where(child_total_values == max_value)[0]
    child_index = np.random.choice(max_value_indices)
    return children_states[child_index], child_index, action_features
Beispiel #5
0
def calculate_available_actions(rollout_state_population, generative_model, env):
    print("Calculate available actions with and without filters")
    num_av_acts = np.zeros(len(rollout_state_population))
    num_fil_av_acts = np.zeros(len(rollout_state_population))
    feature_directors = np.array([-1, -1, -1, -1, -1, -1, 1, -1], dtype=np.float64)
    for ix in range(len(rollout_state_population)):
        # print(ix)
        generative_model.next_tetromino()
        child_states = generative_model.get_after_states(rollout_state_population[ix])
        num_child_states = len(child_states)
        num_av_acts[ix] = len(child_states)

        state_action_features = np.zeros((num_child_states, env.num_features), dtype=np.float_)
        for child_ix in range(num_child_states):
            state_action_features[child_ix] = child_states[child_ix].get_features_pure(False)  # , order_by=self.feature_order

        not_simply_dominated, not_cumu_dominated = dominance_filter(state_action_features * feature_directors,
                                                                    len_after_states=num_child_states)
        num_fil_av_acts[ix] = np.sum(not_cumu_dominated)

    print(f"The mean number of available actions was {np.mean(num_av_acts)}")
    print(f"The mean number of FILTERED available actions was {np.mean(num_fil_av_acts)}")
Beispiel #6
0
    def choose_action(self, start_state, start_tetromino):
        children_states = start_tetromino.get_after_states(start_state)  # , current_state=
        num_children = len(children_states)
        if num_children == 0:
            # Terminal state!!
            return State(np.zeros((1, 1), dtype=np.bool_),
                         np.zeros(1, dtype=np.int64),
                         np.array([0], dtype=np.int64),
                         np.array([0], dtype=np.int64),
                         0.0,
                         1,
                         "bcts",
                         True,
                         False)

        action_features = np.zeros((num_children, self.num_features))
        for ix, after_state in enumerate(children_states):
            action_features[ix] = after_state.get_features_pure(False)

        if self.use_filter_in_eval:
            not_simply_dominated, not_cumu_dominated = dominance_filter(action_features * self.feature_directors,
                                                                        len_after_states=num_children)  # domtools.
            if self.use_cumul_dom_filter:
                action_features = action_features[not_cumu_dominated]
                map_back_vector = np.nonzero(not_cumu_dominated)[0]
            else:
                action_features = action_features[not_simply_dominated]
                map_back_vector = np.nonzero(not_simply_dominated)[0]

        utilities = action_features.dot(np.ascontiguousarray(self.policy_weights))
        max_indices = np.where(utilities == np.max(utilities))[0]
        move_index = np.random.choice(max_indices)
        if self.use_filter_in_eval:
            move = children_states[map_back_vector[move_index]]
        else:
            move = children_states[move_index]
        return move
Beispiel #7
0
def general_action_value_rollout(use_filters_during_rollout,
                                 use_filters_before_rollout,
                                 start_state,
                                 rollout_length,
                                 rollouts_per_action,
                                 gamma,
                                 generative_model,
                                 policy_weights,
                                 value_weights,
                                 num_features,
                                 use_state_values,
                                 reward_greedy,
                                 use_dom,
                                 use_cumul_dom,
                                 feature_directors):
    child_states = generative_model.get_after_states(start_state)
    num_child_states = len(child_states)
    action_value_estimates = np.zeros(num_child_states)
    state_action_features = np.zeros((num_child_states, num_features))

    if num_child_states == 0:
        # Rollout starting state is terminal state
        return action_value_estimates, state_action_features

    state_action_features = np.zeros((num_child_states, num_features), dtype=np.float_)
    for ix in range(num_child_states):
        state_action_features[ix] = child_states[ix].get_features_pure(False)  # , order_by=self.feature_order

    if use_filters_before_rollout:
        not_simply_dominated, not_cumu_dominated = dominance_filter(state_action_features * feature_directors,
                                                                    len_after_states=num_child_states)

    is_not_filtered_out = np.ones(num_child_states, dtype=np.bool_)
    for child_ix in range(num_child_states):
        do_rollout = False
        if use_filters_before_rollout:
            if use_dom:
                if not_simply_dominated[child_ix]:
                    do_rollout = True
            elif use_cumul_dom:
                if not_cumu_dominated[child_ix]:
                    do_rollout = True
            else:
                raise ValueError
        else:
            do_rollout = True

        if do_rollout:
            state_tmp = child_states[child_ix]
            start_reward = state_tmp.n_cleared_lines

            for rollout_ix in range(rollouts_per_action):
                cumulative_reward = start_reward
                game_ended = False
                count = 0
                while not game_ended and count < rollout_length:  # there are rollout_length rollouts
                    generative_model.next_tetromino()
                    available_after_states = generative_model.get_after_states(state_tmp)
                    num_after_states = len(available_after_states)
                    if num_after_states == 0:
                        # Terminal state
                        game_ended = True
                    else:
                        state_tmp = select_action_in_rollout(available_after_states, policy_weights,
                                                             num_features, use_filters_during_rollout, feature_directors,
                                                             use_dom, use_cumul_dom)
                        cumulative_reward += (gamma ** count) * state_tmp.n_cleared_lines
                    count += 1

                # One more (the (rollout_length+1)-th) for truncation value!
                if use_state_values and not game_ended:
                    generative_model.next_tetromino()
                    available_after_states = generative_model.get_after_states(state_tmp)
                    num_after_states = len(available_after_states)
                    if num_after_states > 0:
                        state_tmp = select_action_in_rollout(available_after_states, policy_weights,
                                                             num_features, use_filters_during_rollout, feature_directors,
                                                             use_dom, use_cumul_dom)

                        # Get state value of last state.
                        final_state_features = state_tmp.get_features_pure(True)
                        cumulative_reward += (gamma ** count) * final_state_features.dot(value_weights)

                action_value_estimates[child_ix] += cumulative_reward
        else:
            is_not_filtered_out[child_ix] = False
            action_value_estimates[child_ix] = -np.inf

    action_value_estimates = action_value_estimates[is_not_filtered_out]
    state_action_features = state_action_features[is_not_filtered_out]
    action_value_estimates /= rollouts_per_action
    return action_value_estimates, state_action_features