Ejemplo n.º 1
0
def abstract_tf(intervals, new_state_bounds, sink):

    adder = 1 if sink else 0
    abs_tf = np.zeros(len(intervals) + adder)
    for ns in new_state_bounds:
        min_mcrst = helper.get_mcrst(ns[0], intervals, sink)
        max_mcrst = helper.get_mcrst(ns[1], intervals, sink)

        if min_mcrst == max_mcrst:
            abs_tf[min_mcrst] += 1

        else:
            den = ns[1] - ns[0]

            if min_mcrst == -1:
                abs_tf[min_mcrst] += (intervals[0][0] - ns[0]) / den
            else:
                abs_tf[min_mcrst] += (intervals[min_mcrst][1] - ns[0]) / den
            if max_mcrst == len(intervals):
                abs_tf[max_mcrst] += (ns[1] - intervals[-1][1]) / den
            else:
                abs_tf[max_mcrst] += (ns[1] - intervals[max_mcrst][0]) / den

            for i in range(min_mcrst + 1, max_mcrst):
                abs_tf[i] += (intervals[i][1] - intervals[i][0]) / den

    return helper.normalize_array(abs_tf)
Ejemplo n.º 2
0
def abstract_tf(intervals, new_state_bounds, sink):

    adder = 1 if sink else 0
    abs_tf = []
    for i in range(0, len(intervals) + adder):
        abs_tf.append([0, 0])

    for ns in new_state_bounds:
        min_mcrst = helper.get_mcrst(ns[0], intervals, sink)
        max_mcrst = helper.get_mcrst(ns[1], intervals, sink)

        # update min interval.
        if min_mcrst == max_mcrst:
            abs_tf[min_mcrst][0] += 1

        # update max interval.
        for i in range(min_mcrst, max_mcrst + 1):
            abs_tf[i][1] += 1

        # correction (ev).
        if min_mcrst == -1 and max_mcrst == len(intervals):
            abs_tf[min_mcrst][1] -= 1

    # normalization.
    den = len(new_state_bounds)
    return [[el[0] / den, el[1] / den] for el in abs_tf]
    def create_arriving_mcrst_helper(self):

        for cont in self.container:
            for act in cont.keys():

                # Evaluate the effect of act on every sample in the macrostate.
                # --> We assume valid the Lipschitz-0 hypothesis on delta s in order to add fictitious samples! <--
                sample = cont[act]
                delta_s = sample['new_state'] - sample['state']
                self.arriving_mcrst_helper[act] = {}  # every action is a key.
                ns_index = helper.get_mcrst(sample['new_state'],
                                            self.intervals, self.sink)
                self.arriving_mcrst_helper[act][
                    ns_index] = 1  # every index of an arriving mcrst is a key.

                # Apply the delta s of the sample to every other state in the macrostate.
                for act2 in cont.keys():
                    if act != act2:  # evaluation of act in all the other samples in the mcrst.
                        new_state = cont[act2]['state'] + delta_s
                        index = helper.get_mcrst(new_state, self.intervals,
                                                 self.sink)

                        if index in self.arriving_mcrst_helper[act].keys():
                            self.arriving_mcrst_helper[act][index] += 1
                        else:
                            self.arriving_mcrst_helper[act][
                                index] = 1  # every index of an arriving mcrst is a key.
Ejemplo n.º 4
0
def abstract_tf(intervals, new_state_bounds, sink):

    adder = 1 if sink else 0
    abs_tf = np.zeros(len(intervals) + adder)
    # I obtain the min & max new state I would get by performing action act in the mcrst, according to the samples.
    new_st_min = min([ns[0] for ns in new_state_bounds])
    new_st_max = max([ns[1] for ns in new_state_bounds])

    min_mcrst = helper.get_mcrst(new_st_min, intervals, sink)
    max_mcrst = helper.get_mcrst(new_st_max, intervals, sink)

    if min_mcrst == max_mcrst:
        abs_tf[min_mcrst] += 1

    else:
        if min_mcrst == -1:
            abs_tf[min_mcrst] += (intervals[0][0] - new_st_min)
        else:
            abs_tf[min_mcrst] += (intervals[min_mcrst][1] - new_st_min)
        if max_mcrst == len(intervals):
            abs_tf[max_mcrst] += (new_st_max - intervals[-1][1])
        else:
            abs_tf[max_mcrst] += (new_st_max - intervals[max_mcrst][0])

        for i in range(min_mcrst + 1, max_mcrst):
            abs_tf[i] += (intervals[i][1] - intervals[i][0])

    return helper.normalize_array(abs_tf)
Ejemplo n.º 5
0
    def create_arriving_mcrst_helper(self):

        for cont in self.container:
            for act in cont.keys():

                # Evaluate the effect of act on every sample in the macrostate.
                # --> We assume valid the Lipschitz-0 hypothesis on delta s in order to add fictitious samples! <--
                sample = cont[act]
                delta_s = sample['new_state'] - sample['state']
                self.arriving_mcrst_helper[act] = {}

                # Apply the delta s of the sample to every other state in the macrostate.
                for act2 in cont.keys():
                    if act != act2:
                        new_state = cont[act2]['state'] + delta_s
                        new_state_mcrst = helper.get_mcrst(
                            new_state, self.intervals, self.sink)

                        if new_state_mcrst in self.arriving_mcrst_helper[
                                act].keys():
                            self.arriving_mcrst_helper[act][
                                new_state_mcrst] += 1
                        else:
                            self.arriving_mcrst_helper[act][
                                new_state_mcrst] = 1
Ejemplo n.º 6
0
def estimate_performance_abstract_policy(env, n_episodes, n_steps,
                                         abstract_policy, init_states, interv,
                                         INTERVALS):
    acc = 0
    for i in range(0, n_episodes):
        env.reset(init_states[i])
        g = 1
        for j in range(0, n_steps):
            state = env.get_state()
            if interv is not None:
                action = abstract_policy[helper.get_mcrst(state, interv,
                                                          SINK)][0]
            else:
                action = abstract_policy[helper.get_mcrst(
                    state, INTERVALS, SINK)][0]
            new_state, r, _, _ = env.step(action)
            acc += g * r
            g *= GAMMA
    return acc / n_episodes
Ejemplo n.º 7
0
def sampling_abstract_optimal_pol(abs_opt_policy, det_samples, param, interv,
                                  INTERVALS):
    fictitious_samples = []
    for sam in det_samples:
        single_sample = []
        for s in sam:
            prev_action = deterministic_action(param, s[0])
            if interv is not None:
                mcrst = helper.get_mcrst(s[0], interv, SINK)
            else:
                mcrst = helper.get_mcrst(s[0], INTERVALS, SINK)
            if prev_action in abs_opt_policy[mcrst]:
                single_sample.append([s[0], prev_action])
            else:
                index = np.argmin(
                    [abs(act - prev_action) for act in abs_opt_policy[mcrst]])
                single_sample.append([s[0], abs_opt_policy[mcrst][index]])
        fictitious_samples.append(single_sample)
    return fictitious_samples
Ejemplo n.º 8
0
def sampling_abstract_optimal_pol(abs_opt_policy, det_samples):
    fictitious_samples = []
    for sam in det_samples:
        single_sample = []
        for s in sam:
            prev_action = deterministic_action(np.reshape(s[0], (1, 1)))
            prev_action = prev_action[0]
            mcrst = helper.get_mcrst(s[0], INTERVALS, SINK)
            if prev_action in abs_opt_policy[mcrst]:
                single_sample.append([s[0], prev_action])
            else:
                index = np.argmin(
                    [abs(act - prev_action) for act in abs_opt_policy[mcrst]])
                single_sample.append([s[0], abs_opt_policy[mcrst][index]])
        fictitious_samples.append(single_sample)
    return fictitious_samples
Ejemplo n.º 9
0
    def construct_problem(self):
        self.init_operation()  # Initialize some variables of support.
        theta = cp.Variable((self.n_actions, self.i), nonneg=True)
        objective = cp.Minimize(-cp.sum(cp.multiply(self.I, cp.log(theta))))

        constraints = []
        # Sum of rows must be equal to 1.
        for k in range(0, self.n_actions):
            constraints.append(cp.sum(theta[k]) == 1)

        # Lipschitz hypothesis between actions in the same macrostate.
        for k in range(0, self.i):

            actions_mcrst = sorted(list(self.container[k].keys()),
                                   reverse=True)
            new_mcrst_possible = []
            for act in actions_mcrst:
                new_mcrst = helper.get_mcrst(
                    self.container[k][act]['new_state'], self.intervals,
                    self.sink)

                if new_mcrst not in new_mcrst_possible:
                    new_mcrst_possible.append(new_mcrst)

                # The helper might contain new_mcrst that are not yet included in new_mcrst_possible.
                from_helper = self.arriving_mcrst_helper[act].keys()
                for mcrst in from_helper:
                    if mcrst not in new_mcrst_possible:
                        new_mcrst_possible.append(mcrst)

            for i in range(0, len(actions_mcrst) - 1):
                for k2 in new_mcrst_possible:
                    constraints.append(
                        theta[self.get_id_from_action(actions_mcrst[i])][k2] -
                        theta[self.get_id_from_action(actions_mcrst[
                            i + 1])][k2] <= self.L *
                        abs(actions_mcrst[i] - actions_mcrst[i + 1]))
                    constraints.append(
                        theta[self.get_id_from_action(actions_mcrst[i])][k2] -
                        theta[self.get_id_from_action(actions_mcrst[
                            i + 1])][k2] >= -self.L *
                        abs(actions_mcrst[i] - actions_mcrst[i + 1]))

        problem = cp.Problem(objective, constraints)
        problem.solve(solver=cp.ECOS, abstol=1e-4, max_iters=200)

        return theta.value
Ejemplo n.º 10
0
    def fill_I(self):

        matrix_i = np.zeros((self.n_actions, self.i))
        for cont in self.container:

            for act, single_sample in cont.items():
                new_mcrst = helper.get_mcrst(single_sample['new_state'],
                                             self.intervals, self.sink)
                # I assume that all the actions are different.
                matrix_i[self.get_id_from_action(act)][new_mcrst] += 1

            # contribution of the fictitious samples.
            for act in self.arriving_mcrst_helper.keys():
                for mcrst in self.arriving_mcrst_helper[act].keys():
                    matrix_i[self.get_id_from_action(
                        act)][mcrst] += self.arriving_mcrst_helper[act][mcrst]

        self.I.value = matrix_i
Ejemplo n.º 11
0
def sampling_abstract_optimal_pol(abs_opt_policy, det_samples, param):

    fictitious_samples = []
    for sam in det_samples:
        single_sample = []
        for s in sam:

            # avoid to consider the sink state in the fictitious samples.
            if s[2] != 0:
                prev_action = deterministic_action(param, s[0])
                mcrst = helper.get_mcrst(s[0], INTERVALS, SINK)
                if prev_action in abs_opt_policy[mcrst]:
                    single_sample.append([s[0], prev_action])
                else:
                    index = np.argmin([
                        abs(act - prev_action) for act in abs_opt_policy[mcrst]
                    ])
                    single_sample.append([s[0], abs_opt_policy[mcrst][index]])

        fictitious_samples.append(single_sample)
    return fictitious_samples
Ejemplo n.º 12
0
    def divide_samples(self, samples, problem, seed, intervals=None):

        if intervals is not None:
            self.intervals = intervals

        # container is an array of dictionaries.
        # Every dict has the actions as key and another dict as value.
        # The second dict has 'state', 'new_state', 'abs_reward', 'abs_tf' as keys.
        self.container = self.init_container()
        if self.sink:
            self.container.append({})

        for sam in samples:
            for i, s in enumerate(sam):
                # every s is an array with this shape: ['state', 'action', 'reward', 'new_state']
                mcrst = helper.get_mcrst(s[0], self.intervals, self.sink)
                self.container[mcrst][s[1]] = {
                    'state': s[0],
                    'new_state': s[3]
                }

        # to avoid a slow computation.
        help = Helper(seed)
        self.container = [
            help.big_mcrst_correction(cont)
            if len(cont.items()) > helper.MAX_SAMPLES_IN_MCRST else cont
            for cont in self.container
        ]

        # calculate the abstract reward for every sample.
        if problem == 'lqg1d':
            reward_func = helper.calc_abs_reward_lqg
        elif problem == 'cartpole1d':
            reward_func = helper.calc_abs_reward_cartpole
        elif problem == 'minigolf':
            reward_func = helper.calc_abs_reward_minigolf
        for cont in self.container:
            for act in cont.keys():
                cont[act]['abs_reward'] = reward_func(cont, act)