Beispiel #1
0
def VITabular(T, r, V=None, policy=None, **kwargs):
    steps = kwargs.get('steps', math.inf)
    normalize = kwargs.get('normalize', False)
    g = kwargs.get('g', 0.999)
    eps = kwargs.get('eps', 1e-6)

    norm_factor = 1 - g if normalize else 1

    if not isinstance(V, grl.Storage):
        V = grl.Storage(1, default=0, leaf_keys=T.keys())

    done = False
    while steps and not done:
        delta = 0
        for s in T:
            v_s_old = V[s]
            if not policy:
                V[s] = max([(norm_factor * r[s][a] + g * V).avg(T[s][a])
                            for a in T[s]])
            else:
                V[s] = (policy[s] * {
                    a: (norm_factor * r[s][a] + g * V).avg(T[s][a])
                    for a in T[s]
                }).sum()
            delta = max(delta, abs(v_s_old - V[s]))
        if delta < eps:
            done = True
        steps -= 1
    return V
Beispiel #2
0
 def oracle(self, h, *args, **kwargs):
     Q = grl.Storage(1, default=h.stats.get(type(self).__name__, dict()).get(':)', 0.0), leaf_keys=self.am.action_space)
     if self.sm.hm.h.t == h.t:
         Q[self.optimal_actions[self.sm.hm.h[-1]]] += 1.0
     elif h.t - self.sm.hm.h.t == 1.0:
         Q[self.optimal_actions[self.sm.state]] += 1.0
     return Q
Beispiel #3
0
 def test_nested_sum(self):
     a = grl.Storage(dimensions=3, persist=True, default=0)
     a[1][2][3] = 2
     a[1][1][1] = 5
     a[1][2][2] = 2
     self.assertEqual(a[2].sum(), 0)
     self.assertEqual(a[1].sum(), 9)
     self.assertEqual(a[1][2].sum(), 4)
Beispiel #4
0
 def test_nested_max(self):
     a = grl.Storage(dimensions=3, persist=True, default=0)
     a[1][2][3] = 2
     a[1][1][1] = 5
     a[1][2][2] = 1
     a[2][1][3] = 3
     self.assertEqual(a[2].max(), 3)
     self.assertEqual(a[1].max(), 5)
     self.assertEqual(a[1][2].max(), 2)
Beispiel #5
0
 def test_persist_min(self):
     a = grl.Storage(dimensions=3,
                     persist=True,
                     leaf_keys=range(4),
                     default=(0, 1))
     a[1][2][3]
     a[1][2][1] = -1
     a[1][2][2]
     self.assertEqual(a[1][2].min(), -1)
     self.assertEqual(a[1][2].argmin(), 1)
Beispiel #6
0
 def test_non_persist_max(self):
     a = grl.Storage(dimensions=3,
                     persist=False,
                     leaf_keys=range(4),
                     default=(0, 1))
     a[1][2][3]
     a[1][2][1] = 5
     a[1][2][2]
     self.assertEqual(a[1][2].max(), 5)
     self.assertEqual(a[1][2].argmax(), 1)
Beispiel #7
0
    def oracle(self, h, *args, **kwargs):
        g = kwargs.get('g', 0.999)
        Q = grl.Storage(1, default=0, leaf_keys=self.am.action_space)
        s = h.extract(self.order)

        if s == 's-left':
            Q['left'] = 1 / (1 - g)
        elif s == 's-right':
            Q['right'] = 1 / (1 - g)

        return Q
Beispiel #8
0
 def oracle(self, h, *args, **kwargs):
     Q = grl.Storage(1, default=self.theta/(1-self.theta), leaf_keys=self.am.action_space)
     if self.sm.hm.h.t == h.t:
         s = self.sm.hm.h[-1]
     elif h.t - self.sm.hm.h.t == 1.0:
         s = self.sm.state
     if s == 0:
         Q *= self.C0
         Q[self.optimal_actions[0]] /= self.theta
     else:
         Q[self.optimal_actions[1]] /= self.theta
     return Q
Beispiel #9
0
 def restricted_action_space(self, b_vector, b):
     b_key = ''.join(str(x) for x in b_vector) + str(b)
     if self.restrict_A_cache.get(b_key, None):
         return self.restrict_A_cache[b_key]
     # expensive computation!
     A = grl.Storage(1,
                     default=-math.inf,
                     leaf_keys=self.domain.am.action_space)
     for a in self.ext_actions:
         bit_list = self.binary_func(a)
         for bits in bit_list:
             a_str = ''.join(str(x) for x in bits)
             if a_str.startswith(b_key): A[a] = 0.0
     # cache the computation
     self.restrict_A_cache[b_key] = A
     return A
Beispiel #10
0
    def oracle(self, h, *args, **kwargs):
        g = kwargs.get('g', 0.999)
        g_org = g**self.d
        kwargs['g'] = g_org

        # TODO: assert the binary history h is the transformation of h_ae
        diff = self.d * self.hm_ae.h.t - h.t
        assert (diff >= 0.0)

        dropped_h = self.hm_ae.drop(diff)
        q = self.domain.oracle(self.hm_ae.h, *args, **kwargs)
        self.hm_ae.record(dropped_h)

        q_bin = grl.Storage(1, default=0, leaf_keys=[0, 1])
        # "wierd" masking of the unavailable actions
        # moves the action-values of the unavailable actions to -inf
        q_bin[0] = (q + self.restricted_action_space(self.b, 0)).max()
        q_bin[1] = (q + self.restricted_action_space(self.b, 1)).max()
        q_bin = g**(self.d - self.sm.state - 1) * q_bin
        return q_bin
Beispiel #11
0
 def test_persist_len(self):
     a = grl.Storage(dimensions=3, persist=True, default=(0, 1))
     a[1][2][3]
     a[1][2][1] = 5
     a[1][2][2]
     self.assertEqual(len(a[1][2]), 3)
Beispiel #12
0
 def test_non_persist_storage(self):
     a = grl.Storage(dimensions=3, persist=False)
     a[1][2][3] = 4
     self.assertEqual(a[1][2][3], 4)
Beispiel #13
0
 def test_persist_access(self):
     a = grl.Storage(dimensions=2, persist=True, default=(0, 1))
     self.assertEqual(a[1][2], a[1][2])
Beispiel #14
0
 def test_non_persist_len(self):
     a = grl.Storage(dimensions=3, persist=False)
     a[1][2][3]
     a[1][2][1] = 5
     a[1][2][2]
     self.assertEqual(len(a[1][2]), 1)
Beispiel #15
0
import grl
from examples import SlipperyHill

p = grl.Storage(3, default=0.0, leaf_keys=[0, 1])
r = grl.Storage(3, default=0.0, leaf_keys=[0, 1])

pmin = 0.001
eps_vect = [0.0, -0.00003162277, -0.00001, -0.000003162277, -0.000001]
p_h_vect = [0.5, 0.001, 0.999]

C1 = 1
C0 = 0.001
theta = 0.999

odd_factor = theta * (C1 - C0) / (1 - theta)
normalizer = (2 * odd_factor - pmin) + 1 / pmin

v_org = {0: 333.1106224801706, 1: 333.4440674605935}
for eps in eps_vect:
    v_base = None
    print('Evaluating eps={}'.format(eps))
    for p_h in p_h_vect:

        r_f_0 = odd_factor - p_h * odd_factor  #0 - p_h * odd_factor
        r_f_1 = 2 * odd_factor - p_h * odd_factor  #odd_factor - p_h * odd_factor

        p[0]['up'][1] = p_h
        p[0]['up'][0] = 1.0 - p[0]['up'][1]
        p[0]['down'][0] = 1.0
        p[0]['stay'][0] = 1.0
Beispiel #16
0
import grl
from examples import SlipperyHill

p = grl.Storage(3, default=0.0, leaf_keys=[0,1])
r = grl.Storage(3, default=0.0, leaf_keys=[0,1])

pmin = 0.001
eps_vect = [0.0, -0.00003162277, -0.00001, -0.000003162277, -0.000001]
p_h_vect = [0.5, 0.001, 0.999]

r_sa = grl.Storage(2, default=0.0, leaf_keys=['up', 'stay', 'down'])

theta = 0.999

r_sa[0]['up'] = 0.5
r_sa[0]['stay'] = 0.0
r_sa[0]['down'] = 0.0

r_sa[1]['up'] = 0.0
r_sa[1]['stay'] = 1.0
r_sa[1]['down'] = 0.0

r_s = grl.Storage(1, default=0.0, leaf_key=[0,1])

r_s[0] = r_sa[0].max()
r_s[1] = r_sa[1].max()


for eps in eps_vect:
    v_base = None
    print('Evaluating eps={}'.format(eps))