def VITabular(T, r, V=None, policy=None, **kwargs): steps = kwargs.get('steps', math.inf) normalize = kwargs.get('normalize', False) g = kwargs.get('g', 0.999) eps = kwargs.get('eps', 1e-6) norm_factor = 1 - g if normalize else 1 if not isinstance(V, grl.Storage): V = grl.Storage(1, default=0, leaf_keys=T.keys()) done = False while steps and not done: delta = 0 for s in T: v_s_old = V[s] if not policy: V[s] = max([(norm_factor * r[s][a] + g * V).avg(T[s][a]) for a in T[s]]) else: V[s] = (policy[s] * { a: (norm_factor * r[s][a] + g * V).avg(T[s][a]) for a in T[s] }).sum() delta = max(delta, abs(v_s_old - V[s])) if delta < eps: done = True steps -= 1 return V
def oracle(self, h, *args, **kwargs): Q = grl.Storage(1, default=h.stats.get(type(self).__name__, dict()).get(':)', 0.0), leaf_keys=self.am.action_space) if self.sm.hm.h.t == h.t: Q[self.optimal_actions[self.sm.hm.h[-1]]] += 1.0 elif h.t - self.sm.hm.h.t == 1.0: Q[self.optimal_actions[self.sm.state]] += 1.0 return Q
def test_nested_sum(self): a = grl.Storage(dimensions=3, persist=True, default=0) a[1][2][3] = 2 a[1][1][1] = 5 a[1][2][2] = 2 self.assertEqual(a[2].sum(), 0) self.assertEqual(a[1].sum(), 9) self.assertEqual(a[1][2].sum(), 4)
def test_nested_max(self): a = grl.Storage(dimensions=3, persist=True, default=0) a[1][2][3] = 2 a[1][1][1] = 5 a[1][2][2] = 1 a[2][1][3] = 3 self.assertEqual(a[2].max(), 3) self.assertEqual(a[1].max(), 5) self.assertEqual(a[1][2].max(), 2)
def test_persist_min(self): a = grl.Storage(dimensions=3, persist=True, leaf_keys=range(4), default=(0, 1)) a[1][2][3] a[1][2][1] = -1 a[1][2][2] self.assertEqual(a[1][2].min(), -1) self.assertEqual(a[1][2].argmin(), 1)
def test_non_persist_max(self): a = grl.Storage(dimensions=3, persist=False, leaf_keys=range(4), default=(0, 1)) a[1][2][3] a[1][2][1] = 5 a[1][2][2] self.assertEqual(a[1][2].max(), 5) self.assertEqual(a[1][2].argmax(), 1)
def oracle(self, h, *args, **kwargs): g = kwargs.get('g', 0.999) Q = grl.Storage(1, default=0, leaf_keys=self.am.action_space) s = h.extract(self.order) if s == 's-left': Q['left'] = 1 / (1 - g) elif s == 's-right': Q['right'] = 1 / (1 - g) return Q
def oracle(self, h, *args, **kwargs): Q = grl.Storage(1, default=self.theta/(1-self.theta), leaf_keys=self.am.action_space) if self.sm.hm.h.t == h.t: s = self.sm.hm.h[-1] elif h.t - self.sm.hm.h.t == 1.0: s = self.sm.state if s == 0: Q *= self.C0 Q[self.optimal_actions[0]] /= self.theta else: Q[self.optimal_actions[1]] /= self.theta return Q
def restricted_action_space(self, b_vector, b): b_key = ''.join(str(x) for x in b_vector) + str(b) if self.restrict_A_cache.get(b_key, None): return self.restrict_A_cache[b_key] # expensive computation! A = grl.Storage(1, default=-math.inf, leaf_keys=self.domain.am.action_space) for a in self.ext_actions: bit_list = self.binary_func(a) for bits in bit_list: a_str = ''.join(str(x) for x in bits) if a_str.startswith(b_key): A[a] = 0.0 # cache the computation self.restrict_A_cache[b_key] = A return A
def oracle(self, h, *args, **kwargs): g = kwargs.get('g', 0.999) g_org = g**self.d kwargs['g'] = g_org # TODO: assert the binary history h is the transformation of h_ae diff = self.d * self.hm_ae.h.t - h.t assert (diff >= 0.0) dropped_h = self.hm_ae.drop(diff) q = self.domain.oracle(self.hm_ae.h, *args, **kwargs) self.hm_ae.record(dropped_h) q_bin = grl.Storage(1, default=0, leaf_keys=[0, 1]) # "wierd" masking of the unavailable actions # moves the action-values of the unavailable actions to -inf q_bin[0] = (q + self.restricted_action_space(self.b, 0)).max() q_bin[1] = (q + self.restricted_action_space(self.b, 1)).max() q_bin = g**(self.d - self.sm.state - 1) * q_bin return q_bin
def test_persist_len(self): a = grl.Storage(dimensions=3, persist=True, default=(0, 1)) a[1][2][3] a[1][2][1] = 5 a[1][2][2] self.assertEqual(len(a[1][2]), 3)
def test_non_persist_storage(self): a = grl.Storage(dimensions=3, persist=False) a[1][2][3] = 4 self.assertEqual(a[1][2][3], 4)
def test_persist_access(self): a = grl.Storage(dimensions=2, persist=True, default=(0, 1)) self.assertEqual(a[1][2], a[1][2])
def test_non_persist_len(self): a = grl.Storage(dimensions=3, persist=False) a[1][2][3] a[1][2][1] = 5 a[1][2][2] self.assertEqual(len(a[1][2]), 1)
import grl from examples import SlipperyHill p = grl.Storage(3, default=0.0, leaf_keys=[0, 1]) r = grl.Storage(3, default=0.0, leaf_keys=[0, 1]) pmin = 0.001 eps_vect = [0.0, -0.00003162277, -0.00001, -0.000003162277, -0.000001] p_h_vect = [0.5, 0.001, 0.999] C1 = 1 C0 = 0.001 theta = 0.999 odd_factor = theta * (C1 - C0) / (1 - theta) normalizer = (2 * odd_factor - pmin) + 1 / pmin v_org = {0: 333.1106224801706, 1: 333.4440674605935} for eps in eps_vect: v_base = None print('Evaluating eps={}'.format(eps)) for p_h in p_h_vect: r_f_0 = odd_factor - p_h * odd_factor #0 - p_h * odd_factor r_f_1 = 2 * odd_factor - p_h * odd_factor #odd_factor - p_h * odd_factor p[0]['up'][1] = p_h p[0]['up'][0] = 1.0 - p[0]['up'][1] p[0]['down'][0] = 1.0 p[0]['stay'][0] = 1.0
import grl from examples import SlipperyHill p = grl.Storage(3, default=0.0, leaf_keys=[0,1]) r = grl.Storage(3, default=0.0, leaf_keys=[0,1]) pmin = 0.001 eps_vect = [0.0, -0.00003162277, -0.00001, -0.000003162277, -0.000001] p_h_vect = [0.5, 0.001, 0.999] r_sa = grl.Storage(2, default=0.0, leaf_keys=['up', 'stay', 'down']) theta = 0.999 r_sa[0]['up'] = 0.5 r_sa[0]['stay'] = 0.0 r_sa[0]['down'] = 0.0 r_sa[1]['up'] = 0.0 r_sa[1]['stay'] = 1.0 r_sa[1]['down'] = 0.0 r_s = grl.Storage(1, default=0.0, leaf_key=[0,1]) r_s[0] = r_sa[0].max() r_s[1] = r_sa[1].max() for eps in eps_vect: v_base = None print('Evaluating eps={}'.format(eps))