def __init__(self, pop_size=10): # transition_cpt(s,a,s') = p(s'|a,s) # 0 = Left, 1 = Right transition_cpt = np.zeros((1, 2, 1), dtype=float) # reward_cpt(s,a,r) = p(r|s,a) # 0 = False, 1 = True reward_cpt = np.array([[0, .1]]) DPNetPop.__init__(self, transition_cpt, reward_cpt, pop_size)
def __init__(self, pop_size=1, set_W=True): # transition_cpt(s,a,s') = p(s'|a,s) # 0 = Left, 1 = Right transition_cpt = np.zeros((3, 2, 3), dtype=float) transition_cpt[0, 0, 1] = 1 # Going left in state 0 leads to state 1. transition_cpt[0, 1, 2] = 1 # Going right in state 0 leads to state 2. # reward_cpt(s,a,r) = p(r|s,a) # 0 = False, 1 = True reward_cpt = np.array([[0, 0], [1, 0], [.5, .75]]) DPNetPop.__init__(self, transition_cpt, reward_cpt, pop_size, set_W=set_W)
def __init__(self, pop_size=1, steps=3, set_W=True): # reward_cpt(s,a,r) = p(r|s,a) self.num_p = num_p = 16 self.num_v = num_v = 33 num_s = num_p * num_v reward_cpt = np.ones((num_s, 3)) self.steps = steps try: tmp = np.load('cpt.npz') transition_cpt = tmp['transition_cpt'] reward_sizes = tmp['reward_sizes'] print 'CPTs loaded' stdout.flush() except: print 'Creating CPTs' stdout.flush() # transition_cpt(s,a,s') = p(s'|a,s) # 0 = reverse (-1), 1 = no throttle (0), 2 = forward (1) transition_cpt = np.ones((num_s, 3, num_s), dtype=float) / 3. reward_sizes = np.zeros((num_s, 3)) def pos(s): return self.pos(s) def vel(s): return self.vel(s) overlap = np.array([quad(lambda p: self.phi(p) * self.phi(j - p), - np.inf, np.inf, epsabs=1e-20) for j in range(max(num_p, num_v))]) overlap[overlap[:, 0] < overlap[:, 1], 0] = 0 overlap = overlap[:, 0] Kp = np.array([[overlap[min(abs(i - j), 16 - abs(i - j))] for i in range(num_p)] for j in range(num_p)]) Kv = np.array([[overlap[abs(i - j)] for i in range(num_v)] for j in range(num_v)]) self.invKp = invKp = np.linalg.inv(Kp) self.invKv = invKv = np.linalg.inv(Kv) def phi(i): sigma = .5 return 0 if abs(i) > 5 else np.exp(-i**2 / (2 * sigma**2)) / (2 * np.pi)**(1. / 2) / sigma for i in xrange(num_s): for a in xrange(3): integral = [cf.trapz2d(i, n, a, steps=steps) for n in xrange(num_s)] for k in xrange(num_s): transition_cpt[i, a, k] = np.sum([invKp[pos(k), pos(n)] * invKv[vel(k), vel(n)] * integral[n] for n in xrange(num_s)]) reward_sizes[i] = dblquad(lambda v, p: phi(pos(i) - p) * phi(vel(i) - v) * cf.get_R(p, v), pos(i) - 5, pos(i) + 5, lambda tmp: vel(i) - 5, lambda tmp: vel(i) + 5, epsabs=1e-6)[0] np.savez_compressed('cpt.npz', reward_sizes=reward_sizes, transition_cpt=transition_cpt) DPNetPop.__init__(self, transition_cpt, reward_cpt, pop_size, reward_sizes, set_W)
def __init__(self, n_actions=2, pop_size=1, set_W=True): # transition_cpt(s,a,s') = p(s'|a,s) transition_cpt = np.zeros((10, n_actions, 10), dtype=float) transition_cpt[:, :, 0] = 1 transition_cpt[:, 1, 0] = 0 for s in range(9): transition_cpt[s, 1, s + 1] = 1 # reward_cpt(s,a,r) = p(r|s,a) # 0 = False, 1 = True reward_cpt = np.zeros((10, n_actions)) reward_cpt[9, 1] = 1 DPNetPop.__init__( self, transition_cpt, reward_cpt, pop_size, set_W=set_W)
def __init__(self, pop_size=10, set_W=True): # transition_cpt(s,a,s') = p(s'|a,s) # 0 = North, 1 = East, 2=South, 3=West transition_cpt = np.zeros((264, 4, 264), dtype=float) p = [.05, .9, .05] for y in range(6): for x in range(7): for a in range(4): if self.get_state_nr([y, x], [0, 0, 0]) < 0: continue # wall for aa in range(3): b = np.mod(a + aa - 1, 4) npos = self.get_next_pos([y, x], b) if np.min(npos) < 0 or np.max(npos - [5, 6]) > 0\ or self.get_state_nr(npos, [0, 0, 0]) < 0: # walk against wall for s in self.get_state_nr([y, x]): transition_cpt[s, a, s] += p[aa] continue npos = list(npos) for f0 in [0, 1]: for f1 in [0, 1]: for f2 in [0, 1]: if not (npos in [[0, 2], [5, 0], [4, 6]]): transition_cpt[self.get_state_nr([y, x], [f0, f1, f2]), a, self.get_state_nr(npos, [f0, f1, f2]), ] += p[aa] elif npos == [0, 2]: transition_cpt[self.get_state_nr([y, x], [f0, f1, f2]), a, self.get_state_nr(npos, [1, f1, f2]), ] += p[aa] elif npos == [5, 0]: transition_cpt[self.get_state_nr([y, x], [f0, f1, f2]), a, self.get_state_nr(npos, [f0, 1, f2]), ] += p[aa] elif npos == [4, 6]: transition_cpt[self.get_state_nr([y, x], [f0, f1, f2]), a, self.get_state_nr(npos, [f0, f1, 1]), ] += p[aa] # reward_cpt(s,a,r) = p(r|s,a) # 0 = False, 1 = True reward_cpt = np.ones((264, 4)) reward_sizes = np.zeros((264, 4)) for f0 in [0, 1]: for f1 in [0, 1]: for f2 in [0, 1]: reward_sizes[self.get_state_nr([0, 6], [f0, f1, f2])] = (f0 + f1 + f2) transition_cpt[self.get_state_nr([0, 6], [f0, f1, f2])] *= 0 DPNetPop.__init__(self, transition_cpt, reward_cpt, pop_size, reward_sizes, set_W)
def __init__(self, n_actions=2, pop_size=1, set_W=True): # transition_cpt(s,a,s') = p(s'|a,s) transition_cpt = np.zeros((10, n_actions, 10), dtype=float) transition_cpt[:, :, 0] = 1 transition_cpt[:, 1, 0] = 0 for s in range(9): transition_cpt[s, 1, s + 1] = 1 # reward_cpt(s,a,r) = p(r|s,a) # 0 = False, 1 = True reward_cpt = np.zeros((10, n_actions)) reward_cpt[9, 1] = 1 DPNetPop.__init__(self, transition_cpt, reward_cpt, pop_size, set_W=set_W)
def __init__(self, pop_size=1, set_W=True): # transition_cpt(s,a,s') = p(s'|a,s) transition_cpt = np.zeros((10, 2, 10), dtype=float) for s in range(9): transition_cpt[s, 1, s + 1] = 1 for s in range(6): transition_cpt[s, 0, 0] = 1 for s in range(6, 10): transition_cpt[s, 0, 6] = 1 # reward_cpt(s,a,r) = p(r|s,a) # 0 = False, 1 = True reward_cpt = np.zeros((10, 2)) reward_cpt[9, 1] = 1 reward_cpt[5, 1] = 1 DPNetPop.__init__( self, transition_cpt, reward_cpt, pop_size, set_W=set_W)
def __init__(self, pop_size=1, set_W=True): # transition_cpt(s,a,s') = p(s'|a,s) transition_cpt = np.zeros((10, 2, 10), dtype=float) for s in range(9): transition_cpt[s, 1, s + 1] = 1 for s in range(6): transition_cpt[s, 0, 0] = 1 for s in range(6, 10): transition_cpt[s, 0, 6] = 1 # reward_cpt(s,a,r) = p(r|s,a) # 0 = False, 1 = True reward_cpt = np.zeros((10, 2)) reward_cpt[9, 1] = 1 reward_cpt[5, 1] = 1 DPNetPop.__init__(self, transition_cpt, reward_cpt, pop_size, set_W=set_W)
def __init__(self, pop_size=10, set_W=True): self.pdealer = np.zeros((27, 12, 2)) for i in range(2, 10): self.pdealer[i, i, 0] = 1. / 13 self.pdealer[10, 10, 0] = 4. / 13 self.pdealer[11, 11, 1] = 1. / 13 def draw(x): for card in range(2, 12): if card == 10: if x[0] + card > 21 and x[2] > 0: self.pdealer[x[0] + card - 10, x[1], x[2] - 1] += self.pdealer[tuple(x)] * 4. / 13 else: self.pdealer[x[0] + card, x[1], x[2]] += self.pdealer[tuple(x)] * 4. / 13 elif card == 11: if x[0] + card > 21: self.pdealer[x[0] + 1, x[1], x[2]] += self.pdealer[tuple(x)] * 1. / 13 else: self.pdealer[x[0] + 11, x[1], x[2] + 1] += self.pdealer[tuple(x)] * 1. / 13 else: if x[0] + card > 21 and x[2] > 0: self.pdealer[x[0] + card - 10, x[1], x[2] - 1] += self.pdealer[tuple(x)] * 1. / 13 else: self.pdealer[x[0] + card, x[1], x[2]] += self.pdealer[tuple(x)] * 1. / 13 self.pdealer[tuple(x)] = 0 while np.sum(self.pdealer[:17]) > 0: for hand in range(2, 17): for face in range(2, 12): for ace in [0, 1]: if self.pdealer[hand, face, ace] > 0: draw([hand, face, ace]) def next_x(x, card): xx = np.array(x) + [card, 0, card == 11] if xx[0] > 21 and xx[-1] > 0: xx[0] -= 10 xx[-1] -= 1 if xx[0] > 21: xx[0] = 22 return xx pxinit = np.zeros((22, 12, 2)) for hand in range(2, 12): ace = 1 if hand == 11 else 0 for face in range(2, 12): if hand == 10 and face == 10: pxinit[hand, face, ace] = 4. / 13 * 4. / 13 elif hand == 10 or face == 10: pxinit[hand, face, ace] = 4. / 13 / 13 else: pxinit[hand, face, ace] = 1. / 13 / 13 self.pxinit = np.zeros((22, 12, 2)) self.pstart_state = np.zeros(380) for hand in range(2, 12): for face in range(2, 12): ace = 1 if hand == 11 else 0 for card in range(2, 12): nx = tuple(next_x([hand, face, ace], card)) self.pxinit[nx] += pxinit[hand, face, ace] * \ (4. / 13 if card == 10 else 1. / 13) self.pstart_state[self.get_state_nr(nx)] = self.pxinit[nx] # transition_cpt(s,a,s') = p(s'|a,s) # 0 = Hit, 1 = Stick transition_cpt = np.zeros((380, 2, 380), dtype=float) for hand in range(4, 22): for face in range(2, 12): for ace in [0, 1]: if ace == 1 and hand < 11: continue for card in range(2, 12): transition_cpt[self.get_state_nr([hand, face, ace]), 0, self.get_state_nr(tuple(next_x([hand, face, ace], card)))]\ = 4. / 13 if card == 10 else 1. / 13 # reward_cpt(s,a,r) = p(r|s,a) reward_sizes = np.zeros((380, 2, 3)) # reward can have 3 values (0,.5,1) reward_sizes[:, :, 1] = .5 reward_sizes[:, :, 2] = 1 reward_cpt = np.zeros((380, 2, 3)) reward_cpt[:, :, 0] = 1 # default no reward for hand in range(4, 22): for face in range(2, 12): for ace in [0, 1]: reward_cpt[self.get_state_nr([hand, face, ace]), 1] = self.get_prew(hand, face) DPNetPop.__init__(self, transition_cpt, reward_cpt, pop_size, reward_sizes, set_W)
def __init__(self, pop_size=10, set_W=True): # transition_cpt(s,a,s') = p(s'|a,s) # 0 = North, 1 = East, 2=South, 3=West transition_cpt = np.zeros((264, 4, 264), dtype=float) p = [.05, .9, .05] for y in range(6): for x in range(7): for a in range(4): if self.get_state_nr([y, x], [0, 0, 0]) < 0: continue # wall for aa in range(3): b = np.mod(a + aa - 1, 4) npos = self.get_next_pos([y, x], b) if np.min(npos) < 0 or np.max(npos - [5, 6]) > 0\ or self.get_state_nr(npos, [0, 0, 0]) < 0: # walk against wall for s in self.get_state_nr([y, x]): transition_cpt[s, a, s] += p[aa] continue npos = list(npos) for f0 in [0, 1]: for f1 in [0, 1]: for f2 in [0, 1]: if not (npos in [[0, 2], [5, 0], [4, 6]]): transition_cpt[ self. get_state_nr([y, x], [f0, f1, f2]), a, self.get_state_nr( npos, [f0, f1, f2]), ] += p[aa] elif npos == [0, 2]: transition_cpt[ self. get_state_nr([y, x], [f0, f1, f2]), a, self.get_state_nr( npos, [1, f1, f2]), ] += p[aa] elif npos == [5, 0]: transition_cpt[ self. get_state_nr([y, x], [f0, f1, f2]), a, self.get_state_nr( npos, [f0, 1, f2]), ] += p[aa] elif npos == [4, 6]: transition_cpt[ self. get_state_nr([y, x], [f0, f1, f2]), a, self.get_state_nr( npos, [f0, f1, 1]), ] += p[aa] # reward_cpt(s,a,r) = p(r|s,a) # 0 = False, 1 = True reward_cpt = np.ones((264, 4)) reward_sizes = np.zeros((264, 4)) for f0 in [0, 1]: for f1 in [0, 1]: for f2 in [0, 1]: reward_sizes[self.get_state_nr( [0, 6], [f0, f1, f2])] = (f0 + f1 + f2) transition_cpt[self.get_state_nr([0, 6], [f0, f1, f2])] *= 0 DPNetPop.__init__(self, transition_cpt, reward_cpt, pop_size, reward_sizes, set_W)