def transition_model(self, s, a): # Current state (itmp, ttmp, actmp, acst) = s # Nominal next state al_on = 0.5 al_of = 0.9 if acst: itmp = al_on * itmp + (1 - al_on) * actmp else: itmp = al_of * itmp + (1 - al_of) * etmp actmp += a actry = (a != 0) itmp = r_stvar(bound(itmp, minT, maxT)) actmp = r_stvar(bound(actmp, minT, maxT)) if actry != acst: mpr = (itmp, ttmp, actmp, acst) ipr = (itmp, ttmp, actmp, actry) pr = 0.5 return dist.DDist({mpr: 1 - pr, ipr: pr}) new_s = (itmp, ttmp, actmp, acst) return dist.delta_dist(new_s)
def __init__(self, field_size, ball_speed=1, random_start=True): # image space is n by n self.q = None self.n = field_size h = self.n * ball_speed self.discount_factor = (h - 1.0) / h self.ball_speed = ball_speed # state space is: ball position and velocity, paddle position # and velocity # - ball position is n by n # - ball velocity is one of (-1, -1), (-1, 1), (0, -1), (0, 1), # (1, -1), (1, 1) # - paddle position is n; this is location of bottom of paddle, # can stick "up" out of the screen # - paddle velocity is one of 1, 0, -1 self.states = [((br, bc), (brv, bcv), pp, pv) for \ br in range(self.n) for bc in range(self.n) for brv in (-1, 0, 1) for bcv in (-1, 1) for pp in range(self.n) for pv in (-1, 0, 1)] self.states.append('over') self.start = dist.uniform_dist([((br, 0), (0, 1), 0, 0) \ for br in range(self.n)]) \ if random_start else \ dist.delta_dist(((int(self.n/2), 0), (0, 1), 0, 0))
def transition_model(self, s, a, p = 0.4): # Only randomness is in brv and brc after a bounce # 1- prob of negating nominal velocity if s == 'over': return dist.delta_dist('over') # Current state ((br, bc), (brv, bcv), pp, pv) = s # Nominal next ball state new_br = br + self.ball_speed*brv; new_brv = brv new_bc = bc + self.ball_speed*bcv; new_bcv = bcv # nominal paddle state, a is action (-1, 0, 1) new_pp = max(0, min(self.n-1, pp + a)) new_pv = a new_s = None hit_r = hit_c = False # bottom, top contacts if new_br < 0: new_br = 0; new_brv = 1; hit_r = True elif new_br >= self.n: new_br = self.n - 1; new_brv = -1; hit_r = True # back, front contacts if new_bc < 0: # back bounce new_bc = 0; new_bcv = 1; hit_c = True elif new_bc >= self.n: if self.paddle_hit(pp, new_pp, br, bc, new_br, new_bc): new_bc = self.n-1; new_bcv = -1; hit_c = True else: return dist.delta_dist('over') new_s = ((new_br, new_bc), (new_brv, new_bcv), new_pp, new_pv) if ((not hit_c) and (not hit_r)): return dist.delta_dist(new_s) elif hit_c: # also hit_c and hit_r if abs(new_brv) > 0: return dist.DDist({new_s: p, ((new_br, new_bc), (-new_brv, new_bcv), new_pp, new_pv) : 1-p}) else: return dist.DDist({new_s: p, ((new_br, new_bc), (-1, new_bcv), new_pp, new_pv) : 0.5*(1-p), ((new_br, new_bc), (1, new_bcv), new_pp, new_pv) : 0.5*(1-p)}) elif hit_r: return dist.DDist({new_s: p, ((new_br, new_bc), (new_brv, -new_bcv), new_pp, new_pv) : 1-p})
def __init__(self, grid_size, stride_factor=1, random_start=True): self.q = None self.n = grid_size self.actions = ['up', 'down', 'left', 'right'] self.discount_factor = 1 self.stride = stride_factor self.states = [((px,py), (rx,ry)) for px in range(self.n) \ for py in range(self.n) for rx in range(self.n) for ry in range(self.n)] self.states.append('over') if random_start: self.start = dist.uniform_dist([((0, 0), (int(self.n / 2), ry)) for ry in range(self.n)]) else: self.start = dist.delta_dist( ((0, 0), (int(self.n / 2), int(self.n / 2))))
def __init__(self, start=(20*u, 25*u, 30*u, False)): self.q = None self.discount_factor = 0.99 # +1 so that range is inclusive self.states = [ (itmp, ttmp, actmp, acst) for itmp in range(minT, maxT + 1) for ttmp in range(minT, maxT + 1) for actmp in range(minT, maxT + 1) for acst in (True, False) ] # self.states.append('over') self.actions = [ +1, 0, -1, np.nextafter(0, 1), # keep AC on (!= 0) but do not move target ] self.start = dist.delta_dist(start)
def transition_model(self, s, a): if s == 'over': return dist.delta_dist('over') # the state ((px, py), (rx, ry)) = s # all possible actions if a == 'up': new_px = px if py + 2 > self.n - 1: new_py = self.n - 1 else: new_py = py + 2 if a == 'down': new_px = px if py - 2 < 0: new_py = 0 else: new_py = py - 2 if a == 'left': new_py = py if px - 2 < 0: new_px = 0 else: new_px = px - 2 if a == 'right': new_py = py if px + 2 > self.n - 1: new_px = self.n - 1 else: new_px = px + 2 # end all possible actions # movement of reward (rx, ry) new_rx_up = rx if ry + self.stride > self.n - 1: new_ry_up = self.n - 1 else: new_ry_up = ry + self.stride new_rx_down = rx if ry - self.stride < 0: new_ry_down = 0 else: new_ry_down = ry - self.stride new_ry_left = ry if rx - self.stride < 0: new_rx_left = 0 else: new_rx_left = rx - self.stride new_ry_right = ry if rx + self.stride > self.n - 1: new_rx_right = self.n - 1 else: new_rx_right = rx + self.stride new_s_up = ((new_px, new_py), (new_rx_up, new_ry_up)) new_s_down = ((new_px, new_py), (new_rx_down, new_ry_down)) new_s_left = ((new_px, new_py), (new_rx_left, new_ry_left)) new_s_right = ((new_px, new_py), (new_rx_right, new_ry_right)) n_set = set([new_s_up, new_s_down, new_s_left, new_s_right]) ret_list = list(n_set) if rx == new_px: return dist.delta_dist('over') else: return dist.uniform_dist(ret_list)