def backup(self, cur_visit, visits, reward): # get previous state prev_visit = visits[len(visits) - 1] state_action = list(map(int, prev_visit.split(':'))) state = State() state.dealer = state_action[0] state.player = state_action[1] action = state_action[2] # update eligibility trace (accumulating) Fv = self.get_coarse_feature(state, action) self.E[Fv == 1] += 1.0 delta = reward - np.sum(Fv * self.theta) state_action = list(map(int, cur_visit.split(':'))) state.dealer = state_action[0] state.player = state_action[1] action = state_action[2] Fv = self.get_coarse_feature(state, action) delta += np.sum(Fv * self.theta) self.E *= CONST.LAMBDAS[self.l] # update theta self.theta += CONST.ALPHA * delta * self.E
def record_mse(self, episode): _mse = 0.0 for dealer in range(1, 10): for player in range(1, CONST.EASY21): for action in [CONST.HIT, CONST.STICK]: visit = "%d:%d:%d" % (dealer, player, action) if _mse < 0.00000001 and not _mse == 0.0: _mse = 0.0 if not _mse > 10000.0: state = State() state.dealer = dealer state.player = player F = self.get_coarse_feature(state, action) Q = np.sum(F * self.theta) _mse += (Q - self.MCQ[visit])**2 self.mse[episode] = _mse / 10.0 / 21.0 / 2.0
def play(): state = State() env = Environment() state.dealer = env.draw(initial=True) state.player = env.draw(initial=True) print(state) while state.player < CONST.STAY_VALUE \ and not env.check_burst(state.player): env.step(state, CONST.HIT) print(state) if not env.check_burst(state.player): env.step(state, CONST.STICK) print(state, " Reward: ", env.check_reward(state))
def checkStepDealer(): record_data = { key: np.zeros(1, CONST.STEPDTYPE) for key in CONST.STEPRECORDPLAYS } s = State() e = Environment() for rd in record_data: index = rd.split('-') a = CONST.HIT if int(index[2]) is 0 else CONST.STICK frequency = dict() for i in range(0, CONST.SAMPLES - 1, 1): s.dealer = int(index[0]) s.player = int(index[1]) r = e.step(s, a) key = "%d:%d:%d" % (s.dealer, s.player, r) if e.is_terminal(s): key = "%d:%d:%d" % (0, 0, r) if key in frequency: frequency[key] += 1 else: frequency[key] = 1 for entry in frequency: frequency[entry] = "%.3f" % (float(frequency[entry]) / CONST.SAMPLES) filename = CONST.CHECKSTEPTEMPLATE.format(str(index[0]), str(index[1]), str(index[2])) write(CONST.OUTPUT_PATH + filename, frequency)