def expand_Q(w): Q = np.zeros((10, 21, 2)) for dealer in DEALER_RANGE: for player in PLAYER_RANGE: for action in ACTIONS: #state = (dealer, player) state = State() state.dealercard = dealer state.playersum = player feats = phi(state, action) Q[dealer - 1, player - 1][action] = np.sum(feats * w) return Q
def Lfa(): lmbd = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] learning_curves = {} state1 = State() num_episodes = 2000 with open("./Q_dump_episodes_1000000.pkl", "rb") as f: opt_value = pickle.load(f) for item in lmbd: state1.dealercard = random.randint(1,10) state1.playersum = random.randint(1,10) Q_value,error_history = lfa_learn(item,opt_value,num_episodes) learning_curves[item] = error_history plot_file = ("./outcome/lfa_error_{}_episodes_time_{}.pdf".format(20000,time.time())) plot_learning_curve(learning_curves, save=plot_file)
def Sarsa_lamda_Control(lmbd, opt_value, num_episodes): #initialize value = np.zeros((10, 21, 2)) counter = np.zeros((10, 21, 2)) totalreward = 0 error_history = [] for episode in range(1, num_episodes + 1): # initialize env state1 = State() state1.dealercard = random.randint(1, 10) state1.playersum = random.randint(1, 10) E = np.zeros((10, 21, 2)) while state1 != "terminal": action1 = Epsilon_greedy_policy(value, counter, state1) state2, reward = Step(state1, action1) idx1 = (state1.dealercard - 1, state1.playersum - 1, action1) Q1 = value[idx1] if state2 == "terminal": Q2 = 0.0 else: action2 = Policy(value, counter, state2) idx2 = (state2.dealercard - 1, state2.playersum - 1, action2) Q2 = value[idx2] counter[idx1] += 1 E[idx1] += 1 alpha = 1.0 / counter[idx1] delta = reward + GAMMA * Q2 - Q1 value += alpha * delta * E E *= GAMMA * lmbd state1 = state2 error_history.append((episode, mse(value, opt_value))) return value, error_history
def Sarsa(): lmbd = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] #lmbd = [0.1] learning_curves = {} state1 = State() num_episodes = 20000 with open("./Q_dump_episodes_1000000.pkl", "rb") as f: opt_value = pickle.load(f) for item in lmbd: #print("in main, item:",item) state1.dealercard = random.randint(1,10) state1.playersum = random.randint(1,10) #print("state in main:",state1.dealercard,state1.playersum) Q_value,error_history = Sarsa_lamda_Control(item,opt_value,num_episodes) learning_curves[item] = error_history #print("learning_curves:",learning_curves) plot_file = ("./outcome/Sarsa_error_{}_episodes_time_{}.pdf".format(20000,time.time())) plot_learning_curve(learning_curves, save=plot_file)
def lfa_learn(lmbd, opt_value, num_episodes): #initialize Q = np.zeros((10, 21, 2)) counter = np.zeros((10, 21, 2)) totalreward = 0 error_history = [] w = (np.random.rand(*FEATS_SHAPE) - 0.5) * 0.001 for episode in range(1, num_episodes + 1): # initialize env state1 = State() state1.dealercard = random.randint(1, 10) state1.playersum = random.randint(1, 10) #state1 = (state1.dealercard,state1.playersum) E = np.zeros_like(w) while state1 != "terminal": Qhat1, action1 = policy(state1, w) state2, reward = Step(state1, action1) Qhat2, action2 = policy(state2, w) feats1 = phi(state1, action1) grad_w_Qhat1 = feats1 delta = reward + GAMMA * Qhat2 - Qhat1 E = GAMMA * lmbd * E + grad_w_Qhat1 dw = ALPHA * delta * E w += dw state1 = state2 Q = expand_Q(w) #print("in lfa while") error_history.append((episode, mse(Q, opt_value))) return Q, error_history