def run(): global discount global epsilon global alpha global log score = 0 time.sleep(1) s1 = World.player a1, q_val1 = policy(s1) for episode_num in range(40): steps = 0 score = 0 while not World.has_restarted(): # Do the action (s1, a1, r1, s2) = do_action(a1) score += r1 # Update Q a2, q_val2 = policy( s2) # Change to max_Q(s2) if following Greedy policy a_best, q_best = max_Q(s2) delta = r1 + discount * q_best - Q[s1][a1] E[s1][a1] = 1 for state in states: for action in actions: inc_Q(state, action, alpha, delta) if a_best == a2: E[state][action] *= discount * e_decay else: E[state][action] = 0 # print('new q:', Q[s1][a1]) s1 = s2 a1 = a2 q_val1 = q_val2 steps += 1 # Update the learning rate # MODIFY THIS SLEEP IF THE GAME IS GOING TOO FAST. # time.sleep(0.005) World.restart_game() reset_E() log.append({ 'episode': episode_num, 'score': score, 'steps': steps, 'alpha': alpha, 'epsilon': 0 }) # time.sleep(0.01) alpha = max(0.1, pow(episode_num + 1, -0.4)) epsilon = min(0.3, pow(episode_num + 1, -1.2)) with open('data/Q.csv', 'w', newline='') as csvfile: writer = csv.DictWriter( csvfile, fieldnames=['episode', 'score', 'steps', 'alpha', 'epsilon']) writer.writeheader() for episode in log: writer.writerow(episode) print('Logged')