def play_game(grid: Grid, policy): state = (2, 0) grid.set_state(state) action = random_action(policy[state]) states_actions_rewards = [(state, action, 0)] while True: reward = grid.move(action) state = grid.current_state() if grid.is_game_over(): states_actions_rewards.append((state, None, reward)) break else: action = random_action(policy[state]) states_actions_rewards.append((state, action, reward)) # Calculate returns, G, by working backwards from the terminal state G = 0 states_actions_returns = [] first = True for state, action, reward in reversed(states_actions_rewards): # Value of terminal state is 0 so ignore it. Can also ignore last G if first: first = False else: states_actions_returns.append((state, action, G)) G = reward + gamma*G states_actions_returns.reverse() # Order of states visited, which was reverse return states_actions_returns
def play_game(grid: Grid, policy: dict): # returns a list of states and corresponding rewards # start at the designated start state s = (2, 0) grid.set_state(s) states_and_rewards = [(s, 0)] #list of tuples (state, reward) while not grid.game_over(): a = policy[s] a = random_action(a) r = grid.move(a) s = grid.current_state() states_and_rewards.append((s, r)) return states_and_rewards
def main(argv): setpath() from q_learning import QLearning from grid_world import Grid # 5 rows, 4 cols, unreachables : [(1,1), (1,3)], pits : [(3,1)], goal : (4,3) g = Grid(5, 4, [(1, 1), (1, 3)], [(3, 1)], (4, 3)) q = QLearning(g) q.learn()
def run_simulation_441(): for i in tqdm(range(N)): grid = Grid(X_GRID, Y_GRID, LIVES, True) while not grid.done: state = grid.current_state() # The current position in the env action = random.choice( grid.actions[state]) # Choose a random valid action grid.move(*action) round_reward = cost_per_game - rewards_table_441[grid.score] global current_balance current_balance += round_reward # Append round profit and accumulated profit games_log.append([round_reward, current_balance]) if show_live and not i % SHOW_EVERY: plot_graph(True, games_log)
def play_game(grid: Grid, policy): # Reset game to start at a random position. # Need to do this becasue of the current deterministic policy # we would never end up at certain states start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) state = grid.current_state() states_and_rewards = [(state, 0)] # List of tuples (state, reward) while not grid.is_game_over(): action = policy[state] action = random_action(action) reward = grid.move(action) state = grid.current_state() states_and_rewards.append((state, reward)) # Calculate returns, G, by working backwards from the terminal state G = 0 states_and_returns = [] first = True for state, reward in reversed(states_and_rewards): # Value of terminal state is 0 so ignore it. Can also ignore last G if first: first = False else: states_and_returns.append((state, G)) G = reward + gamma*G states_and_returns.reverse() # Order of states visited, which was reverse return states_and_returns
def play_game(grid: Grid, policy): # Reset game to start at a random position. # Need to do this becasue of the current deterministic policy # we would never end up at certain states start_states = list(grid.actions.keys()) start_idx = np.random.choice(len(start_states)) grid.set_state(start_states[start_idx]) state = grid.current_state() action = np.random.choice( ALL_POSSIBLE_ACTIONS) # First action is uniformly random states_actions_rewards = [(state, action, 0)] seen_states = set() seen_states.add(grid.current_state()) num_steps = 0 while True: reward = grid.move(action) num_steps += 1 state = grid.current_state() if state in seen_states: r = -10. / num_steps # Hack so we don't end up in an infinitely long episode bumping into a wall states_actions_rewards.append((state, None, r)) break elif grid.is_game_over(): states_actions_rewards.append((state, None, reward)) break else: action = policy[state] states_actions_rewards.append((state, action, reward)) seen_states.add(state) # Calculate returns, G, by working backwards from the terminal state G = 0 states_actions_returns = [] first = True for state, action, reward in reversed(states_actions_rewards): # Value of terminal state is 0 so ignore it. Can also ignore last G if first: first = False else: states_actions_returns.append((state, action, G)) G = reward + gamma * G states_actions_returns.reverse( ) # Order of states visited, which was reverse return states_actions_returns
def initialize_round(): global grid, screen, heart, pixels_per_tile, margin, old_lives pixels_per_tile = int(min(screen_h/Y_GRID,screen_w/X_GRID) * 3 / 4) margin = int(pixels_per_tile/20) size = [(pixels_per_tile+margin)*X_GRID + margin, (pixels_per_tile+margin)*Y_GRID+margin] # print(f"Grid Size: {X_GRID}x{Y_GRID}: ({size[0]}, {size[1]})px") setattr(Tile, 'PIXELS_PER_TILE', pixels_per_tile) setattr(Tile, 'MARGIN', margin) # screen = pygame.display.set_mode(SIZE, pygame.HWSURFACE | pygame.DOUBLEBUF | pygame.RESIZABLE| pygame.FULLSCREEN) screen = pygame.display.set_mode(size) grid = Grid(X_GRID,Y_GRID,LIVES) old_lives = LIVES print("Right Path: ", grid.path) heart = pygame.image.load(Tile.ASSETS['heart']) heart = pygame.transform.scale(heart,(int(pixels_per_tile/2),int(pixels_per_tile/2)))
import numpy as np import torch from torch.optim import AdamW from grid_world import Grid from actor import Actor, Actor_Loss, choose_action from critic import Critic, Critic_Loss np.random.seed(1) # training config MAX_EPISODE = 450 Actor_lr = 1e-3 Critic_lr = 1e-3 # problem setting grid = Grid() grid.draw_board() state_dim = 2 action_dim = 4 # init models actor = Actor(input_dim=state_dim, output_dim=action_dim) critic = Critic(input_dim=state_dim) actor_opt = AdamW(actor.parameters(), lr=Actor_lr) critic_opt = AdamW(critic.parameters(), lr=Critic_lr) # init loss a_loss = Actor_Loss() c_loss = Critic_Loss() for i_episode in range(MAX_EPISODE):