async def index(algo: str): env = GridWorldEnv(grid_size, mode="random") env.reset() player, win, pit, wall = GridWorldBase.get_item_positions(env.state) model = models[algo] predictions = model.predict(env) return { "state": {"player": player, "wall": wall, "win": win, "pit": pit}, "grid_size": grid_size, "predictions": predictions, }
def rollout(state: np.ndarray, attempt: int = 0, max_steps: int = None, **env_kwargs) -> float: if attempt >= max_steps: return 0 if GridWorldEnv.is_done(state): if GridWorldEnv.has_won(state): return 1 return 0 env_ = GridWorldEnv(**env_kwargs) env_.reset() env_.set_state(state) state, _, _, _ = env_.step(np.random.choice(range(4))) return rollout(state, attempt + 1, max_steps, **env_kwargs)
def play(model, cfg): env = GridWorldEnv(cfg.grid_size, cfg.env_mode) env.reset() env.render() step = 0 while not env.done and step < cfg.max_steps: x = GWPgModel.convert_inputs([env]) yh = model(x) yh = F.softmax(yh, 1) action = yh[0].argmax(0) _, reward, done, _ = env.step(action) env.render() step += 1
def rollout(state: np.ndarray, attempt: int = 0) -> float: if attempt >= MAX_STEPS: return 0 if env.is_done(state): if env.has_won(state): return 1 return 0 env_ = GridWorldEnv(size, mode) env_.reset() env_.set_state(state) state, _, _, _ = env_.step(np.random.choice(range(4))) return rollout(state, attempt + 1)
def __init__(self, parent=None, action: int = None): self.parent = parent self.children = None self.action = action self.n = 0 self.wins = 0 self.is_terminal = False temp_env = GridWorldEnv(size, mode) temp_env.reset() self.state: np.ndarray = (copy.deepcopy(parent.state) if parent is not None else temp_env.state) if action is not None: temp_env.set_state(self.state) self.state, _, self.is_terminal, _ = temp_env.step(action)
async def step(algo: str, data: StepData): env = GridWorldEnv(grid_size, mode="random") env.reset() env.state = dict(data.positions) state, reward, done, info = env.step(data.action) model = models[algo] predictions = model.predict(env) player, win, pit, wall = GridWorldBase.get_item_positions(state) return { "reward": reward, "done": done, "info": info, "predictions": predictions, "state": {"player": player, "wall": wall, "win": win, "pit": pit}, }
def __init__(self, **config): self.cfg = OmegaConf.create(config) self.model = (GWPgModel( self.cfg.grid_size, [self.cfg.units for _ in range(self.cfg.depth)]).double().to(device)) self.optim = torch.optim.Adam(self.model.parameters(), lr=self.cfg.lr) self.writer = SummaryWriter( f"{CWD}/runs/gw_policy_grad_LR{str(self.cfg.lr)[:7]}_{self.cfg.depth}x{self.cfg.units}_{int(datetime.now().timestamp())}" ) self.envs = [ GridWorldEnv(size=self.cfg.grid_size, mode=self.cfg.env_mode) for _ in range(self.cfg.n_env) ] self.stats_e = [] self.won = [] self.current_episode = 1 self.reset_episode() self.writer.add_graph(self.model, GWPgModel.convert_inputs(self.envs))
def __init__(self): super().__init__(GridWorldEnv(size=4, mode="random"))
def expand(node: MctsNode) -> MctsNode: if GridWorldEnv.is_done(node.state): return node if node.children is None: node.children = [MctsNode(node, action) for action in range(4)] return np.random.choice(node.children)
def main(): # ----------------- Hyper params ------------------- # Env params GRID_SIZE = 4 ENV_MODE = "random" # TRAINING_PARAMS EPOCHS = 10000 BATCH_SIZE = 1 MAX_MONTE_CARLO_STEPS = 50 N_TRAIN_STEP = 4 ARCHITECTURE = [50, 50] GAMMA_RETURNS = 0.75 GAMMA_CREDITS = 0.75 LEARNING_RATE = 1e-3 # -------------- Setup other variables ---------------- model = GwAcModel(GRID_SIZE, ARCHITECTURE).double().to(device) optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) envs = [ GridWorldEnv(size=GRID_SIZE, mode=ENV_MODE) for _ in range(BATCH_SIZE) ] global_step = 0 timestamp = int(datetime.now().timestamp()) writer = SummaryWriter( f"{CWD}/runs/gw_ac_LR{str(LEARNING_RATE)[:7]}_{timestamp}") # Add model graph envs[0].reset() writer.add_graph(model, GwAcModel.convert_inputs(envs[:1])) # -------------- Training loop ---------------- for epoch in range(EPOCHS): [env.reset() for env in envs] stats: List[List[Dict[str, Union[torch.Tensor, float]]]] = [[] for _ in envs] step = 0 episode_rewards = [] tree = MctsNode() expand(tree) # -------------- Monte Carlo Loop --------------------- while True: """ 1. predict for node when you create the node 2. During expansion, filter the children 3. During selection, only go deep enough """ # ----------- Predict policy and value ------------- states = GwAcModel.convert_inputs(envs) policy, value = model( states) # Shapes: ph: (batch, 4); vh: (batch, 1) # ------------ Sample actions ----------------- tau = max((1 / (np.log(epoch) + 0.0001) * 5), 0.7) writer.add_scalar("tau", tau, global_step=global_step) policy = F.gumbel_softmax(policy, tau=tau, dim=1) actions = torch.multinomial(policy, 1).squeeze() # shape: (batch) # ------------- Rewards from step ---------------- for i in range(BATCH_SIZE): if not envs[i].done: _, reward, _, _ = envs[i].step(actions[i]) stats[i].append({ "reward": reward, "value": value[i][0], "policy": policy[i][actions[i]], }) episode_rewards.append(reward) # -------------- Termination conditions ------------------ all_done = all([env.done for env in envs]) has_timed_out = step >= MAX_MONTE_CARLO_STEPS n_step_ended = step % N_TRAIN_STEP == 0 if has_timed_out: # Set unfinished env's reward to -10 for i in range(BATCH_SIZE): if not envs[i].done: stats[i][-1]["reward"] = -10 all_done = True if all_done or n_step_ended: # ----------- Add last state's value ------------- states = GwAcModel.convert_inputs(envs) model.eval() with torch.no_grad(): _, value = model(states) model.train() for i in range(BATCH_SIZE): stats[i].append({"value": value[i][0]}) # -------------- LEARN ----------------- # loss = naive_ac_loss(stats, GAMMA_RETURNS, GAMMA_CREDITS) loss = advantage_ac_loss(stats, GAMMA_RETURNS) optim.zero_grad() loss.backward() optim.step() # ------------ Logging ---------------- writer.add_scalar("Training loss", loss.item(), global_step=global_step) global_step += 1 # Clean up stats = [[] for _ in envs] if all_done: break writer.add_scalar("Mean Rewards", np.mean(episode_rewards), global_step=global_step) step += 1 print(".", end="") save_model(model, CWD, "grid_world_ac")
import numpy as np import copy import torch from gym_grid_world.envs import GridWorldEnv size = 4 mode = "static" MAX_STEPS = 50 env = GridWorldEnv(size=size, mode=mode) class MctsNode: def __init__(self, parent=None, action: int = None): self.parent = parent self.children = None self.action = action self.n = 0 self.wins = 0 self.is_terminal = False temp_env = GridWorldEnv(size, mode) temp_env.reset() self.state: np.ndarray = (copy.deepcopy(parent.state) if parent is not None else temp_env.state) if action is not None: temp_env.set_state(self.state) self.state, _, self.is_terminal, _ = temp_env.step(action)
def main_single_batch(): # ============= INITIALIZE VARIABLES =================== grid_size = 4 epsilon = 0.1 gamma = 0.7 n_episodes = 5000 max_steps = 50 replay_batch_size = 199 max_buffer_size = 1000 lr = 0.01 mode = "random" architecture = [50] writer = SummaryWriter( f"{CWD}/runs/gw_PER_q_LR{str(lr)[:7]}_{mode}_{int(datetime.now().timestamp())}" ) env = GridWorldEnv(size=grid_size, mode=mode) experiences = PrioritizedReplay(max_size=max_buffer_size) model = GWPgModel(size=grid_size, units=architecture).double().to(device) optim = torch.optim.Adam(model.parameters(), lr=lr) # ============== TRAINING LOOP =========================== for epoch in range(n_episodes): env.reset() step = 0 losses = [] all_rewards = [] while not env.done and step < max_steps: # Store state for experience replay state = env.state # =============== Collect experiences ================= envs = [env] exp_samples = experiences.sample(replay_batch_size) for exp in exp_samples: new_env = GridWorldEnv(size=grid_size, mode=mode) new_env.reset() new_env.state = state_to_dict(exp[1]) envs.append(new_env) x = model.convert_inputs(envs) x = x + torch.rand_like(x) / 100 # ======================================================= y = model(x) rewards = [] qhs = [] for i in range(len(envs)): # =========== Epsilon Probability ============== use_rand = torch.rand(1)[0] < epsilon action = ( torch.randint(0, 4, (1,))[0] if use_rand else torch.argmax(y[i], 0) ) qh = y[i][action] # ============ Observe the reward && predict value of next state ============== _, reward, _, _ = envs[i].step(int(action)) rewards.append(reward) qhs.append(qh) rewards = torch.tensor(rewards).double().to(device) qhs = torch.stack(qhs) with torch.no_grad(): model.eval() x_next = model.convert_inputs(envs) y_next = model(x_next) q_next, _ = torch.max(y_next, dim=1) model.train() q = rewards + gamma * q_next # =========== LEARN =============== loss = (qhs - q) ** 2 experiences.put([(loss[0].item(), state)]) losses.append(loss[0].item()) all_rewards.append(rewards[0]) loss = torch.mean(loss) optim.zero_grad() loss.backward() optim.step() step += 1 writer.add_scalar("loss", np.mean(losses), global_step=epoch) writer.add_scalar("reward", np.mean(all_rewards), global_step=epoch) writer.add_scalar("episode_len", len(losses), global_step=epoch) print(".", end="")
def main_single_batch(): # ============= INITIALIZE VARIABLES =================== grid_size = 4 epsilon = 0.1 gamma = 0.95 n_episodes = 15000 max_steps = 50 replay_batch_size = 199 max_buffer_size = 1000 sync_freq = 250 # NEW lr = 0.001 mode = "random" architecture = [50] writer = SummaryWriter( f"{CWD}/runs/gw_target_network_q_LR{str(lr)[:7]}_{mode}_{int(datetime.now().timestamp())}" ) env = GridWorldEnv(size=grid_size, mode=mode) experiences = PrioritizedReplay(max_size=max_buffer_size) model = GWPgModel(size=grid_size, units=architecture).double().to(device) model2 = deepcopy(model) model2.load_state_dict(model.state_dict()) model2.eval() optim = torch.optim.Adam(model.parameters(), lr=lr) global_step = 0 # ============== TRAINING LOOP =========================== for epoch in range(n_episodes): env.reset() step = 0 while not env.done and step <= max_steps: # Store state for experience replay state = env.state # =============== Collect experiences ================= envs = [env] exp_samples = experiences.sample(replay_batch_size) for exp in exp_samples: new_env = GridWorldEnv(size=grid_size, mode=mode) new_env.reset() new_env.state = state_to_dict(exp[1]) envs.append(new_env) x = model.convert_inputs(envs) x = x + torch.rand_like(x) / 100 # ======================================================= y = model(x) rewards = [] qhs = [] for i in range(len(envs)): # =========== Epsilon Probability ============== use_rand = torch.rand(1)[0] < epsilon action = ( torch.randint(0, 4, (1,))[0] if use_rand else torch.argmax(y[i], 0) ) qh = y[i][action] # ============ Observe the reward && predict value of next state ============== _, reward, _, _ = envs[i].step(int(action)) rewards.append(reward) qhs.append(qh) if step >= max_steps: rewards[0] = -10 env.done = True rewards = torch.tensor(rewards).double().to(device) qhs = torch.stack(qhs) x_next = model2.convert_inputs(envs) with torch.no_grad(): y_next = model2(x_next) q_next, _ = torch.max(y_next, dim=1) q = rewards + gamma * q_next # =========== LEARN =============== loss = (qhs - q) ** 2 loss = torch.mean(loss) optim.zero_grad() loss.backward() optim.step() step += 1 global_step += 1 if global_step % sync_freq == 0: model2.load_state_dict(model.state_dict()) writer.add_scalar("loss", loss.item(), global_step=global_step) writer.add_scalar("reward", rewards[0].item(), global_step=global_step) # writer.add_scalar("episode_len", step, global_step=global_step) print(".", end="") save_model(model, CWD + "/..", "grid_world_q")
gamma_returns = 0.80 gamma_credits = 0.95 total_episodes = 1000 n_env = 50 max_steps = 100 grid_size = 4 env_mode = "random" # ----------------------------------------------------- model = GwAcModel(grid_size, [units for _ in range(depth)]).double().to(device) optim = torch.optim.Adam(model.parameters(), lr=lr) envs = [GridWorldEnv(size=grid_size, mode=env_mode) for _ in range(n_env)] current_episode = 1 stats_e = None won = None writer = SummaryWriter( f"{CWD}/runs/gw_policy_grad_LR{str(lr)[:7]}_{depth}x{units}_{int(datetime.now().timestamp())}" ) envs[0].reset() writer.add_graph(model, GwAcModel.convert_inputs(envs[:1])) # -----------------------------------------------------
def main_single_batch(): # ============= INITIALIZE VARIABLES =================== grid_size = 4 epsilon = 0.1 gamma = 0.9 n_episodes = 10000 max_steps = 150 lr = 0.01 mode = "player" writer = SummaryWriter( f"{CWD}/runs/gw_vanilla_q_LR{str(lr)[:7]}_{mode}_{int(datetime.now().timestamp())}" ) env = GridWorldEnv(size=grid_size, mode=mode) model = GWPgModel(size=grid_size, units=[10]).double().to(device) optim = torch.optim.Adam(model.parameters(), lr=lr) # ============== TRAINING LOOP =========================== for epoch in range(n_episodes): env.reset() step = 0 losses = [] rewards = [] # Monte Carlo loop while not env.done and step < max_steps: x = model.convert_inputs([env]) y = model(x) # =========== Epsilon Probability ============== if torch.rand(1) < epsilon: action = torch.randint(0, 4, (1, )) qh = y[0][action] else: action = torch.argmax(y, 1) qh = y[0][action] # ============ Observe the reward && predict value of next state ============== _, reward, _, _ = env.step(int(action)) with torch.no_grad(): model.eval() q_next, _ = torch.max(model(model.convert_inputs([env]))[0], dim=0) model.train() q = reward + gamma * q_next # =========== LEARN =============== loss = (qh - q)**2 losses.append(loss.item()) rewards.append(reward) optim.zero_grad() loss.backward() optim.step() step += 1 writer.add_scalar("loss", np.mean(losses), global_step=epoch) writer.add_scalar("reward", np.mean(rewards), global_step=epoch) writer.add_scalar("episode_len", len(losses), global_step=epoch) print(".", end="")