def supervised_training(net, lr, n_epochs, n_samples, game_params, get_probs=False): env = test_env.Sandbox(**game_params) print("\nCreating dataset...") state_set, action_set = create_action_state_set(game_params, size=n_samples, get_probs=get_probs) train_loader, val_loader, test_loader = prepare_dataset(state_set, action_set, 0.8, 0.2) dataloader_dict = dict(train_loader=train_loader, val_loader=val_loader, test_loader=test_loader) print("\nTraining network...") net, train_loss, val_loss = train_NN(net, lr, n_epochs, train_loader, val_loader, return_model=True, KL_loss=get_probs) return net, train_loss, val_loss, dataloader_dict, state_set, action_set, env
def train_sandbox(agent, game_params, n_episodes = 1000, max_steps=120, return_agent=False, random_init=True): performance = [] steps_to_solve = [] time_profile = [] critic_losses = [] actor_losses = [] entropies = [] for e in range(n_episodes): if random_init: # Change game params initial, goal = random_start(game_params["x"], game_params["y"]) # All game parameters game_params["initial"] = initial game_params["goal"] = goal #print("Playing episode %d... "%(e+1)) t0 = time.time() env = test_env.Sandbox(**game_params) rewards, log_probs, distributions, states, done, bootstrap = play_episode(agent, env, max_steps) t1 = time.time() #print("Time playing the episode: %.2f s"%(t1-t0)) performance.append(np.sum(rewards)) steps_to_solve.append(len(rewards)) if (e+1)%10 == 0: print("Episode %d - reward: %.2f - steps to solve: %.2f"%(e+1, np.mean(performance[-10:]), np.mean(steps_to_solve[-10:]))) #print("Episode %d - reward: %.2f - steps to solve: %d"%(e+1, performance[-1], len(rewards))) critic_loss, actor_loss, entropy = agent.update(rewards, log_probs, distributions, states, done, bootstrap) critic_losses.append(critic_loss) actor_losses.append(actor_loss) entropies.append(entropy) t2 = time.time() #print("Time updating the agent: %.2f s"%(t2-t1)) time_profile.append([t1-t0, t2-t1]) performance = np.array(performance) time_profile = np.array(time_profile) steps_to_solve = np.array(steps_to_solve) L = n_episodes // 6 # consider last sixth of episodes to compute agent's asymptotic performance losses = dict(critic_losses=critic_losses, actor_losses=actor_losses, entropies=entropies) if return_agent: return performance, performance[-L:].mean(), performance[-L:].std(), agent, time_profile, losses, steps_to_solve else: return performance, performance[-L:].mean(), performance[-L:].std(), losses, steps_to_solve
def create_action_state_set(game_params, size = 10000, get_probs=False): action_memory = [] state_memory = [] while len(action_memory) < size: # Change game params initial, goal = random_start(game_params["x"], game_params["y"]) # All game parameters game_params["initial"] = initial game_params["goal"] = goal env = test_env.Sandbox(**game_params) actions, states = play_optimal(env, get_probs) action_memory += actions state_memory += states #print('len(action_memory): ',len(action_memory)) return np.array(state_memory[:size]), np.array(action_memory[:size])
def render(agent=None, env=None, save=False, x=10, y=10, goal=[9, 9], initial=[0, 0], greedy=True): fig = plt.figure(figsize=(8, 6)) # initialize environment if env is None: env = test_env.Sandbox(x, y, initial, goal, max_steps=50) # rgb_map = np.full( (env.boundary[0], env.boundary[1], 3), [199, 234, 70]) / 255. rgb_map[env.goal[0], env.goal[1], :] = np.array([255, 255, 255]) / 255. rgb_map[env.initial[0], env.initial[1], :] = np.array([225, 30, 100]) / 255. plt.imshow(rgb_map) # show map plt.title("Sandbox Env - Turn: %d" % (0)) plt.yticks([]) plt.xticks([]) fig.show() time.sleep(0.75) #uncomment to slow down for visualization purposes if save: plt.savefig('.raw_gif/turn%.3d.png' % 0) # run episode state = env.reset() for step in range(0, env.max_steps): if agent is None: action = env.get_optimal_action() else: action, log_prob, probs = agent.get_action(state, return_log=True) if greedy: probs = probs.squeeze().cpu().detach().numpy() action = np.argmax(probs) new_state, reward, terminal, info = env.step( action) # gym standard step's output plt.cla( ) # clear current axis from previous drawings -> prevents matplotlib from slowing down rgb_map = np.full( (env.boundary[0], env.boundary[1], 3), [199, 234, 70]) / 255. rgb_map[env.goal[0], env.goal[1], :] = np.array([255, 255, 255]) / 255. rgb_map[env.state[0], env.state[1], :] = np.array([225, 30, 100]) / 255. plt.imshow(rgb_map) plt.title("Sandbox Env - Turn: %d " % (step + 1)) plt.yticks([]) # remove y ticks plt.xticks([]) # remove x ticks fig.canvas.draw() # update the figure time.sleep(0.5) #uncomment to slow down for visualization purposes if save: plt.savefig('.raw_gif/turn%.3d.png' % (step + 1)) if terminal: break state = new_state return