コード例 #1
0
def supervised_training(net, lr, n_epochs, n_samples, game_params, get_probs=False):
    env = test_env.Sandbox(**game_params)
    print("\nCreating dataset...")
    state_set, action_set = create_action_state_set(game_params, size=n_samples, get_probs=get_probs)
    train_loader, val_loader, test_loader = prepare_dataset(state_set, action_set, 0.8, 0.2)
    dataloader_dict = dict(train_loader=train_loader,
                           val_loader=val_loader,
                           test_loader=test_loader)
    print("\nTraining network...")
    net, train_loss, val_loss = train_NN(net, lr, n_epochs, train_loader, val_loader, 
                                         return_model=True, KL_loss=get_probs)
    
    return net, train_loss, val_loss, dataloader_dict, state_set, action_set, env
コード例 #2
0
def train_sandbox(agent, game_params, n_episodes = 1000, max_steps=120, return_agent=False, random_init=True):
    performance = []
    steps_to_solve = []
    time_profile = []
    critic_losses = [] 
    actor_losses = []
    entropies = []
    
    for e in range(n_episodes):
        
        if random_init:
            # Change game params
            initial, goal = random_start(game_params["x"], game_params["y"])

            # All game parameters
            game_params["initial"] = initial
            game_params["goal"] = goal

        #print("Playing episode %d... "%(e+1))
        t0 = time.time()
        env = test_env.Sandbox(**game_params)
        rewards, log_probs, distributions, states, done, bootstrap = play_episode(agent, env, max_steps)
        t1 = time.time()
        #print("Time playing the episode: %.2f s"%(t1-t0))
        performance.append(np.sum(rewards))
        steps_to_solve.append(len(rewards))
        if (e+1)%10 == 0:
            print("Episode %d - reward: %.2f - steps to solve: %.2f"%(e+1, np.mean(performance[-10:]), np.mean(steps_to_solve[-10:])))
        #print("Episode %d - reward: %.2f - steps to solve: %d"%(e+1, performance[-1], len(rewards)))

        critic_loss, actor_loss, entropy = agent.update(rewards, log_probs, distributions, states, done, bootstrap)
        critic_losses.append(critic_loss)
        actor_losses.append(actor_loss)
        entropies.append(entropy)
    
        t2 = time.time()
        #print("Time updating the agent: %.2f s"%(t2-t1))
            
        time_profile.append([t1-t0, t2-t1])
        
    performance = np.array(performance)
    time_profile = np.array(time_profile)
    steps_to_solve = np.array(steps_to_solve)
    L = n_episodes // 6 # consider last sixth of episodes to compute agent's asymptotic performance
    losses = dict(critic_losses=critic_losses, actor_losses=actor_losses, entropies=entropies)
    if return_agent:
        return performance, performance[-L:].mean(), performance[-L:].std(), agent, time_profile, losses, steps_to_solve
    else:
        return performance, performance[-L:].mean(), performance[-L:].std(), losses, steps_to_solve
コード例 #3
0
def create_action_state_set(game_params, size = 10000, get_probs=False):
    action_memory = []
    state_memory = []
    
    while len(action_memory) < size:
        
        # Change game params
        initial, goal = random_start(game_params["x"], game_params["y"])

        # All game parameters
        game_params["initial"] = initial
        game_params["goal"] = goal

        env = test_env.Sandbox(**game_params)
        
        actions, states = play_optimal(env, get_probs)
        action_memory += actions
        state_memory += states
        
        #print('len(action_memory): ',len(action_memory))
        
    return np.array(state_memory[:size]), np.array(action_memory[:size])
コード例 #4
0
def render(agent=None,
           env=None,
           save=False,
           x=10,
           y=10,
           goal=[9, 9],
           initial=[0, 0],
           greedy=True):
    fig = plt.figure(figsize=(8, 6))
    # initialize environment
    if env is None:
        env = test_env.Sandbox(x, y, initial, goal, max_steps=50)
    #

    rgb_map = np.full(
        (env.boundary[0], env.boundary[1], 3), [199, 234, 70]) / 255.
    rgb_map[env.goal[0], env.goal[1], :] = np.array([255, 255, 255]) / 255.
    rgb_map[env.initial[0],
            env.initial[1], :] = np.array([225, 30, 100]) / 255.
    plt.imshow(rgb_map)  # show map
    plt.title("Sandbox Env - Turn: %d" % (0))
    plt.yticks([])
    plt.xticks([])
    fig.show()
    time.sleep(0.75)  #uncomment to slow down for visualization purposes
    if save:
        plt.savefig('.raw_gif/turn%.3d.png' % 0)

    # run episode
    state = env.reset()
    for step in range(0, env.max_steps):
        if agent is None:
            action = env.get_optimal_action()
        else:
            action, log_prob, probs = agent.get_action(state, return_log=True)
            if greedy:
                probs = probs.squeeze().cpu().detach().numpy()
                action = np.argmax(probs)

        new_state, reward, terminal, info = env.step(
            action)  # gym standard step's output

        plt.cla(
        )  # clear current axis from previous drawings -> prevents matplotlib from slowing down
        rgb_map = np.full(
            (env.boundary[0], env.boundary[1], 3), [199, 234, 70]) / 255.
        rgb_map[env.goal[0], env.goal[1], :] = np.array([255, 255, 255]) / 255.
        rgb_map[env.state[0],
                env.state[1], :] = np.array([225, 30, 100]) / 255.
        plt.imshow(rgb_map)
        plt.title("Sandbox Env - Turn: %d " % (step + 1))
        plt.yticks([])  # remove y ticks
        plt.xticks([])  # remove x ticks
        fig.canvas.draw()  # update the figure
        time.sleep(0.5)  #uncomment to slow down for visualization purposes
        if save:
            plt.savefig('.raw_gif/turn%.3d.png' % (step + 1))

        if terminal:
            break
        state = new_state

    return