def verify_network(agent, utils, config):
    """
    For plotting - plot the math mean error versions along with the agent mean
    """
    experience = namedtuple('experience',
                            field_names=[
                                'state', 'value', 'log_prob', 'action',
                                'reward', 'next_state'
                            ])
    # Create two instances of gyms
    gym_network = Gym(config.total_routes, config.num_reset_routes, utils)
    gym_math = Gym(config.total_routes, config.num_reset_routes, utils)
    # tic = time.time()
    # Load weights of trained model
    agent.load_weights(config.checkpoint_path)

    # Collections
    math_means = []
    math_stds = []
    means = []
    stds = []
    scores_window = deque(maxlen=100)
    math_window = deque(maxlen=100)
    for e in range(1, 11):
        trajectory = []
        rewards = []
        math_rewards = []
        state = gym_network.reset()
        math_state = gym_math.reset()
        math_loss = [gym_math.loss]
        network_loss = [gym_network.loss]
        for t in range(config.tmax):
            suggestion, log_prob, value = agent.act(state)
            route = utils.route_from_suggestion(suggestion)
            next_state, reward = gym_network.step(route)
            # Compare with math
            math_route = utils.deterministic_route(math_state)
            math_next_state, math_reward = gym_math.step(math_route)
            math_rewards.append(math_reward)
            math_loss.append(gym_math.loss)
            # Record (s,a,r,s)
            exp = experience(state, value, log_prob, suggestion, reward,
                             next_state)
            trajectory.append(exp)
            rewards.append(reward)
            network_loss.append(gym_network.loss)
            state = next_state
        scores_window.append(np.sum(rewards))
        means.append(np.mean(scores_window))
        stds.append(np.std(scores_window))
        math_window.append(np.sum(math_rewards))
        math_means.append(np.mean(math_window))
        math_stds.append(np.std(math_window))
        # Compare network vs math
        if e == 1:
            plot_episode(math_loss, name="Math single episode")
            plot_episode(network_loss, name="Network single episode")
        if e % 5 == 0:
            plot(means, stds, name=config.name, game='RouteMuse')
            plot(math_means, math_stds, name='Math', game='RouteMuse')
Exemple #2
0
def train_network(agent, utils, config):
    """
	For plotting - plot the math mean error versions along with the agent mean
	"""
    experience = namedtuple('experience',
                            field_names=[
                                'state', 'value', 'log_prob', 'action',
                                'reward', 'next_state'
                            ])
    # Compared training with math baseline
    gym = Gym(config.total_routes, config.num_reset_routes, utils)
    math_gym = Gym(config.total_routes, config.num_reset_routes, utils)
    tic = time.time()
    # Collections
    max_mean = 0
    means = []
    stds = []
    mins = []
    maxes = []
    stds = []
    steps = []
    math_means = []
    math_stds = []
    math_window = deque(maxlen=100)
    scores_window = deque(maxlen=100)
    for e in range(1, config.episodes):
        trajectory = []
        rewards = []
        math_rewards = []
        state = gym.reset()
        math_state = math_gym.reset()
        for t in range(config.tmax):
            # Normalize state for network
            normalized_state = normalize(state)
            suggestion, log_prob, value = agent.act(normalized_state)
            route = utils.route_from_suggestion(suggestion)
            next_state, reward = gym.step(route)
            # math comparison
            math_route = utils.deterministic_route(math_state)
            math_next_state, math_reward = math_gym.step(math_route)
            math_rewards.append(math_reward)
            math_state = math_next_state
            # Record (s,a,r,s)
            exp = experience(state, value, log_prob, suggestion, reward,
                             next_state)
            trajectory.append(exp)
            rewards.append(reward)
            state = next_state
        agent.step(trajectory)
        steps.append(t)
        math_window.append(sum(math_rewards))
        math_means.append(np.mean(math_window))
        math_stds.append(np.std(math_window))
        scores_window.append(sum(rewards))
        means.append(np.mean(scores_window))
        mins.append(np.min(scores_window))
        maxes.append(np.max(scores_window))
        stds.append(np.std(scores_window))
        if e % 10 == 0:
            toc = time.time()
            r_mean = np.mean(scores_window)
            r_max = max(scores_window)
            r_min = min(scores_window)
            r_std = np.std(scores_window)
            plot(math_means, math_stds, name="Math", game="RouteMuse")
            plot(means, stds, name=config.name, game='RouteMuse')
            print(
                "\rEpisode: {} out of {}, Steps {}, Mean steps {:.2f}, Rewards: mean {:.2f}, min {:.2f}, max {:.2f}, std {:.2f}, Elapsed {:.2f}"
                .format(e, config.episodes, np.sum(steps), np.mean(steps),
                        r_mean, r_min, r_max, r_std, (toc - tic) / 60))
            # save scores
            if r_mean > max_mean:
                pickle.dump([means, maxes, mins],
                            open(str(config.name) + '_scores.p', 'wb'))
                # save policy
                agent.save_weights(config.checkpoint_path)
                max_mean = r_mean