def update_random_batch(batch):
    next_action_probs = computing_probabilities(batch)
    batch["next_action_probs"] = next_action_probs


def update_q_parameters(batch):
    dqn_agent.update_parameters(batch)


def compute_return(batch):
    return dqn_agent.compute_q_values(batch["states"], batch["actions"])


reward = []
for _ in trange(10000):
    batch = sampler.collect_one_batch()
    actions = batch["actions"]
    update_batch(batch)
    replay_buffer.add_batch(batch)
    if sample_size <= replay_buffer.num_items:
        random_batch = replay_buffer.sample_batch(sample_size)
        update_random_batch(random_batch)
        update_q_parameters(random_batch)
        returns = compute_return(batch)
        pg_reinforce.update_parameters(batch["states"], actions, returns)
        reward.append(batch["rewards"].sum() / 200)

show_image(reward)
Exemple #2
0
fixed_policy = FixedPolicy()
sampler = Sampler(fixed_policy, env, discount=1, use_doubly_robust=True)


def action_masker(array):
    masked_action = np.zeros((array.size, num_actions))
    masked_action[np.arange(array.size), array] = 1
    return masked_action


def probs_for_next_action(array):
    return np.column_stack(
        (0.5 * np.ones_like(array), 0.5 * np.ones_like(array)))


dqn_agent = DQNAgent(session,
                     optimizer,
                     q_network,
                     state_dim,
                     num_actions,
                     summary_writer=writer)

dr = DoublyRobust(dqn_agent, fixed_policy, fixed_policy)

episodes, batch = sampler.collect_one_batch()

for episode in episodes:
    masked_action = action_masker(np.array(episode["actions"]))
    episode["masked_actions"] = masked_action
Exemple #3
0
exploration_policy = EpsilonGreedyPolicy(dueling_agent, num_actions, epsilon)
# Always take greedy actions according to greedy policy
greedy_policy = EpsilonGreedyPolicy(dueling_agent, num_actions, 1.0)

# Sampler (collect trajectories using the present dueling agent)
num_episodes = 10
training_sampler = Sampler(exploration_policy, env, num_episodes=num_episodes)
testing_sampler = Sampler(greedy_policy, env, num_episodes=5)

# Initializing ReplayBuffer
buffer_size = 100000
sample_size = 32
replay_buffer = ReplayBuffer(buffer_size)


def update_q_parameters(batch):
    dueling_agent.update_parameters(batch)


reward = []
for _ in trange(1000):
    batch = training_sampler.collect_one_batch()
    replay_buffer.add_batch(batch)
    if sample_size <= replay_buffer.num_items:
        random_batch = replay_buffer.sample_batch(sample_size)  # replay buffer
        update_q_parameters(random_batch)
        testing_batch = testing_sampler.collect_one_batch()
        reward.append(testing_batch["rewards"].sum() / 200)

show_image(reward)
Exemple #4
0
def record_progress():
    batch_size = 5
    sampler = Sampler(pg_reinforce, env, batch_size=batch_size)
    batch = sampler.collect_one_batch()
    return (batch["rewards"].sum()) / batch_size