def update_random_batch(batch): next_action_probs = computing_probabilities(batch) batch["next_action_probs"] = next_action_probs def update_q_parameters(batch): dqn_agent.update_parameters(batch) def compute_return(batch): return dqn_agent.compute_q_values(batch["states"], batch["actions"]) reward = [] for _ in trange(10000): batch = sampler.collect_one_batch() actions = batch["actions"] update_batch(batch) replay_buffer.add_batch(batch) if sample_size <= replay_buffer.num_items: random_batch = replay_buffer.sample_batch(sample_size) update_random_batch(random_batch) update_q_parameters(random_batch) returns = compute_return(batch) pg_reinforce.update_parameters(batch["states"], actions, returns) reward.append(batch["rewards"].sum() / 200) show_image(reward)
fixed_policy = FixedPolicy() sampler = Sampler(fixed_policy, env, discount=1, use_doubly_robust=True) def action_masker(array): masked_action = np.zeros((array.size, num_actions)) masked_action[np.arange(array.size), array] = 1 return masked_action def probs_for_next_action(array): return np.column_stack( (0.5 * np.ones_like(array), 0.5 * np.ones_like(array))) dqn_agent = DQNAgent(session, optimizer, q_network, state_dim, num_actions, summary_writer=writer) dr = DoublyRobust(dqn_agent, fixed_policy, fixed_policy) episodes, batch = sampler.collect_one_batch() for episode in episodes: masked_action = action_masker(np.array(episode["actions"])) episode["masked_actions"] = masked_action
exploration_policy = EpsilonGreedyPolicy(dueling_agent, num_actions, epsilon) # Always take greedy actions according to greedy policy greedy_policy = EpsilonGreedyPolicy(dueling_agent, num_actions, 1.0) # Sampler (collect trajectories using the present dueling agent) num_episodes = 10 training_sampler = Sampler(exploration_policy, env, num_episodes=num_episodes) testing_sampler = Sampler(greedy_policy, env, num_episodes=5) # Initializing ReplayBuffer buffer_size = 100000 sample_size = 32 replay_buffer = ReplayBuffer(buffer_size) def update_q_parameters(batch): dueling_agent.update_parameters(batch) reward = [] for _ in trange(1000): batch = training_sampler.collect_one_batch() replay_buffer.add_batch(batch) if sample_size <= replay_buffer.num_items: random_batch = replay_buffer.sample_batch(sample_size) # replay buffer update_q_parameters(random_batch) testing_batch = testing_sampler.collect_one_batch() reward.append(testing_batch["rewards"].sum() / 200) show_image(reward)
def record_progress(): batch_size = 5 sampler = Sampler(pg_reinforce, env, batch_size=batch_size) batch = sampler.collect_one_batch() return (batch["rewards"].sum()) / batch_size