def q1_run(num_timesteps): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed, expt_dir='tmp/gym-results2') optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learning( env=env, q_func=DQN, runname="normal_run", optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
def q2_run(num_timesteps): schedulers = {"no_explore": ConstantSchedule(0.1), "delayed_decay": PiecewiseSchedule([(0, 1.0), (0.25e6, 1.0), (1.25e6, 0.1)], outside_value=0.1), "slower_decay": LinearSchedule(1500000, 0.1)} for name, exploration_schedule in schedulers.items(): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) env.reset() optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS)) dqn_learning( env=env, q_func=DQN, runname=name, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
def __init__(self, settings): self.check_settings(settings) # Constants self.batch_size = settings["batch_size"] self.checkpoint_frequency = settings["checkpoint_frequency"] self.device = settings["device"] self.dtype = (torch.cuda.FloatTensor if self.device.type == "cuda" else torch.FloatTensor) self.env_name = settings["env"] self.env = get_env(settings["env"], 6) self.eps_cliff = settings["eps_cliff"] self.eps_start = settings["eps_start"] self.eps_end = settings["eps_end"] self.frame_history_len = settings["frame_history_len"] self.gamma = settings["gamma"] self.learning_freq = settings["learning_freq"] self.learning_start = settings["learning_start"] self.logs_dir = settings["logs_dir"] self.log_freq = settings["log_freq"] self.memory_size = settings["memory_size"] self.model_name = settings["model_name"] self.num_actions = self.env.action_space.n settings["num_actions"] = self.num_actions settings["num_channels"] = self.frame_history_len self.out_dir = settings["out_dir"] self.target_update_freq = settings["target_update_freq"] self.total_timesteps = settings["total_timesteps"] # Init models self.Q = DQN(settings).to(self.device) self.target_Q = DQN(settings).to(self.device) self.target_Q.load_state_dict(self.Q.state_dict()) self.target_Q.eval() # Init model supporting objects self.memory = ReplayBuffer(self.memory_size, self.frame_history_len) self.optimizer = optim.RMSprop(self.Q.parameters(), lr=settings["lr"], alpha=0.95, eps=0.01) self.loss = F.smooth_l1_loss # Logging self.writer = SummaryWriter(self.logs_dir)
def eval_model(self, epoch, n=100): self.Q.eval() env = get_env(self.env_name, 6, monitor=False) rewards = [] durations = [] for _e in tqdm(range(n)): memory = ReplayBuffer(10000, self.frame_history_len) state = env.reset()[..., np.newaxis] reward_acc = 0.0 for t in range(10000): if state is None: break memory.store_frame(state) recent_observations = memory.encode_recent_observation() action = self.select_epsilon_greedy_action( recent_observations, None, 0.05).item() state, reward, done, _ = env.step(action) if done: state = env.reset() state = state[..., np.newaxis] reward_acc += reward durations.append(t) self.Q.train() sum_rewards = sum(rewards) sum_durations = sum(durations) self.writer.add_scalar( f"Mean Reward ({n} episodes)", round(sum_rewards / len(rewards), 2), epoch, ) self.writer.add_scalar( f"Mean Duration ({n} episodes)", round(sum_durations / len(durations), 2), epoch, ) self.writer.add_scalar( f"Mean Reward per Timestep ({n} episodes)", round(sum_rewards / sum_durations, 2), epoch, )
def bonus_run(num_timesteps): def make_range_black(arr: np.ndarray, start, end): arr[:, start:end, :] = 0 frame_filters = {"no_left_side": lambda x: make_range_black(x, 0, x.shape[1] // 4), "no_middle_side": lambda x: make_range_black(x, x.shape[1] // 4, x.shape[1] // 2), } for name, frame_filter in frame_filters.items(): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) env.reset() optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS)) dqn_learning( env=env, q_func=DQN, runname=name, frame_filter=frame_filter, optimizer_spec=optimizer_spec, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=stopping_criterion2(num_timesteps), replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGET_UPDATE_FREQ )
type=int, default=30, help='Number of epochs to train') parser.add_argument('--k', type=int, default=10, help='Number of Value Iterations') parser.add_argument('--l_i', type=int, default=3, help='Number of channels in input layer') parser.add_argument('--l_h', type=int, default=150, help='Number of channels in first hidden layer') parser.add_argument( '--l_q', type=int, default=9, help='Number of channels in q layer (~actions) in VI-module') parser.add_argument('--batch_size', type=int, default=128, help='Batch size') config = parser.parse_args() seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(seed) main(env, 1000000, config=config)
def main(env, num_timesteps): # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = random.randint(0,100) # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) def stopping_criterion(env): # notice that here t is the number of steps of the wrapped env, # which is different from the number of steps in the underlying env return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) # empty dict to hold all results Stats = {} new_lr = 0.001 new_gamma = 0.999 exploration_sches = [LinearSchedule(1000000, 0.1), ConstantSchedule(0.05), ConstantSchedule(0.15), LinearSchedule(500000, 0.05)] optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=new_lr, alpha=ALPHA, eps=EPS), ) env = get_env(task, seed) Stats["lr=0.001, gamma=0.999"] = dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=new_gamma, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, feature_tested="lr=0.001, gamma=0.999" ) optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) env = get_env(task, seed) Stats["Default"] = dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, feature_tested="" ) plt.clf() plt.xlabel('Timesteps') plt.ylabel('Mean Reward (past 100 episodes)') num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"]) plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999") num_items = len(Stats["Default"]["mean_episode_rewards"]) plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default") plt.legend() plt.title("Performance") plt.savefig('Final-Performance.png')
learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, feature_tested="" ) plt.clf() plt.xlabel('Timesteps') plt.ylabel('Mean Reward (past 100 episodes)') num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"]) plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999") num_items = len(Stats["Default"]["mean_episode_rewards"]) plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default") plt.legend() plt.title("Performance") plt.savefig('Final-Performance.png') if __name__ == '__main__': # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] # Run training seed = 0 # datetime.now() # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) main(env, task.max_timesteps)
state, reward, done, _ = env.step(action) state = state[..., np.newaxis] memory.store_effect(last_idx, action, reward, done) reward_acc += reward if done: break time.sleep(0.05) logging.info(f"Total Reward: {reward_acc}") logging.info(f"Average Reward per Timestep: {reward_acc / _step}") logging.info(f"Timesteps: {_step}") if __name__ == "__main__": # Initialize environment env = get_env("StarGunnerDeterministic-v4", 6, monitor=False) # Initialize model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_actions = env.action_space.n num_channels = 4 model = load_model_checkpoint("out/checkpoints/dqn_1250000", num_actions, num_channels).to(device) # play using model play_using_model(env, model, device)
reward_per_timestep = total_reward / total_timesteps timesteps_per_episode = total_timesteps / curr_episode all_rewards_per_episode.append(reward_per_episode) print_policy_statistics(reward_per_episode, reward_per_timestep, timesteps_per_episode, wrapped_stdev(all_rewards_per_episode), episode_num=curr_episode) return total_reward / num_episodes, total_reward / total_timesteps, total_timesteps / num_episodes, wrapped_stdev( all_rewards_per_episode) if __name__ == '__main__': env = get_env(GAME, 6, monitor=False) if len(sys.argv) != 2: print('Incorrect number of arguments: python3 runner.py <random|dqn>') exit(1) if sys.argv[1] == 'dqn': print('Simulating DQN...') device = torch.device("cpu") settings = { "num_actions": env.action_space.n, "num_channels": CHANNELS } model = DQN(settings) model.load_state_dict(
optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, ) if __name__ == '__main__': env_id = 'Pong' + 'NoFrameskip-v4' env = get_env(env_id, seed=0) main(env, 10e6 * 40)
optimizer_spec = OptimizerSpec( constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS), ) exploration_schedule = LinearSchedule(1000000, 0.1) dqn_learing( env=env, q_func=DQN, optimizer_spec=optimizer_spec, exploration=exploration_schedule, stopping_criterion=stopping_criterion, replay_buffer_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA, learning_starts=LEARNING_STARTS, learning_freq=LEARNING_FREQ, frame_history_len=FRAME_HISTORY_LEN, target_update_freq=TARGER_UPDATE_FREQ, ) if __name__ == '__main__': # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env('Breakout-v0', seed) main(env, 1e6)