def main(to_train, save_path): torch.manual_seed(1234) parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda computation") parser.add_argument("--env", default=DEFAULT_ENV_NAME, help="default env name") args = parser.parse_args() device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") os.makedirs(save_path, exist_ok=True) env = wrappers.make_env(args.env) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) target_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) print(net) buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) # only need one optimizer if to_train: train(env, net, target_net, buffer, agent, optimizer, device, save_path)
if not os.path.exists('mimic_models'): os.makedirs('mimic_models') envSI = gym.make('SpaceInvadersNoFrameskip-v4') envSI = ptan.common.wrappers.wrap_dqn(envSI) envDA = gym.make('DemonAttackNoFrameskip-v4') envDA = ptan.common.wrappers.wrap_dqn(envDA) assert envSI.action_space.n == envDA.action_space.n, "Different Action Space Lengths" assert envSI.observation_space.shape == envDA.observation_space.shape, "Different Obs. Space Shapes" print("Loaded Environments: {}l {}".format(envSI.unwrapped.spec.id, envDA.unwrapped.spec.id)) expertSI = dqn_model.DQN(envSI.observation_space.shape, envSI.action_space.n) expertSI.load_state_dict( torch.load(args.si, map_location=device).state_dict()) expertSI_hidden = dqn_model.DQN_Hidden(envSI.observation_space.shape, envSI.action_space.n, expertSI).to(device) expertSI = expertSI.to(device) expertSI.eval() expertSI_hidden.eval() expertDA = dqn_model.DQN(envSI.observation_space.shape, envSI.action_space.n) expertDA.load_state_dict( torch.load(args.da, map_location=device).state_dict()) expertDA_hidden = dqn_model.DQN_Hidden(envSI.observation_space.shape, envSI.action_space.n,
if __name__ == "__main__": mkdir('.', 'checkpoints') parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--env", default=DEFAULT_ENV_NAME, help="Name of the environment, default=" + DEFAULT_ENV_NAME) parser.add_argument("--reward", type=float, default=MEAN_REWARD_GOAL, help="Mean reward goal to stop training, default=%.2f" % MEAN_REWARD_GOAL) args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = wrappers.make_env(args.env) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) writer = SummaryWriter(comment="-" + args.env) print(net) buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward = None
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", required=True, help="Model file to load") parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME, help="Environment name to use, default=" + DEFAULT_ENV_NAME) parser.add_argument("-r", "--record", help="Directory to store video recording") parser.add_argument("--no-visualize", default=True, action='store_false', dest='visualize', help="Disable visualization of the game play") args = parser.parse_args() env = wrappers.make_env(args.env) if args.record: mkdir('.', args.record) env = gym.wrappers.Monitor(env, args.record) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage)) state = env.reset() total_reward = 0.0 c = collections.Counter() while True: start_ts = time.time() if args.visualize: env.render() state_v = torch.tensor(np.array([state], copy=False)) q_vals = net(state_v).data.numpy()[0] action = np.argmax(q_vals) c[action] += 1 state, reward, done, _ = env.step(action)
def main(cuda: bool, env_name: str, reward_stop: float, render: bool, weights_fn: str, fps: float, epsilon_fixed: float, no_learn: bool): device = torch.device("cuda" if cuda else "cpu") # create environment env: gym.Env = wrappers.make_env(env_name) # create both neural networks net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) if weights_fn: assert os.path.isfile(weights_fn), "File {0} does not exist.".format( weights_fn) state_dict = torch.load(weights_fn, map_location=device) net.load_state_dict(state_dict) tgt_net.load_state_dict(state_dict) # create summary writer for tensorboard writer = SummaryWriter(comment="-" + env_name) # create buffer and agent and init epsilon buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer, render=render) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards: List[float] = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward: Optional[float] = None while True: frame_idx += 1 # update epsilon if epsilon_fixed: epsilon = epsilon_fixed else: epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME) # play one step t_step_0 = time.time() reward = agent.play_step(net, epsilon, device) if fps: while 1 / (time.time() - t_step_0) > fps: time.sleep(0.01) if reward is not None: # add reward to total and calculate mean total_rewards.append(reward) mean_reward = np.mean(total_rewards[-100:]) # meter speed speed = (frame_idx - ts_frame) / (time.time() - ts) ts_frame = frame_idx ts = time.time() # print and write information print( "%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (frame_idx, len(total_rewards), float(mean_reward), epsilon, speed)) writer.add_scalar("epsilon", epsilon, frame_idx) writer.add_scalar("speed", speed, frame_idx) writer.add_scalar("reward_100", mean_reward, frame_idx) writer.add_scalar("reward", reward, frame_idx) if best_mean_reward is None or best_mean_reward < mean_reward: torch.save(net.state_dict(), env_name + "-best.dat") if best_mean_reward is not None: print( "Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, float(mean_reward))) best_mean_reward = float(mean_reward) if mean_reward > reward_stop: print("Solved in {0} frames!".format(frame_idx)) break if len(buffer) < REPLAY_START_SIZE or no_learn: continue # sync target net with training net if frame_idx % SYNC_TARGET_FRAMES == 0: tgt_net.load_state_dict(net.state_dict()) optimizer.zero_grad() batch = buffer.sample(BATCH_SIZE) loss_t = calc_loss(batch, net, tgt_net, device=device) loss_t.backward() optimizer.step() writer.close()
parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("-n", type=int, default=STEP_COUNT, help="Steps to do on Bellman unroll") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") env = gym.make(params.env_name) env = drl.common.wrappers.wrap_dqn(env) env.seed(common.SEED) input_shape = env.observation_space.shape n_actions = env.action_space.n selector = dac.EpsilonGreedySelector() eps_tracker = dac.EpsilonTracker(selector, params.epsilon_start, params.epsilon_final, params.epsilon_frames) net = dqn_model.DQN(input_shape, n_actions).to(device) agent = dag.DQNAgent(net, selector, device) tgt_net = dag.TargetNet(net) buffer = dexp.ReplayBuffer(params.replay_size) exp_source = dexp.ExperienceSource(env, agent, buffer, args.n, params.gamma) writer = SummaryWriter(comment="-" + params.env_name) print(net) optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) total_reward = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_m_reward = None
from lib import dqn_model, common # Select hyperparameters params = common.HYPERPARAMS['star_gunner'] total_frames = params['total_frames'] rep_init = params['replay_initial'] tgt_net_sync = params['target_net_sync'] batch_size = params['batch_size'] HIST_LENGTH = 4 UPDATE_FREQ = 4 # Initialise environment and use dqn wrappers env = rl.common.wrappers.make_atari(params['env_name']) env = rl.common.wrappers.wrap_deepmind(env=env, stack_frames=HIST_LENGTH) makeDQN = dqn_model.DQN(env.action_space.n) # Placeholders state = tf.placeholder(tf.float32, shape=[None, HIST_LENGTH, 84, 84], name='state') action = tf.placeholder(tf.float32, shape=[None, env.action_space.n], name='action') reward = tf.placeholder(tf.float32, shape=[None], name='reward') done = tf.placeholder(tf.float32, shape=[None], name='done') state2 = tf.placeholder(tf.float32, shape=[None, HIST_LENGTH, 84, 84], name='next_state') # Loss function net_q, net_vars = makeDQN.create_model(state, name='online') tgt_q, tgt_vars = makeDQN.create_model(state2, name='target') q = tf.reduce_sum(net_q * action, axis=1) max_tgt_q = tf.reduce_max(tgt_q, axis=1) tgt = reward + (1. - done) * params['gamma'] * max_tgt_q delta = tf.stop_gradient(tgt) - q
def main(): global params_save_file game = 'spaceinvaders' params_save_file += '-' + game params = config.HYPERPARAMS[game] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") args = parser.parse_args() env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames']) print("Parameters:") print(params) sys.stdout.flush() writer = SummaryWriter(comment="-" + params['run_name'] + "-prio-replay") net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) if args.cuda: net.cuda() tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.PrioritizedReplayBuffer(exp_source, params['replay_size'], PRIO_REPLAY_ALPHA) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 beta = BETA_START with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += params['steps'] buffer.populate(params['steps']) epsilon_tracker.frame(frame_idx) beta = min(1.0, BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES) new_rewards = exp_source.pop_total_rewards() if new_rewards: writer.add_scalar("beta", beta, frame_idx) if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon, last_dq_losses): break if len(buffer) < params['replay_initial']: continue optimizer.zero_grad() batch, batch_indices, batch_weights = buffer.sample(params['batch_size'] * params['steps'], beta) loss_v, sample_prios = calc_loss(batch, batch_weights, net, tgt_net.target_model, params["gamma"], cuda=args.cuda) loss_v.backward() optimizer.step() buffer.update_priorities(batch_indices, sample_prios) if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % params['save_params_every'] == 0: torch.save(net.state_dict(), params_save_file + str(frame_idx)) torch.save(net.state_dict(), params_save_file + str(frame_idx))
if __name__ == "__main__": print('\n\n***********************************************************') print("* RELINE model's training on MsPacman game is starting... *") print('***********************************************************\n') # set the device -> cuda or cpu device = "cpu" # create the wrapped environment env = wrappers.make_env(DEFAULT_ENV_NAME) num_actions = 5 # exclude actions: 5 6 7 8 # 0 -> none # 1 -> up # 2 -> right # 3 -> left # 4 -> down net = dqn_model.DQN(env.observation_space.shape, num_actions).to(device) tgt_net = dqn_model.DQN(env.observation_space.shape, num_actions).to(device) print(net) buffer = ExperienceBuffer(REPLAY_SIZE) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward = None
def main(): global params_save_file game = 'spaceinvaders' params_save_file += '-' + game params = config.HYPERPARAMS[game] parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("--double", default=True, action="store_true", help="Enable double DQN") args = parser.parse_args() env = gym.make(params['env_name']) env = ptan.common.wrappers.wrap_dqn(env, skip=params['skip-frames']) print("Parameters:") print(params) sys.stdout.flush() writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" + str(args.double)) net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) if args.cuda: net.cuda() tgt_net = ptan.agent.TargetNet(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start']) epsilon_tracker = common.EpsilonTracker(selector, params) agent = ptan.agent.DQNAgent(net, selector, cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=1) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size']) optimizer = optim.Adam(net.parameters(), lr=params['learning_rate']) frame_idx = 0 eval_states = None with common.RewardTracker(writer, params['stop_reward']) as reward_tracker: while True: frame_idx += params['steps'] buffer.populate(params['steps']) epsilon_tracker.frame(frame_idx) new_rewards = exp_source.pop_total_rewards() if new_rewards: if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): break if len(buffer) < params['replay_initial']: continue if eval_states is None: eval_states = buffer.sample(STATES_TO_EVALUATE) eval_states = [np.array(transition.state, copy=False) for transition in eval_states] eval_states = np.array(eval_states, copy=False) optimizer.zero_grad() batch = buffer.sample(params['batch_size'] * params['steps']) loss_v = calc_loss(batch, net, tgt_net.target_model, gamma=params['gamma'], cuda=args.cuda, double=args.double) loss_v.backward() optimizer.step() if frame_idx % params['target_net_sync'] == 0: tgt_net.sync() if frame_idx % EVAL_EVERY_FRAME == 0: mean_val = calc_values_of_states(eval_states, net, cuda=args.cuda) writer.add_scalar("values_mean", mean_val, frame_idx) if frame_idx % params['save_params_every'] == 0: torch.save(net.state_dict(), params_save_file + str(frame_idx)) torch.save(net.state_dict(), params_save_file + str(frame_idx))