def replay_train(mainDQN: dqn.DQN, targetDQN: dqn.DQN, train_batch: list) -> float: states = np.vstack([x[0] for x in train_batch]) actions = np.array([x[1] for x in train_batch]) rewards = np.array([x[2] for x in train_batch]) next_states = np.vstack([x[3] for x in train_batch]) done = np.array([x[4] for x in train_batch]) X = states Q_target = rewards + DISCOUNT_RATE * np.max(targetDQN.predict(next_states), axis=1) * ~done y = mainDQN.predict(states) y[np.arange(len(X)), actions] = Q_target # Train our network using target and predicted Q values on each episode return mainDQN.update(X, y)
def __init__(self, conf): self.env = Environment(name=conf.env, width=conf.width, height=conf.height, history=conf.history) self.hist = History(self.env) self.mem = ReplayMemory(self.env, capacity=conf.mem_capacity, batch_size=conf.batch_size) self._capa = conf.mem_capacity self._ep_en = conf.ep_end self._ep_st = conf.ep_start self._learn_st = conf.learn_start self._tr_freq = conf.train_freq self._update_freq = conf.update_freq self.q = DQN(self.hist._history, self.env.action_size).type(dtype) self.target_q = DQN(self.hist._history, self.env.action_size).type(dtype) self.optim = torch.optim.RMSprop(self.q.parameters(), lr=0.00025, alpha=0.95, eps=0.01)
if args.random_seed: random.seed(args.random_seed) # instantiate classes if args.environment == 'ale': env = ALEEnvironment(args.game, args) logger.info("Using ALE Environment") elif args.environment == 'gym': logger.handlers.pop() env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") else: assert False, "Unknown environment" + args.environment mem = ReplayMemory(args.replay_size, args) net = DQN(env.numActions(), args) agent = Agent(env, mem, net, args) stats = Statistics(agent, net, mem, env, args) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights) net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for %d game(s)" % args.play_games) stats.reset() agent.play(args.play_games) stats.write(0, "play") if args.visualization_file: from visualization import visualize # use states recorded during gameplay. NB! Check buffer size, that it can accomodate one game!
class Agent(object): def __init__(self, conf): self.env = Environment(name=conf.env, width=conf.width, height=conf.height, history=conf.history) self.hist = History(self.env) self.mem = ReplayMemory(self.env, capacity=conf.mem_capacity, batch_size=conf.batch_size) self._capa = conf.mem_capacity self._ep_en = conf.ep_end self._ep_st = conf.ep_start self._learn_st = conf.learn_start self._tr_freq = conf.train_freq self._update_freq = conf.update_freq self.q = DQN(self.hist._history, self.env.action_size).type(dtype) self.target_q = DQN(self.hist._history, self.env.action_size).type(dtype) self.optim = torch.optim.RMSprop(self.q.parameters(), lr=0.00025, alpha=0.95, eps=0.01) def train(self): screen, reward, action, terminal = self.env.new_random_game() for _ in range(self.env._history): self.hist.add(screen) num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] #for self.step in xrange(50000000): for self.step in tqdm(range(0, 50000000), ncols=70, initial=0): if self.step == self._learn_st: num_game, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] action = self._select_action() screen, reward, terminal = self.env.act(action) self.observe(screen, reward, action, terminal) if terminal: screen, reward, action, terminal = self.env.new_random_game() num_game += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self._learn_st: if self.step % 10000 == 10000 - 1: avg_reward = total_reward / 10000. avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count print '# games: {}, reward: {}, loss: {}, q: {}'.format( num_game, avg_reward, avg_loss, avg_q) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = [] def observe(self, screen, reward, action, terminal): reward = max(-1., min(1., reward)) self.hist.add(screen) self.mem.add(screen, reward, action, terminal) if self.step > self._learn_st: if self.step % self._tr_freq == 0: self._q_learning() #print '{} q-learning'.format(self.step) if self.step % self._update_freq == self._update_freq - 1: self.target_q.load_state_dict(self.q.state_dict()) if self.step % (self._update_freq * 10) == (self._update_freq * 10) - 1: torch.save(self.target_q, 'models1/model_{}'.format(self.step)) #print 'update' def play(self, model_path, num_ep=200): self.q = torch.load(model_path) best_reward = 0 best_screen_hist = [] for ep in range(num_ep): print '# episode: {}'.format(ep) screen, reward, action, terminal = self.env.new_random_game( force=True) current_reward = 0 current_screen_hist = [] act_hist = [] current_screen_hist.append(self.env.screen) for _ in range(self.env._history): self.hist.add(screen) cnt = 0 while not terminal: cnt += 1 action = self._select_action(test_mode=True) act_hist.append(action) if cnt > 200: # avoid local maxima ??? same actions....?? if np.array(act_hist[-100:]).mean() == act_hist[-1]: action = random.randrange(self.env.action_size) screen, reward, terminal = self.env.act(action, is_train=False) self.hist.add(screen) current_reward += reward #print cnt, action, current_reward, terminal, self.env.lives current_screen_hist.append(self.env.screen) print current_reward print 'count: {}'.format(cnt) if current_reward > best_reward: best_reward = current_reward best_screen_hist = current_screen_hist import imageio print 'best reward: {}'.format(best_reward) imageio.mimsave('movies_play/best_{}.gif'.format(best_reward), best_screen_hist, 'GIF', duration=0.0001) def _q_learning(self): sc_t, actions, rewards, sc_t_1, terminals = self.mem.sample() batch_obs_t = self._to_tensor(sc_t) batch_obs_t_1 = self._to_tensor(sc_t_1, volatile=True) batch_rewards = self._to_tensor(rewards).unsqueeze(1) batch_actions = self._to_tensor( actions, data_type=torch.cuda.LongTensor).unsqueeze(1) batch_terminals = self._to_tensor(1. - terminals).unsqueeze(1) q_dash = self.q(batch_obs_t) #print 'shape_q: {}'.format(q_dash.shape) q_values = self.q(batch_obs_t).gather(1, batch_actions) next_max_q_values = self.target_q(batch_obs_t_1).max(1)[0].unsqueeze(1) next_q_values = batch_terminals * next_max_q_values target_q_values = batch_rewards + (0.99 * next_q_values) target_q_values.volatile = False cri = torch.nn.SmoothL1Loss() self.loss = cri(q_values, target_q_values) self.optim.zero_grad() self.loss.backward() self.optim.step() self.update_count += 1 self.total_q += q_values.data.mean() self.total_loss += self.loss.data.mean() def _select_action(self, test_mode=False): # epsilon greedy policy if not test_mode: ep = self._ep_en + max( 0., (self._ep_st - self._ep_en) * (self._capa - max(0., self.step - self._learn_st)) / self._capa) else: ep = -1. if random.random() < ep: action = random.randrange(self.env.action_size) else: inputs = self._to_tensor(self.hist.get) pred = self.q(inputs.unsqueeze(0)) action = pred.data.max(1)[1][ 0] # ##### actual = pred.data.max(1)[1][0][0] return action def _to_tensor(self, ndarray, volatile=False, data_type=dtype): return Variable(torch.from_numpy(ndarray), volatile=volatile).type(data_type)
GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() init_screen = get_screen() _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0 def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \ math.exp(-1. * steps_done / EPS_DECAY)