class DQNAgent: def __init__(self, env_name="BreakoutDeterministic-v4", gamma=0.99, batch_size=32, lr=0.00025, update_period=4, target_update_period=10000, n_frames=4): self.env_name = env_name self.gamma = gamma self.batch_size = batch_size self.epsilon_scheduler = ( lambda steps: max(1.0 - 0.9 * steps / 1000000, 0.1)) self.update_period = update_period self.target_update_period = target_update_period env = gym.make(self.env_name) self.action_space = env.action_space.n self.qnet = QNetwork(self.action_space) self.target_qnet = QNetwork(self.action_space) self.optimizer = Adam(lr=lr, epsilon=0.01 / self.batch_size) self.n_frames = n_frames self.use_reward_clipping = True self.huber_loss = tf.keras.losses.Huber() def learn(self, n_episodes, buffer_size=1000000, logdir="log"): logdir = Path(__file__).parent / logdir if logdir.exists(): shutil.rmtree(logdir) self.summary_writer = tf.summary.create_file_writer(str(logdir)) self.replay_buffer = ReplayBuffer(max_len=buffer_size) steps = 0 for episode in range(1, n_episodes + 1): env = gym.make(self.env_name) frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) episode_rewards = 0 episode_steps = 0 done = False lives = 5 while not done: steps, episode_steps = steps + 1, episode_steps + 1 epsilon = self.epsilon_scheduler(steps) state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=epsilon) next_frame, reward, done, info = env.step(action) episode_rewards += reward frames.append(preprocess_frame(next_frame)) next_state = np.stack(frames, axis=2)[np.newaxis, ...] if info["ale.lives"] != lives: lives = info["ale.lives"] transition = (state, action, reward, next_state, True) else: transition = (state, action, reward, next_state, done) self.replay_buffer.push(transition) if len(self.replay_buffer) > 50000: if steps % self.update_period == 0: loss = self.update_network() with self.summary_writer.as_default(): tf.summary.scalar("loss", loss, step=steps) tf.summary.scalar("epsilon", epsilon, step=steps) tf.summary.scalar("buffer_size", len(self.replay_buffer), step=steps) tf.summary.scalar("train_score", episode_rewards, step=steps) tf.summary.scalar("train_steps", episode_steps, step=steps) if steps % self.target_update_period == 0: self.target_qnet.set_weights(self.qnet.get_weights()) if done: break print( f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}" ) if episode % 20 == 0: test_scores, test_steps = self.test_play(n_testplay=1) with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores[0], step=steps) tf.summary.scalar("test_step", test_steps[0], step=steps) if episode % 1000 == 0: self.qnet.save_weights("checkpoints/qnet") def update_network(self): #: ミニバッチの作成 (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(self.batch_size) if self.use_reward_clipping: rewards = np.clip(rewards, -1, 1) next_actions, next_qvalues = self.target_qnet.sample_actions( next_states) next_actions_onehot = tf.one_hot(next_actions, self.action_space) max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot, axis=1, keepdims=True) target_q = rewards + self.gamma * (1 - dones) * max_next_qvalues with tf.GradientTape() as tape: qvalues = self.qnet(states) actions_onehot = tf.one_hot(actions.flatten().astype(np.int32), self.action_space) q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True) loss = self.huber_loss(target_q, q) grads = tape.gradient(loss, self.qnet.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.qnet.trainable_variables)) return loss def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None): if checkpoint_path: env = gym.make(self.env_name) frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) state = np.stack(frames, axis=2)[np.newaxis, ...] self.qnet(state) self.qnet.load_weights(checkpoint_path) if monitor_dir: monitor_dir = Path(monitor_dir) if monitor_dir.exists(): shutil.rmtree(monitor_dir) monitor_dir.mkdir() env = gym.wrappers.Monitor(gym.make(self.env_name), monitor_dir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_name) scores = [] steps = [] for _ in range(n_testplay): frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) done = False episode_steps = 0 episode_rewards = 0 while not done: state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=0.05) next_frame, reward, done, _ = env.step(action) frames.append(preprocess_frame(next_frame)) episode_rewards += reward episode_steps += 1 if episode_steps > 500 and episode_rewards < 3: #: ゲーム開始(action: 0)しないまま停滞するケースへの対処 break scores.append(episode_rewards) steps.append(episode_steps) return scores, steps
class Agent(object): """DQN Agent that interacts and learns from the environment.""" def __init__(self, state_size, action_size, device, replay_buffer_size=int(1e5), batch_size=64, discount_factor=0.99, soft_update=1e-3, learning_rate=5e-4, update_every=4, **kwargs): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations replay_buffer_size (int): Size of replay buffer batch_size (int): Size of experience batches during training discount_factor (float): Discount factor (gamma) soft_update (float): Soft update coefficient (tau) learning_rate (float): Learning rate (alpha) update_every (int): Steps between updating the network **kwargs: Arguments describing the QNetwork """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" # Parameters self.batch_size = batch_size """Size of experience batches during training""" self.discount_factor = discount_factor """Discount factor (gamma)""" self.soft_update = soft_update """Soft update coefficient (tau)""" self.update_every = update_every """Steps between updating the network""" # Q Networks self.target_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Target Q-Network""" self.local_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Local Q-Network""" self.optimizer = optim.Adam(self.local_network.parameters(), lr=learning_rate) """Optimizer used when training the Q-network.""" # Memory self.memory = ReplayBuffer(replay_buffer_size, batch_size, device) # Time step self.t_step = 0 """Current time step""" def save_weights(self, path): """Save local network weights. Args: path (string): File to save to""" self.local_network.save_weights(path) def load_weights(self, path): """Load local network weights. Args: path (string): File to load weights from""" self.local_network.load_weights(path) def act(self, state, eps=0.): """Returns action for given state according to the current policy Args: state (np.ndarray): Current state eps (float): Probability of selecting random action (epsilon) Returns: int: Epsilon-greedily selected action """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Temporarily set evaluation mode (no dropout &c) & turn off autograd self.local_network.eval() with torch.no_grad(): action_values = self.local_network(state) self.local_network.train() # Select action epsilon-greedily if random.random() > eps: return np.argmax(action_values.cpu().detach().numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): """Save experience and learn if due. Args: state (Tensor): Current state action (int): Chosen action reward (float): Resulting reward next_state (Tensor): State after action done (bool): True if terminal state """ self.memory.add(state, action, reward, next_state, done) # Learn if at update_every steps self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check that we have enough stored experiences if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """Update Q-network using given experiences Args: experiences (Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: SARS'+done tuple """ states, actions, rewards, next_states, dones = experiences # Predicted Q values from target model for next states # (NB. torch.max returns tuple (max, argmax) q_target_next = self.target_network(next_states).max(dim=1, keepdim=True)[0] # Computed target Q values for current state q_target = rewards + self.discount_factor * q_target_next * (1 - dones) # Predicted Q values from local model for current state q_local = self.local_network(states).gather(dim=1, index=actions) loss = F.mse_loss(q_local, q_target) # Update local network weights self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network soft_update(self.local_network, self.target_network, self.soft_update)