class Learner: def __init__(self, gamma, env_name): self.env_name = env_name self.action_space = gym.make(self.env_name).action_space.n self.q_network = QNetwork(self.action_space) self.target_q_network = QNetwork(self.action_space) self.gamma = gamma self.optimizer = tf.keras.optimizers.Adam(lr=0.001) def define_network(self): env = gym.make(self.env_name) state = env.reset() self.q_network(np.atleast_2d(state)) self.target_q_network(np.atleast_2d(state)) self.target_q_network.set_weights(self.q_network.get_weights()) current_weights = self.q_network.get_weights() return current_weights def update_network(self, minibatchs): indices_all = [] td_errors_all = [] for (indices, weights, transitions) in minibatchs: states, actions, rewards, next_states, dones = zip(*transitions) states = np.vstack(states) actions = np.array(actions) rewards = np.vstack(rewards) next_states = np.vstack(next_states) dones = np.vstack(dones) next_qvalues = self.q_network(next_states) next_actions = tf.cast(tf.argmax(next_qvalues, axis=1), tf.int32) next_actions_onehot = tf.one_hot(next_actions, self.action_space) next_maxQ = tf.reduce_sum( next_qvalues * next_actions_onehot, axis=1, keepdims=True) TQ = rewards + self.gamma * (1 - dones) * next_maxQ with tf.GradientTape() as tape: qvalues = self.q_network(states) actions_onehot = tf.one_hot(actions, self.action_space) Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True) td_errors = tf.square(TQ - Q) loss = tf.reduce_mean(weights * td_errors) grads = tape.gradient(loss, self.q_network.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 40.0) self.optimizer.apply_gradients( zip(grads, self.q_network.trainable_variables)) indices_all += indices td_errors_all += td_errors.numpy().flatten().tolist() current_weights = self.q_network.get_weights() return current_weights, indices_all, td_errors_all
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_shape, action_size, buffer_size, batch_size, gamma, tau, learning_rate, update_every, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor tau (float): used for soft update of target parameters learning_rate (float): learning rate update_every (int): how many steps between network updates device (torch.Device): pytorch device seed (int): random seed """ self.state_shape = state_shape self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.update_every = update_every self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(action_size) self.qnetwork_target = QNetwork(action_size) #self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.00025, momentum=0.95) self.loss_fn = tf.keras.losses.MeanSquaredError() # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): """Adds new experience to the replay buffer and learns from a subset of memories. Params ====== state (array_like): initial state action (int): chosen action reward (int): reward given next_state (array_like): next state done (bool): True if the episode is finished """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every self.update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, epsilon=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state epsilon (float): epsilon, for epsilon-greedy action selection """ #state = tf.expand_dims(state, axis=0) action_values = self.qnetwork_local(state) # Epsilon-greedy action selection if random.random() > epsilon: return np.argmax(action_values.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states) Q_targets_next = tf.math.reduce_max(Q_targets_next, axis=1) Q_targets_next = tf.expand_dims(Q_targets_next, axis=1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) with tf.GradientTape() as tape: # Get expected Q values from local model Q_expected = self.qnetwork_local(states) Q_expected = tf.gather(Q_expected, indices=actions, axis=1, batch_dims=1) loss = self.loss_fn(y_true=Q_targets, y_pred=Q_expected) gradients = tape.gradient(loss, self.qnetwork_local.trainable_weights) self.optimizer.apply_gradients( zip(gradients, self.qnetwork_local.trainable_weights)) # ------------------- update target network ------------------- # #self.soft_update() def hard_update(self): self.qnetwork_target.set_weights(self.qnetwork_local.get_weights()) def soft_update(self): """Soft update model parameters. Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for target_layer, local_layer in zip(self.qnetwork_target.layers, self.qnetwork_local.layers): if target_layer.trainable: for i in range(len(target_layer.trainable_weights)): target_layer.trainable_weights[ i] = self.tau * local_layer.trainable_weights[i] + ( 1.0 - self.tau) * target_layer.trainable_weights[i]
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, update_every): """Initialize an Agent object. Args: state_size: Integer. Dimension of each state action_size: Integer. Dimension of each action buffer_size: Integer. Replay buffer size batch_size: Integer. Mini-batch size gamma: Float. Discount factor tau: Float. For soft update of target parameters lr: Float. Learning rate update_every: Integer. How often to update the network """ # Environment parameters self.state_size = state_size self.action_size = action_size # Q-Learning self.gamma = gamma # Q-Network self.model_local = QNetwork(state_size, action_size) self.model_target = QNetwork(state_size, action_size) self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr) self.loss_fn = tf.keras.losses.MeanSquaredError(name="mse") self.tau = tau self.update_every = update_every self.batch_size = batch_size # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size) # Initialize time step (for updating every update_every steps) self.t_step = 0 def __str__(self): return 'RL_Agent_Class' def __repr__(self): return 'RL_Agent_Class' def step(self, state, action, reward, next_state, done): """Save state on buffer and trigger learn according to update_every Args: state: The previous state of the environment action: Integer. Previous action selected by the agent reward: Float. Reward value next_state: The current state of the environment done: Boolean. Whether the episode is complete """ # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Args: state: A array like object or list with states eps: Float. Random value for epsilon-greedy action selection Returns: An action selected by the network or by the epsilon-greedy method """ # Reshape state state = np.expand_dims(state, 0) # Predict action action_values = self.model_local(state) # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences): """Update value parameters using given batch of experience tuples. Args: experiences: Tuple. Content of tuple (s, a, r, s', done) """ states, actions, rewards, next_states, dones = experiences # Create mask to actions mask = tf.one_hot(actions.reshape(-1), self.action_size) with tf.GradientTape(persistent=True) as tape: # Get expected Q values from local model q_expected = tf.reduce_sum(self.model_local(states) * mask, axis=1, keepdims=True) # Get max predicted Q values (for next states) from target model q_targets_next = tf.reduce_max(self.model_target(next_states, training=True), axis=1) # Compute Q targets for current states q_targets = tf.add( rewards, tf.multiply( self.gamma, q_targets_next, tf.subtract(1.0, dones)) ) # Compute loss loss = self.loss_fn(q_expected, q_targets) # Minimize the loss gradients = tape.gradient(loss, self.model_local.trainable_variables) self.optimizer.apply_gradients(zip(gradients, self.model_local.trainable_variables)) # Update target network self.soft_update() def soft_update(self): """Soft update model parameters. The model is update using: θ_target = τ * θ_local + (1 - τ) * θ_target """ # Instantiate weight list new_weights = [] # Apply soft update for weights in self.model_local.get_weights(): new_weights.append(self.tau * weights + (1.0 - self.tau) * weights) # Set new weights self.model_target.set_weights(new_weights)
class DQNAgent: def __init__(self, env_name="BreakoutDeterministic-v4", gamma=0.99, batch_size=32, lr=0.00025, update_period=4, target_update_period=10000, n_frames=4): self.env_name = env_name self.gamma = gamma self.batch_size = batch_size self.epsilon_scheduler = ( lambda steps: max(1.0 - 0.9 * steps / 1000000, 0.1)) self.update_period = update_period self.target_update_period = target_update_period env = gym.make(self.env_name) self.action_space = env.action_space.n self.qnet = QNetwork(self.action_space) self.target_qnet = QNetwork(self.action_space) self.optimizer = Adam(lr=lr, epsilon=0.01 / self.batch_size) self.n_frames = n_frames self.use_reward_clipping = True self.huber_loss = tf.keras.losses.Huber() def learn(self, n_episodes, buffer_size=1000000, logdir="log"): logdir = Path(__file__).parent / logdir if logdir.exists(): shutil.rmtree(logdir) self.summary_writer = tf.summary.create_file_writer(str(logdir)) self.replay_buffer = ReplayBuffer(max_len=buffer_size) steps = 0 for episode in range(1, n_episodes + 1): env = gym.make(self.env_name) frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) episode_rewards = 0 episode_steps = 0 done = False lives = 5 while not done: steps, episode_steps = steps + 1, episode_steps + 1 epsilon = self.epsilon_scheduler(steps) state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=epsilon) next_frame, reward, done, info = env.step(action) episode_rewards += reward frames.append(preprocess_frame(next_frame)) next_state = np.stack(frames, axis=2)[np.newaxis, ...] if info["ale.lives"] != lives: lives = info["ale.lives"] transition = (state, action, reward, next_state, True) else: transition = (state, action, reward, next_state, done) self.replay_buffer.push(transition) if len(self.replay_buffer) > 50000: if steps % self.update_period == 0: loss = self.update_network() with self.summary_writer.as_default(): tf.summary.scalar("loss", loss, step=steps) tf.summary.scalar("epsilon", epsilon, step=steps) tf.summary.scalar("buffer_size", len(self.replay_buffer), step=steps) tf.summary.scalar("train_score", episode_rewards, step=steps) tf.summary.scalar("train_steps", episode_steps, step=steps) if steps % self.target_update_period == 0: self.target_qnet.set_weights(self.qnet.get_weights()) if done: break print( f"Episode: {episode}, score: {episode_rewards}, steps: {episode_steps}" ) if episode % 20 == 0: test_scores, test_steps = self.test_play(n_testplay=1) with self.summary_writer.as_default(): tf.summary.scalar("test_score", test_scores[0], step=steps) tf.summary.scalar("test_step", test_steps[0], step=steps) if episode % 1000 == 0: self.qnet.save_weights("checkpoints/qnet") def update_network(self): #: ミニバッチの作成 (states, actions, rewards, next_states, dones) = self.replay_buffer.get_minibatch(self.batch_size) if self.use_reward_clipping: rewards = np.clip(rewards, -1, 1) next_actions, next_qvalues = self.target_qnet.sample_actions( next_states) next_actions_onehot = tf.one_hot(next_actions, self.action_space) max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot, axis=1, keepdims=True) target_q = rewards + self.gamma * (1 - dones) * max_next_qvalues with tf.GradientTape() as tape: qvalues = self.qnet(states) actions_onehot = tf.one_hot(actions.flatten().astype(np.int32), self.action_space) q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True) loss = self.huber_loss(target_q, q) grads = tape.gradient(loss, self.qnet.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.qnet.trainable_variables)) return loss def test_play(self, n_testplay=1, monitor_dir=None, checkpoint_path=None): if checkpoint_path: env = gym.make(self.env_name) frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) state = np.stack(frames, axis=2)[np.newaxis, ...] self.qnet(state) self.qnet.load_weights(checkpoint_path) if monitor_dir: monitor_dir = Path(monitor_dir) if monitor_dir.exists(): shutil.rmtree(monitor_dir) monitor_dir.mkdir() env = gym.wrappers.Monitor(gym.make(self.env_name), monitor_dir, force=True, video_callable=(lambda ep: True)) else: env = gym.make(self.env_name) scores = [] steps = [] for _ in range(n_testplay): frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) done = False episode_steps = 0 episode_rewards = 0 while not done: state = np.stack(frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=0.05) next_frame, reward, done, _ = env.step(action) frames.append(preprocess_frame(next_frame)) episode_rewards += reward episode_steps += 1 if episode_steps > 500 and episode_rewards < 3: #: ゲーム開始(action: 0)しないまま停滞するケースへの対処 break scores.append(episode_rewards) steps.append(episode_steps) return scores, steps