class TrainDQN: def __init__(self, env, sess, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=20, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=2000, eps_decay_rate=-0.0001, target_update_freq=1000, ): """Trains an openai gym-like environment with deep q learning. Args: env: gym.Env where our agent resides seed: Random seed for reproducibility gamma: Discount factor max_eps: Starting exploration factor min_eps: Exploration factor to decay towards max_episode_len: Maximum length of an individual episode render: True to render the environment, else False print_freq: Displays logging information every 'print_freq' episodes load_path: (str) Path to load existing model from save_path: (str) Path to save model during training max_steps: maximum number of times to sample the environment buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store max_episode_len: Maximum number of timesteps in an episode eps_decay_rate: lambda parameter in exponential decay for epsilon target_update_fraction: Fraction of max_steps update the target network """ np.random.seed(seed) self.sess = sess self.env = env self.input_dim = env.observation_space.shape[0] self.output_dim = env.action_space.n self.max_steps = max_steps self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.max_episode_len = max_episode_len self.render = render self.print_freq = print_freq self.rewards = [] self.metrics = [] self.save_path = save_path self.load_path = load_path self.batch_size = batch_size self.num_updates = 0 self.gamma = gamma self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity) self.target_update_freq = target_update_freq self.learning_rate = learning_rate with tf.variable_scope('q_network'): self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) with tf.variable_scope('target_network'): self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) self.update_target_network = [old.assign(new) for (new, old) in zip(tf.trainable_variables('q_network'), tf.trainable_variables('target_network'))] if self.load_path is not None: self.load() self.add_summaries(log_dir) def add_summaries(self, log_dir): tf.summary.scalar('Loss', self.q_network.loss, ) tf.summary.scalar('Mean Estimated Value', tf.reduce_mean(self.q_network.output_pred)) # Merge all the summaries and write them out to log_dir self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(log_dir, self.sess.graph) def learn(self): """Learns via Deep-Q-Networks (DQN)""" obs = self.env.reset() mean_reward = None total_reward = 0 ep = 0 ep_len = 0 rand_actions = 0 for t in range(self.max_steps): # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/ eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp( self.eps_decay_rate * t) if self.render: self.env.render() # Take exploratory action with probability epsilon if np.random.uniform() < eps: action = self.env.action_space.sample() rand_actions += 1 else: action = self.act(obs) # Execute action in emulator and observe reward and next state new_obs, reward, done, info = self.env.step(action) total_reward += reward # Store transition s_t, a_t, r_t, s_t+1 in replay buffer self.buffer.add((obs, action, reward, new_obs, done)) # Perform learning step self.update() obs = new_obs ep_len += 1 if done or ep_len >= self.max_episode_len: # print("Episode Length:", ep_len) # print(f"Episode {ep} Reward:{total_reward}") # print(f"Random Action Percent: {rand_actions/ep_len}") ep += 1 ep_len = 0 rand_actions = 0 self.rewards.append(total_reward) total_reward = 0 obs = self.env.reset() if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:]) print(f"-------------------------------------------------------") print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}") print(f"Exploration fraction: {eps}") print(f"Total Episodes: {ep}") print(f"Total timesteps: {t}") print(f"-------------------------------------------------------") # Add reward summary summary = tf.Summary() summary.value.add(tag=f'Mean {self.print_freq} Episode Reward', simple_value=new_mean_reward) summary.value.add(tag=f'Epsilon', simple_value=eps) self.train_writer.add_summary(summary, self.num_updates) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}") print(f'Location: {self.save_path}') # save_path = f"{self.save_path}_model" self.save() mean_reward = new_mean_reward def act(self, observation): """Takes an action given the observation. Args: observation: observation from the environment Returns: integer index of the selected action """ pred = self.sess.run([self.q_network.output_pred], feed_dict={self.q_network.input_ph: np.reshape(observation, (1, self.input_dim))}) return np.argmax(pred) def update(self): """Applies gradients to the Q network computed from a minibatch of self.batch_size.""" if self.batch_size <= self.buffer.size(): self.num_updates += 1 # Update the Q network with model parameters from the target network if self.num_updates % self.target_update_freq == 0: self.sess.run(self.update_target_network) print('Updated Target Network') # Sample random minibatch of transitions from the replay buffer sample = self.buffer.sample(self.batch_size) states, action, reward, next_states, done = sample # Calculate discounted predictions for the subsequent states using target network next_state_pred = self.gamma * self.sess.run(self.target_network.output_pred, feed_dict={ self.target_network.input_ph: next_states}, ) # Adjust the targets for non-terminal states reward = reward.reshape(len(reward), 1) targets = reward loc = np.argwhere(done != True).flatten() if len(loc) > 0: max_q = np.amax(next_state_pred, axis=1) targets[loc] = np.add( targets[loc], max_q[loc].reshape(max_q[loc].shape[0], 1), casting='unsafe') # Update discount factor and train model on batch _, loss = self.sess.run([self.q_network.opt, self.q_network.loss], feed_dict={self.q_network.input_ph: states, self.q_network.target_ph: targets.flatten(), self.q_network.action_indices_ph: action}) def save(self): """Saves the Q network.""" self.q_network.saver.save(self.sess, self.save_path) def load(self): """Loads the Q network.""" self.q_network.saver.restore(self.sess, self.save_path) def plot_rewards(self, path=None): """Plots rewards per episode. Args: path: Location to save the rewards plot. If None, image will be displayed with plt.show() """ plt.plot(self.rewards) plt.xlabel('Episode') plt.ylabel('Reward') if path is None: plt.show() else: plt.savefig(path) plt.close('all')
plot_count_per_actions += episode_count_per_actions plot_episode_requested_agents += episode_episode_requested_agents plot_episode_count_requested_agent += episode_episode_count_requested_agent plot_episode_rewards.append(episode_reward) episodes.append(episode) episode_batch = episodes[0] episodes.pop(0) for episode in episodes: for key in episode_batch.keys(): episode_batch[key] = np.concatenate( (episode_batch[key], episode[key]), axis=0) buffer.store_episode(episode_batch) for train_step in range(args.train_steps): mini_batch = buffer.sample(min(buffer.current_size, args.batch_size)) agents.train(mini_batch, train_steps) train_steps += 1 figure, axes = plt.subplots(nrows=2, ncols=2) # plt.rcParams["figure.figsize"] = (50, 50) plt.rcParams['lines.linewidth'] = 4 index1 = ["Action 0", "Action 1", "Action 2"] axes[0, 0].bar(x=index1, height=plot_count_per_actions) axes[0, 0].set_title('Cumulative count over action space') # index2 = ["1 Agents", "2 Agents", "3 Agents", "4 Agents"] index2 = [f'{i+1} Agents' for i in range(N_AGENTS)] axes[0, 1].bar(x=index2, height=plot_episode_count_requested_agent)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get targets self.qnetwork_target.eval() with torch.no_grad(): Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0] Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # get outputs self.qnetwork_local.train() Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # compute loss loss = F.mse_loss(Q_expected, Q_targets) # clear gradients self.optimizer.zero_grad() # update weights local network loss.backward() # take one SGD step self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class NeuralNetworkAgent(Agent): def __init__(self, api, network_class, sess, save_path, history_size=15, restore_path=None, verbose=False, train=False, test=False): super(NeuralNetworkAgent, self).__init__(api, verbose=verbose) # currently 7500 w/ 1000 # Network self.network = network_class(sess, save_path, restore_path=restore_path, hist_size=history_size) self.replay_buffer = ReplayBuffer(max_size=2500) self.train = train self.history_size = history_size # Internal self.launched = False self.placed_move = False self.ctr = 0 self.restart_game = 1 self.game_restarted = True self.show_board = False self.last_move = -2 self.start_state = np.zeros((20, 10, 1)) self.possible_moves = [-1, 0, 6, 7] self.training_begun = False if not test else True self.epsilon = 1. if not test else 0 self.decay = 0.999 self.test = test self.prev_states = [self.start_state] * self.history_size def _controller_listener(self): piece_id = self.api.peekCPU(0x0042) game_state = self.api.peekCPU(0x0048) if piece_id != 19 and game_state == 1: # Train if self.train and self.replay_buffer.size( ) > 250 and not self.test: batch = self.replay_buffer.sample(batch_sz=250) self.network.train(batch) self.training_begun = True self.epsilon *= self.decay if self.epsilon < 0.010: self.epsilon = 0.010 if not self.placed_move: # and (random_move >= 0 or self.restart_game > 0): # os.system('clear') print '--------------' is_random = False move = None if np.random.random() < self.epsilon or not self.training_begun: move = np.random.choice(self.possible_moves) is_random = True else: tensor = np.dstack([self.grid] + self.prev_states) pred = self.network.predict(tensor)[0] move = self.possible_moves[pred] if self.restart_game > 0: self.api.writeGamepad(0, 3, True) self.restart_game -= 1 move = -2 else: if move >= 0: self.api.writeGamepad(0, move, True) self.placed_move = True self.show_board = True if self.last_move != -2 and piece_id != 19: print 'Random:', is_random S = self.grid.copy() self._update_board(self.api.peekCPU(0x0042)) board = self._simulate_piece_drop(self.api.peekCPU(0x0042)) n_empty = self._count_empty(self.grid) n_holes = self._count_holes(self.grid) height = self._count_height(board) levelness = self._determine_levelness(board) A = self.last_move # R = self._count_total() + self._get_score() - n_empty #R = (-50 * height) + (-20 * n_holes) + (self._get_score()) if height <= 2: R = 1000 else: R = -200 * height R += -20 * n_holes + 10 * levelness # 10 * self._get_score() SP = self.grid.copy() self.prev_states.insert(0, S) print np.dstack(self.prev_states).shape self.replay_buffer.add( np.dstack(self.prev_states), self.possible_moves.index(A), R, np.dstack([SP] + self.prev_states[:self.history_size])) self.prev_states = self.prev_states[:self.history_size] print self.epsilon self._print_transition(S, A, board, R) self.last_move = move else: self.placed_move = False def _frame_render_finished(self): """ Renders the board the the current piece TODO: do this lazily, so we aren't calling read too often O_o """ # To make things easier, we're going to modify the next piece drop # Always drop a certain type of block (currently square). self.api.writeCPU(0x00bf, 0x0a) piece_id = self.api.peekCPU(0x0042) game_state = self.api.peekCPU(0x0048) # Restart the game if piece_id == 19 and (game_state == 10 or game_state == 0): self.prev_states = [self.start_state] * self.history_size self.game_restarted = True self.restart_game = 1 return # Probably a line clear... Skip if piece_id == 19 and game_state != 1: return def _piece_update(self, access_type, address, value): """ Can be used to control the piece being dropped """ if self.api.readCPU(0x0048) == 1: return 0x0a return value def agent_name(self): return 'NeuralNetworkAgent'
class DQN: def __init__( self, env, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=1, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=None, eps_decay_rate=-1e-4, target_update_freq=1000, ): tf.random.set_seed(seed) np.random.seed(seed) self.gamma = gamma self.render = render self.batch_size = batch_size self.print_freq = print_freq self.q_lr = learning_rate self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.buffer = ReplayBuffer(buffer_capacity) self.max_steps = max_steps self.target_update = target_update_freq self.model = QNetwork(env.action_space.n, name='q_network') self.target = QNetwork(env.action_space.n, name='target_network') self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.summary_writer = tf.summary.create_file_writer(log_dir) self.env = env self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps self.rewards = [] self.save_path = save_path if load_path is not None: self.model.load_weights(load_path) def act(self, state): return np.argmax(self.model(state)) @tf.function def train_step(self, states, indices, targets): """ Performs a single step of gradient descent on the Q network Args: states: numpy array of states with shape (batch size, state dim) indices: list indices of the selected actions targets: targets for computing the MSE loss """ with tf.GradientTape() as tape: action_values = tf.gather_nd(self.model(states), indices) mse_loss = tf.keras.losses.MeanSquaredError()(action_values, targets) gradients = tape.gradient(mse_loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.model.trainable_variables)) # Log training information with self.summary_writer.as_default(): tf.summary.scalar('MSE Loss', mse_loss, step=self.optimizer.iterations) tf.summary.scalar('Estimated Q Value', tf.reduce_mean(action_values), step=self.optimizer.iterations) def update(self): """ Computes the target for the MSE loss and calls the tf.function for gradient descent """ if len(self.buffer) >= self.batch_size: # Sample random minibatch of N transitions states, actions, rewards, next_states, dones = self.buffer.sample( self.batch_size) # Adjust the targets for non-terminal states next_state_pred = self.target(next_states) targets = rewards + self.gamma * next_state_pred.numpy().max( axis=1) * (1 - dones) batch_range = tf.range(start=0, limit=actions.shape[0]) indices = tf.stack((batch_range, actions), axis=1) # update critic by minimizing the MSE loss self.train_step(states, indices, targets) def learn(self): """Learns via Deep-Q-Networks (DQN)""" obs = self.env.reset() total_reward = 0 ep = 0 ep_len = 0 rand_actions = 0 mean_reward = None for t in range(self.max_steps): if t % self.target_update == 0: copy_weights(self.model.variables, self.target.variables) # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/ eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp( self.eps_decay_rate * t) if self.render: self.env.render() # Take exploratory action with probability epsilon if np.random.uniform() < eps: action = self.env.action_space.sample() rand_actions += 1 else: action = self.act(np.expand_dims(obs, axis=0)) # Execute action in emulator and observe reward and next state new_obs, reward, done, info = self.env.step(action) total_reward += reward # Store transition s_t, a_t, r_t, s_t+1 in replay buffer self.buffer.add((obs, action, reward, new_obs, done)) # Perform learning step self.update() obs = new_obs ep_len += 1 if done or ep_len >= self.max_episode_len: with self.summary_writer.as_default(): ep += 1 self.rewards.append(total_reward) total_reward = 0 obs = self.env.reset() if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean( self.rewards[-self.print_freq - 1:]) print( f"-------------------------------------------------------" ) print( f"Mean {self.print_freq} Episode Reward: {new_mean_reward}" ) print(f"Exploration fraction: {rand_actions / ep_len}") print(f"Total Episodes: {ep}") print(f"Total timesteps: {t}") print( f"-------------------------------------------------------" ) tf.summary.scalar( f'Mean {self.print_freq} Episode Reward', new_mean_reward, step=t) tf.summary.scalar(f'Epsilon', eps, step=t) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print( f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}" ) print(f'Location: {self.save_path}') mean_reward = new_mean_reward self.model.save_weights(self.save_path) ep_len = 0 rand_actions = 0
def train(conf, env, model, num_episodes=500, batch_size=100, buffer_size=10000): conf.buffer_size = buffer_size conf.batch_size = batch_size replay_buffer = ReplayBuffer(size=buffer_size) discount_rate = conf.discount_rate eps = conf.initial_eps decay_factor = conf.decay_factor for episode in range(num_episodes): print("Episode {}".format(episode)) observation = env.reset() eps *= decay_factor done = False total_food = 0 step = 0 while not done: model_input = np.array([observation]) prediction = model.predict(model_input) if np.random.random() < eps: action = np.random.randint(0, 4) was_random = True else: action = np.argmax(prediction) was_random = False debugger.print_step_before_move(step, observation, prediction, action, was_random) debugger.render_env_until_key_press(env) new_observation, reward, done, _ = env.step(action) replay_buffer.add(observation, action, reward, new_observation, float(done)) # target_action_score = reward + (0 if done else discount_rate * np.max(model.predict( # np.array([new_observation])))) # label = prediction # label[0][action] = target_action_score # model.fit(model_input, label, epochs=1, # verbose=0) obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) labels = model.predict(obses_t) targets = discount_rate * np.max(model.predict(obses_tp1), axis=1) # print('targets', targets) # print('rewards', rewards) for i in range(len(dones)): if dones[i]: targets[i] = 0 targets[i] += rewards[i] labels[i][actions[i]] = targets[i] model.fit(obses_t, labels, epochs=1, verbose=0) weights, batch_idxes = np.ones_like(rewards), None # debugger.print_step_after_move(reward, target_action_score, # label, model.predict(model_input)) if (reward > 0): total_food += 1 step += 1 observation = new_observation wandb.log({ 'episode': episode, 'total_food': total_food, 'eps': eps, 'lifetime': step }) print('Score: {}'.format(total_food)) print() env.close()