class DQNAgent: def __init__(self, env, net_update_rate: int = 25, exploration_rate: float = 1.0, exploration_decay: float = 0.00005): # set hyper parameters self.exploration_rate = exploration_rate self.exploration_decay = exploration_decay self.net_updating_rate = net_update_rate # set environment self.env = env self.state_shape = env.get_state_shape() self.action_shape = env.get_action_shape() # the number of experience per batch for batch learning # Experience Replay for batch learning self.exp_rep = ExperienceReplay() # Deep Q Network self.net = None def set_model(self, model): """ Sets the model the agent is used to train. Receives a compiled tf Model with input_shape = env.observation_space and output_shape = env.action_s pace""" self.net = DoubleDQN(model) def get_action(self, state: np.ndarray, eps=0) -> int: """Given a state returns a random action with probability eps, and argmax(q_net(state)) with probability 1-eps. (only legal actions are considered)""" if self.net is None: raise NotImplementedError( 'agent.get_action called before model was not initiated.\n Please set the agent\'s model' ' using the set_model method. You can access the state and action shapes using ' 'agent\'s methods \'get_state_shape\' and \'get_action_shape\'' ) legal_actions = self.env.get_legal_actions(state) if np.random.random() >= eps: # Exploitation # Calculate the Q-value of each action q_values = self.net.predict(state[np.newaxis, ...], np.expand_dims(legal_actions, 0)) # Make sure we only choose between available actions legal_actions = np.logical_and(legal_actions, q_values == np.max(q_values)) return np.random.choice(np.flatnonzero(legal_actions)) def update_net(self, batch_size: int): """ if there are more than batch_size experiences, Optimizes the network's weights using the Double-Q-learning algorithm with a batch of experiences, else returns""" if self.exp_rep.get_num() < batch_size: return batch = self.exp_rep.get_batch(batch_size) self.net.fit(*batch) def train(self, episodes: int, path: str, checkpoint_rate=100, batch_size: int = 64, exp_decay_func=lambda exp_rate, exp_decay, i: 0.01 + (exp_rate - 0.01) * np.exp(exp_decay * (i + 1)), show_progress=False): """ Runs a training session for the agent :param episodes: number of episodes to train. :param path: a path to a directory where the trained weights will be saved. :param batch_size: number of experiences to learn from in each net_update. """ if self.net is None: raise NotImplementedError( 'agent.train called before model was not initiated.\n Please set the agent\'s model' ' using the set_model method. You can access the state and action shapes using ' 'agent\'s methods \'get_state_shape\' and \'get_action_shape\'' ) # set hyper parameters exploration_rate = self.exploration_rate total_rewards = [] # start training for episode in tqdm(range(episodes)): state = self.env.reset() # Reset the environment for a new episode step, episode_reward = 0, 0 run = True # Run until max actions is reached or episode has ended while run: step += 1 # choose a current action using epsilon greedy exploration action = self.get_action(state, exploration_rate) # apply the chosen action to the environment and observe the next_state and reward obs = self.env.step(action) next_state, reward, is_terminal = obs[:3] episode_reward += reward # Add experience to memory self.exp_rep.add(state, action, reward, next_state, self.env.get_legal_actions(state), is_terminal) # Optimize the DoubleQ-net self.update_net(batch_size) if is_terminal: # The action taken led to a terminal state run = False if (step % self.net_updating_rate) == 0 and step > 0: # update target network self.net.align_target_model() state = next_state # Update total_rewards to keep track of progress total_rewards.append(episode_reward) # Update target network at the end of the episode self.net.align_target_model() # Update exploration rate - exploration_rate = exp_decay_func(exploration_rate, self.exploration_decay, episode) if episode % checkpoint_rate == 0 and self.exp_rep.get_num( ) > batch_size: self.save_weights( os.path.join(path, f'episode_{episode}_weights')) if show_progress: # Plot a moving average of last 10 episodes self.plot_progress(total_rewards) # update the agents exploration rate in case more training is needed. self.exploration_rate = exploration_rate # saves the total_rewards as csv file to the path specified. with open(os.path.join(path, 'rewards.csv'), 'w') as reward_file: rewards = pd.DataFrame(total_rewards) rewards.to_csv(reward_file) self.save_weights(os.path.join(path, 'final_weights')) def plot_progress(self, total_rewards): w = np.ones(10) / 10 moving_average = np.convolve(total_rewards, w, mode='valid') plt.plot(np.arange(len(moving_average)), moving_average) plt.title('Moving average of rewards across episodes') plt.xlabel('episodes') plt.ylabel('average reward over last 10 episodes') plt.show() def get_state_shape(self): return self.state_shape def get_action_shape(self): return self.action_shape # Handles saving\loading the model as explained here: https://www.tensorflow.org/guide/keras/save_and_serialize def load_weights(self, path): self.net.load_weights(path) def save_weights(self, path): self.net.save_weights(path) def save_model(self, path): if self.net is None: raise NotImplementedError( 'agent.save_model was called before model was not initiated.\n Please set the ' 'agent\'s model using the set_model method. You can access the state and action ' 'shapes using agent\'s methods \'get_state_shape\' and \'get_action_shape\'' ) self.net.save_model(path) def load_model(self, path): model = load_model(path) self.set_model(model) def to_json(self, **kwargs): if self.net is None: raise NotImplementedError( 'agent.to_json was called before model was not initiated.\n Please set the ' 'agent\'s model using the set_model method. You can access the state and action ' 'shapes using agent\'s methods \'get_state_shape\' and \'get_action_shape\'' ) return self.net.to_json(**kwargs) def from_json(self, json_config): model = model_from_json(json_config) self.set_model(model)
class Player: """ This class represents a player, his strategy of learning and playing the game. """ def __init__(self): # gamma is a parameter of Q - learing algorithm self.gamma = 0.9 # We use epsilon - greedy strategy of learning self.epsilon = 1 self.epsilon_decay = 0.99 self.epsilon_min = 0.01 # Number of epochs (fully played games) to study an agent self.epochs = 500 # Game to play self.game = Game() # Number of hidden layer nodes self.hidden_layer_nodes = 20 # Create keras model # _________________________________________________________________ # Layer (type) Output Shape Param # # ================================================================= # dense_1 (Dense) (None, 20) 120 # _________________________________________________________________ # dense_2 (Dense) (None, 20) 420 # _________________________________________________________________ # dense_3 (Dense) (None, 5) 105 # ================================================================= # Total params: 645 # Trainable params: 645 # Non-trainable params: 0 # _________________________________________________________________ self.model = Sequential() self.model.add(Dense(self.hidden_layer_nodes, input_dim=self.game.state_size, activation='relu')) self.model.add(Dense(self.hidden_layer_nodes, activation='relu')) self.model.add(Dense(len(POSSIBLE_ACTIONS), activation='linear')) self.model.compile('Adam', loss='mse') # Initialize experience replay self.experience_replay = ExperienceReplay(size=2000) self.batch_size = 20 self.max_turns = 100 def train_model_on_batch(self): batch = self.experience_replay.get_batch(self.batch_size) # ---------------------------------- # # TODO: move this logic to get_batch states = [] target_fs = [] actions = [] rewards = [] next_states = [] not_is_overs = [] for state, action, reward, next_state, is_over in batch: states.append(state) actions.append(action) rewards.append(reward) next_states.append(next_state) not_is_overs.append(not is_over) states = numpy.array(states) next_states = numpy.array(next_states) not_is_overs = numpy.array(not_is_overs) rewards = numpy.array(rewards) # ---------------------------------- # targets = rewards + not_is_overs * self.gamma * numpy.amax(self.model.predict(next_states), axis=1) target_fs = self.model.predict(states) for i in range(len(batch)): target_fs[i, ACTION_TO_INDEX[actions[i]]] = targets[i] self.model.fit(states, target_fs, verbose=0) def train(self, interactive=False): for epoch in range(self.epochs): self.game.create_agent() turns = 0 while turns < self.max_turns: turns += 1 if interactive: os.system('clear') self.game.show() time.sleep(0.1) state = numpy.array(self.game.encode()) if random.uniform(0, 1) < self.epsilon: action = random.choice(POSSIBLE_ACTIONS) else: index = numpy.argmax(self.model.predict(state[numpy.newaxis])[0]) action = POSSIBLE_ACTIONS[index] reward = self.game.act(action) next_state = numpy.array(self.game.encode()) is_over = self.game.is_over() if is_over: reward -= 10 self.experience_replay.remember(state, action, reward, next_state, is_over) break if turns == self.max_turns: reward += 10 self.experience_replay.remember(state, action, reward, next_state, is_over) self.train_model_on_batch() # Epsilon decay technic if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay print('Epoch: %i Total turns: %i' % (epoch, turns)) print("Training finished!\n") def play(self, interactive=False): for _ in range(self.epochs): self.game.create_agent() while not self.game.is_over(): if interactive: os.system('clear') self.game.show() time.sleep(0.1) state = numpy.array(self.game.encode())[numpy.newaxis] index = numpy.argmax(self.model.predict(state)[0]) action = POSSIBLE_ACTIONS[index] self.game.act(action)
# Collect environment data s2, r, terminal = env.step( np.argmax(a) ) # Add data to ExperienceReplay memory if UPDATE_REPLAY: if np.abs(r) > 0.0: er.add_experience(s, a, r, terminal, s2) else: if np.random.random() < 0.0018: er.add_experience(s, a, r, terminal, s2) # Keep adding experience to the memory until # there are at least minibatch size samples if er.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = er.get_batch(MINIBATCH_SIZE) # Calculate Q targets for s2 based on QValue target model s2_batch1 = np.reshape( [elt[0].ravel() for elt in s2_batch], [-1,hero_state_dim] ) s2_batch2 = np.reshape( [elt[1] for elt in s2_batch], [-1,balls_state_shape[0],balls_state_shape[1]] ) target_q = qvalue_network.max_qvalues( s2_batch1, s2_batch2 ) new_q = np.zeros((MINIBATCH_SIZE, 1)) for k in range(MINIBATCH_SIZE): if t_batch[k]: new_q[k,0] = r_batch[k] else: new_q[k,0] = r_batch[k] + GAMMA * target_q[k] # Update qvalues given the Q targets s_batch1 = np.reshape( [elt[0].ravel() for elt in s_batch], [-1,hero_state_dim] )