def test(strategy=dqn, log_file='train_params.json'): with open('test_params.json', 'r') as file: read_params = json.load(file) game_params = read_params['params'] test_start_states = read_params['states'] total_history = [] total_scores = [] env = PacmanGame(**game_params) for start_state in test_start_states: preprocess(start_state) episode_history = [] env.reset() env.player = start_state['player'] env.monsters = start_state['monsters'] env.diamonds = start_state['diamonds'] env.walls = start_state['walls'] assert len(env.monsters) == env.nmonsters and len( env.diamonds) == env.ndiamonds and len(env.walls) == env.nwalls obs = env.get_obs() episode_history.append(obs) while not obs['end_game']: action = strategy(obs) obs = env.make_action(action) episode_history.append(obs) total_history.append(episode_history) total_scores.append(obs['total_score']) mean_score = np.mean(total_scores) with open(log_file, 'w') as file: json.dump(total_history, file) print( "Your average score is {}, saved log to '{}'. Do not forget to upload it for submission!" .format(mean_score, log_file)) return mean_score
write_graph=False, write_images=False) if __name__ == '__main__': # counters: step = 0 # training step counter (= epoch counter) iteration = 0 # frames counter episodes = 0 # game episodes counter done = True # indicator that env needs to be reset nb_actions = 10 episode_scores = [] # collect total scores in this list and log it later env = PacmanGame(**game_params) obs = env.reset() while step < n_steps: if obs['end_game']: # game over, restart it obs = env.reset() score = 0 # reset score for current episode state = get_observation(obs) # Online network evaluates what to do iteration += 1 q_values = online_network.predict(state)[ 0] # calculate q-values using online network # select epsilon (which linearly decreases over training steps): epsilon = max(eps_min, eps_max - (eps_max - eps_min) * step / eps_decay_steps)
max_q = max([ self.qmap[tuple(new_state) + (a, )] for a in new_possible_actions ]) self.qmap[old_stateaction] = (1 - self.alpha) * self.qmap[ old_stateaction] + self.alpha * (reward + self.gamma * max_q) return def best_action(self, state, possible_actions): # Get the action with highest Q-Value estimate for specific state a, q = max([(a, self.qmap[tuple(state) + (a, )]) for a in possible_actions], key=lambda x: x[1]) return a input_shape = (len(get_state(env.reset())), ) nb_actions = len(action_to_dxdy) online_network = create_dqn_model(input_shape, nb_actions) online_network.compile(optimizer=Adam(), loss='mse') target_network = clone_model(online_network) target_network.set_weights(online_network.get_weights()) from IPython.display import SVG from keras.utils.vis_utils import model_to_dot SVG(model_to_dot(online_network).create(prog='dot', format='svg')) from keras.utils import plot_model plot_model(online_network, to_file='online_network.png', show_shapes=True, show_layer_names=True)