class WrappedFlappyBird(): def __init__(self): self.score_counter = 0 self.game = FlappyBird() self.env = PLE(self.game, fps=30, display_screen=True) def frame_step(self, action_vector): if action_vector[0] == 1: self.env.act(119) elif action_vector[1] == 1: self.env.act(1) frame = self.env.getScreenRGB() reward = self.get_action_reward() game_over = self.game.game_over() if game_over: self.game.reset() return frame, reward, game_over def get_action_reward(self): if self.game.game_over(): self.score_counter = 0 return -1 elif self.score_counter < self.game.getScore(): self.score_counter = self.game.getScore() return 1 else: return 0.1
jeu = FlappyBird() p = PLE(jeu, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True) p.init() i = 0 while (True): p.reset_game() state = jeu.getGameState() state = np.array(list(state.values())) while (not jeu.game_over()): qval = model.predict( state.reshape(1, len(state)), batch_size=batchSize ) #Learn Q (Q-learning) / model initialise avant (neural-network) if (random.random() < epsilon): # exploration exploitation strategy action = np.random.randint(0, 2) else: #choose best action from Q(s,a) values qval_av_action = [-9999] * 2 for ac in range(0, 2): qval_av_action[ac] = qval[0][ac] action = (np.argmax(qval_av_action)) #Take action, observe new state S' #Observe reward reward = p.act(119 * action)
def DAgger(hidden_sizes=[32, 32], dagger_iterations=20, p_lr=1e-3, step_iterations=1000, batch_size=128, train_epochs=20, obs_dim=8, act_dim=2): tf.reset_default_graph() ############################## EXPERT ############################### # load the expert and return a function that predict the expert action given a state expert_policy = expert() print('Expert performance: ', np.mean(test_agent(expert_policy))) #################### LEARNER COMPUTATIONAL GRAPH #################### obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32, name='obs') act_ph = tf.placeholder(shape=(None, ), dtype=tf.int32, name='act') # Multi-layer perceptron p_logits = mlp(obs_ph, hidden_sizes, act_dim, tf.nn.relu, last_activation=None) act_max = tf.math.argmax(p_logits, axis=1) act_onehot = tf.one_hot(act_ph, depth=act_dim) # softmax cross entropy loss p_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=act_onehot, logits=p_logits)) # Adam optimizer p_opt = tf.train.AdamOptimizer(p_lr).minimize(p_loss) now = datetime.now() clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second) file_writer = tf.summary.FileWriter( 'log_dir/FlappyBird/DAgger_' + clock_time, tf.get_default_graph()) sess = tf.Session() sess.run(tf.global_variables_initializer()) def learner_policy(state): action = sess.run(act_max, feed_dict={obs_ph: [state]}) return np.squeeze(action) X = [] y = [] env = FlappyBird() # env = PLE(env, fps=30, display_screen=False) env = PLE(env, fps=30, display_screen=True, force_fps=False) env.init() #################### DAgger iterations #################### for it in range(dagger_iterations): sess.run(tf.global_variables_initializer()) env.reset_game() no_op(env) game_rew = 0 rewards = [] ###################### Populate the dataset ##################### for _ in range(step_iterations): # get the current state from the environment state = flappy_game_state(env) # As the iterations continue use more and more actions sampled from the learner if np.random.rand() < (1 - it / 5): action = expert_policy(state) else: action = learner_policy(state) action = 119 if action == 1 else None rew = env.act(action) rew += env.act(action) # Add the state and the expert action to the dataset X.append(state) y.append(expert_policy(state)) game_rew += rew # Whenever the game stop, reset the environment and initailize the variables if env.game_over(): env.reset_game() no_op(env) rewards.append(game_rew) game_rew = 0 ##################### Training ##################### # Calculate the number of minibatches n_batches = int(np.floor(len(X) / batch_size)) # shuffle the dataset shuffle = np.arange(len(X)) np.random.shuffle(shuffle) shuffled_X = np.array(X)[shuffle] shuffled_y = np.array(y)[shuffle] for _ in range(train_epochs): ep_loss = [] # Train the model on each minibatch in the dataset for b in range(n_batches): p_start = b * batch_size # mini-batch training tr_loss, _ = sess.run( [p_loss, p_opt], feed_dict={ obs_ph: shuffled_X[p_start:p_start + batch_size], act_ph: shuffled_y[p_start:p_start + batch_size] }) ep_loss.append(tr_loss) agent_tests = test_agent(learner_policy, file_writer, step=len(X)) print('Ep:', it, np.mean(ep_loss), 'Test:', np.mean(agent_tests))
return np.sum(scores) ## Training loop : total_games = 15000 #nombre de parties jouées pour l'entraînement. evaluation_period = 1000 #Tout les ... évalue la qualité du réseau. gamma = 0.99 #Permet de déifinir la récompense update. step_game = 0 #indice du nombre de parties jouées. while (step_game < total_games): p.reset_game() #réinitialisation du jeu state = game.getGameState() state = process_state(state) rand_sum = 0 greedy_sum = 0 tuyau_passe = 0 while(not game.game_over()): if (np.random.random() < epsilon(step_game,total_games)): #Exploration rand_sum = rand_sum + 1 #action = random_action(state) action = np.random.choice([0,1]) else: #On suit le résultat du réseau de neurone. greedy_sum = greedy_sum + 1 action = greedy_action(dqn, state, batchSize) #Résultat de l'action : reward = p.act(list_actions[action]) reward = training_reward(reward) if reward > 0:
"""-----------------""" """ Deep Q-Learning """ """-----------------""" for id_game in range(total_games): if id_game % evaluation_period == 0: epoch += 1 scoreMC[epoch] = MCeval(dqn, 50, gamma) dqn.save(filename + str(epoch) + ".dqf") print(">>> Eval n°%d | score = %f" % (epoch, scoreMC[epoch])) p.reset_game() # Nouvelle partie state_x = process_state(game.getGameState()) id_frame = 0 score = 0 alea = 0 while not game.game_over(): id_frame += 1 step += 1 ## Choisit l'action à effectuer : 0 ou 1 if np.random.rand() < epsilon(step): # Action au hasard alea += 1 action = np.random.choice([0, 1]) else: # Meilleure action possible action = greedy_action(dqn, state_x) ## Joue l'action et observe le gain et l'état suivant reward = p.act(actions[action]) reward = clip_reward(reward) state_y = process_state(game.getGameState()) ## Mise à jour de Q QX = dqn.predict(np.array(state_x).reshape(1, len(state_x)), batch_size=batchSize)