Example #1
0
class WrappedFlappyBird():
    def __init__(self):
        self.score_counter = 0
        self.game = FlappyBird()
        self.env = PLE(self.game, fps=30, display_screen=True)

    def frame_step(self, action_vector):
        if action_vector[0] == 1:
            self.env.act(119)
        elif action_vector[1] == 1:
            self.env.act(1)

        frame = self.env.getScreenRGB()
        reward = self.get_action_reward()
        game_over = self.game.game_over()

        if game_over:
            self.game.reset()

        return frame, reward, game_over

    def get_action_reward(self):
        if self.game.game_over():
            self.score_counter = 0
            return -1
        elif self.score_counter < self.game.getScore():
            self.score_counter = self.game.getScore()
            return 1
        else:
            return 0.1
Example #2
0
jeu = FlappyBird()
p = PLE(jeu,
        fps=30,
        frame_skip=1,
        num_steps=1,
        force_fps=True,
        display_screen=True)
p.init()

i = 0

while (True):
    p.reset_game()
    state = jeu.getGameState()
    state = np.array(list(state.values()))
    while (not jeu.game_over()):

        qval = model.predict(
            state.reshape(1, len(state)), batch_size=batchSize
        )  #Learn Q (Q-learning) / model initialise avant (neural-network)
        if (random.random() < epsilon):  # exploration exploitation strategy
            action = np.random.randint(0, 2)
        else:  #choose best action from Q(s,a) values
            qval_av_action = [-9999] * 2

            for ac in range(0, 2):
                qval_av_action[ac] = qval[0][ac]
            action = (np.argmax(qval_av_action))
        #Take action, observe new state S'
        #Observe reward
        reward = p.act(119 * action)
Example #3
0
def DAgger(hidden_sizes=[32, 32],
           dagger_iterations=20,
           p_lr=1e-3,
           step_iterations=1000,
           batch_size=128,
           train_epochs=20,
           obs_dim=8,
           act_dim=2):

    tf.reset_default_graph()

    ############################## EXPERT ###############################
    # load the expert and return a function that predict the expert action given a state
    expert_policy = expert()
    print('Expert performance: ', np.mean(test_agent(expert_policy)))

    #################### LEARNER COMPUTATIONAL GRAPH ####################
    obs_ph = tf.placeholder(shape=(None, obs_dim),
                            dtype=tf.float32,
                            name='obs')
    act_ph = tf.placeholder(shape=(None, ), dtype=tf.int32, name='act')

    # Multi-layer perceptron
    p_logits = mlp(obs_ph,
                   hidden_sizes,
                   act_dim,
                   tf.nn.relu,
                   last_activation=None)

    act_max = tf.math.argmax(p_logits, axis=1)
    act_onehot = tf.one_hot(act_ph, depth=act_dim)

    # softmax cross entropy loss
    p_loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(labels=act_onehot,
                                                   logits=p_logits))
    # Adam optimizer
    p_opt = tf.train.AdamOptimizer(p_lr).minimize(p_loss)

    now = datetime.now()
    clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute,
                                      now.second)
    file_writer = tf.summary.FileWriter(
        'log_dir/FlappyBird/DAgger_' + clock_time, tf.get_default_graph())

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def learner_policy(state):
        action = sess.run(act_max, feed_dict={obs_ph: [state]})
        return np.squeeze(action)

    X = []
    y = []

    env = FlappyBird()

    # env = PLE(env, fps=30, display_screen=False)
    env = PLE(env, fps=30, display_screen=True, force_fps=False)
    env.init()

    #################### DAgger iterations ####################

    for it in range(dagger_iterations):
        sess.run(tf.global_variables_initializer())
        env.reset_game()
        no_op(env)

        game_rew = 0
        rewards = []

        ###################### Populate the dataset #####################

        for _ in range(step_iterations):
            # get the current state from the environment
            state = flappy_game_state(env)

            # As the iterations continue use more and more actions sampled from the learner
            if np.random.rand() < (1 - it / 5):
                action = expert_policy(state)
            else:
                action = learner_policy(state)

            action = 119 if action == 1 else None

            rew = env.act(action)
            rew += env.act(action)

            # Add the state and the expert action to the dataset
            X.append(state)
            y.append(expert_policy(state))

            game_rew += rew

            # Whenever the game stop, reset the environment and initailize the variables
            if env.game_over():
                env.reset_game()
                no_op(env)

                rewards.append(game_rew)
                game_rew = 0

        ##################### Training #####################

        # Calculate the number of minibatches
        n_batches = int(np.floor(len(X) / batch_size))

        # shuffle the dataset
        shuffle = np.arange(len(X))
        np.random.shuffle(shuffle)

        shuffled_X = np.array(X)[shuffle]
        shuffled_y = np.array(y)[shuffle]

        for _ in range(train_epochs):
            ep_loss = []
            # Train the model on each minibatch in the dataset
            for b in range(n_batches):
                p_start = b * batch_size

                # mini-batch training
                tr_loss, _ = sess.run(
                    [p_loss, p_opt],
                    feed_dict={
                        obs_ph: shuffled_X[p_start:p_start + batch_size],
                        act_ph: shuffled_y[p_start:p_start + batch_size]
                    })

                ep_loss.append(tr_loss)

        agent_tests = test_agent(learner_policy, file_writer, step=len(X))

        print('Ep:', it, np.mean(ep_loss), 'Test:', np.mean(agent_tests))
Example #4
0
    return np.sum(scores)


## Training loop :
total_games = 15000      #nombre de parties jouées pour l'entraînement.
evaluation_period = 1000 #Tout les ... évalue la qualité du réseau.
gamma = 0.99             #Permet de déifinir la récompense update.
step_game = 0            #indice du nombre de parties jouées.
while (step_game < total_games):
    p.reset_game()       #réinitialisation du jeu
    state = game.getGameState()
    state = process_state(state)
    rand_sum = 0
    greedy_sum = 0
    tuyau_passe = 0
    while(not game.game_over()):
        
        if (np.random.random() < epsilon(step_game,total_games)): 
            #Exploration
            rand_sum = rand_sum + 1
            #action = random_action(state)
            action = np.random.choice([0,1])
        else: 
            #On suit le résultat du réseau de neurone.
            greedy_sum = greedy_sum + 1
            action = greedy_action(dqn, state, batchSize)
        
        #Résultat de l'action :
        reward = p.act(list_actions[action])
        reward = training_reward(reward)
        if reward > 0:
Example #5
0
"""-----------------"""
""" Deep Q-Learning """
"""-----------------"""

for id_game in range(total_games):
    if id_game % evaluation_period == 0:
        epoch += 1
        scoreMC[epoch] = MCeval(dqn, 50, gamma)
        dqn.save(filename + str(epoch) + ".dqf")
        print(">>> Eval n°%d | score = %f" % (epoch, scoreMC[epoch]))
    p.reset_game()  # Nouvelle partie
    state_x = process_state(game.getGameState())
    id_frame = 0
    score = 0
    alea = 0
    while not game.game_over():
        id_frame += 1
        step += 1
        ## Choisit l'action à effectuer : 0 ou 1
        if np.random.rand() < epsilon(step):  # Action au hasard
            alea += 1
            action = np.random.choice([0, 1])
        else:  # Meilleure action possible
            action = greedy_action(dqn, state_x)
        ## Joue l'action et observe le gain et l'état suivant
        reward = p.act(actions[action])
        reward = clip_reward(reward)
        state_y = process_state(game.getGameState())
        ## Mise à jour de Q
        QX = dqn.predict(np.array(state_x).reshape(1, len(state_x)),
                         batch_size=batchSize)