Ejemplo n.º 1
0
def evaluate(agent):
    env = PLE(game, fps=30, display_screen=True)
    actionset = env.getActionSet()
    eval_reward = []
    for i in range(5):
        env.init()
        env.reset_game()
        obs = list(env.getGameState().values())
        episode_reward = 0
        while True:
            action = agent.predict(obs)
            observation = env.getScreenRGB()
            score = env.score()
            #action = agent.pickAction(reward, observation)
            observation = cv2.transpose(observation)
            font = cv2.FONT_HERSHEY_SIMPLEX
            observation = cv2.putText(observation, str(int(score)), (0, 25),
                                      font, 1.2, (255, 255, 255), 2)
            cv2.imshow("ss", observation)
            cv2.waitKey(10)  # 预测动作,只选最优动作
            reward = env.act(actionset[action])
            obs = list(env.getGameState().values())
            done = env.game_over()
            episode_reward += reward
            if done:
                break
        eval_reward.append(episode_reward)
        cv2.destroyAllWindows()
    return np.mean(eval_reward)
Ejemplo n.º 2
0
    p.init()

    #get the current state values(state array)
    game_current_state = agent.get_current_state(game.getGameState())
    #initializing the episode to 0
    number_of_episods = 0
    #initializing the maximum score variable to 0
    maximum_score = 0

    #creating a while loop to itaraqte through the episodes
    while True:
        #get the optimal action to the current state and store in in the variable
        maximum_action = agent.get_action(game_current_state)
        #get the score in the current episode
        current_score = p.score()
        #get the maximim score by comparing with the current acore
        maximum_score = max(current_score, maximum_score)
        #get the reward value by performing the above action (rward is either 1 or -1000)
        reward = agent.perform_action(p, maximum_action)
        #get the next state values (state array)
        game_next_state = agent.get_current_state(game.getGameState())

        #update the Q values by calling the update Q function
        agent.update_Q_values(game_current_state, game_next_state, reward,
                              maximum_action)

        #set the next state as current state
        game_current_state = game_next_state

        time.sleep(0.01)
Ejemplo n.º 3
0
def play(size_image):
    sess = tf.InteractiveSession()

    img_size = 80
    net = NetworkOld(img_size)

    # open up a game state to communicate with emulator
    game = flappybird.prepare_game()
    p = PLE(game, fps=30, display_screen=True)
    p.init()
    reward = 0.0

    # get the first state by doing nothing and preprocess the image to 80x80x4

    actions = p.getActionSet()
    p.act(actions[1])

    s_t = preprocessing.transform_image(p.getScreenRGB(), img_size)

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state("../saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # start training
    t = 0
    while t < MAX_ITE:
        if p.game_over():
            p.reset_game()
            terminal = True
        else:
            terminal = False

        # choose an action epsilon greedily
        readout_t = net.readout.eval(feed_dict={net.s: [s_t]})[0]
        a_t = np.zeros([ACTIONS])

        action_index = np.argmax(readout_t)
        a_t[action_index] = 1

        # run the selected action and observe next state and reward
        action = int(np.argmax(a_t))
        if action == 0:
            action = 1
        else:
            action = 0
        r_t = p.act(actions[action])

        s_t1 = preprocessing.transform_image_stacked(p.getScreenRGB(), s_t,
                                                     img_size)

        # update the old values
        s_t = s_t1
        t += 1

        print("TIMESTEP", t, "/ ACTION", action_index, "/ REWARD", r_t,
              "/ Q_MAX %e" % np.max(readout_t), " / SCORE", p.score())
Ejemplo n.º 4
0
epochs = 10000000
game_duration = 1000

rewards = []
avg_rewards = []
epsilons = []
steps = []
step = 0
plt.ion()
for epoch in range(epochs):
    p.reset_game()

    for it in range(1000):
        if p.game_over():
            p.reset_game()
            print "Score:" + str(p.score())

        current_state = game.getGameState()
        processed_current_state = process_state(current_state)

        action = agent.act(processed_current_state)
        reward = p.act(actions[action])
        rewards.append(reward)

        next_state = game.getGameState()
        game_over = p.game_over()

        processed_next_state = process_state(next_state)

        agent.remember(processed_current_state, action, reward,
                       processed_next_state, game_over)
import numpy as np
from ple import PLE
from ple.games.waterworld import WaterWorld


# lets adjust the rewards our agent recieves
rewards = {
    "tick": -0.01,  # each time the game steps forward in time the agent gets -0.1
    "positive": 1.0,  # each time the agent collects a green circle
    "negative": -5.0,  # each time the agent bumps into a red circle
}

# make a PLE instance.
# use lower fps so we can see whats happening a little easier
game = WaterWorld(width=256, height=256, num_creeps=8)
p = PLE(game, fps=15, force_fps=False, display_screen=True,
        reward_values=rewards)
# we pass in the rewards and PLE will adjust the game for us

p.init()
actions = p.getActionSet()
for i in range(1000):
    if p.game_over():
        p.reset_game()

    action = actions[np.random.randint(0, len(actions))]  # random actions
    reward = p.act(action)

    print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
Ejemplo n.º 6
0
def q_learning(file_name=None,
               plot=False,
               gap_division=3,
               gamma=0.75,
               epsilon=0.9,
               batch_size=128,
               reward_weight_decision=True,
               buffer_size=5000):
    os.putenv('SDL_VIDEODRIVER', 'fbcon')
    os.environ["SDL_VIDEODRIVER"] = "dummy"

    game = FlappyBird(width=game_width,
                      height=game_height,
                      pipe_gap=game_pipe_gap)

    p = PLE(game, frame_skip=6)
    p.init()

    last_state = None
    last_action = 0
    last_actions_q_values = [0, 0]
    last_score = 0

    buffer = []
    episode = 0

    network = Network(batch_size, gamma, epsilon, gap_division)
    if file_name is not None:
        network.load(file_name, rename=True)
    else:
        leaky_option_hidden_layers, leaky_option_last_layer = False, False
        activation_hidden_layers = input(
            "Enter the activation function for the hidden layers (leave empty for default activation (relu)) \n"
        )
        activation_hidden_layers = "relu" if activation_hidden_layers == "" else activation_hidden_layers
        if activation_hidden_layers == "leaky relu":
            alpha_relu = input(
                "Enter alpha value for relu activation (0.3 by default)\n")
            if alpha_relu == "0.3" or alpha_relu == "":
                activation_hidden_layers = LeakyReLU(alpha=0.3)
            else:
                activation_hidden_layers = LeakyReLU(alpha=float(alpha_relu))

            leaky_option_hidden_layers = True

        activation_last_layer = input(
            "Enter the activation function for the last layer (leave empty for default activation (linear)) \n"
        )
        activation_last_layer = "linear" if activation_last_layer == "" else activation_last_layer
        if activation_last_layer == "leaky relu":
            alpha_relu = input(
                "Enter alpha value for relu activation (0.3 by default)\n")
            if alpha_relu == "0.3" or alpha_relu == "":
                activation_last_layer = LeakyReLU(alpha=0.3)
            else:
                activation_last_layer = LeakyReLU(alpha=float(alpha_relu))
            leaky_option_last_layer = True

        weight_initializer = input(
            "Enter weight initializer (leave empty for default value (glorot_uniform)) \n"
        )
        weight_initializer = "glorot_uniform" if weight_initializer == "" else weight_initializer

        bias_initializer = input(
            "Enter bias initializer (leave empty for default value (glorot_uniform)) \n"
        )
        bias_initializer = "glorot_uniform" if bias_initializer == "" else bias_initializer

        loss_func = input(
            "Enter loss function (leave empty for default value (binary_crossentropy)) \n"
        )
        loss_func = "binary_crossentropy" if loss_func == "" else loss_func

        optimizer = input(
            "Enter the optimizer for neural network (leave empty for default value (Adadelta)) or (Adadelta/RMSprop/SGD/Nadam) \n"
        )
        optimizer = "Adadelta" if optimizer == "" else optimizer

        optimizer_parameters = set_optimizer_parameters(optimizer)

        network.create_layers(
            activation_hidden_layers=activation_hidden_layers,
            activation_last_layer=activation_last_layer,
            weight_initializer=weight_initializer,
            bias_initializer=bias_initializer,
            loss_function=loss_func,
            optimizer=optimizer,
            optimizer_parameters=optimizer_parameters,
            leaky_hidden_layers=leaky_option_hidden_layers,
            leaky_last_layer=leaky_option_last_layer)

    while 1:
        if p.game_over():
            # restart the game
            p.reset_game()
            # count episodes
            episode += 1
            if episode % 1000 == 0:
                network.save_file()

            # update plot
            print(
                f'\n episode={episode}, epsilon={epsilon}, buffer_size={len(buffer)}, score={last_score}'
            )
            if plot is True:
                plt.scatter(episode, last_score)
                plt.pause(0.001)
                print(f'\n episode={episode}, score={last_score}')

            # adding the last entry correctly
            label = last_actions_q_values
            label[last_action] = -1000
            if len(buffer) < buffer_size:
                buffer += [(last_state, label)]
            else:
                buffer = buffer[1:] + [(last_state, label)]

            # reset all
            last_state = None
            last_action = 0
            last_actions_q_values = [0, 0]
            last_score = 0

        # look at the current state
        current_state = p.getGameState()
        current_score = p.score()

        # compute the actions' Q values
        actions_q_values = network.Q(current_state).tolist()

        # Compute the label for the last_state
        reward = get_reward(state=current_state,
                            gap_division=gap_division,
                            reward_weight_decision=reward_weight_decision)
        max_q = max(actions_q_values)

        label = last_actions_q_values
        if current_score - last_score > 0:
            label[last_action] = (current_score - last_score) * 1000
        else:
            label[last_action] = reward + gamma * max_q

        # not taking the first state into consideration
        if last_state is not None:
            # Update buffers
            if len(buffer) < buffer_size:
                buffer += [(last_state, label)]
            else:
                buffer = buffer[1:] + [(last_state, label)]

        # train
        if len(buffer) >= batch_size:
            sample = random.sample(buffer, batch_size)
            network.train(sample)

        # choose the optimal action with a chance of 1 - epsilon
        actions_indexes = np.arange(len(actions_q_values))

        optimal_action_to_take = np.argmax(actions_q_values)
        random_action = np.random.choice(actions_indexes)

        if np.random.uniform() < epsilon:
            action = random_action
        else:
            action = optimal_action_to_take

        # act accordingly
        p.act(None if action == 0 else 119)

        # update epsilon
        if epsilon > 0.1:
            epsilon = epsilon - 0.00000075

        # remember everything needed from the current state
        last_action = action
        last_state = current_state
        last_actions_q_values = actions_q_values
        last_score = current_score

        # Log
        sys.stdout.write(
            f'\rBottom: {game_height - current_state["next_pipe_bottom_y"]}, Top: {game_height - current_state["next_pipe_top_y"]}, Bird: {game_height - current_state["player_y"]}, Reward: {reward}'
        )
        sys.stdout.flush()
Ejemplo n.º 7
0
if __name__ == '__main__':
    reward = 0
    steps = 1000
    epoch = 0
    limit = 100
    la = LearningAgent(list(game.getActions()))
    la.brain.load()
    scores = []

    i = 0
    while epoch <= limit:

        # We want to train
        i += 1
        state = list(p.getGameState().values())
        reward = p.score()
        #print(reward)
        action = la.brain.update(reward, state)
        la.pickAction(action)
        if i > steps:
            print(epoch)
            epoch += 1
            la.brain.save()
            scores.append(la.brain.score())
            plt.plot(scores)
            plt.savefig("RewardGraph.png")
            i = 0
    la.brain.save()
    plt.show()
Ejemplo n.º 8
0
def evaluate(agent1, agent2, agent3):
    input("开始比赛")
    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')

    frame_number = 0
    env = PLE(game, fps=30, display_screen=True)
    actionset = env.getActionSet()
    eval_reward = []

    for i in range(5):
        output_movie = cv2.VideoWriter(videoname + '_' + str(i) + '.mp4',
                                       fourcc, 20, (288, 512))
        env.init()
        env.reset_game()
        dstate = env.getGameState()
        # print(dstate)
        obs = list(dstate.values())

        last_obs = np.zeros_like(obs[0:8])
        episode_reward = 0
        while True:
            obs1 = obs[0:8]
            obs2 = obs[8:16]
            obs3 = obs[16:24]
            action1 = agent1.predict(obs1)
            action2 = agent2.predict(obs2)
            action3 = agent3.predict(last_obs, obs3)

            finalaction = 0
            if action1 == 0:
                finalaction += 1
            if action2 == 0:
                finalaction += 2
            if action3 == 0:
                finalaction += 4
            # print("action1: ", action1)
            # print("action2: ", action2)
            # print("action3: ", action3)
            # print("action: ", finalaction)
            # print(obs)
            # print(obs1)
            # print(obs2)
            # print(obs3)
            if finalaction == 0:
                finalaction = None
            score = env.score()

            observation = env.getScreenRGB()
            observation = cv2.transpose(observation)
            font = cv2.FONT_HERSHEY_SIMPLEX
            observation = cv2.putText(observation, str(int(score)), (0, 25),
                                      font, 1.2, (255, 255, 255), 2)
            ss = observation.shape
            observation = cv2.resize(observation, (ss[1] * 2, ss[0] * 2))
            output_movie.write(observation)
            cv2.imshow("ss", observation)
            cv2.waitKey(30)  # 预测动作,只选最优动作

            reward = env.act(finalaction)
            last_obs = obs3
            dstate = env.getGameState()
            # print(dstate)
            obs = list(dstate.values())
            done = env.game_over()
            episode_reward += reward
            if done:
                break
            # input()
        eval_reward.append(episode_reward)
        cv2.destroyAllWindows()
        output_movie.release()
        input()
    return np.mean(eval_reward)
Ejemplo n.º 9
0
class Bot():
    """
            This is our Test agent. It's gonna pick some actions after training!
    """
    def __init__(self, lr):

        self.lr = lr
        self.game = Pixelcopter(width=480, height=480)
        self.p = PLE(self.game, fps=60, display_screen=True)
        self.actions = self.p.getActionSet()

    #def pickAction(self, reward, obs):
    #   return random.choice(self.actions)

    def frame_step(self, act_inp):
        terminal = False
        reward = self.p.act(act_inp)
        if self.p.game_over():
            self.p.reset_game()
            terminal = True
            reward = -1
        else:
            reward = 1

        self.score = self.p.score()
        img = self.p.getScreenGrayscale()
        img = transform.resize(img, (80, 80))
        img = exposure.rescale_intensity(img, out_range=(0, 255))
        img = img / 255.0

        return img, reward, terminal

    def build_model(self):
        print("Building the model..")
        model = Sequential()
        model.add(
            Convolution2D(32,
                          8,
                          8,
                          subsample=(4, 4),
                          border_mode='same',
                          input_shape=(img_rows, img_cols,
                                       img_channels)))  #80*80*4
        model.add(Activation('relu'))
        model.add(Convolution2D(64, 4, 4, subsample=(2, 2),
                                border_mode='same'))
        model.add(Activation('relu'))
        model.add(Convolution2D(64, 3, 3, subsample=(1, 1),
                                border_mode='same'))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(2))

        adam = Adam(lr=self.lr)
        model.compile(loss='mse', optimizer=adam)
        self.model = model
        print("Finished building the model..")

    def trainNetwork(self, mode):
        D = deque()

        x_t, r_0, terminal = self.frame_step(self.actions[1])

        s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
        #print (s_t.shape)

        #need to reshape for keras
        s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1],
                          s_t.shape[2])  #1*80*80*4

        if mode == 'Run':
            OBSERVE = 999999999  #We keep observe, never train
            epsilon = FINAL_EPSILON
            print("Now we load weight")
            self.model.load_weights("model.h5")
            adam = Adam(lr=self.lr)
            self.model.compile(loss='mse', optimizer=adam)
            print("Weight load successfully")
        else:  #We go to training mode
            OBSERVE = OBSERVATION
            epsilon = INITIAL_EPSILON

        t = 0
        while (True):
            loss = 0
            Q_sa = 0
            action_index = 0
            r_t = 0
            #choose an action epsilon greedy
            if t % FRAME_PER_ACTION == 0:
                if random.random() <= epsilon:
                    print("----------Random Action----------")
                    action_index = random.randrange(num_actions)
                    chosen_act = self.actions[action_index]
                else:
                    q = self.model.predict(
                        s_t)  #input a stack of 4 images, get the prediction
                    max_Q = np.argmax(q)
                    action_index = max_Q
                    chosen_act = self.actions[action_index]

            #We reduced the epsilon gradually
            if epsilon > FINAL_EPSILON and t > OBSERVE:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

            #run the selected action and observed next state and reward
            x_t1, r_t, terminal = self.frame_step(chosen_act)

            x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1)  #1x80x80x1
            s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

            # store the transition in D
            D.append((s_t, action_index, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

            #only train if done observing
            if t > OBSERVE:
                #sample a minibatch to train on
                minibatch = random.sample(D, BATCH)

                #Now we do the experience replay
                state_t, action_t, reward_t, state_t1, terminal = zip(
                    *minibatch)
                state_t = np.concatenate(state_t)
                state_t1 = np.concatenate(state_t1)
                targets = self.model.predict(state_t)
                Q_sa = self.model.predict(state_t1)
                targets[range(BATCH), action_t] = reward_t + GAMMA * np.max(
                    Q_sa, axis=1) * np.invert(terminal)

                loss += self.model.train_on_batch(state_t, targets)

            s_t = s_t1
            t = t + 1

            # save progress every 10000 iterations
            if t % 1000 == 0:
                print("Now we save model")
                self.model.save_weights("model.h5", overwrite=True)
                with open("model.json", "w") as outfile:
                    json.dump(self.model.to_json(), outfile)

            # print info
            state = ""
            if t <= OBSERVE:
                state = "observe"
            elif t > OBSERVE and t <= OBSERVE + EXPLORE:
                state = "explore"
            else:
                state = "train"

            print("TIMESTEP", t, "/ STATE", state, \
                "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
                "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)

        print("Episode finished!")
        print("************************")

    def playGame(self, mode):
        self.build_model()
        self.trainNetwork(mode)

    def main(self):
        modes = ["Train", "Run"]
        mode = modes[input("Do you wanna Train(0) or Run(1): ")]
        self.playGame(mode)
Ejemplo n.º 10
0
from ple.games.waterworld import WaterWorld

# lets adjust the rewards our agent recieves
rewards = {
    "tick":
    -0.01,  # each time the game steps forward in time the agent gets -0.1
    "positive": 1.0,  # each time the agent collects a green circle
    "negative": -5.0,  # each time the agent bumps into a red circle
}

# make a PLE instance.
# use lower fps so we can see whats happening a little easier
game = WaterWorld(width=256, height=256, num_creeps=8)
p = PLE(game,
        fps=15,
        force_fps=False,
        display_screen=True,
        reward_values=rewards)
# we pass in the rewards and PLE will adjust the game for us

p.init()
actions = p.getActionSet()
for i in range(1000):
    if p.game_over():
        p.reset_game()

    action = actions[np.random.randint(0, len(actions))]  # random actions
    reward = p.act(action)

    print "Score: {:0.3f} | Reward: {:0.3f} ".format(p.score(), reward)
Ejemplo n.º 11
0
            return self.actions[1]
        elif fwd[1] < 0 and abs(fwd[1]) > abs(fwd[0]):
            return self.actions[2]
        elif fwd[0] < 0 and abs(fwd[0]) > abs(fwd[1]):
            return self.actions[3]
        else:
            return self.actions[4]

os.putenv('SDL_VIDEODRIVER', 'fbcon')
os.environ["SDL_VIDEODRIVER"] = "dummy"

# create our game
force_fps = True  # slower speed
display_screen = False
game = WaterWorld()

# make a PLE instance.
p = PLE(game,force_fps=force_fps)

# init agent and game.
p.init()
p.display_screen = True

reward = 0
agent = MyAgent(p.getActionSet())
while p.game_over() == False:
    state = p.getGameState()
    action = agent.pickAction(reward, state)
    reward = p.act(action)
print p.score()
Ejemplo n.º 12
0
def train():
    game = Snake(600, 600)
    p = PLE(game,
            fps=60,
            state_preprocessor=process_state,
            force_fps=True,
            display_screen=False,
            frame_skip=2,
            reward_values={
                "positive": 100.0,
                "negative": -50.0,
                "tick": -0.1,
                "loss": -110.0,
                "win": 5.0
            })
    agent = Agent(alpha=float(sys.argv[1]),
                  gamma=float(sys.argv[2]),
                  n_actions=3,
                  epsilon=0.99,
                  batch_size=100,
                  input_shape=6,
                  epsilon_dec=0.99999,
                  epsilon_end=0.001,
                  memory_size=500000,
                  file_name=sys.argv[3],
                  activations=[str(sys.argv[4]),
                               str(sys.argv[5])])
    p.init()
    # agent.load_game()

    scores = []

    for _ in range(100000):
        if p.game_over():
            p.reset_game()
        score = 0
        initial_direction = "Right"

        while not p.game_over():
            old_state = np.array(
                vision(list(p.getGameState()[0]), initial_direction))

            action = agent.choose_action(old_state)

            possible_directions = prepare_corect_directions(initial_direction)
            possible_directions_tuples = list(
                zip(possible_directions.keys(), possible_directions.values()))
            direction = possible_directions_tuples[action]
            initial_direction = direction[1]

            reward = p.act(direction[0])

            new_state = np.array(
                vision(list(p.getGameState()[0]), initial_direction))
            agent.add_experience(old_state, action, reward, new_state)
            agent.learn()
            score = p.score()
        scores.append(score)
        print(
            f"Score for model iteration number _ {str(sys.argv[3])} with learning_rate {sys.argv[1]}, gama {sys.argv[2]}, activations: {sys.argv[4], sys.argv[5]} is score {score}. Epsilon is {agent.epsilon}"
        )
        agent.save_game()
Ejemplo n.º 13
0
class FlappyBirdEnvironment(Environment):
    def __init__(self):
        env = FlappyBird()
        self.p = PLE(env, add_noop_action=True)
        self.p.init()
        self.win_score = 10.
        action_space = len(self.p.getActionSet())
        state_space = len(self.p.getGameState())
        actions = ["up", "nothing"]
        state_names = list(self.p.getGameState().keys())

        Environment.__init__(self, env, action_space, state_space, actions,
                             state_names)

    def reset_environment(self):
        self.p.reset_game()

    def get_state(self) -> np.array:
        state = list(self.p.getGameState().values())
        state = np.array(state)
        return state

    def get_normalized_state(self) -> np.array:
        """Get the current state of the environment with each
        state attribute normalized in [0, 1], ready to be fed to a NN.

        Returns:
            The current normalized state (np.array)
        """

        state = self.get_state()

        states_mins = np.array([0., -10., 0., 0., 103., 103., 0., 103.])
        states_maxs = np.array([410., 10., 288., 205., 308., 410., 205., 308.])
        state = (state - states_mins) / (states_maxs - states_mins)
        return state

    def environment_step(self, action: int) -> (np.array, int, bool):
        """Do a move in the environment.

        Args:
            action: The action to take

        Returns:
            The next state, the reward obtained by doing the action, and if the environment is terminated
        """
        p_action = self.p.getActionSet()[action]
        reward = self.p.act(p_action)
        done = self.p.game_over()
        if self.p.score() >= self.win_score:
            done = True
        next_state = self.get_state()
        return next_state, reward, done

    def render_environment(self):
        self.p.display_screen = True
        self.p.force_fps = False

    def pass_test(self, rewards: List[float]):
        if np.mean(rewards) >= self.win_score:
            return True
        else:
            return False

    def close(self):
        pygame.quit()

    def win_condition(self, episode: Episode):
        if episode.total_reward >= self.win_score:
            return True
        else:
            return False
Ejemplo n.º 14
0
if __name__ == '__main__':
    #p.init() #do I even need this? Kaio didn't seem to be using it for naive agent
    print(game.getActions())
    tresh = False
    reward = 0
    steps = 1000
    la = LearningAgent(list(game.getActions()))
    #where is the documentation for extract_image? I imagine it comes from utils
    snapshot = extract_image(p.getScreenRGB(), (80,80), thresh=thresh)
    #what is this
    stack_snaps = np.stack((snapshot, snapshot, snapshot, snapshot), axis=0)

    while p.game_over() == False:
        snapshot = extract_image(p.getScreenRGB(), (80, 80), thresh=thresh)
        snapshot = np.reshape(snapshot, (1, 80, 80))
        st = np.append(stack_snaps[1:4, :, :], snapshot, axis=0) #what does st stand for?

        if train:
            reward, action, _, _, _ = train_and_play(p_action, st, select_action, perform_action, possible_actions, optimize, None, {})
            push_to_memory(stack_snaps, action, st, reward)
        else:
            play(p_action, st, select_action, perform_action, possible_actions, None, {})

        stack_snaps = st

    score = p.score()
    p.reset_game()
    if train:
        save_model(save_path)

    #return score #how to do this when this isn't technically a function?
Ejemplo n.º 15
0
# Initialisation du jeu
game = Snake(height=case_size * size, width=case_size * size)
p = PLE(game, fps=30, display_screen=True)

agent = Trainer(allowed_actions=p.getActionSet(),
                height=game.height,
                width=game.width)

p.init()
reward = 0.0
nb_frames = 10000000000000000
bestScore = 0

for i in range(nb_frames):

    if (p.score() > bestScore):
        bestScore = int(p.score())
        print('New Best Score : ' + str(bestScore) + ' a ' +
              str(datetime.datetime.now()))

    if p.game_over():
        p.reset_game()

    observation = p.getGameState()
    food_location = [
        int(observation.get('food_x') / 10),
        int(observation.get('food_y') / 10)
    ]
    snake_location = [
        int(observation.get('snake_head_x') / 10),
        int(observation.get('snake_head_y') / 10)
Ejemplo n.º 16
0
class UMDAc():
    def __init__(self,
                 gen_size,
                 net_size,
                 activation,
                 env,
                 max_steps=None,
                 seed=0,
                 action_mode='argmax',
                 iterations=1,
                 display_info=False):

        ## Global variables
        self.gen_size = gen_size
        self.net_size = net_size
        self.activation = activation
        self.iterations = iterations

        self.seed = seed
        self.max_steps = max_steps

        ## Detect environment, OpenAI or PLE
        try:
            ## Environment is from OpenAI Gym
            self.state_size = env.observation_space.shape[0]
            self.openai = True
            self.ple = False

            self.env = env  ## Environment

            try:
                ## Size of action vector agent can take
                self.action_size = env.action_space.n
            except:
                ## Size of action vector agent can take
                self.action_size = env.action_space.shape[0]

        except:
            ## Environment is from PLE
            self.openai = False
            self.ple = True

            self.game = env
            ## Init environment
            self.env = PLE(self.game, fps=30, display_screen=True, rng=0)
            ## Allowed actions set
            self.allowed_actions = list(self.env.getActionSet())
            self.action_size = len(self.allowed_actions)
            #self.state_size = len(self.game.getGameState())
            self.state_size = self._ple_get_state().shape[1]

        if display_info:
            ## Print environment info
            print('\n' + '#' * 5, ' Environment data: ', '#' * 5)
            print('Type (Autodected): ', 'Gym' if self.openai else 'PLE')
            print('State size: ', self.state_size)
            print('Action size: ', self.action_size)
            print('')
            print('Iterations: ', self.iterations)
            print('')
        '''
        ACTION MODE:
            Determines how output data from neural network
            will be treated. Three options:
                - raw
                - argmax
                - tanh
        '''
        self.action_mode = action_mode

        self.fitness = {}  # Init fitness log

        ## Create first generation randomly
        self.gen = {}  # Init generation 0

        ## Create random specimens
        for i in range(gen_size):
            ## Generate specimen weights and biases
            specimen = {}
            ## First layer
            specimen['h0'] = np.random.uniform(-1, 1,
                                               [self.state_size, net_size[0]])
            specimen['b0'] = np.random.uniform(-1, 1, [1, net_size[0]])

            ## Intermediate layers
            h_i = 1
            for layer in net_size[1:]:
                ## Generate hidden layers and biases
                specimen['h' + str(h_i)] = np.random.uniform(
                    -1, 1, [net_size[h_i - 1], net_size[h_i]])
                specimen['b' + str(h_i)] = np.random.uniform(
                    -1, 1, [1, net_size[h_i]])

                h_i += 1

            ## Last layer
            specimen['h' + str(h_i)] = np.random.uniform(
                -1, 1, [net_size[h_i - 1], self.action_size])
            specimen['b' + str(h_i)] = np.random.uniform(
                -1, 1, [1, self.action_size])

            ## Add specimen to generation
            self.gen['s' + str(i)] = specimen
            ## Add specimen to fitness log, init with fitness
            ## value of 0
            self.fitness['s' + str(i)] = 0.

            ## Create a dictionary to hold new specimens
            self.new = {}

            ## First new specimen (reference specimen)
            reference = {}

            reference['h0'] = np.empty([self.state_size, net_size[0]])
            reference['b0'] = np.empty([1, net_size[0]])
            ## Intermediate layers
            h_i = 1
            for layer in net_size[1:]:
                ## Generate hidden layers and biases
                reference['h' + str(h_i)] = np.empty(
                    [net_size[h_i - 1], net_size[h_i]])
                reference['b' + str(h_i)] = np.empty([1, net_size[h_i]])

                h_i += 1

            ## Last layer
            reference['h' + str(h_i)] = np.empty(
                [net_size[h_i - 1], self.action_size])
            reference['b' + str(h_i)] = np.empty([1, self.action_size])

            ## Add reference to dict
            self.new['n0'] = reference

    def show(self, name, show_weights=False):
        ## For every layer in specimen
        for l_i in range(int(len(self.gen[name]) / 2)):
            ## Print info about layer and bias
            print('-' * 5, " layer Nº", str(l_i), ' ', '-' * 5)
            print(' * Neurons: ', self.gen[name]['h' + str(l_i)].shape[1],
                  '\n', '* Weights of each neuron: ',
                  self.gen[name]['h' + str(l_i)].shape[0], '\n', '* Biases: ',
                  self.gen[name]['b' + str(l_i)].shape[1], '\n')

            if show_weights:
                ## Show weight values
                print("* Weights:")
                print(self.gen[name]['h' + str(l_i)])
                print("* Biases:")
                print(self.gen[name]['b' + str(l_i)])
                print('')

    def pass_forward(self, feature, specimen):

        in_data = feature  ## Load input data

        for l_i in range(int(len(specimen) / 2)):
            ## Pass through weights and sum
            h_z = np.dot(in_data,
                         specimen['h' + str(l_i)]) + specimen['b' + str(l_i)]
            ## Activation function
            h_a = self.activation(h_z)
            ## Pass data to next layer
            in_data = h_a
        ## Return las activation
        return h_a

    def gym_evaluate(self, specimen, render=False, time_sleep=.0):

        seed = self.seed  ## Initial random seed
        reward_log = []  ## For later use in total reward sum if iterations > 1
        for iters in range(self.iterations):

            ## Reset environment
            self.env.seed(seed)
            state = self.env.reset()

            t_reward = 0  ## Reset total reward

            if self.max_steps != None:
                ## Finite time steps
                for step in range(self.max_steps):
                    ## Render env
                    if render:
                        self.env.render()

                    ## Pass forward state data
                    output = self.pass_forward(state, specimen)

                    ## Format output to use it as next action
                    if self.action_mode == 'argmax':
                        action = np.argmax(output[0])

                    elif self.action_mode == 'raw':
                        action = output[0]

                    elif self.action_mode == 'tanh':
                        action = np.tanh(output[0])

                    ## Run new step
                    state, reward, done, _ = self.env.step(action)
                    time.sleep(time_sleep)  ## Wait time

                    ## Add current reard to total
                    t_reward += reward

                    if done:
                        break
                ## Used if iterations > 1
                reward_log.append(t_reward)
                ## Update seed to test agent in different scenarios
                seed += 1

            else:
                ## Test agent until game over
                done = False
                while not done:
                    ## Render env
                    if render:
                        self.env.render()

                    ## Pass forward state data
                    output = self.pass_forward(state, specimen)

                    ## Format output to use it as next action
                    if self.action_mode == 'argmax':
                        action = np.argmax(output[0])

                    elif self.action_mode == 'raw':
                        action = output[0]

                    elif self.action_mode == 'tanh':
                        action = np.tanh(output[0])

                    ## Run new step
                    state, reward, done, _ = self.env.step(action)
                    time.sleep(time_sleep)  ## Wait time

                    ## Add current reard to total
                    t_reward += reward
                    ## End game if game over
                    if done:
                        break
                ## Used if iterations > 1
                reward_log.append(t_reward)
                seed += 1  ## Update random seed

        ## Disable random seed
        ''' This prevents the algorithm to generate the
            same random numbers all time.   '''
        np.random.seed(None)
        ## Sum of total rewards in all iterations
        return sum(reward_log)

    def _ple_get_state(self):
        ## Adapt game observation to
        ## useful state vector
        observation = self.game.getGameState()
        state = []
        for item in observation:

            data = observation[item]

            if type(data) is dict:
                for d in data:
                    inf = np.array(data[d]).flatten()
                    for dt in inf:
                        state.append(dt)

            elif type(data) is list:
                data = np.array(data).flatten()
                for val in data:
                    state.append(val)
            else:
                state.append(data)

        return np.array([state])

    def ple_evaluate(self, specimen, time_sleep=.0):

        ## Set initial random seed
        np.random.seed(self.seed)

        class MyRandom():
            def __init__(self, seed):
                pass
                #np.random.seed(seed)
                #np.random.seed(0)
                #self.seed = seed
            def random_sample(self, size=None):
                return np.random.random_sample(size)

            def choice(self, a, size=None, replace=True, p=None):
                return np.random.choice(a, size, replace, p)

            def random_integers(self, rmin, rmax):
                return np.random.randint(rmin, rmax)

            def uniform(self, low=0.0, high=1.0, size=None):
                return np.random.uniform(low, high, size)

            def rand(self):
                return np.random.rand()

        reward_log = []  ## Log of all total rewards

        if self.max_steps != None:

            for i in range(self.iterations):

                ## Initialize game
                self.game.rng = MyRandom(self.seed)
                self.game.init()  ## Reset game
                t_reward = .0  ## Reset total reward

                for time_step in range(self.max_steps):
                    ## Get state
                    state = self._ple_get_state()
                    ## Output from specimen for given state
                    output = self.pass_forward(state, specimen)
                    ## Covert specimen output to action
                    act = self.allowed_actions[np.argmax(output[0])]
                    ## Take action
                    self.env.act(act)
                    ## Wait time useful if render is enabled
                    time.sleep(time_sleep)
                    ## Update total reward
                    t_reward = self.env.score()
                    ## End game if game over
                    if self.env.game_over():
                        break

                ## Log reward for later sum
                reward_log.append(t_reward)

        else:
            ## Finite number of time
            for i in range(self.iterations):

                ## Initialize game
                self.game.rng = MyRandom(self.seed)
                self.game.init()
                t_reward = .0  ## Reset total reward

                while not self.env.game_over():
                    ## Get state
                    state = self._ple_get_state()
                    ## Take action
                    output = self.pass_forward(state, specimen)
                    act = self.allowed_actions[np.argmax(output[0])]
                    self.env.act(act)
                    ## Useful if random enabled
                    time.sleep(time_sleep)
                    ## Update total reward
                    t_reward = self.env.score()
                ## Log all total rewards
                reward_log.append(t_reward)

        ## Disable random seed
        ''' This prevents the algorithm to generate the
            same random numbers all time.   '''
        np.random.seed(None)
        ## Sum all total rewards
        return sum(reward_log)

    def train(self, n_surv, n_random_surv):

        ## Collect data about generation
        survivors = list(self.fitness.keys())  ## Survivors' names
        survivors_fitness = list(
            self.fitness.values())  ## Survivors's fitnesses

        worsts = []  ## Worst specimens names
        worsts_fitness = []  ## Worst specimens fitness values

        ## Select best fitness survivors
        n_r = len(survivors) - n_surv  ## Number of not survivor specimens
        for n in range(n_r):

            ## Select worst specimen
            indx = survivors_fitness.index(min(survivors_fitness))
            ## Save worsts
            worsts.append(survivors[indx])
            worsts_fitness.append(survivors_fitness[indx])
            ## Delete worsts from survivors lists
            del survivors[indx]
            del survivors_fitness[indx]

        ## Randomly select bad specimens to survive
        for i in range(n_random_surv):
            ## Random index
            indx = np.random.randint(len(worsts))
            ## Add random specimen to survivors
            survivors.append(worsts[indx])
            survivors_fitness.append(worsts_fitness[indx])
            ## Update worst specimens' lists
            del worsts[indx]
            del worsts_fitness[indx]

        ## Generate new specimens (empty):
        for i in range(len(worsts)):
            self.new['n' + str(i)] = copy.deepcopy(self.gen['s0'])

        for param in self.gen['s0']:
            ## For each parameter
            for i in range(self.gen['s0'][param].shape[0]):
                for j in range(self.gen['s0'][param].shape[1]):
                    ## layer[i][j] weight of each survivor
                    w = []
                    ## For each survivor
                    for name in survivors:
                        w.append(self.gen[name][param][i][j])

                    ## NOTE: Experimental
                    #n_mut = int(len(w)*.3)
                    #muts = np.random.rand(n_mut)

                    #w = np.array(w)
                    #np.random.shuffle(w)
                    #
                    #w = np.delete(w, range(len(w)-n_mut, len(w)), 0)

                    #w = np.hstack((w, muts))
                    #np.random.shuffle(w)
                    ## END OF NOTE

                    ## Compute weights list's mean
                    mean = np.mean(w)
                    ## Standard deviation
                    std = np.std(w)

                    ## Get samples
                    samples = np.random.normal(mean, std, len(worsts))

                    i_sample = 0  ##  Iterator
                    ## Generate new specimens
                    for name in self.new:
                        ## Update weight
                        self.new[name][param][i][j] = samples[i_sample]
                        i_sample += 1

        ## After generating a set of new specimens
        new_names = []
        new_fitness = []

        for name in self.new:
            ## Load specimen
            specimen = self.new[name]
            ## Evaluate new specimens
            ## and store data for later comparison
            new_names.append(name)

            if self.openai:
                new_fitness.append(self.gym_evaluate(specimen))

            elif self.ple:
                new_fitness.append(self.ple_evaluate(specimen))
        '''
        Selection. Replace all specimens in the worsts list
        with best specimens of the to_select lists.
        '''
        to_select_names = new_names + worsts
        to_select_fitness = new_fitness + worsts_fitness

        for i in range(len(worsts)):
            indx = np.argmax(to_select_fitness)

            ## Add selected specimen to new generation
            if 'n' in to_select_names[indx]:
                ## Replace specimen
                self.gen[worsts[i]] = copy.deepcopy(
                    self.new[to_select_names[indx]])

            else:
                ## Replace specimen
                self.gen[worsts[i]] = copy.deepcopy(
                    self.gen[to_select_names[indx]])

            ## Update selection lists
            del to_select_names[indx]
            del to_select_fitness[indx]

    def add_neurons(self, layer_name, n_neurons=1):

        ## To all specimens in generation
        for name in self.gen:

            ## Load specimen
            specimen = self.gen[name]

            last_indx = int(len(specimen) / 2) - 1  ## Number of layers
            sel_indx = int(layer_name[1])  ## Selected layer's index

            ## Add neuron to layer
            new_neuron = np.random.rand(specimen[layer_name].shape[0],
                                        n_neurons)
            specimen[layer_name] = np.hstack(
                (specimen[layer_name], new_neuron))

            ## Add new bias
            new_bias = np.random.rand(1, n_neurons)
            specimen['b' + str(sel_indx)] = np.hstack(
                (specimen['b' + str(sel_indx)], new_bias))

            ## Check if the selected layer is
            ## the last (output layer) of the net
            if sel_indx != last_indx:
                next_layer = specimen['h' + str(sel_indx + 1)]
                ## Selected layer isn't the last
                ## Generate new weights
                new_w = np.random.rand(n_neurons, next_layer.shape[1])
                ## Add weights to next layer
                specimen['h' + str(sel_indx + 1)] = np.vstack(
                    (new_w, next_layer))

    def add_layer(self, n_neurons):
        ## Add one layer to all specimens
        ## The new layer is added before
        ## the output layer

        ## Define network's layers
        specimen = self.gen['s0']
        layers = []
        layers_shape = []
        biases = []
        biases_shape = []
        for l in specimen:
            if 'h' in l:
                layers.append(l)
                layers_shape.append(specimen[l].shape)
            elif 'b' in l:
                biases.append(l)
                biases_shape.append(specimen[l].shape)

        for name in self.gen:
            ## Load specimen
            specimen = self.gen[name]
            ## Reset output layer
            new_o = np.random.rand(n_neurons, self.action_size)
            ## Reset output layer bias
            new_o_b = np.random.rand(1, self.action_size)
            ## Create new layer
            new_l = np.random.rand(layers_shape[-2][1], n_neurons)
            new_l_b = np.random.rand(1, n_neurons)

            specimen[layers[-1]] = new_l
            specimen[biases[-1]] = new_l_b
            specimen['h' + str(len(layers))] = new_o
            specimen['b' + str(len(biases))] = new_o_b

    def save_specimen(self, specimen, filename='specimen0.txt'):
        ## Open file
        f = open(filename, 'w')
        ## Write layers
        for layer in specimen:
            f.write(layer + '\n')
            f.write(str(specimen[layer].tolist()) + '\n')

        f.close()  # Close file

    def load_specimen(self, filename):

        import ast

        ## Open file
        f = open(filename, 'r')
        ## Init specimen
        specimen = {}
        ## Read file
        array = False
        for line in f.readlines():
            line = line.split('\n')[0]
            if array:
                ## Covert string to np array
                layer = np.array(ast.literal_eval(line))
                specimen[layer_name] = layer  ## Add layer
                array = False

            else:
                layer_name = line
                array = True
        f.close()  ## Close
        return specimen
Ejemplo n.º 17
0
# 加载模型
save_path = '.\model_dir\model_6700_2823.0.ckpt'  #episode_reward: 1785.0
agent.restore(save_path)

obs = list(env.getGameState().values())
# #处理obs
# obs = preprocess(obs)
episode_reward = 0
while True:
    # 预测动作,只选最优动作
    action = agent.predict(obs)
    # 图像太快休眠
    # time.sleep(0.02) #延迟单位为秒
    # # 新建窗口显示分数
    observation = env.getScreenRGB()
    score = env.score()
    # 格式转换
    observation = cv2.cvtColor(observation, cv2.COLOR_RGB2BGR)
    # 选择90度
    observation = cv2.transpose(observation)
    font = cv2.FONT_HERSHEY_SIMPLEX
    observation = cv2.putText(observation, "score:" + str(int(score)), (0, 30),
                              font, 0.6, (0, 0, 255), 2)
    cv2.imshow("flappybird", observation)
    cv2.waitKey(5)

    reward = env.act(actionset[action])
    obs = list(env.getGameState().values())
    # #处理obs
    # obs = preprocess(obs)
    done = env.game_over()
Ejemplo n.º 18
0
class Environment():
    def __init__(self, device, display=True):
        # Design reward
        reward_values = {
            "positive": 1,
            "tick": 0.1,
            "loss": -1,
        }
        self.env = PLE(FlappyBird(),
                       display_screen=display,
                       reward_values=reward_values)
        self.device = device
        self.action_set = self.env.getActionSet()

        self.frames = []

    def reset(self):
        self.env.reset_game()

    def start(self):
        self.env.act(0)
        obs = convert(self.env.getScreenGrayscale())
        self.state = np.stack([[obs for _ in range(4)]], axis=0)
        self.t_alive = 0
        self.total_reward = 0

        return self.state

    def game_over(self):
        return self.env.game_over()

    def getScore(self):
        return self.env.score()

    def step(self, action):

        reward = self.env.act(self.action_set[action])

        # make next state
        obs = convert(self.env.getScreenGrayscale())
        obs = np.reshape(obs, [1, 1, obs.shape[0], obs.shape[1]])
        next_state = np.append(self.state[:, 1:, ...], obs, axis=1)

        self.t_alive += 1
        self.total_reward += reward
        self.state = next_state

        return self.state, reward, self.env.game_over()

    def get_screen(self):
        return self.env.getScreenRGB()

    def record(self):
        self.frames.append(self.env.getScreenRGB())

    def saveVideo(self, episode, video_path):
        os.makedirs(video_path, exist_ok=True)
        clip = make_video(self.frames, fps=60).rotate(-90)
        clip.write_videofile(os.path.join(video_path,
                                          'env_{}.mp4'.format(episode)),
                             fps=60)
        print('Episode: {} t: {} Reward: {:.3f}'.format(
            episode, self.t_alive, self.total_reward))
Ejemplo n.º 19
0
	# print "Action = ", action
	reward = p.act(p.getActionSet()[action])
	# print "Reward = ", reward
	if p.game_over():
		episode_over = True
		# print ">>>DEAD!"
	observation = game.getGameState()
	observation = ((int(observation["player_y"]) - int(observation["next_pipe_bottom_y"])), int(observation["next_pipe_dist_to_player"]), int(observation["player_vel"]))
	# print "Next observation = ", observation
	agent.update(action, reward, observation, episode_over)

	if episode_over:
		batch_sum += frame_count		
		episode_count += 1		
		if episode_count % 100 == 0:
			output.write("Episode " + str(episode_count) + ", Score = " + str(p.score()) + ", Avg Frames survived = " + str(batch_sum / 100) + "Q Size = " + str(len(agent.q)) + "\n")
			print "Episode ", episode_count, ", Score = ", p.score(), ", Avg Frames survived = ", batch_sum / 100, "Q Size = ", len(agent.q)
			batch_sum = 0
			if p.score > max_score:
				max_score = p.score
				# q_table = copy.deepcopy(agent.q)
				q_table = dict(agent.q)
				pickle.dump(q_table, open("agent_q.p", "w"))
		p.reset_game()
		observation = game.getGameState()
		observation = ((int(observation["player_y"]) - int(observation["next_pipe_bottom_y"])), int(observation["next_pipe_dist_to_player"]), int(observation["player_vel"]))
		agent.state = observation

		frame_count = 0
	# print "observation = ", observation
	# print "reward = ", reward
Ejemplo n.º 20
0
    # 训练次数
    episodes = 20000
    # 实例化游戏对象
    game = FlappyBird()
    # 类似游戏的一个接口,可以为我们提供一些功能
    p = PLE(game, fps=30, display_screen=True)
    # 初始化
    p.init()
    # 实例化Agent,将动作集传进去
    agent = Agent(p.getActionSet())

    for episode in range(episodes):
        # 重置游戏
        p.reset_game()
        # 获得状态
        state = agent.get_state(game.getGameState())

        while True:
            # 获得最佳动作
            action = agent.get_best_action(state)
            # 然后执行动作获得奖励
            reward = agent.act(p, action)
            # 获得执行动作之后的状态
            next_state = agent.get_state(game.getGameState())
            state = next_state
            if p.game_over():
                print("当前分数为{}".format(p.score()))
                break
            # 让小鸟慢一点
            time.sleep(0.02)
Ejemplo n.º 21
0
        processed_state.append(creep[1])

    return np.array((processed_state, ))


p.init()
actions = p.getActionSet()[:-1]
agent = Agent(len(actions))

epochs = 10000000
game_duration = 1000
for epoch in range(epochs):
    p.reset_game()

    for it in range(1000):
        if p.game_over():
            p.reset_game()
            print "Finished with score:" + str(p.score())

        current_state = game.getGameState()
        processed_current_state = process_state(current_state)

        action = agent.act(processed_current_state)
        # action = actions[np.random.randint(0, len(actions))]
        reward = p.act(actions[action])
        next_state = game.getGameState()
        game_over = p.game_over()

        print "Current score: " + str(p.score())
    print "Finished with score:" + str(p.score())
Ejemplo n.º 22
0
p.state_preprocessor = agent.process_state

#agent.load("model.h5")
#agent.epsilon = 0.05

fail, catch, j = 0, 0, 0
best_score = -np.inf
nb_games = 1

while 1:
    j += 1

    # On réinitialise de temps en temps
    if p.game_over() or j == 50000:
        fail, catch, j = 0, 0, 0
        best_score = max(best_score, p.score())
        nb_games += 1
        p.reset_game()

    observation = p.getGameState()
    action = agent.pickAction(observation)
    reward = p.act(action_set[action])

    if reward < -0.5:
        fail += 1
    if reward > 0.5:
        catch += 1

    agent.remember(observation, action, reward, p.getGameState(),
                   p.game_over())