Beispiel #1
0
def main():
    #impedir o pyplot de congelar meu processo atual
    plt.ion()

    #silenciar tensorflow https://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

    print("Inicializando ambiente...")
    env = gym.make("FrozenLake-v0")

    print("Inicializando agente...")
    action_size = env.action_space.n
    state_size = env.observation_space.n
    gamma = 0.96
    learning_rate = 0.81
    epsilon = 1.0  # Exploration rate
    max_epsilon = 1.0  # Exploration probability at start
    min_epsilon = 0.01  # Minimum exploration probability
    decay_rate = 0.005

    x = Agent(learning_rate,
              gamma,
              action_size,
              state_size,
              decay_rate,
              epsilon=epsilon,
              max_epsilon=max_epsilon,
              min_epsilon=min_epsilon)

    memo_size = get_int_input(
        "Olá! Insira o tamanho da memória utilizada para treinar o agente:",
        50000)
    memo = Memory(memo_size)

    batch_size = get_int_input(
        "Insira o tamanho do batch (quantos slots de memória serão utilizados por "
        + "vez para treinar o agente:", 50)

    total_episodes = get_int_input(
        "Insira a quantidade de episodios que o agente irá treinar:", 10000)

    max_steps = get_int_input(
        "Insira a quantidade de acoes que o agente irá realizar " +
        "por episódio:", 100)

    # List of rewards
    rewards = []

    #lista das medias de recompensa para imprimir o grafico
    reward_mean = []

    #lista do numero de vitorias
    victories = []
    victory_percentage = []

    print("Iniciando treinamento do agente.")
    print("Pode apertar Ctrl-C durante o treinamento para interrompê-lo")
    print("Treinando...")

    try:
        # 2 For life or until learning is stopped
        for episode in range(total_episodes):

            print("Episódio " + str(episode + 1) + " de " +
                  str(total_episodes),
                  end="\r")

            # Reset the environment
            state = convert_state(env.reset())
            step = 0
            done = False
            total_rewards = 0
            victory = False

            for step in range(max_steps):

                numero_aleatorio = random.uniform(0, 1)

                #se nosso numero aleatorio for maior que epsilon
                #devemos aproveitar o conhecimento adquirido
                #se nao, explorar
                if numero_aleatorio > x.epsilon:
                    action = x.act(state)
                else:
                    action = env.action_space.sample()

                # Take the action (a) and observe the outcome state(s') and reward (r)
                new_state, reward, done, info = env.step(action)
                new_state = convert_state(new_state)

                if reward == 1:
                    victory = True

                reward = immediate_reward(reward)

                #guardar tupla para inseri-la na memoria
                sample = np.array([state, action, new_state, reward])

                #inserir tupla na memoria
                memo.add_sample(sample)

                #fazer o agente aprender com um exemplo da memoria
                x.learn_batch(memo.sample(batch_size))

                # Our new state is state
                state = new_state

                total_rewards += reward

                # If done (if we're dead) : finish episode
                if done == True:
                    break

            rewards.append(total_rewards)
            reward_mean.append(sum(rewards) / (episode + 1))

            if victory:
                victories.append(1)
            else:
                victories.append(0)
            victory_percentage.append(sum(victories) / (episode + 1))

            x.decay_epsilon(episode)

            plt.figure("Média de Vitórias")
            plt.plot(victory_percentage)

            plt.figure("Média de Recompensas")
            plt.plot(reward_mean)

            plt.show()
            plt.pause(0.0000000000001)
            plt.clf()

    except KeyboardInterrupt:
        print()

    print()
    print("Treino finalizado!")
    print("Recompensa acumulada: " + str(sum(rewards) / total_episodes))
    print()

    try:
        input("Aperte ENTER para ver o agente jogando. Ctrl-C para cancelar.")
    except KeyboardInterrupt:
        print("\nTchau!")
        sys.exit(0)

    ############### O AGENTE VAI JOGAR DAQUI PRA BAIXO ############

    #redefinir ambiente
    env.reset()

    total_episodes = get_int_input(
        "Olá! Insira a quantidade de episodios que o agente irá jogar:", 5)

    max_steps = get_int_input(
        "Olá! Insira a quantidade de acoes que o agente irá realizar " +
        "por episódio:", 100)

    for episode in range(total_episodes):
        state = convert_state(env.reset())
        step = 0
        done = False
        print("****************************************************")
        print("EPISÓDIO ", episode)

        for step in range(max_steps):
            # Take the action (index) that have the maximum expected future reward given that state
            action = x.act(state)

            new_state, reward, done, info = env.step(action)
            new_state = convert_state(new_state)

            if done:
                # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
                env.render()

                # We print the number of step it took.
                print("Número de passos: ", step)
                break

            state = new_state

    env.close()

    input("Concluído! Aperte ENTER para finalizar.")
Beispiel #2
0
            state_stack = tf.Variable(np.repeat(state, NUM_FRAMES).reshape((POST_PROCESS_IMAGE_SIZE[0],
                                                                            POST_PROCESS_IMAGE_SIZE[1],
                                                                            NUM_FRAMES)))
            cnt = 1
            avg_loss = 0
            tot_reward = 0
            images = []
            while True:
                if RENDER:
                    env.render()
                action = choose_action(state=state_stack, online_network=online_network, eps=eps, delay_steps=delay_steps)
                next_state, reward, done, info = env.step(action=action)
                tot_reward += reward
                state_stack = process_state_stack(state_stack=state_stack, state=next_state)
                # salva in memory il nuovo stato
                memory.add_sample(frame=next_state, action=action, reward=reward, done=done)
                # Memorizza per creare una GIF
                images.append((next_state*255).round().astype(np.uint8))

                if delay_steps > DELAY_TRAINING:
                    loss = online_network.train_model(memory=memory, target_network=target_network)
                    online_network.update_network(target_network=target_network)
                    with train_writer.as_default():
                        tf.summary.scalar('loss', loss, step=total_steps)
                else:
                    loss = -1
                avg_loss += loss

                # decresce il valore di eps in modo lineare
                if delay_steps > DELAY_TRAINING:
                    eps = linear_eps_decay(steps=total_steps)
Beispiel #3
0
class Agent:
    """ Parent class of TopAgent and BottomAgent.
        This hierarchy is needed because these two agents share alot of similar functionality (namely, take_action)
        but differ when it comes to converting the board state into their local state (the stat from their point of view) (get_perspective_state())
        The other key difference is their action_to_global_and_back() function which takes the agent's action and converts it to what the action
        looks like to the global state. For instance, when TopAgent moves up, it's a down move from the board's perspective. but up from the agent's perspective
    """

    def __init__(self, sess, static_actions, model, name):
        self.sess = sess

        # size of the state vector that is fed into the NN
        self.state_size = constants.BOARD_SIZE*2 + 1

        self.memory = Memory(constants.MEMORY_SIZE)
        # model is passed here in order to ensure there is only one model object that trains and performs q-learning
        self.model = model

        # static actions allows us to list out all the actions so that greedy_action and random_action can map
        # action_indexes to actions fast
        self.static_actions = static_actions

        # probability of taking a random aciton, which decays as the training goes on.
        self.exploration_probability = constants.STARTING_EXPLORATION_PROBABILITY
        self.steps = 1

        self.game_loss = 0
        self.recent_loss = 0
        self.recent_loss_counter = 0
        
        self.name = name




    def take_action(self, board_state, only_inference, valid_human_action = None):
        """ takes in the state of the game
            determines an action (random or greedy)
            updates the state and collects the reward
            records (S, A, S', R) as a memory
            trains the NN on a batch of recent memories
        """
        # child method is called here.
        state_vector = self.get_perspective_state(board_state)


        if valid_human_action == None:
            if only_inference or random.random() > self.exploration_probability:
                action_index = self.greedy_action(state_vector, board_state)
            else:
                action_index = self.random_action(board_state)
            # in small grids, agents can become stuck if they are next to a wall and the enemy (can't move)
            # and thus action_index will be None in this case
            if action_index == None:
                return None # return None to signify that this game should be abandoned
            else:
                action = self.static_actions.all_actions[action_index]
        else:
            # human actions are special, they are already verified and legal when they arrive here, 
            # but are board-based actions (not good for topagent), so we need to account for this 
            action = self.action_to_global_and_back(valid_human_action)
            action_index = self.static_actions.get_index_of_action(action)

        

        # child method again - 
        #   state needs to be converted to what it looks like to
        #   the board state, so that we can update the state properly
        state_action = self.action_to_global_and_back(action)
        reward = board_state.apply_action(self.name, state_action)



        next_state_vector = self.get_perspective_state(board_state)

        # memory is our training examples
        self.memory.add_sample(MemoryInstance(state_vector, action_index, reward, next_state_vector))
        # learn off a batch of recent memories
        self.q_learn()

        self.steps += 1
        self.exploration_probability = constants.ENDING_EXPLORATION_PROBABILITY + (constants.STARTING_EXPLORATION_PROBABILITY - constants.ENDING_EXPLORATION_PROBABILITY) \
            * math.exp(-constants.EXPLORATION_PROBABILITY_DECAY * self.steps)

        return reward



    def random_action(self, board_state):
        """ Random action to help with exploration.
            The probability of selecting a random move action over a random wall action is high,
            this is because there are many more wall actions than move actions,
            but in the real game move actions are more frequent, so we want our exploration
            phase of training to reflect this and select move much more often than wall """

        if random.random() < constants.MOVE_ACTION_PROBABILITY:
            actions = self.static_actions.move_actions
        else:
            actions = self.static_actions.all_actions
        
        action_indexes = [i for i in range(len(actions))]
        random.shuffle(action_indexes)

        return self.first_legal_action(action_indexes, board_state)


    def greedy_action(self, state_vector, board_state):
        """ Returns a greedy action taken from the model:
            1. gets the Q values from the model given the state
            2. sorts them
            3. go through them until a valid action is found
            4. return the action or of none are found, return a random valid action
        """

        q_values = self.model.predict_one(state_vector)
        q_values = q_values.flatten()

        _, action_indexes = self.sess.run(tf.nn.top_k(q_values, len(q_values)))
        action_indexes = action_indexes.tolist()

        # return the legal action with the highest q-value
        return self.first_legal_action(action_indexes, board_state)



    def first_legal_action(self, action_indexes, board_state):
        """ takes the first legal action found and returns it 
            This is used to take the highest legal Q valued action
        """
        for action_index in action_indexes:
            if self.is_legal_action(action_index, board_state):
                return action_index


    def is_legal_action(self, action_index, board_state):
        """converts this action to the board action and asks the board state if it's legal"""
        action = self.static_actions.all_actions[action_index]
        state_action = self.action_to_global_and_back(action)
        return board_state.is_legal_action(state_action, self.name)



    
    def get_perspective_state(self, board_state):
        """ Gets the agent's perspective of the state (as a vector)
            TopAgent overrides this since it has a different perspecive than BottomAgent
         """
        full_grid_size = board_state.full_grid_size
        grid = board_state.build_grid(BoardElement.AGENT_BOT, BoardElement.AGENT_TOP)

        vector = []
        for y in range(full_grid_size):
            for x in range(full_grid_size):
                vector.append(grid[x][y])

        # my walls, then enemy walls
        vector.append(board_state.wall_counts[BoardElement.AGENT_BOT])
        vector.append(board_state.wall_counts[BoardElement.AGENT_TOP])

        vector = np.array(vector)
        return vector



    def action_to_global_and_back(self, agent_action):
        return agent_action


    def q_learn(self):
        """ Deep Q learning algorithm with memory. Uses the bellman equation.
            Each training example is (state, action, next state, reward)
            Q is R + gamme * max(s', a')
        """
        batch = self.memory.sample(self.model.get_batch_size())

        states = np.array([val[0] for val in batch])
        # When we first start training, some of the memories of examples could be null (not enough for a full batch yet)
        next_states = np.array([(np.zeros(self.model.get_num_states()) if val[3] is None else val[3]) for val in batch])

        # predict Q(s,a) given the batch of states
        q_s_a = self.model.predict_batch(states)

        # predict Q(s',a') - so that we can do gamma * max(Q(s'a')) below
        q_s_a_d = self.model.predict_batch(next_states)

        # setup training arrays
        x = np.zeros((len(batch), self.model.get_num_states()))
        y = np.zeros((len(batch), self.model.get_num_actions()))

        # convert each memory to a trainable example via q-lerning method
        for i, b in enumerate(batch):
            state, action, reward, next_state = b[0], b[1], b[2], b[3]

            # get the current q values for all actions in state
            current_q = q_s_a[i]

            # update the q value for action
            if next_state is None:
                # in this case, the game completed after action, so there is no max Q(s',a') prediction possible
                current_q[action] = reward
            else:
                current_q[action] = reward + constants.GAMMA * np.amax(q_s_a_d[i])
            x[i] = state
            y[i] = current_q

        _, l = self.model.train_batch(x, y)

        self.game_loss = l
        self.recent_loss += l
        self.recent_loss_counter += 1



    def get_exploration_probability(self):
        return self.exploration_probability


    def get_game_loss(self):
        """ returns loss from previous game """
        return self.game_loss

    def get_recent_loss(self):
        """ returns the average recent loss since this function was last called """
        recentloss = self.recent_loss / self.recent_loss_counter
        self.recent_loss = 0
        self.recent_loss_counter = 0
        return recentloss