Esempio n. 1
0
class DQNAgent:
    def __init__(self,
                 env,
                 net_update_rate: int = 25,
                 exploration_rate: float = 1.0,
                 exploration_decay: float = 0.00005):
        # set hyper parameters
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.net_updating_rate = net_update_rate

        # set environment
        self.env = env
        self.state_shape = env.get_state_shape()
        self.action_shape = env.get_action_shape()

        # the number of experience per batch for batch learning
        # Experience Replay for batch learning
        self.exp_rep = ExperienceReplay()

        # Deep Q Network
        self.net = None

    def set_model(self, model):
        """ Sets the model the agent is used to train. Receives a compiled tf Model with
            input_shape = env.observation_space and output_shape = env.action_s pace"""
        self.net = DoubleDQN(model)

    def get_action(self, state: np.ndarray, eps=0) -> int:
        """Given a state returns a random action with probability eps, and argmax(q_net(state)) with probability 1-eps.
           (only legal actions are considered)"""
        if self.net is None:
            raise NotImplementedError(
                'agent.get_action called before model was not initiated.\n Please set the agent\'s model'
                ' using the set_model method. You can access the state and action shapes using '
                'agent\'s methods \'get_state_shape\' and \'get_action_shape\''
            )
        legal_actions = self.env.get_legal_actions(state)

        if np.random.random() >= eps:  # Exploitation

            # Calculate the Q-value of each action
            q_values = self.net.predict(state[np.newaxis, ...],
                                        np.expand_dims(legal_actions, 0))

            # Make sure we only choose between available actions
            legal_actions = np.logical_and(legal_actions,
                                           q_values == np.max(q_values))

        return np.random.choice(np.flatnonzero(legal_actions))

    def update_net(self, batch_size: int):
        """ if there are more than batch_size experiences, Optimizes the network's weights using the Double-Q-learning
         algorithm with a batch of experiences, else returns"""
        if self.exp_rep.get_num() < batch_size:
            return
        batch = self.exp_rep.get_batch(batch_size)
        self.net.fit(*batch)

    def train(self,
              episodes: int,
              path: str,
              checkpoint_rate=100,
              batch_size: int = 64,
              exp_decay_func=lambda exp_rate, exp_decay, i: 0.01 +
              (exp_rate - 0.01) * np.exp(exp_decay * (i + 1)),
              show_progress=False):
        """
        Runs a training session for the agent
        :param episodes: number of episodes to train.
        :param path: a path to a directory where the trained weights will be saved.
        :param batch_size: number of experiences to learn from in each net_update.
        """
        if self.net is None:
            raise NotImplementedError(
                'agent.train called before model was not initiated.\n Please set the agent\'s model'
                ' using the set_model method. You can access the state and action shapes using '
                'agent\'s methods \'get_state_shape\' and \'get_action_shape\''
            )
        # set hyper parameters
        exploration_rate = self.exploration_rate
        total_rewards = []
        # start training
        for episode in tqdm(range(episodes)):
            state = self.env.reset()  # Reset the environment for a new episode
            step, episode_reward = 0, 0
            run = True
            # Run until max actions is reached or episode has ended
            while run:

                step += 1
                # choose a current action using epsilon greedy exploration
                action = self.get_action(state, exploration_rate)

                # apply the chosen action to the environment and observe the next_state and reward
                obs = self.env.step(action)
                next_state, reward, is_terminal = obs[:3]
                episode_reward += reward

                # Add experience to memory
                self.exp_rep.add(state, action, reward, next_state,
                                 self.env.get_legal_actions(state),
                                 is_terminal)

                # Optimize the DoubleQ-net
                self.update_net(batch_size)

                if is_terminal:  # The action taken led to a  terminal state
                    run = False

                if (step % self.net_updating_rate) == 0 and step > 0:
                    # update target network
                    self.net.align_target_model()
                state = next_state

            # Update total_rewards to keep track of progress
            total_rewards.append(episode_reward)
            # Update target network at the end of the episode
            self.net.align_target_model()
            # Update exploration rate -
            exploration_rate = exp_decay_func(exploration_rate,
                                              self.exploration_decay, episode)

            if episode % checkpoint_rate == 0 and self.exp_rep.get_num(
            ) > batch_size:
                self.save_weights(
                    os.path.join(path, f'episode_{episode}_weights'))

                if show_progress:  # Plot a moving average of last 10 episodes
                    self.plot_progress(total_rewards)

        # update the agents exploration rate in case more training is needed.
        self.exploration_rate = exploration_rate

        # saves the total_rewards as csv file to the path specified.
        with open(os.path.join(path, 'rewards.csv'), 'w') as reward_file:
            rewards = pd.DataFrame(total_rewards)
            rewards.to_csv(reward_file)
        self.save_weights(os.path.join(path, 'final_weights'))

    def plot_progress(self, total_rewards):
        w = np.ones(10) / 10
        moving_average = np.convolve(total_rewards, w, mode='valid')
        plt.plot(np.arange(len(moving_average)), moving_average)
        plt.title('Moving average of rewards across episodes')
        plt.xlabel('episodes')
        plt.ylabel('average reward over last 10 episodes')
        plt.show()

    def get_state_shape(self):
        return self.state_shape

    def get_action_shape(self):
        return self.action_shape

    # Handles saving\loading the model as explained here: https://www.tensorflow.org/guide/keras/save_and_serialize
    def load_weights(self, path):
        self.net.load_weights(path)

    def save_weights(self, path):
        self.net.save_weights(path)

    def save_model(self, path):
        if self.net is None:
            raise NotImplementedError(
                'agent.save_model was called before model was not initiated.\n Please set the '
                'agent\'s model using the set_model method. You can access the state and action '
                'shapes using agent\'s methods \'get_state_shape\' and \'get_action_shape\''
            )
        self.net.save_model(path)

    def load_model(self, path):
        model = load_model(path)
        self.set_model(model)

    def to_json(self, **kwargs):
        if self.net is None:
            raise NotImplementedError(
                'agent.to_json was called before model was not initiated.\n Please set the '
                'agent\'s model using the set_model method. You can access the state and action '
                'shapes using agent\'s methods \'get_state_shape\' and \'get_action_shape\''
            )
        return self.net.to_json(**kwargs)

    def from_json(self, json_config):
        model = model_from_json(json_config)
        self.set_model(model)
Esempio n. 2
0
class Player:
    """
    This class represents a player, his strategy of learning and playing the game.
    """
    def __init__(self):
        # gamma is a parameter of Q - learing algorithm
        self.gamma = 0.9

        # We use epsilon - greedy strategy of learning
        self.epsilon = 1
        self.epsilon_decay = 0.99
        self.epsilon_min = 0.01
        
        # Number of epochs (fully played games) to study an agent
        self.epochs = 500

        # Game to play
        self.game = Game()

        # Number of hidden layer nodes
        self.hidden_layer_nodes = 20

        # Create keras model
        # _________________________________________________________________
        # Layer (type)                 Output Shape              Param #   
        # =================================================================
        # dense_1 (Dense)              (None, 20)                120       
        # _________________________________________________________________
        # dense_2 (Dense)              (None, 20)                420       
        # _________________________________________________________________
        # dense_3 (Dense)              (None, 5)                 105       
        # =================================================================
        # Total params: 645
        # Trainable params: 645
        # Non-trainable params: 0
        # _________________________________________________________________
        self.model = Sequential()
        self.model.add(Dense(self.hidden_layer_nodes, input_dim=self.game.state_size, activation='relu'))
        self.model.add(Dense(self.hidden_layer_nodes, activation='relu'))
        self.model.add(Dense(len(POSSIBLE_ACTIONS), activation='linear'))
        self.model.compile('Adam', loss='mse')

        # Initialize experience replay
        self.experience_replay = ExperienceReplay(size=2000)
        self.batch_size = 20
        self.max_turns = 100
    
    def train_model_on_batch(self):
        batch = self.experience_replay.get_batch(self.batch_size)

        # ---------------------------------- #
        # TODO: move this logic to get_batch
        states = []
        target_fs = []
        actions = []
        rewards = []
        next_states = []
        not_is_overs = []

        for state, action, reward, next_state, is_over in batch:
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            not_is_overs.append(not is_over)

        states = numpy.array(states)
        next_states = numpy.array(next_states)
        not_is_overs = numpy.array(not_is_overs)
        rewards = numpy.array(rewards)
        # ---------------------------------- #

        targets = rewards + not_is_overs * self.gamma * numpy.amax(self.model.predict(next_states), axis=1)
        target_fs = self.model.predict(states)

        for i in range(len(batch)):
            target_fs[i, ACTION_TO_INDEX[actions[i]]] = targets[i]
        self.model.fit(states, target_fs, verbose=0)

    def train(self, interactive=False):
        for epoch in range(self.epochs):
            self.game.create_agent()

            turns = 0
            while turns < self.max_turns:
                turns += 1

                if interactive:
                    os.system('clear')
                    self.game.show()
                    time.sleep(0.1)

                state = numpy.array(self.game.encode())
                if random.uniform(0, 1) < self.epsilon:
                    action = random.choice(POSSIBLE_ACTIONS)
                else:
                    index = numpy.argmax(self.model.predict(state[numpy.newaxis])[0])
                    action = POSSIBLE_ACTIONS[index]
                
                reward = self.game.act(action)
                next_state = numpy.array(self.game.encode())

                is_over = self.game.is_over()
                if is_over:
                    reward -= 10
                    self.experience_replay.remember(state, action, reward, next_state, is_over)
                    break

                if turns == self.max_turns:
                    reward += 10

                self.experience_replay.remember(state, action, reward, next_state, is_over)
                self.train_model_on_batch()
            
            # Epsilon decay technic
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
            
            print('Epoch: %i Total turns: %i' % (epoch, turns))

        print("Training finished!\n")
    
    def play(self, interactive=False):
        for _ in range(self.epochs):
            self.game.create_agent()

            while not self.game.is_over():
                if interactive:
                    os.system('clear')
                    self.game.show()
                    time.sleep(0.1)

                state = numpy.array(self.game.encode())[numpy.newaxis]
                index = numpy.argmax(self.model.predict(state)[0])
                action = POSSIBLE_ACTIONS[index]
                self.game.act(action)
Esempio n. 3
0
            # Collect environment data
            s2, r, terminal = env.step( np.argmax(a) )

            # Add data to ExperienceReplay memory
            if UPDATE_REPLAY:
                if np.abs(r) > 0.0:
                    er.add_experience(s, a, r, terminal, s2)
                else:
                    if np.random.random() < 0.0018:
                        er.add_experience(s, a, r, terminal, s2)

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if er.size() > MINIBATCH_SIZE:

                s_batch, a_batch, r_batch, t_batch, s2_batch = er.get_batch(MINIBATCH_SIZE)

                # Calculate Q targets for s2 based on QValue target model
                s2_batch1 = np.reshape( [elt[0].ravel() for elt in s2_batch], [-1,hero_state_dim] )
                s2_batch2 = np.reshape( [elt[1] for elt in s2_batch], [-1,balls_state_shape[0],balls_state_shape[1]] )
                target_q = qvalue_network.max_qvalues( s2_batch1, s2_batch2 )
                
                new_q = np.zeros((MINIBATCH_SIZE, 1))
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        new_q[k,0] = r_batch[k]
                    else:
                        new_q[k,0] = r_batch[k] + GAMMA * target_q[k]

                # Update qvalues given the Q targets
                s_batch1 = np.reshape( [elt[0].ravel() for elt in s_batch], [-1,hero_state_dim] )