コード例 #1
0
class DQNAgent:
    def __init__(self):
        self.action_space = env.action_space.n  # number of actions
        self.discount_rate = 0.99
        self.stack_frames = 4  # number of frames stacked
        self.lr = 2.5e-5
        self.model = self.build_deep_q_model()  # build the main network
        self.target_model = self.build_deep_q_model(
        )  # build the target network
        mask_shape = (1, env.action_space.n)
        self.batch_size = 32  # number of batches to learn from
        self.priority_replay_size = 1000000  # max replay memory size

        self.priority_replay = Memory(
            self.priority_replay_size)  # create the replay memory
        self.exploration_rate = AnnealingVariable(
            1., .01,
            400000)  # exploration rate (start_value, final_value, n_steps)
        self.mask = np.ones(
            mask_shape, dtype=np.float32
        )  # mask used in the training to train only the q values for which the agent performed and action
        self.one_hot = np.eye(env.action_space.n, dtype=np.float32)

        self.update_target_frequency = 10000  # update target every
        self.replay_frequency = 4  # learn from memories frequency

    # creates the network and returns it
    def build_deep_q_model(self, image_size: tuple = (84, 84)) -> keras.Model:
        # weights initializer - he init
        initializer = keras.initializers.VarianceScaling(scale=2.0)
        # input layer - takes (84,84,84,4) images
        cnn_input = keras.layers.Input((*image_size, self.stack_frames),
                                       name="cnninpt")
        # mask input - one hot when want only q_value for particular action
        mask_input = keras.layers.Input((self.action_space, ), name="mask")

        # first Conv2D layer
        cnn = keras.layers.Conv2D(32,
                                  8,
                                  strides=4,
                                  activation="relu",
                                  padding='valid',
                                  kernel_initializer=initializer)(cnn_input)
        # second Conv2D layer
        cnn = keras.layers.Conv2D(64,
                                  4,
                                  strides=2,
                                  activation="relu",
                                  padding='valid',
                                  kernel_initializer=initializer)(cnn)
        # third Conv2D layer
        cnn = keras.layers.Conv2D(64,
                                  3,
                                  strides=1,
                                  activation="relu",
                                  padding='valid',
                                  kernel_initializer=initializer)(cnn)

        # flatten the kernels from previous layers
        cnn = keras.layers.Flatten()(cnn)

        # fully connected layer
        cnn = keras.layers.Dense(512,
                                 activation="relu",
                                 kernel_initializer=initializer)(cnn)
        # output layer, q_values for every action in enviroment
        cnn = keras.layers.Dense(self.action_space, name="output")(cnn)

        # multiply output by mask to give true output
        output = keras.layers.Multiply()([cnn, mask_input])

        # create the model
        model = keras.Model(inputs=[cnn_input, mask_input], outputs=output)

        # add loss function and method of optimization
        model.compile(loss=huber_loss,
                      optimizer=keras.optimizers.Adam(lr=self.lr))
        print(model.summary())
        return model

    # samples a batch of #batch_size from the priority replay
    def sample(self):
        batch = self.priority_replay.sample(
            self.batch_size)  # sample batch according to priorities
        X_state = [None] * len(batch)  # create empty list #batch_size
        X_action = [None] * len(batch)
        X_reward = [None] * len(batch)
        X_done = [None] * len(batch)
        X_next_state = [None] * len(batch)
        # for each batch - retrieve the particualar entries
        for i in range(len(batch)):
            o = batch[i][1]
            X_state[i] = np.array(o[0], copy=False)
            X_action[i] = o[1]
            X_reward[i] = o[2]
            X_done[i] = o[3]
            X_next_state[i] = np.array(o[4], copy=False)

        return np.array(X_state), np.array(X_action), np.array(
            X_reward), np.array(X_done), np.array(X_next_state), batch

    # train on the samples in memory
    def learn_from_memories(self):
        X_state, X_action, X_reward, X_done, X_next_state, batch = self.sample(
        )

        # repeat the base mask (all actions) for all the samples in the batch
        mask = np.repeat(self.mask, len(X_state), axis=0)

        # q for the next_state is 0 if episode has ended or else the max of q values according to the target network
        q_next = np.max(self.target_model.predict_on_batch(
            [X_next_state, mask]),
                        axis=1)
        q_next[X_done] = 0.0

        # the q predictions on batch
        pred = self.model.predict_on_batch([X_state, self.mask])

        # the q predictions for each action taken only
        pred_action = pred[range(pred.shape[0]), X_action]

        # calculate the target / true values
        target_q = X_reward + self.discount_rate * q_next

        # get the error - priorirty
        error = abs(target_q - pred_action)

        # update errors - priorities
        for i in range(len(batch)):
            idx = batch[i][0]
            self.priority_replay.update(idx, error[i])

        # assign target_q to the appropriate true_q columns for which the agent performed an action
        # all other true q values are 0 and the agent is not trained based on their value
        true_q = np.zeros((len(X_state), env.action_space.n), dtype=np.float32)
        # for every row, for the columns the agent picked that action assign target_q
        true_q[range(true_q.shape[0]), X_action] = target_q
        # train only on actions the agent chose - One hot mask from action batch
        return self.model.fit([X_state, self.one_hot[X_action]],
                              true_q,
                              verbose=0,
                              epochs=1)

    # helper function to initialize the priority replay with init_size samples
    def init_priority_replay(self, init_size=50000):
        while init_size > 0:
            done = False
            state = env.reset()  # reset the env
            while not done:
                # behave randomly
                action = env.action_space.sample()
                next_state, reward, done, _ = env.step(action)
                # save batch of experience to memory
                self.save_to_memory(state, action, reward, done, next_state)
                state = next_state
                init_size -= 1

    # takes the experience and stores it in replay memory
    def save_to_memory(self, state, action, reward, done, next_state):
        error = self.get_error(state, action, reward, done,
                               next_state)  # find error - priority
        self.priority_replay.add(
            error, (state, action, reward, done,
                    next_state))  # save to memory according to priority

    # behaves epsilon greedily
    def epsilon_greedy_policy(self, state, epsilon=0.01):
        if np.random.random() < epsilon:
            return env.action_space.sample()  # pick randomly

        q_values = self.predict_qvalues(state)
        return np.argmax(q_values)  # pick optimal action - max q(s,a)

    # calculate the error of prediction
    def get_error(self, state, action, reward, done, next_state):
        # q next is 0 for terminal states else predict it from target network
        if done:
            q_next = 0.0
        else:
            next_state = np.expand_dims(next_state, axis=0)
            q_next = self.target_model.predict([next_state, self.mask])

        predicted_q = self.predict_qvalues(state)
        # error = true - predicted for action taken
        error = abs(reward + self.discount_rate * np.max(q_next) -
                    predicted_q[0, action])
        return error

    # predicts and returns q values from state
    def predict_qvalues(self, state: np.ndarray):
        # keras needs first dimension to be batch size even if only one sample
        state = np.expand_dims(state, axis=0)
        return self.model.predict([state, self.mask])

    # trains the agent for max_steps
    def train_model(self, max_steps, stats):
        print("Start Training ")
        while max_steps > 0:
            r_per_episode = 0
            error = 0
            done = False

            state = env.reset()

            while not done:  # the episode has not ended
                # find action according to epsilon greedy behaviour with epsilon = exploration rate
                action = self.epsilon_greedy_policy(
                    state, self.exploration_rate.value)
                # decrease the exploration rate
                self.exploration_rate.step()

                max_steps -= 1
                # take the action and observe next observation, reward, done
                next_state, reward, done, _ = env.step(action)

                r_per_episode += reward

                # save this experience to memory
                self.save_to_memory(state, action, reward, done, next_state)

                state = next_state

                # time to train from priority replay
                if max_steps % self.replay_frequency == 0:
                    hist = self.learn_from_memories()
                    error += hist.history['loss'][0]

                # time to update the target network
                if max_steps % self.update_target_frequency == 0:
                    self.target_model.set_weights(self.model.get_weights())
            #  update stats
            stats(self, r_per_episode)
        # save stats at the end of training
        stats.save_stats()

    # test the agent for nepisodes
    def play(self, nepisodes, exploration_rate, stats):
        rewards_arr = np.zeros(nepisodes)
        for episode in range(nepisodes):
            episode_reward = 0
            done = False
            state = env.reset()
            while not done:
                env.render()
                # time.sleep(0.05)
                action = self.epsilon_greedy_policy(state, exploration_rate)
                next_state, reward, done, _ = env.step(action)
                episode_reward += reward

                state = next_state
            rewards_arr[episode] = episode_reward
            stats(self, episode_reward)
            print(episode_reward)
        stats.save_stats()
        return rewards_arr

    def save_weights(self):
        print("PRINT")
        self.model.save_weights("PongDQNWeights.h5")

    def save_model(self):
        self.model.save("DqnPongModel.h5")

    def restore_weights(self):
        print("Restoring model weights Pong")
        self.model.load_weights("DqnPongModel.h5")
def train_thread(agent, max_eps, env, discount_rate, optimizer,
                 statistics: Stats, exploration_rate: AnnealingVariable,
                 number):

    # create local network and init its weights equal to the global
    local_network = Actor_Critic(env.action_space.n)
    # prepare it (must do this when eager execution is enabled)
    local_network(
        tf.convert_to_tensor(np.random.random((1, 84, 84, 4)),
                             dtype=tf.float32))
    local_network.set_weights(agent.global_network.get_weights())
    # lr_decay_anneal = AnnealingVariable(1e-4, 1e-24, 10e6)
    global episodes  # number of total episodes done for all threads

    # local lists for states, rewards and actions
    states, rewards, actions = [], [], []
    while episodes < max_eps:
        r_per_episode = 0.0
        done = False
        step = 0
        state = env.reset()
        # still training
        while not done and episodes < max_eps:
            exploration_rate.step()  # decay the exploration rate

            states.append(
                state)  # add the observation/state into the state list
            # find acction to pick according to the probs network and the exploration rate
            action = pick_action(env, local_network, state,
                                 exploration_rate.value)
            # do the action and observe the next state, reward and if the episode is over
            next_state, reward, done, _ = env.step(action)
            # lr_decay_anneal.step()

            # append the reward experienced in the reward list
            rewards.append(reward)
            # append action taken
            actions.append(action)
            r_per_episode += reward

            step += 1

            # if gathered enough experience or the episode is over -> train on experience gathered
            if step % train_frequency == 0 or done:
                # Gradient tape records the gradient during the evaluation of the loss function
                # -> eager execution MUST be enabled to work
                with tf.GradientTape() as tape:
                    # compute loss for each batch of experience
                    loss = compute_loss_from_batch(local_network, states,
                                                   rewards, actions, done,
                                                   next_state, discount_rate)
                # rewind the tape and get the gradients of the loss
                # for the weights of the local network (Actor-Critic)
                gradients = tape.gradient(loss,
                                          local_network.trainable_weights)
                # used because multiple threads
                lock.acquire()
                # agent.lr.assign(lr_decay_anneal.value)
                # apply the gradients found from the local network into the global network for the global weights
                optimizer.apply_gradients(
                    zip(gradients, agent.global_network.trainable_weights))
                # update local network with weights of global
                local_network.set_weights(agent.global_network.get_weights())
                lock.release()
                # empty state, reward, action list
                states, rewards, actions = [], [], []

            state = next_state
        with lock:
            # save stats
            if episodes < max_eps:
                episodes += 1
                statistics(agent, r_per_episode)