コード例 #1
0
 def get_x_y(data_list):
     interpolator = Interpolator()
     interpolator.set_u(ACTIONS)
     x = []
     y = []
     for data_row in data_list:
         new_q = data_row["reward"]
         if not data_row["done"]:
             new_q += DISCOUNT * np.max(data_row["next_qualities"])
         interpolator.set_q(data_row["qualities"])
         interpolator.update_function(data_row["action"], new_q)
         x.append(data_row["state"])
         y.append(interpolator.get_q())
     return x, y
コード例 #2
0
    def train(self, terminal_state):
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return

        # Calculate Prioritized Experience Replay weights
        current_states = np.array([transition[0] for transition in self.replay_memory])
        future_states = np.array([transition[3] for transition in self.replay_memory])
        current_qs = self.model.predict(current_states)
        future_qs = self.target_model.predict(future_states)
        p = np.array([abs((reward + DISCOUNT * np.amax(future_qs[index]) if not done else reward)
                          - current_qs[index][ACTIONS.index(action)])
                      for index, (_, action, reward, _, done) in enumerate(self.replay_memory)])
        p = np.interp(p, (p.min(), p.max()), (0, +1))
        p /= np.sum(p)

        # Get a minibatch of random samples from memory replay table
        minibatch = np.array(self.replay_memory)[np.random.choice(len(self.replay_memory),
                                                                  size=MINIBATCH_SIZE,
                                                                  replace=False,
                                                                  p=p)]  # random.sample(self.replay_memory, MINIBATCH_SIZE)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch])  # / 255
        current_qs_list = self.model.predict(current_states)

        # Get future states from minibatch, then query NN model for Q values
        # When using target network, query it, otherwise main network should be queried
        new_current_states = np.array([transition[3] for transition in minibatch])  # / 255
        future_target_qs_list = self.target_model.predict(new_current_states)
        future_model_qs_list = self.model.predict(new_current_states)

        x = []
        y = []
        interpolator = Interpolator()

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):

            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            future_model_qs_at_index = future_model_qs_list[index]
            future_target_qs_at_index = future_target_qs_list[index]
            # future_qs = np.reshape(future_model_qs_at_index, OUTPUT_2D_SHAPE)
            if not done:
                max_future_q = future_target_qs_at_index[np.argmax(future_model_qs_at_index)]
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs_list_at_index = current_qs_list[index]
            current_qs = np.reshape(current_qs_list_at_index, OUTPUT_2D_SHAPE)
            current_actions = ACTIONS
            current_qualities = current_qs

            interpolator.set_u(current_actions)
            interpolator.set_q(current_qualities)
            interpolator.update_function(action, new_q)
            # current_qs = np.zeros(OUTPUT_2D_SHAPE)
            # current_qs[:, :2] = interpolator.get_u()
            current_qs = interpolator.get_q()  # [current_actions.index(action)] = [new_q]  #

            # print(current_state)
            # print(current_qs_list)
            # print(action)
            # current_qs[action] = new_q

            # And append to our training data
            x.append(current_state)
            reshaped_current_qs = np.reshape(current_qs, OUTPUT_1D_SHAPE)
            y.append(reshaped_current_qs)

        # print("x:", x)
        # print("y:", y)
        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(np.array(x), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False,
                       callbacks=[self.tensorboard] if terminal_state else None)
        # Update target network counter every episode
        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            # a = self.model.get_weights()
            # print(a)
            self.target_update_counter = 0
            self.save_replay_memory()