def get_x_y(data_list): interpolator = Interpolator() interpolator.set_u(ACTIONS) x = [] y = [] for data_row in data_list: new_q = data_row["reward"] if not data_row["done"]: new_q += DISCOUNT * np.max(data_row["next_qualities"]) interpolator.set_q(data_row["qualities"]) interpolator.update_function(data_row["action"], new_q) x.append(data_row["state"]) y.append(interpolator.get_q()) return x, y
def train(self, terminal_state): # Start training only if certain number of samples is already saved if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return # Calculate Prioritized Experience Replay weights current_states = np.array([transition[0] for transition in self.replay_memory]) future_states = np.array([transition[3] for transition in self.replay_memory]) current_qs = self.model.predict(current_states) future_qs = self.target_model.predict(future_states) p = np.array([abs((reward + DISCOUNT * np.amax(future_qs[index]) if not done else reward) - current_qs[index][ACTIONS.index(action)]) for index, (_, action, reward, _, done) in enumerate(self.replay_memory)]) p = np.interp(p, (p.min(), p.max()), (0, +1)) p /= np.sum(p) # Get a minibatch of random samples from memory replay table minibatch = np.array(self.replay_memory)[np.random.choice(len(self.replay_memory), size=MINIBATCH_SIZE, replace=False, p=p)] # random.sample(self.replay_memory, MINIBATCH_SIZE) # Get current states from minibatch, then query NN model for Q values current_states = np.array([transition[0] for transition in minibatch]) # / 255 current_qs_list = self.model.predict(current_states) # Get future states from minibatch, then query NN model for Q values # When using target network, query it, otherwise main network should be queried new_current_states = np.array([transition[3] for transition in minibatch]) # / 255 future_target_qs_list = self.target_model.predict(new_current_states) future_model_qs_list = self.model.predict(new_current_states) x = [] y = [] interpolator = Interpolator() # Now we need to enumerate our batches for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch): # If not a terminal state, get new q from future states, otherwise set it to 0 # almost like with Q Learning, but we use just part of equation here future_model_qs_at_index = future_model_qs_list[index] future_target_qs_at_index = future_target_qs_list[index] # future_qs = np.reshape(future_model_qs_at_index, OUTPUT_2D_SHAPE) if not done: max_future_q = future_target_qs_at_index[np.argmax(future_model_qs_at_index)] new_q = reward + DISCOUNT * max_future_q else: new_q = reward # Update Q value for given state current_qs_list_at_index = current_qs_list[index] current_qs = np.reshape(current_qs_list_at_index, OUTPUT_2D_SHAPE) current_actions = ACTIONS current_qualities = current_qs interpolator.set_u(current_actions) interpolator.set_q(current_qualities) interpolator.update_function(action, new_q) # current_qs = np.zeros(OUTPUT_2D_SHAPE) # current_qs[:, :2] = interpolator.get_u() current_qs = interpolator.get_q() # [current_actions.index(action)] = [new_q] # # print(current_state) # print(current_qs_list) # print(action) # current_qs[action] = new_q # And append to our training data x.append(current_state) reshaped_current_qs = np.reshape(current_qs, OUTPUT_1D_SHAPE) y.append(reshaped_current_qs) # print("x:", x) # print("y:", y) # Fit on all samples as one batch, log only on terminal state self.model.fit(np.array(x), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None) # Update target network counter every episode if terminal_state: self.target_update_counter += 1 # If counter reaches set value, update target network with weights of main network if self.target_update_counter > UPDATE_TARGET_EVERY: self.target_model.set_weights(self.model.get_weights()) # a = self.model.get_weights() # print(a) self.target_update_counter = 0 self.save_replay_memory()