class DeepQlearner: def __init__(self, random_action_method, future_discount=0.75, learning_rate=0.001, load_path=None): learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8) self.model = RLModel() self.model.build((None, AGENT_INPUT_SIZE)) self.load_path = load_path if load_path is not None and os.path.isfile(load_path): print("Loading") self.model.load_weights(load_path) self.exp_rep = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.random_action_method = random_action_method self.learning_rate = learning_rate self.future_discount = future_discount self.loss_measure = tf.losses.MeanSquaredError() self.opt = tf.optimizers.Adam(lr=self.learning_rate) self.n_since_last_train = 0 self.latestLoss = tf.add(0, 0) def getActions(self, agentInputs): rand_action = self.random_action_method.get_random_action() if rand_action is not None: return [rand_action] * agentInputs.shape[0] else: pred = self.model.call(agentInputs) #print(pred[0]) return [ACTIONS[x] for x in np.argmax(pred, axis=1)] def update(self, oldAgentInputs, actions, newAgentInputs, rewards): # Lägg till i experience_replay actions = np.array([ACTIONS.index(action) for action in actions]) #print(["LEFT","RIGHT","JUMP","NONE"][actions[0]],rewards[0]) self.exp_rep.add_experinces(oldAgentInputs, actions, newAgentInputs, rewards) self.n_since_last_train += oldAgentInputs.shape[0] if self.n_since_last_train > TRAIN_RATE: loss = self.train_on_random_minibatch() self.n_since_last_train = 0 def train_on_random_minibatch(self): input, action, new_input, reward = self.exp_rep.get_random_minibatch( BATCH_SIZE) loss = self.train_on_batch(input, action, new_input, reward) #if self.load_path is not None: # self.save(self.load_path) return loss.numpy() def train_on_batch(self, agent_input_before, action, agent_input_after, reward): q_after = self.model(agent_input_after) wanted_q = reward + self.future_discount * tf.reduce_max(q_after, axis=1) #wanted_q = reward tvars = self.model.trainable_variables with tf.GradientTape() as tape: pred_q_for_all_actions = self.model(agent_input_before) # Indexera med rätt actions action_ind = tf.transpose( [tf.range(agent_input_before.shape[0]), action]) pred_q_for_action = tf.gather_nd(pred_q_for_all_actions, action_ind) loss = self.loss_measure(wanted_q, pred_q_for_action) gradients = tape.gradient(loss, tvars) self.opt.apply_gradients(zip(gradients, tvars)) self.latestLoss = loss return loss def save(self, path=SAVE_PATH): self.model.save_weights(path)
class DoubleDeepQlearner: def __init__(self, random_action_method, future_discount=0.75, learning_rate=0.001, saveAndLoad=True): learning_rate = learning_rate * (1 - future_discount) / (1 - 0.8) self.model_a = RLModel() self.model_a.build((None, AGENT_INPUT_SIZE)) self.model_b = RLModel() self.model_b.build((None, AGENT_INPUT_SIZE)) self.saveAndLoad = saveAndLoad if os.path.isfile(SAVE_PATH_A) and os.path.isfile( SAVE_PATH_B) and saveAndLoad: print("Loading") self.model_a.load_weights(SAVE_PATH_A) self.model_b.load_weights(SAVE_PATH_B) self.exp_rep_a = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.exp_rep_b = ExperienceReplay(ER_SIZE, AGENT_INPUT_SIZE) self.random_action_method = random_action_method self.learning_rate = learning_rate self.future_discount = future_discount self.loss_measure = tf.losses.MeanSquaredError() self.opt = tf.optimizers.Adam(lr=self.learning_rate) self.n_since_last_train = 0 self.latestLoss = tf.add(0, 0) def getAction(self, agentInput): rand_action = self.random_action_method.get_random_action() if rand_action is not None: return rand_action else: model = random.choice([self.model_a, self.model_b]) pred = model.call_fast(agentInput) return ACTIONS[np.argmax(pred)] def update(self, oldAgentInput, action, newAgentInput, reward): if random.random() < 0.5: self.exp_rep_a.add_experince(oldAgentInput, ACTIONS.index(action), newAgentInput, reward) else: self.exp_rep_b.add_experince(oldAgentInput, ACTIONS.index(action), newAgentInput, reward) self.n_since_last_train += 1 if self.n_since_last_train > TRAIN_RATE: # print("Training") loss = self.train_on_random_minibatch() #print("Loss =", loss) if self.saveAndLoad: self.model_a.save_weights(SAVE_PATH_A) self.model_b.save_weights(SAVE_PATH_B) self.n_since_last_train = 0 def train_on_random_minibatch(self): train_a = random.random() < 0.5 if train_a: input, action, new_input, reward = self.exp_rep_a.get_random_minibatch( BATCH_SIZE) else: input, action, new_input, reward = self.exp_rep_b.get_random_minibatch( BATCH_SIZE) loss = self.train_on_batch(input, action, new_input, reward, train_a) return loss.numpy() def train_on_batch(self, agent_input_before, action, agent_input_after, reward, train_a): if train_a: model_t = self.model_a model_p = self.model_b else: model_t = self.model_b model_p = self.model_a t_best_action = tf.math.argmax(model_t(agent_input_after), axis=1) tba_ind = tf.transpose([ tf.range(agent_input_before.shape[0]), tf.cast(t_best_action, "int32") ]) q_after = model_p(agent_input_after) q_after_max = tf.gather_nd(q_after, tba_ind) wanted_q = reward + self.future_discount * q_after_max #wanted_q = reward tvars = model_t.trainable_variables with tf.GradientTape() as tape: pred_q_for_all_actions = model_t(agent_input_before) # Indexera med rätt actions action_ind = tf.transpose( [tf.range(agent_input_before.shape[0]), action]) pred_q_for_action = tf.gather_nd(pred_q_for_all_actions, action_ind) loss = self.loss_measure(wanted_q, pred_q_for_action) gradients = tape.gradient(loss, tvars) self.opt.apply_gradients(zip(gradients, tvars)) self.latestLoss = loss return loss