def get_action(self, task, state, non_normalized_state, epsilon): """ method returns action to take """ if not epsilon: q_value = self.model_net.predict(np.array([state])) else: if np.random.rand() <= self.current_epsilon: if task.name == "2048-v0": possible_actions = possible_moves(non_normalized_state) while True: rand_action = np.random.randint(0, self.action_size, size=1)[0] if possible_actions[rand_action] == 1: return rand_action else: return np.random.randint(0, self.action_size, size=1)[0] else: q_value = self.model_net.predict(np.array([state])) if task.name == "2048-v0": possible_actions = possible_moves(non_normalized_state) while True: chosen_action = np.argmax(q_value) if possible_actions[chosen_action] == 1: return chosen_action else: q_value[0][chosen_action] = -100 return np.argmax(q_value)
def train_ddqn(self): """ method trains agent using DDQN """ if self.memory_type == "basic": if len(self.memory) >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return else: if self.memory.length >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return errors = np.zeros(self.minibatch_size) """ if self.model_type == "experimental": state = split_2048(state) next_state = split_2048(next_state) q_value = self.model_net.predict(state) ns_model_pred = self.model_net.predict(next_state) ns_target_pred = self.target_net.predict(next_state) else:""" possible_actions_curr = [] if self.args.environment == "2048-v0": for i, item in enumerate(state): possible_actions_curr.append(possible_moves(item)) state = state / 16384.0 - 0.5 next_state = next_state / 16384.0 - 0.5 q_value = self.model_net.predict(state) ns_model_pred = self.model_net.predict(next_state) ns_target_pred = self.target_net.predict(next_state) for i in range(0, self.minibatch_size): errors[i] = q_value[i][action[i]] if done[i] == 1: q_value[i][action[i]] = reward[i] else: q_value[i][action[ i]] = reward[i] + self.gamma * ns_target_pred[i][np.argmax( ns_model_pred[i])] errors[i] = abs(errors[i] - q_value[i][action[i]]) for i, item in enumerate(possible_actions_curr): for e, elem in enumerate(item): if elem == 0: q_value[i][e] = -1 self.model_net.fit(state, q_value, epochs=1, verbose=0) if self.memory_type == "dueling": self.memory.update_minibatch(minibatch, errors)