q_func = np.zeros((NUM_ROOM_DESC, NUM_QUESTS, NUM_ACTIONS, NUM_OBJECTS)) single_run_epoch_rewards_test = [] pbar = tqdm(range(NUM_EPOCHS), ncols=80) for _ in pbar: single_run_epoch_rewards_test.append(run_epoch()) pbar.set_description( "Avg reward: {:0.6f} | Ewma reward: {:0.6f}".format( np.mean(single_run_epoch_rewards_test), utils.ewma(single_run_epoch_rewards_test))) return single_run_epoch_rewards_test if __name__ == '__main__': # Data loading and build the dictionaries that use unique index for each state (dict_room_desc, dict_quest_desc) = framework.make_all_states_index() NUM_ROOM_DESC = len(dict_room_desc) NUM_QUESTS = len(dict_quest_desc) # set up the game framework.load_game_data() epoch_rewards_test = [] # shape NUM_RUNS * NUM_EPOCHS for _ in range(NUM_RUNS): epoch_rewards_test.append(run()) epoch_rewards_test = np.array(epoch_rewards_test) x = np.arange(NUM_EPOCHS) fig, axis = plt.subplots()
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP dict_room_desc, dict_quest_desc = framework.make_all_states_index() # q_func = np.zeros((NUM_ROOM_DESC, NUM_QUESTS, NUM_ACTIONS, NUM_OBJECTS)) epi_reward = 0 # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() t = 0 while not terminal: # Choose next action and execute # TODO Your code here current_room = dict_room_desc[ current_room_desc] # Índice de habitación current_quest = dict_quest_desc[ current_quest_desc] # Índice de la quest # Decidir acción con épsilon greedy action_index, object_index = epsilon_greedy(current_room, current_quest, q_func, epsilon) # Paso del juego + traducir descripciones next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) next_room = dict_room_desc[next_room_desc] next_quest = dict_quest_desc[next_quest_desc] if for_training: # update Q-function. # TODO Your code here tabular_q_learning(q_func, current_room, current_quest, action_index, object_index, reward, next_room, next_quest, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward += (GAMMA**t) * reward pass # prepare next step # TODO Your code here t += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward