def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = None # initialize for each episode # TODO Your code here epi_reward = 0 t = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute # TODO Your code here current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) if for_training: # update Q-function. # TODO Your code here action_index, object_index = epsilon_greedy(current_state_vector, theta, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) next_state = next_room_desc + next_quest_desc next_state_vector = utils.extract_bow_feature_vector( next_state, dictionary) linear_q_learning(theta, current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward # TODO Your code here action_index, object_index = epsilon_greedy(current_state_vector, theta, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) epi_reward = epi_reward + reward * (GAMMA**t) t = t + 1 # prepare next step # TODO Your code here # update current_room_desc and current_quest_desc current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() # initial value count = 0 epi_reward = 0 while not terminal: # Choose next action and execute # TODO Your code here # recall index from dictionary by "description" key current_state_1 = dict_room_desc[current_room_desc] current_state_2 = dict_quest_desc[current_quest_desc] (action_index, object_index) = epsilon_greedy(current_state_1, current_state_2, q_func, epsilon) (next_room_desc, next_quest_desc, reward, terminal) \ = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_state_1 = dict_room_desc[next_room_desc] next_state_2 = dict_quest_desc[next_quest_desc] if for_training: # update Q-function. # TODO Your code here tabular_q_learning(q_func, current_state_1, current_state_2, action_index, object_index, reward, next_state_1, next_state_2, terminal) if not for_training: # update reward # TODO Your code here epi_reward += np.power(GAMMA, count) * reward # prepare next step # TODO Your code here count += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ if for_training: epsilon = TRAINING_EP else: epsilon = TESTING_EP epi_reward = 0 current_room_desc, current_quest_desc, terminal = framework.newGame() step = 0 while not terminal: state_1, state_2 = dict_room_desc[current_room_desc], dict_quest_desc[current_quest_desc] action_index, object_index = epsilon_greedy(state_1, state_2, q_func, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_state_1, next_state_2 = dict_room_desc[next_room_desc], dict_quest_desc[next_quest_desc] if for_training: tabular_q_learning(q_func, state_1, state_2, action_index,object_index, reward, next_state_1, next_state_2, terminal) if not for_training: epi_reward+=(reward*(GAMMA)**step) step+=1 if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = None # initialize for each episode epi_reward = 0 gamma_t = 1 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) # choose the next action based on epsilon greedy policy (action_index, object_index) = epsilon_greedy(current_state_vector, epsilon) # go to the next state based on the action (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) # extract the feature vector of the next state next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)) if for_training: # update Q-function. # TODO Your code here #update the parameters of deep Q network deep_q_learning(current_state_vector, action_index, object_index, reward, next_state_vector, terminal) #pass if not for_training: # update reward #update episodic reward with discount epi_reward += gamma_t * reward #pass # prepare next step #update current state to the next state current_room_desc = next_room_desc current_quest_desc = next_quest_desc gamma_t = gamma_t * GAMMA if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP global STEP_COUNT STEP_COUNT=0 # initialize for each episode epi_reward = 0.0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: STEP_COUNT+=1 # Choose next action and execute current_room_desc_index = dict_room_desc[current_room_desc] current_quest_desc_index = dict_quest_desc[current_quest_desc] # Get room and quest indices next_action_index, next_object_index = epsilon_greedy(current_room_desc_index, current_quest_desc_index, q_func, epsilon) # Get next action/object next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, next_action_index, next_object_index) # Take a step # Only need room index; quest remains same during an episode next_room_desc_index = dict_room_desc[next_room_desc] if for_training: # update Q-function. tabular_q_learning(q_func, current_room_desc_index, current_quest_desc_index, next_action_index, next_object_index, reward, next_room_desc_index, current_quest_desc_index, terminal) if not for_training: # update reward epi_reward += (GAMMA**(framework.STEP_COUNT - 1))*reward # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() count = 0 epi_reward = 0 while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) # TODO Your code here (action_index, object_index) = epsilon_greedy(current_state_vector, epsilon) (next_room_desc, next_quest_desc, reward, terminal) \ = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)) if for_training: # update Q-function. # TODO Your code here deep_q_learning(current_state_vector, action_index, object_index, reward, next_state_vector, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward += np.power(GAMMA, count) * reward pass # prepare next step # TODO Your code here count += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 gamma = 1 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: current_room_index = dict_room_desc[current_room_desc] current_quest_index = dict_quest_desc[current_quest_desc] # Choose next action and execute # TODO Your code here action_index, object_index = epsilon_greedy(current_room_index, current_quest_index, q_func, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) next_room_index = dict_room_desc[next_room_desc] next_quest_index = dict_quest_desc[next_quest_desc] if for_training: # update Q-function. # TODO Your code here tabular_q_learning(q_func, current_room_index, current_quest_index, action_index, object_index, reward, next_room_index, next_quest_index, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward += reward * gamma gamma *= GAMMA pass # prepare next step # TODO Your code here current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0.00 t = 0 # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute # TODO Your code here state_1 = dict_room_desc[current_room_desc] state_2 = dict_quest_desc[current_quest_desc] (action_index, object_index) = epsilon_greedy(state_1, state_2, q_func, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) (next_state_1, next_state_2) = (dict_room_desc[next_room_desc], dict_quest_desc[next_quest_desc]) if for_training: # update Q-function. # TODO Your code here tabular_q_learning(q_func, state_1, state_2, action_index, object_index, reward, next_state_1, next_state_2, terminal) if not for_training: # update reward # TODO Your code here epi_reward = epi_reward + (GAMMA**t) * reward t += 1 current_room_desc, current_quest_desc = next_room_desc, next_quest_desc # prepare next step # TODO Your code here if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP gamma_step = 1 epi_reward = 0 # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute # TODO Your code here cur_room_desc_id = dict_room_desc[current_room_desc] cur_quest_desc_id = dict_quest_desc[current_quest_desc] (action_index, object_index) = epsilon_greedy(cur_room_desc_id, cur_quest_desc_id, q_func, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) if for_training: # update Q-function. # TODO Your code here next_room_desc_id = dict_room_desc[next_room_desc] next_quest_desc_id = dict_quest_desc[next_quest_desc] tabular_q_learning(q_func, cur_room_desc_id, cur_quest_desc_id, action_index, object_index, reward, next_room_desc_id, next_quest_desc_id, terminal) if not for_training: # update reward # TODO Your code here epi_reward = epi_reward + gamma_step * reward gamma_step = gamma_step * GAMMA # prepare next step # TODO Your code here current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode epi_reward = 0. (current_room_desc, current_quest_desc, terminal) = framework.newGame() current_state_1 = dict_room_desc.get(current_room_desc) current_state_2 = dict_quest_desc.get(current_quest_desc) steps = 0 while not terminal: steps += 1 # Choose next action and execute action_index, object_index = epsilon_greedy(current_state_1, current_state_2, q_func, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) current_state_1 = dict_room_desc.get(current_room_desc) current_state_2 = dict_quest_desc.get(current_quest_desc) next_state_1 = dict_room_desc.get(next_room_desc) next_state_2 = dict_quest_desc.get(next_quest_desc) if for_training: # update Q-function. tabular_q_learning(q_func, current_state_1, current_state_2, action_index, object_index, reward, next_state_1, next_state_2, terminal) pass if not for_training: # update reward epi_reward += GAMMA**(steps - 1) * reward pass # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode # TODO Your code here num_steps = 0 # dict_room_desc, dict_quest_desc = framework.make_all_states_index() (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute # TODO Your code here state_r, state_q = dict_room_desc[current_room_desc], dict_quest_desc[ current_quest_desc] next_action, next_object = epsilon_greedy(state_r, state_q, q_func, epsilon) next_room_desc, next_quest_desc, reward, terminal = ( framework.step_game(current_room_desc, current_quest_desc, next_action, next_object)) if for_training: # update Q-function. # TODO Your code here next_state_r, next_state_q = dict_room_desc[ next_room_desc], dict_quest_desc[next_quest_desc] tabular_q_learning(q_func, state_r, state_q, next_action, next_object, reward, next_state_r, next_state_q, terminal) if not for_training: # update reward # TODO Your code here epi_reward = np.power(GAMMA, num_steps) * reward + epi_reward # prepare next step # TODO Your code here current_room_desc, current_quest_desc = next_room_desc, next_quest_desc num_steps += 1 if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 t = 0 # step # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_room_desc_index = dict_room_desc[current_room_desc] current_quest_desc_index = dict_quest_desc[current_quest_desc] next_action_index, next_object_index = epsilon_greedy( current_room_desc_index, current_quest_desc_index, q_func, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, next_action_index, next_object_index) next_room_desc_index = dict_room_desc[ next_room_desc] # quest remains same if for_training: # update Q-function. tabular_q_learning(q_func, current_room_desc_index, current_quest_desc_index, next_action_index, next_object_index, reward, next_room_desc_index, current_quest_desc_index, terminal) if not for_training: # update reward epi_reward += (GAMMA**t) * reward # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc t += 1 if not for_training: return epi_reward
def run_episode(for_training): """ Model, optimiser are freaking global Also dictionary. Crap. Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0. t = 0 # initialize for each episode (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) action_index, object_index = epsilon_greedy( current_state_vector, epsilon ) next_room_desc, next_quest_desc, reward, terminal= framework.step_game( current_room_desc, current_quest_desc, action_index, object_index ) next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector( next_room_desc + next_quest_desc, dictionary ) ) if for_training: # update Q-function. deep_q_learning( current_state_vector, action_index, object_index, reward, next_state_vector, terminal ) else: # if not for_training: # update reward epi_reward += (GAMMA**t)*reward # prepare next step t += 1 if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode count = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() curr_state1 = dict_room_desc[current_room_desc] curr_state2 = dict_quest_desc[current_quest_desc] while not terminal: # Choose next action and execute (action_index, object_index) = epsilon_greedy(curr_state1, curr_state2, q_func, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = \ framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_state1 = dict_room_desc[next_room_desc] next_state2 = dict_quest_desc[next_quest_desc] if for_training: # update Q-function. tabular_q_learning(q_func, curr_state1, curr_state2, action_index, object_index, reward, next_state1, next_state2, terminal) if not for_training: # update reward epi_reward += (GAMMA**count) * reward count += 1 # prepare next step curr_state1 = next_state1 curr_state2 = next_state2 current_room_desc = list(dict_room_desc.keys())[list( dict_room_desc.values()).index(curr_state1)] current_quest_desc = list(dict_quest_desc.keys())[list( dict_quest_desc.values()).index(curr_state2)] if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP \ # initialize for each episode gamma_step = 1 epi_reward = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) (action_index, object_index) = epsilon_greedy(current_state_vector, theta, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) if for_training: # update Q-function. next_state = next_room_desc + next_quest_desc # Returns the bag-of-words vector representation of the state next_state_vector = utils.extract_bow_feature_vector( next_state, dictionary) linear_q_learning(theta, current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward = epi_reward + gamma_step * reward gamma_step = gamma_step * GAMMA # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = None # initialize for each episode epi_reward = 0 t = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state_1, current_state_2 = dict_room_desc[ current_room_desc], dict_quest_desc[current_quest_desc] # get the next action according to policy action_index, object_index = epsilon_greedy(current_state_1, current_state_2, q_func, epsilon) # take action and get the next state next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) next_state_1, next_state_2 = dict_room_desc[ next_room_desc], dict_quest_desc[next_quest_desc] if for_training: # update Q-function. tabular_q_learning(q_func, current_state_1, current_state_2, action_index, object_index, reward, next_state_1, next_state_2, terminal) pass if not for_training: # update reward epi_reward = epi_reward + GAMMA**t * reward t = t + 1 pass # prepare next step #current_state_1, current_state_2 = next_state_1, next_state_2 current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() counter = 0 while not terminal: # Choose next action and execute # TODO Your code here (action_index, object_index) = epsilon_greedy(dict_room_desc[current_room_desc], dict_quest_desc[current_quest_desc], q_func, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) if for_training: # update Q-function. tabular_q_learning(q_func, dict_room_desc[current_room_desc], dict_quest_desc[current_quest_desc], action_index, object_index, reward, dict_room_desc[next_room_desc], dict_quest_desc[next_quest_desc], terminal) # TODO Your code here pass if not for_training: # update reward epi_reward += reward * (GAMMA ** (framework.STEP_COUNT-1)) # TODO Your code here pass counter += 1 # prepare next step # TODO Your code here current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0.0 # initialize for each episode (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) next_action_index, next_object_index = epsilon_greedy(current_state_vector, theta, epsilon) # Get next action, object next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, next_action_index, next_object_index) # Take a step next_state = next_room_desc + next_quest_desc # Build next state vector next_state_vector = utils.extract_bow_feature_vector(next_state, dictionary) if for_training: # update Q-function. linear_q_learning(theta, current_state_vector, next_action_index, next_object_index, reward, next_state_vector, terminal) # Update theta if not for_training: # update reward epi_reward += (GAMMA**(framework.STEP_COUNT - 1))*reward # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ # My solution: epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode current_room_desc, current_quest_desc, terminal = framework.newGame() t = 0 while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)).to(device) action_index, object_index = epsilon_greedy(current_state_vector, epsilon) next_room_desc, next_quest_desc, reward, terminal = \ framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)).to(device) if for_training: # update Q-function. deep_q_learning(current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward += GAMMA**t * reward t += 1 # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode epi_reward = 0.0 step = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) # Get next c next_action_i, next_object_i = epsilon_greedy(current_state_vector, epsilon) # Make a move step += 1 next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, next_action_i, next_object_i) # Next state vector next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor(utils.extract_bow_feature_vector(next_state, dictionary)) if for_training: # update Q-function. deep_q_learning(current_state_vector, next_action_i, next_object_i, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward += (GAMMA**(step - 1)) * reward # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode epi_reward = 0 step = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_room_desc_i = dict_room_desc[current_room_desc] current_quest_desc_i = dict_quest_desc[current_quest_desc] # Get next c next_action_i, next_object_i = epsilon_greedy(current_room_desc_i, current_quest_desc_i, q_func, epsilon) # Make a move step += 1 next_room_desc, next_quest_desc, reward, terminal = framework.step_game(current_room_desc, current_quest_desc, next_action_i, next_object_i) next_room_desc_i = dict_room_desc[next_room_desc] if for_training: # update Q-function. tabular_q_learning(q_func, current_room_desc_i, current_quest_desc_i, next_action_i, next_object_i, reward, next_room_desc_i, current_quest_desc_i, terminal) if not for_training: # update reward. sum(gamma^t * reward) epi_reward += (GAMMA**(step - 1)) * reward # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward theta & dictionary are freaking global Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0. t = 0 # initialize for each episode # theta = ... # global (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) action_index, object_index = epsilon_greedy(current_state_vector, theta, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) next_state_vector = utils.extract_bow_feature_vector( next_room_desc + next_quest_desc, dictionary) if for_training: # update Q-function. linear_q_learning(theta, current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: epi_reward += (GAMMA**t) * reward # prepare next step t += 1 if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP gamma_step = 1 epi_reward = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) (action_index, object_index) = epsilon_greedy(current_state_vector, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) if for_training: # update Q-function. next_state = next_room_desc + next_quest_desc # 32-bit floating point CPU tensor next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)) deep_q_learning(current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward = epi_reward + gamma_step * reward gamma_step = gamma_step * GAMMA # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0.0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) next_action_index, next_object_index = epsilon_greedy( current_state_vector, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, next_action_index, next_object_index) next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)) if for_training: deep_q_learning(current_state_vector, next_action_index, next_object_index, reward, next_state_vector, terminal) if not for_training: epi_reward += (GAMMA**(framework.STEP_COUNT - 1)) * reward # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() t = 0 while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) # TODO Your code here # Decidir acción con epsilon_greedy action_index, object_index = epsilon_greedy(current_state_vector, theta, epsilon) # Paso del juego + traducir descripciones next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) # next_room = dict_room_desc[ next_room_desc] # next_quest = dict_quest_desc[ next_quest_desc] next_state = next_room_desc + next_quest_desc next_state_vector = utils.extract_bow_feature_vector( next_state, dictionary) if for_training: # update Q-function. # TODO Your code here linear_q_learning(theta, current_state_vector, action_index, object_index, reward, next_state_vector, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward += (GAMMA**t) * reward pass # prepare next step # TODO Your code here t += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ """ My solution: epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode current_room_desc, current_quest_desc, terminal = framework.newGame() # string descriptions t = 0 while not terminal: # Choose next action and execute room_index = dict_room_desc[current_room_desc] quest_index = dict_quest_desc[current_quest_desc] action_index, object_index = epsilon_greedy(room_index, quest_index, q_func, epsilon) next_room_desc, next_quest_desc, reward, terminal = \ framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_room_index = dict_room_desc[next_room_desc] next_quest_index = dict_quest_desc[next_quest_desc] if for_training: # update Q-function. tabular_q_learning(q_func, room_index, quest_index, action_index, object_index, reward, next_room_index, next_quest_index, terminal) if not for_training: # update reward epi_reward += GAMMA ** t * reward t += 1 # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward """ # Instructor's solution: # Difference reward section: # Uses epi_reward += gamma_step * reward, then gamma_step += GAMMA epsilon = TRAINING_EP if for_training else TESTING_EP gamma_step = 1 epi_reward = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute cur_room_desc_id = dict_room_desc[current_room_desc] cur_quest_desc_id = dict_quest_desc[current_quest_desc] (action_index, object_index) = epsilon_greedy(cur_room_desc_id, cur_quest_desc_id, q_func, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) if for_training: # update Q-function. next_room_desc_id = dict_room_desc[next_room_desc] next_quest_desc_id = dict_quest_desc[next_quest_desc] tabular_q_learning(q_func, cur_room_desc_id, cur_quest_desc_id, action_index, object_index, reward, next_room_desc_id, next_quest_desc_id, terminal) if not for_training: # update reward epi_reward = epi_reward + gamma_step * reward gamma_step = gamma_step * GAMMA # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training, need_history=False): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode # TODO Your code here global q_func, dict_room_desc, dict_quest_desc (current_room_desc, current_quest_desc, terminal) = framework.newGame() history = list() while not terminal: # Choose next action and execute # get the greedy action wrt the current_room_desc and current_quest_desc action_index, object_index = epsilon_greedy( state_1=dict_room_desc[current_room_desc], state_2=dict_quest_desc[current_quest_desc], q_func=q_func, epsilon=epsilon) # take the action in the environment and get the reward next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc=current_room_desc, current_quest_desc=current_quest_desc, action_index=action_index, object_index=object_index) history.append([ current_room_desc, current_quest_desc, action_index, object_index, next_room_desc, next_quest_desc, reward, terminal ]) if for_training: # update Q-function. tabular_q_learning( q_func=q_func, current_state_1=dict_room_desc[current_room_desc], current_state_2=dict_quest_desc[current_quest_desc], action_index=action_index, object_index=object_index, reward=reward, next_state_1=dict_room_desc[next_room_desc], next_state_2=dict_quest_desc[next_quest_desc], terminal=terminal) if not for_training: # update reward epi_reward += np.power(GAMMA, framework.STEP_COUNT) * reward # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: if need_history: return epi_reward, history else: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP dict_room_desc, dict_quest_desc = framework.make_all_states_index() # q_func = np.zeros((NUM_ROOM_DESC, NUM_QUESTS, NUM_ACTIONS, NUM_OBJECTS)) epi_reward = 0 # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() t = 0 while not terminal: # Choose next action and execute # TODO Your code here current_room = dict_room_desc[ current_room_desc] # Índice de habitación current_quest = dict_quest_desc[ current_quest_desc] # Índice de la quest # Decidir acción con épsilon greedy action_index, object_index = epsilon_greedy(current_room, current_quest, q_func, epsilon) # Paso del juego + traducir descripciones next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) next_room = dict_room_desc[next_room_desc] next_quest = dict_quest_desc[next_quest_desc] if for_training: # update Q-function. # TODO Your code here tabular_q_learning(q_func, current_room, current_quest, action_index, object_index, reward, next_room, next_quest, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward += (GAMMA**t) * reward pass # prepare next step # TODO Your code here t += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode # TODO Your code here # Look into framework.py file for hint # A tuple where the first element is a description of the initial room, # the second element is a description of the quest for this new game episode, and # the last element is a Boolean variable with value False implying that the game is not over. (current_room_desc, current_quest_desc, terminal) = framework.newGame() # initial value count = 0 epi_reward = 0 while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector(current_state, dictionary) # TODO Your code here (action_index, object_index) = epsilon_greedy(current_state_vector, theta, epsilon) # Renaming to shorter name a_idx = action_index # rename for shorter name for action index o_idx = object_index # rename for shorter name for object index crd = current_room_desc # rename for shorter name for current room description cqd = current_quest_desc # rename for shorter name for current quest description # the system next state when the selected command is applied at the current state (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(crd, cqd, a_idx, o_idx) next_state = next_room_desc + next_quest_desc # Look into utils.py for the bag-of-words vector representation of the state next_state_vector = utils.extract_bow_feature_vector(next_state, dictionary) if for_training: # update Q-function. # TODO Your code here linear_q_learning(theta, current_state_vector, a_idx, o_idx, reward, next_state_vector, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward += np.power(GAMMA, count) * reward pass # prepare next step # TODO Your code here count += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward