def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = None # initialize for each episode epi_reward = 0 gamma_t = 1 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) # choose the next action based on epsilon greedy policy (action_index, object_index) = epsilon_greedy(current_state_vector, epsilon) # go to the next state based on the action (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) # extract the feature vector of the next state next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)) if for_training: # update Q-function. # TODO Your code here #update the parameters of deep Q network deep_q_learning(current_state_vector, action_index, object_index, reward, next_state_vector, terminal) #pass if not for_training: # update reward #update episodic reward with discount epi_reward += gamma_t * reward #pass # prepare next step #update current state to the next state current_room_desc = next_room_desc current_quest_desc = next_quest_desc gamma_t = gamma_t * GAMMA if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() count = 0 epi_reward = 0 while not terminal: # Choose next action and execute # recall current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) (action_index, object_index) = epsilon_greedy(current_state_vector, theta, epsilon) (next_room_desc, next_quest_desc, reward, terminal) \ = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_state = next_room_desc + next_quest_desc next_state_vector = utils.extract_bow_feature_vector( next_state, dictionary) if for_training: # update Q-function. # TODO Your code here linear_q_learning(theta, current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward # TODO Your code here epi_reward += np.power(GAMMA, count) * reward # prepare next step # TODO Your code here count += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = None # initialize for each episode # TODO Your code here epi_reward = 0 t = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) # TODO Your code here if for_training: # update Q-function. # TODO Your code here action_index, object_index = epsilon_greedy( current_state_vector, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)) deep_q_learning(current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward # TODO Your code here action_index, object_index = epsilon_greedy( current_state_vector, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) epi_reward = epi_reward + reward * (GAMMA**t) t = t + 1 # prepare next step # TODO Your code here # update current_room_desc and current_quest_desc current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Model, optimiser are freaking global Also dictionary. Crap. Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0. t = 0 # initialize for each episode (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) action_index, object_index = epsilon_greedy( current_state_vector, epsilon ) next_room_desc, next_quest_desc, reward, terminal= framework.step_game( current_room_desc, current_quest_desc, action_index, object_index ) next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector( next_room_desc + next_quest_desc, dictionary ) ) if for_training: # update Q-function. deep_q_learning( current_state_vector, action_index, object_index, reward, next_state_vector, terminal ) else: # if not for_training: # update reward epi_reward += (GAMMA**t)*reward # prepare next step t += 1 if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ # My solution: epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode current_room_desc, current_quest_desc, terminal = framework.newGame() t = 0 while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) action_index, object_index = epsilon_greedy(current_state_vector, theta, epsilon) next_room_desc, next_quest_desc, reward, terminal = \ framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_state = next_room_desc + next_quest_desc next_state_vector = utils.extract_bow_feature_vector( next_state, dictionary) if for_training: # update Q-function. linear_q_learning(theta, current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward += GAMMA**t * reward t += 1 # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP \ # initialize for each episode gamma_step = 1 epi_reward = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) (action_index, object_index) = epsilon_greedy(current_state_vector, theta, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) if for_training: # update Q-function. next_state = next_room_desc + next_quest_desc # Returns the bag-of-words vector representation of the state next_state_vector = utils.extract_bow_feature_vector( next_state, dictionary) linear_q_learning(theta, current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward = epi_reward + gamma_step * reward gamma_step = gamma_step * GAMMA # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0.0 # initialize for each episode (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) next_action_index, next_object_index = epsilon_greedy(current_state_vector, theta, epsilon) # Get next action, object next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, next_action_index, next_object_index) # Take a step next_state = next_room_desc + next_quest_desc # Build next state vector next_state_vector = utils.extract_bow_feature_vector(next_state, dictionary) if for_training: # update Q-function. linear_q_learning(theta, current_state_vector, next_action_index, next_object_index, reward, next_state_vector, terminal) # Update theta if not for_training: # update reward epi_reward += (GAMMA**(framework.STEP_COUNT - 1))*reward # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode epi_reward = 0.0 step = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) # Get next c next_action_i, next_object_i = epsilon_greedy(current_state_vector, epsilon) # Make a move step += 1 next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, next_action_i, next_object_i) # Next state vector next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor(utils.extract_bow_feature_vector(next_state, dictionary)) if for_training: # update Q-function. deep_q_learning(current_state_vector, next_action_i, next_object_i, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward += (GAMMA**(step - 1)) * reward # prepare next step current_room_desc, current_quest_desc = next_room_desc, next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 count = 0 # initialize for each episode (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)).type(dtype) (action_index, object_index) = epsilon_greedy(current_state_vector, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = \ framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)).type(dtype) if for_training: deep_q_learning(current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward += (GAMMA**count) * reward count += 1 # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP gamma_step = 1 epi_reward = 0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) (action_index, object_index) = epsilon_greedy(current_state_vector, epsilon) (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(current_room_desc, current_quest_desc, action_index, object_index) if for_training: # update Q-function. next_state = next_room_desc + next_quest_desc # 32-bit floating point CPU tensor next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)) deep_q_learning(current_state_vector, action_index, object_index, reward, next_state_vector, terminal) if not for_training: # update reward epi_reward = epi_reward + gamma_step * reward gamma_step = gamma_step * GAMMA # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0.0 (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) next_action_index, next_object_index = epsilon_greedy( current_state_vector, epsilon) next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, next_action_index, next_object_index) next_state = next_room_desc + next_quest_desc next_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(next_state, dictionary)) if for_training: deep_q_learning(current_state_vector, next_action_index, next_object_index, reward, next_state_vector, terminal) if not for_training: epi_reward += (GAMMA**(framework.STEP_COUNT - 1)) * reward # prepare next step current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) # TODO Your code here if for_training: # update Q-function. # TODO Your code here tabular_q_learning(q_func, current_state_1, current_state_2, action_index, object_index, reward, next_state_1, next_state_2, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward = epi_reward + (GAMMA**t) * reward pass # prepare next step # TODO Your code here t = t + 1 current_room_desc, current_quest_desc = next_state_1, next_state_2 if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = None # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = torch.FloatTensor( utils.extract_bow_feature_vector(current_state, dictionary)) # TODO Your code here if for_training: # update Q-function. # TODO Your code here pass if not for_training: # update reward # TODO Your code here pass # prepare next step # TODO Your code here if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP epi_reward = 0 # initialize for each episode # TODO Your code here (current_room_desc, current_quest_desc, terminal) = framework.newGame() t = 0 while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector( current_state, dictionary) # TODO Your code here # Decidir acción con epsilon_greedy action_index, object_index = epsilon_greedy(current_state_vector, theta, epsilon) # Paso del juego + traducir descripciones next_room_desc, next_quest_desc, reward, terminal = framework.step_game( current_room_desc, current_quest_desc, action_index, object_index) # next_room = dict_room_desc[ next_room_desc] # next_quest = dict_quest_desc[ next_quest_desc] next_state = next_room_desc + next_quest_desc next_state_vector = utils.extract_bow_feature_vector( next_state, dictionary) if for_training: # update Q-function. # TODO Your code here linear_q_learning(theta, current_state_vector, action_index, object_index, reward, next_state_vector, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward += (GAMMA**t) * reward pass # prepare next step # TODO Your code here t += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward
def run_episode(for_training): """ Runs one episode If for training, update Q function If for testing, computes and return cumulative discounted reward Args: for_training (bool): True if for training Returns: None """ epsilon = TRAINING_EP if for_training else TESTING_EP # initialize for each episode # TODO Your code here # Look into framework.py file for hint # A tuple where the first element is a description of the initial room, # the second element is a description of the quest for this new game episode, and # the last element is a Boolean variable with value False implying that the game is not over. (current_room_desc, current_quest_desc, terminal) = framework.newGame() # initial value count = 0 epi_reward = 0 while not terminal: # Choose next action and execute current_state = current_room_desc + current_quest_desc current_state_vector = utils.extract_bow_feature_vector(current_state, dictionary) # TODO Your code here (action_index, object_index) = epsilon_greedy(current_state_vector, theta, epsilon) # Renaming to shorter name a_idx = action_index # rename for shorter name for action index o_idx = object_index # rename for shorter name for object index crd = current_room_desc # rename for shorter name for current room description cqd = current_quest_desc # rename for shorter name for current quest description # the system next state when the selected command is applied at the current state (next_room_desc, next_quest_desc, reward, terminal) = framework.step_game(crd, cqd, a_idx, o_idx) next_state = next_room_desc + next_quest_desc # Look into utils.py for the bag-of-words vector representation of the state next_state_vector = utils.extract_bow_feature_vector(next_state, dictionary) if for_training: # update Q-function. # TODO Your code here linear_q_learning(theta, current_state_vector, a_idx, o_idx, reward, next_state_vector, terminal) pass if not for_training: # update reward # TODO Your code here epi_reward += np.power(GAMMA, count) * reward pass # prepare next step # TODO Your code here count += 1 current_room_desc = next_room_desc current_quest_desc = next_quest_desc if not for_training: return epi_reward