def get_mentioned_landmarks(thesaurus, str_instruction): split_instr = split_instruction(clean_instruction(str_instruction)) word2term = thesaurus["word2term"] term_groundings = thesaurus["term_groundings"] lm_name2index = get_landmark_name_to_index() # Map each word in the instruction to it's corresponding term: split_instr_terms = words_to_terms(split_instr, word2term) mentioned_landmark_names = set() # For each term, find all the landmarks that have been mentioned for term in split_instr_terms: for landmark_name in term_groundings[term]["landmarks"]: mentioned_landmark_names.add(landmark_name) mentioned_landmark_names = list(mentioned_landmark_names) mentioned_landmark_indices = [ lm_name2index[name] for name in mentioned_landmark_names ] return mentioned_landmark_names, mentioned_landmark_indices
def ground_terms(word2id, clustered_corpus, landmark_names, train_instructions): # the clustered corpus is a dictionary of lists, where the keys are valid english words and the values are # lists of words found in the corpus that are assumed to be misspellings of the key valid words # We make the distinction that a word is any word in an instruction # Terms are words in the english vocabulary. Multiple words (misspellings) can map to a single term. num_terms = len(clustered_corpus) vocab_size = len(word2id) num_landmarks = len(landmark_names) # This is gonna be the new word2id, once we start using the thesaurus term2id = {} id2term = {} for i, term in enumerate(sorted(clustered_corpus.keys())): term2id[term] = i id2term[i] = term # Calculate the mutual information between each cluster and each landmark # Number of times each term appears in an instruction term_occurences = np.zeros(num_terms) # Number of times each landmark appears near a segment path landmark_occurences = np.zeros(num_landmarks) # The number of times each term and landmark combination appears in the instruction and near the path term_landmark_cooccurences = np.zeros((num_terms, num_landmarks)) # The number of total segments that were considered total_occurences = 0 landmark_indices = get_landmark_name_to_index() # Inverse the clusters so that we can efficiently map each word in each instruction to it's cluster core word2term = {} for real_word, misspellings in clustered_corpus.items(): for misspelling in misspellings: word2term[misspelling] = real_word # Count landmark and word occurences and co-occurences for env_id, instruction_sets in train_instructions.items(): path = load_path(env_id) env_config = load_env_config(env_id) for instruction_set in instruction_sets[0]["instructions"]: instruction_str = instruction_set["instruction"] start_idx = instruction_set["start_idx"] end_idx = instruction_set["end_idx"] present_landmarks = close_landmark_names(env_config, path, start_idx, end_idx) present_lm_indices = [ landmark_indices[lm] for lm in present_landmarks ] mentioned_words = split_instruction( clean_instruction(instruction_str)) mentioned_terms = words_to_terms(mentioned_words, word2term) for term in mentioned_terms: term_id = term2id[term] term_occurences[term_id] += 1 for lm_idx in present_lm_indices: landmark_occurences[lm_idx] += 1 for term in mentioned_terms: term_id = term2id[term] term_landmark_cooccurences[term_id][lm_idx] += 1 total_occurences += 1 term_prob = np.expand_dims(term_occurences / total_occurences, 1).repeat(num_landmarks, 1) landmark_prob = np.expand_dims(landmark_occurences / total_occurences, 0).repeat(num_terms, 0) term_and_landmark_prob = term_landmark_cooccurences / total_occurences # term_and_landmark_prob has dimensions 0: terms, 1: landmarks mutual_info_factor = term_and_landmark_prob / (landmark_prob * term_prob + 1e-27) #mutual_info_factor = term_and_landmark_prob / ((1 / num_landmarks) * term_prob + 1e-9) mutual_info = term_and_landmark_prob * np.log(mutual_info_factor + 1e-27) # The above line is the correct formula for mutual information. For our case, below formula might be better? # The mutual information is higher for common words than uncommon ones. We might prefer the opposite effect. # On the other hand, uncommon words are more likely to spuriously correlate with landmarks, which will cause a # less reliable corpus. #mutual_info = np.log(mutual_info_factor + 1e-27) # Ground each term and produce the thesaurus term_meanings = {} common_words = [] for i in range(num_terms): grounded_lm_indices = [ idx for idx in range(num_landmarks) if mutual_info[i][idx] > MUTUAL_INFO_THRESHOLD ] grounded_lm_names = [ landmark_names[idx] for idx in grounded_lm_indices ] mutual_infos = np.asarray( [mutual_info[i][idx] for idx in grounded_lm_indices]) args = list(np.argsort(mutual_infos)) grounded_lm_names = list( reversed([grounded_lm_names[idx] for idx in args])) mutual_infos = list(reversed([mutual_infos[idx] for idx in args])) # If the word is too common to be referring to a landmark, ignore ita this_term_prob = term_prob[i][0] if this_term_prob > MAX_TERM_PROB: common_words.append(id2term[i]) grounded_lm_names = [] mutual_infos = [] term_meanings[id2term[i]] = \ { "landmarks": grounded_lm_names, "mutual_info": mutual_infos, "term_prob": this_term_prob } for k in term_meanings.keys(): if len(term_meanings[k]["landmarks"]) > 0: print(k, term_meanings[k]) print("Ignored groundings for these common words: " + str(common_words)) return term_meanings, word2term