def make_annotations(end_i): P.initialize_experiment() annotations = {"train": [], "test": [], "dev": []} train_range, dev_range, test_range = get_split_ranges(end_i) assert ( train_range[1] - train_range[0] ) % NEW_CONFIG_EVERY_N == 0, "training set size must be a multiple of NEW_CONFIG_EVERY_N" for config_id in range(end_i): config_path = paths.get_env_config_path(config_id) path_path = paths.get_curve_path(config_id) instruction_path = paths.get_instructions_path(config_id) with open(config_path) as fp: config = json.load(fp) with open(path_path) as fp: curve = json.load(fp) with open(instruction_path) as fp: instruction = fp.readline() token_list = clean_instruction(instruction) curve_np = np.asarray(list(zip(curve["x_array"], curve["z_array"]))) split = "train" if train_range[0] <= config_id < train_range[1] else \ "dev" if dev_range[0] <= config_id < dev_range[1] else \ "test" if test_range[0] <= config_id < test_range[1] else None #start_dir = np.asarray(config["startHeading"]) - np.asarray(config["startPos"]) start_dir = curve_np[1] - curve_np[0] start_yaw = vec_to_yaw(start_dir) start_yaw_cfg = np.rad2deg(-start_yaw + np.pi / 2) dataset = { "id": str(config_id), "start_z": [curve["z_array"][0]], "start_x": [curve["x_array"][0]], "end_z": [curve["z_array"][-1]], "end_x": [curve["x_array"][-1]], "start_rot": [start_yaw_cfg], "config_file": "configs/random_config_%d.json" % config_id, "instructions_file": "instructions/instructions_%d.txt" % config_id, "path_file": "paths/random_curve_%d.json" % config_id, "moves": [], "valid": True, "num_tokens": [len(token_list)], "instructions": [instruction] } annotations[split].append(dataset) print("Added annotations for env: " + str(config_id)) with open(paths.get_instruction_annotations_path(), "w") as fp: json.dump(annotations, fp)
def make_annotations(start_i, end_i): P.initialize_experiment() annotations = { "train": [], "test": [], "dev": [] } for config_id in range(start_i, end_i): config_path = paths.get_env_config_path(config_id) path_path = paths.get_curve_path(config_id) instruction_path = paths.get_instructions_path(config_id) with open(config_path) as fp: config = json.load(fp) with open(path_path) as fp: curve = json.load(fp) with open(instruction_path) as fp: instruction = fp.readline() token_list = clean_instruction(instruction) split = get_split((config_id % 100) / 100.0) start_dir = np.asarray(config["startHeading"]) - np.asarray(config["startPos"]) start_yaw = vec_to_yaw(start_dir) start_yaw_cfg = np.rad2deg(-start_yaw + np.pi/2) dataset = { "id": str(config_id), "start_z": [curve["z_array"][0]], "start_x": [curve["x_array"][0]], "end_z": [curve["z_array"][-1]], "end_x": [curve["x_array"][-1]], "start_rot": [start_yaw_cfg], "config_file": "configs/random_config_%d.json" % config_id, "instructions_file": "instructions/instructions_%d.txt" % config_id, "path_file": "paths/random_curve_%d.json" % config_id, "moves": [], "valid": True, "num_tokens": [len(token_list)], "instructions": [instruction] } annotations[split].append(dataset) print ("Added annotations for env: " + str(config_id)) with open(paths.get_instruction_annotations_path(), "w") as fp: json.dump(annotations, fp)
def get_mentioned_landmarks(thesaurus, str_instruction): split_instr = split_instruction(clean_instruction(str_instruction)) word2term = thesaurus["word2term"] term_groundings = thesaurus["term_groundings"] lm_name2index = get_landmark_name_to_index() # Map each word in the instruction to it's corresponding term: split_instr_terms = words_to_terms(split_instr, word2term) mentioned_landmark_names = set() # For each term, find all the landmarks that have been mentioned for term in split_instr_terms: for landmark_name in term_groundings[term]["landmarks"]: mentioned_landmark_names.add(landmark_name) mentioned_landmark_names = list(mentioned_landmark_names) mentioned_landmark_indices = [ lm_name2index[name] for name in mentioned_landmark_names ] return mentioned_landmark_names, mentioned_landmark_indices
def cluster_corpus(corpus, train_instructions, max_edit_distance=3): english = load_english_vocabulary() terms = {} potential_misspellings = [] # Count the number of times each word occurs in the corpus word_counts = {} for word in corpus: word_counts[word] = 0 for env_id, instruction_sets in train_instructions.items(): for instruction_set in instruction_sets[0]["instructions"]: instruction_str = instruction_set["instruction"] instr_split = split_instruction(clean_instruction(instruction_str)) for word in instr_split: word_counts[word] += 1 for word in corpus: if word in english: terms[word] = [word] else: potential_misspellings.append(word) terms[UNK_TERM] = [] # Find the closest english word by edit distance edit_dists = np.zeros((len(terms))) term_list = sorted(list(terms.keys())) unresolved_words = [] for i, misspelled_word in enumerate(potential_misspellings): # Words that contain a numbers should be assumed not to be misspellings if any(char.isdigit() for char in misspelled_word): unresolved_words.append(misspelled_word) continue # For other words, see if they might be misspellings of one of the terms for j, term in enumerate(term_list): edit_dists[j] = levenshtein(misspelled_word, term) closest = int(np.argmin(edit_dists)) min_dist = edit_dists[closest] # If the misspelled word is likely a misspelling of the closest term, add it in, except to the "NA" term if min_dist <= max_edit_distance and term_list[closest] != UNK_TERM: terms[term_list[closest]].append(misspelled_word) # Otherwise add it to the list of unresolved words that are too different from every term else: unresolved_words.append(misspelled_word) rejected_words = [] # Handle words that are not misspellings for unresolved_word in unresolved_words: # If the word is not a misspelling and is also very infrequent, reject it if word_counts[unresolved_word] < MIN_TERM_OCCURENCES: rejected_words.append(unresolved_word) # Otherwise create a term for this word else: terms[unresolved_word] = [unresolved_word] # For each rejected word, add it to the unknown term for rejected_word in rejected_words: terms[UNK_TERM].append(rejected_word) print("After clustering words, found " + str(len(rejected_words)) + " rare ones that have been rejected:") print(rejected_words) print("...") return terms, rejected_words
def ground_terms(word2id, clustered_corpus, landmark_names, train_instructions): # the clustered corpus is a dictionary of lists, where the keys are valid english words and the values are # lists of words found in the corpus that are assumed to be misspellings of the key valid words # We make the distinction that a word is any word in an instruction # Terms are words in the english vocabulary. Multiple words (misspellings) can map to a single term. num_terms = len(clustered_corpus) vocab_size = len(word2id) num_landmarks = len(landmark_names) # This is gonna be the new word2id, once we start using the thesaurus term2id = {} id2term = {} for i, term in enumerate(sorted(clustered_corpus.keys())): term2id[term] = i id2term[i] = term # Calculate the mutual information between each cluster and each landmark # Number of times each term appears in an instruction term_occurences = np.zeros(num_terms) # Number of times each landmark appears near a segment path landmark_occurences = np.zeros(num_landmarks) # The number of times each term and landmark combination appears in the instruction and near the path term_landmark_cooccurences = np.zeros((num_terms, num_landmarks)) # The number of total segments that were considered total_occurences = 0 landmark_indices = get_landmark_name_to_index() # Inverse the clusters so that we can efficiently map each word in each instruction to it's cluster core word2term = {} for real_word, misspellings in clustered_corpus.items(): for misspelling in misspellings: word2term[misspelling] = real_word # Count landmark and word occurences and co-occurences for env_id, instruction_sets in train_instructions.items(): path = load_path(env_id) env_config = load_env_config(env_id) for instruction_set in instruction_sets[0]["instructions"]: instruction_str = instruction_set["instruction"] start_idx = instruction_set["start_idx"] end_idx = instruction_set["end_idx"] present_landmarks = close_landmark_names(env_config, path, start_idx, end_idx) present_lm_indices = [ landmark_indices[lm] for lm in present_landmarks ] mentioned_words = split_instruction( clean_instruction(instruction_str)) mentioned_terms = words_to_terms(mentioned_words, word2term) for term in mentioned_terms: term_id = term2id[term] term_occurences[term_id] += 1 for lm_idx in present_lm_indices: landmark_occurences[lm_idx] += 1 for term in mentioned_terms: term_id = term2id[term] term_landmark_cooccurences[term_id][lm_idx] += 1 total_occurences += 1 term_prob = np.expand_dims(term_occurences / total_occurences, 1).repeat(num_landmarks, 1) landmark_prob = np.expand_dims(landmark_occurences / total_occurences, 0).repeat(num_terms, 0) term_and_landmark_prob = term_landmark_cooccurences / total_occurences # term_and_landmark_prob has dimensions 0: terms, 1: landmarks mutual_info_factor = term_and_landmark_prob / (landmark_prob * term_prob + 1e-27) #mutual_info_factor = term_and_landmark_prob / ((1 / num_landmarks) * term_prob + 1e-9) mutual_info = term_and_landmark_prob * np.log(mutual_info_factor + 1e-27) # The above line is the correct formula for mutual information. For our case, below formula might be better? # The mutual information is higher for common words than uncommon ones. We might prefer the opposite effect. # On the other hand, uncommon words are more likely to spuriously correlate with landmarks, which will cause a # less reliable corpus. #mutual_info = np.log(mutual_info_factor + 1e-27) # Ground each term and produce the thesaurus term_meanings = {} common_words = [] for i in range(num_terms): grounded_lm_indices = [ idx for idx in range(num_landmarks) if mutual_info[i][idx] > MUTUAL_INFO_THRESHOLD ] grounded_lm_names = [ landmark_names[idx] for idx in grounded_lm_indices ] mutual_infos = np.asarray( [mutual_info[i][idx] for idx in grounded_lm_indices]) args = list(np.argsort(mutual_infos)) grounded_lm_names = list( reversed([grounded_lm_names[idx] for idx in args])) mutual_infos = list(reversed([mutual_infos[idx] for idx in args])) # If the word is too common to be referring to a landmark, ignore ita this_term_prob = term_prob[i][0] if this_term_prob > MAX_TERM_PROB: common_words.append(id2term[i]) grounded_lm_names = [] mutual_infos = [] term_meanings[id2term[i]] = \ { "landmarks": grounded_lm_names, "mutual_info": mutual_infos, "term_prob": this_term_prob } for k in term_meanings.keys(): if len(term_meanings[k]["landmarks"]) > 0: print(k, term_meanings[k]) print("Ignored groundings for these common words: " + str(common_words)) return term_meanings, word2term