def creat_word_rel_dict(r_file, *q_files): word_dict = Dictionary() word_dict.add_unk_token() word_dict.add_pad_token() word_dict.add_start_token() for q_file in q_files: qa_data = pickle.load(open(q_file, 'rb')) for data in qa_data: q = data.question tokens = q.split(' ') for token in tokens: word_dict.add(token) print(len(word_dict)) rels = pickle.load(open(r_file, 'rb')) for rel in rels: rel_word = [] w = rel[3:].split('.') for i in w: rel_word.extend(i.split('_')) for word in rel_word: word_dict.add(word) print(len(word_dict)) return word_dict
def load_type_dictionary(filename, word_dict=None): if word_dict is None: word_dict = Dictionary() word_dict.add_unk_token() word_dict.add_pad_token() data = pickle.load(open(filename, 'rb')) for ty in data: word_dict.add(ty) return word_dict
def load_word_dictionary(filename, word_dict=None): if word_dict is None: word_dict = Dictionary() word_dict.add_unk_token() word_dict.add_pad_token() with open(filename) as f: for line in f: if not line: break line = line.strip() if not line: continue word_dict.add(line) return word_dict
def load_rel_separated_dictionary(filename): rel1_dict = Dictionary() rel1_dict.add_unk_token() rel1_dict.add_pad_token() rel2_dict = Dictionary() rel2_dict.add_unk_token() rel2_dict.add_pad_token() with open(filename) as f: for line in f: if not line: break line = line.strip() if not line: continue line = line.split('.') rel1 = '.'.join(line[:-1]) rel2 = line[-1] rel1_dict.add(rel1) rel2_dict.add(rel2) return rel1_dict, rel2_dict
normalized_edit_distance = (max_length - edit_distance) / max_length print_data.append({ 'Predicted Subject Name': tagged_subject_name, 'Original Subject Name': row['subject_name'], 'Normalized Edit Distance': normalized_edit_distance, 'Question Tokens': row['question_tokens'], }) exact_match = [ d for d in print_data if d['Normalized Edit Distance'] == 1.0 ] # get the word dictionary word_vocab = Dictionary() word_vocab.add_unk_token() word_vocab.add_pad_token() word_vocab.add_start_token() word_vocab.add_end_token() word_vocab.add("<e>") add_word(df_dev) add_word(df_test) add_word(df_train) torch.save(word_vocab, "../../data/vocab/word_vocab.pt") # get the training data and test data get_formatted_examples(128, '../../data/subject_recognition/dev.pt', df_dev) get_formatted_examples(128, '../../data/subject_recognition/test.pt', df_test)