def creat_word_rel_dict(r_file, *q_files): word_dict = Dictionary() word_dict.add_unk_token() word_dict.add_pad_token() word_dict.add_start_token() for q_file in q_files: qa_data = pickle.load(open(q_file, 'rb')) for data in qa_data: q = data.question tokens = q.split(' ') for token in tokens: word_dict.add(token) print(len(word_dict)) rels = pickle.load(open(r_file, 'rb')) for rel in rels: rel_word = [] w = rel[3:].split('.') for i in w: rel_word.extend(i.split('_')) for word in rel_word: word_dict.add(word) print(len(word_dict)) return word_dict
print_data.append({ 'Predicted Subject Name': tagged_subject_name, 'Original Subject Name': row['subject_name'], 'Normalized Edit Distance': normalized_edit_distance, 'Question Tokens': row['question_tokens'], }) exact_match = [ d for d in print_data if d['Normalized Edit Distance'] == 1.0 ] # get the word dictionary word_vocab = Dictionary() word_vocab.add_unk_token() word_vocab.add_pad_token() word_vocab.add_start_token() word_vocab.add_end_token() word_vocab.add("<e>") add_word(df_dev) add_word(df_test) add_word(df_train) torch.save(word_vocab, "../../data/vocab/word_vocab.pt") # get the training data and test data get_formatted_examples(128, '../../data/subject_recognition/dev.pt', df_dev) get_formatted_examples(128, '../../data/subject_recognition/test.pt', df_test) get_formatted_examples(128, '../../data/subject_recognition/train.pt',