def creat_word_rel_dict(r_file, *q_files):
    word_dict = Dictionary()
    word_dict.add_unk_token()
    word_dict.add_pad_token()
    word_dict.add_start_token()

    for q_file in q_files:
        qa_data = pickle.load(open(q_file, 'rb'))
        for data in qa_data:
            q = data.question
            tokens = q.split(' ')
            for token in tokens:
                word_dict.add(token)
    print(len(word_dict))

    rels = pickle.load(open(r_file, 'rb'))
    for rel in rels:
        rel_word = []
        w = rel[3:].split('.')
        for i in w:
            rel_word.extend(i.split('_'))
        for word in rel_word:
            word_dict.add(word)
    print(len(word_dict))
    return word_dict
Exemple #2
0
def load_type_dictionary(filename, word_dict=None):
    if word_dict is None:
        word_dict = Dictionary()
        word_dict.add_unk_token()
        word_dict.add_pad_token()
    data = pickle.load(open(filename, 'rb'))
    for ty in data:
        word_dict.add(ty)
    return word_dict
def load_word_dictionary(filename, word_dict=None):
    if word_dict is None:
        word_dict = Dictionary()
        word_dict.add_unk_token()
        word_dict.add_pad_token()
    with open(filename) as f:
        for line in f:
            if not line: break
            line = line.strip()
            if not line: continue
            word_dict.add(line)
    return word_dict
def load_rel_separated_dictionary(filename):
    rel1_dict = Dictionary()
    rel1_dict.add_unk_token()
    rel1_dict.add_pad_token()
    rel2_dict = Dictionary()
    rel2_dict.add_unk_token()
    rel2_dict.add_pad_token()
    with open(filename) as f:
        for line in f:
            if not line: break
            line = line.strip()
            if not line: continue
            line = line.split('.')
            rel1 = '.'.join(line[:-1])
            rel2 = line[-1]
            rel1_dict.add(rel1)
            rel2_dict.add(rel2)
    return rel1_dict, rel2_dict
Exemple #5
0
        normalized_edit_distance = (max_length - edit_distance) / max_length
        print_data.append({
            'Predicted Subject Name': tagged_subject_name,
            'Original Subject Name': row['subject_name'],
            'Normalized Edit Distance': normalized_edit_distance,
            'Question Tokens': row['question_tokens'],
        })

    exact_match = [
        d for d in print_data if d['Normalized Edit Distance'] == 1.0
    ]

    # get the word dictionary
    word_vocab = Dictionary()
    word_vocab.add_unk_token()
    word_vocab.add_pad_token()
    word_vocab.add_start_token()
    word_vocab.add_end_token()
    word_vocab.add("<e>")

    add_word(df_dev)
    add_word(df_test)
    add_word(df_train)

    torch.save(word_vocab, "../../data/vocab/word_vocab.pt")

    # get the training data and test data
    get_formatted_examples(128, '../../data/subject_recognition/dev.pt',
                           df_dev)
    get_formatted_examples(128, '../../data/subject_recognition/test.pt',
                           df_test)