Exemple #1
0
def build_vocab():
    token_list = []
    char_list = []
    tri_type_list = []
    ent_type_list = []
    ent_ref_list = []
    arg_type_list = []
    actions_list = []
    pos_list = []

    tri_word_set = []
    for inst in train_list:
        words = inst['nlp_words']
        tris = inst['Triggers'] # (idx, event_type)
        ents = inst['Entities'] # (start, end, coarse_type, ref_type)
        args = inst['Arguments'] # (ent_start, ent_end, trigger_idx, argument_type)
        pos_list.extend(inst['nlp_pos'])

        for word in words:
            word = normalize_tok(word, lower_case, normalize_digits)
            token_list.append(word)
            char_list.extend(list(word))

        for tri in tris:
            tri_type_list.append(tri[1].lower())
            tri_word_set.append((words[tri[0]]))

        for ent in ents:
            ent_type_list.append(ent[2])
            ent_ref_list.append(ent[3])

        collapsed_args = []
        for arg in args:
            collapsed_type = collapse_role_type(arg[3]).lower()
            arg_type_list.append(collapsed_type)
            collapsed_args.append([arg[0], arg[1], arg[2], collapsed_type])

        actions = Actions.make_oracle(words,tris,ents,collapsed_args)
        actions_list.extend(actions)

    train_token_set = set(token_list)

    dev_oo_train_but_in_glove = 0
    for inst in dev_list:
        words = inst['nlp_words']
        tris = inst['Triggers']  # (idx, event_type)
        ents = inst['Entities']  # (start, end, coarse_type, ref_type)
        args = inst['Arguments']  # (ent_start, ent_end, trigger_idx, argument_type)
        pos_list.extend(inst['nlp_pos'])

        for word in words:
            word = normalize_tok(word, lower_case, normalize_digits)
            if embedd_dict is not None and (word in embedd_dict or word.lower() in embedd_dict):
                token_list.append(word)
                char_list.extend(list(word))
                if word not in train_token_set:
                    dev_oo_train_but_in_glove += 1

        for tri in tris:
            tri_type_list.append(tri[1].lower())
            tri_word_set.append((words[tri[0]]))

        for ent in ents:
            ent_type_list.append(ent[2])
            ent_ref_list.append(ent[3])

        collapsed_args = []
        for arg in args:
            collapsed_type = collapse_role_type(arg[3]).lower()
            arg_type_list.append(collapsed_type)
            collapsed_args.append([arg[0], arg[1], arg[2], collapsed_type])

        actions = Actions.make_oracle(words, tris, ents, collapsed_args)
        actions_list.extend(actions)

    test_oo_train_but_in_glove = 0
    for inst in test_list:
        words = inst['nlp_words']
        tris = inst['Triggers']  # (idx, event_type)
        ents = inst['Entities']  # (start, end, coarse_type, ref_type)
        args = inst['Arguments']  # (ent_start, ent_end, trigger_idx, argument_type)
        pos_list.extend(inst['nlp_pos'])

        for word in words:
            word = normalize_tok(word, lower_case, normalize_digits)
            if embedd_dict is not None and (word in embedd_dict or word.lower() in embedd_dict):
                token_list.append(word)
                char_list.extend(list(word))
                if word not in train_token_set:
                    test_oo_train_but_in_glove += 1

        for tri in tris:
            tri_type_list.append(tri[1].lower())
            #tri_word_set.append((words[tri[0]]))

        for ent in ents:
            ent_type_list.append(ent[2])
            ent_ref_list.append(ent[3])

        collapsed_args = []
        for arg in args:
            collapsed_type = collapse_role_type(arg[3]).lower()
            arg_type_list.append(collapsed_type)
            collapsed_args.append([arg[0], arg[1], arg[2], collapsed_type])

        actions = Actions.make_oracle(words, tris, ents, collapsed_args)
        actions_list.extend(actions)

    print('dev_oo_train_but_in_glove : ', dev_oo_train_but_in_glove)
    print('test_oo_train_but_in_glove : ', test_oo_train_but_in_glove)

    print('--------token_vocab---------------')
    token_vocab = Vocab()
    token_vocab.add_spec_toks(unk_tok=True, pad_tok=False)
    token_vocab.add_counter(Counter(token_list))
    token_vocab.save(token_vocab_file)
    print(token_vocab)

    print('--------char_vocab---------------')
    char_vocab = Vocab()
    char_vocab.add_spec_toks(unk_tok=True, pad_tok=False)
    char_vocab.add_counter(Counter(char_list))
    char_vocab.save(char_vocab_file)
    print(char_vocab)

    print('--------ent_type_vocab---------------')
    ent_type_vocab = Vocab()
    ent_type_vocab.add_spec_toks(pad_tok=False, unk_tok=False)
    ent_type_vocab.add_counter(Counter(ent_type_list))
    ent_type_vocab.save(ent_type_vocab_file)
    print(ent_type_vocab)

    print('--------ent_ref_vocab---------------')
    ent_ref_vocab = Vocab()
    ent_ref_vocab.add_spec_toks(pad_tok=False, unk_tok=False)
    ent_ref_vocab.add_counter(Counter(ent_ref_list))
    ent_ref_vocab.save(ent_ref_vocab_file)
    print(ent_ref_vocab)

    print('--------tri_type_vocab---------------')
    tri_type_vocab = Vocab()
    tri_type_vocab.add_spec_toks(pad_tok=False, unk_tok=False, null_tok=True)
    tri_type_vocab.add_counter(Counter(tri_type_list))
    tri_type_vocab.save(tri_type_vocab_file)
    print(tri_type_vocab)

    print('--------arg_type_vocab---------------')
    arg_type_vocab = Vocab()
    arg_type_vocab.add_spec_toks(pad_tok=False, unk_tok=False, null_tok=True)
    arg_type_vocab.add_counter(Counter(arg_type_list))
    arg_type_vocab.save(arg_type_vocab_file)
    print(arg_type_vocab)

    print('--------action_vocab---------------')
    action_vocab = Vocab()
    action_vocab.add_spec_toks(pad_tok=False, unk_tok=False)
    action_vocab.add_counter(Counter(actions_list))
    action_vocab.save(action_vocab_file)
    print(action_vocab)

    print('--------pos_vocab---------------')
    pos_vocab = Vocab()
    pos_vocab.add_spec_toks(pad_tok=False, unk_tok=False)
    pos_vocab.add_counter(Counter(pos_list))
    pos_vocab.save(pos_vocab_file)
    print(pos_vocab)
Exemple #2
0
def construct_instance(inst_list,
                       token_vocab,
                       char_vocab,
                       pos_vocab,
                       dep_vocab,
                       is_train=True):
    word_num = 0
    processed_inst_list = []
    for inst in tqdm(inst_list, total=len(inst_list)):

        words = inst['words']
        aspects = inst['aspects_idx']
        opinions = inst['opinions_idx']
        pair_idx = inst['pair_idx']

        if is_train and len(pair_idx) == 0:
            continue

        words_processed = []
        word_indices = []
        char_indices = []
        for word in words:
            word = normalize_tok(word, lower_case, normalize_digits)
            words_processed.append(word)
            word_idx = token_vocab.get_index(word)
            word_indices.append(word_idx)
            char_indices.append([char_vocab.get_index(c) for c in word])

        inst['words'] = words_processed
        inst['word_indices'] = word_indices
        inst['char_indices'] = char_indices

        temp_parser_res = depparser.parse(words_processed)
        parser_res = []
        for i in temp_parser_res:
            temp = i.to_conll(4).strip().split('\n')
            for t in temp:
                parser_res.append(t.split('\t'))
        if len(parser_res) > len(inst['words']):
            words = [a[0] for a in parser_res]
            inst['words'] = words
            # print("source text: ", inst['words'])
            # print("new text: ", words)
            s_to_t = {}
            i = j = 0
            while i < len(inst['words']):
                if inst['words'][i] == words[j]:
                    s_to_t[i] = [j]
                    i += 1
                    j += 1
                else:
                    s_to_t[i] = []
                    if i + 1 > len(inst['words']) - 1:
                        s_to_t[i] = [x for x in range(j, len(words))]
                    else:
                        next_token = inst['words'][i + 1]
                        while words[j] != '-RRB-' and words[
                                j] != next_token and words[
                                    j] not in next_token and j <= len(
                                        words) - 1:
                            s_to_t[i].append(j)
                            j += 1
                    i += 1

            def get_new_term(old_term):
                new_term = []
                for i in old_term:
                    temp = []
                    for j in i:
                        temp.extend(s_to_t[j])
                    new_term.append(temp)
                return new_term

            new_aspects = get_new_term(aspects)
            new_opinions = get_new_term(opinions)
            inst['aspects_idx'] = new_aspects
            inst['opinions_idx'] = new_opinions

            new_pairs = []
            for p in pair_idx:
                new_p_a = []
                for a in p[0]:
                    new_p_a.extend(s_to_t[a])

                new_p_o = []
                for a in p[1]:
                    new_p_o.extend(s_to_t[a])
                new_pairs.append((new_p_a, new_p_o))
            inst['pair_idx'] = new_pairs

        inst['dep_label'] = [a[3] for a in parser_res]
        inst['dep_label_indices'] = [
            dep_vocab.get_index(a[3]) for a in parser_res
        ]
        inst['dep'] = [a[2] for a in parser_res]

        inst['tag_type'] = [a[1] for a in parser_res]
        inst['tag_type_indices'] = [
            pos_vocab.get_index(a[1]) for a in parser_res
        ]

        inst['sent_range'] = list(range(word_num, word_num + len(words)))
        word_num += len(words)
        processed_inst_list.append(inst)

    return processed_inst_list
Exemple #3
0
def construct_instance(inst_list, token_vocab, char_vocab, ent_type_vocab,
                       ent_ref_vocab, tri_type_vocab, arg_type_vocab, action_vocab,
                       pos_vocab, is_train=True):
    word_num = 0
    processed_inst_list = []
    sample_sent_total = 2000
    sample_sent_num = 0
    for inst in inst_list:
        words = inst['nlp_words']
        tris = inst['Triggers'] # (idx, event_type)
        ents = inst['Entities'] # (start, end, coarse_type, ref_type)
        args = inst['Arguments'] # (ent_start, ent_end, trigger_idx, argument_type)
        pos = inst['nlp_pos']
        deps = inst['nlp_deps']

        # if is_train and len(tris) == 0:
        #     if len(ents) > 0 and sample_sent_num < sample_sent_total:
        #         sample_sent_num += 1
        #     else:
        #         continue

        # Empirically filter out sentences where event size is 0 or entity size less than 3 (for traning)
        if is_train and len(tris) == 0 and len(ents) < 3: continue

        words_processed = []
        word_indices = []
        char_indices = []
        for word in words:
            word = normalize_tok(word, lower_case, normalize_digits)
            words_processed.append(word)
            word_idx = token_vocab.get_index(word)
            word_indices.append(word_idx)
            char_indices.append([char_vocab.get_index(c) for c in word])

        del inst['Sent']
        inst['words'] = words_processed
        inst['word_indices'] = word_indices
        inst['char_indices'] = char_indices

        inst['pos_indices'] = [pos_vocab.get_index(p) for p in pos]

        inst['tri_indices'] = [[tri[0], tri_type_vocab.get_index(tri[1].lower())] for tri in tris]

        inst['ent_indices'] = [[ent[0], ent[1], ent_type_vocab.get_index(ent[2]),
                                ent_ref_vocab.get_index(ent[3])] for ent in ents]

        collapsed_args = []
        for arg in args:
            collapsed_type = collapse_role_type(arg[3]).lower()
            collapsed_args.append([arg[0], arg[1], arg[2], collapsed_type])
        inst['Arguments'] = collapsed_args

        inst['arg_indices'] = [[arg[0], arg[1], arg[2], arg_type_vocab.get_index(arg[3])]
                               for arg in collapsed_args]

        actions = Actions.make_oracle(words, tris, ents, collapsed_args)
        inst['actions'] = actions
        inst['action_indices'] = [action_vocab.get_index(act) for act in actions]

        inst['sent_range'] = list(range(word_num, word_num + len(words)))
        word_num += len(words)
        processed_inst_list.append(inst)

    return processed_inst_list
Exemple #4
0
def build_vocab(train_list, dev_list, test_list, data_type='lap14'):
    token_list = []
    char_list = []

    aspects_list = []
    opinions_list = []
    pos_list = []
    dep_list = []
    for inst in tqdm(train_list, total=len(train_list)):
        words = inst['words']
        aspects = inst['aspects_idx']  # idx, prds_type
        opinions = inst['opinions_idx']  # arg_id, prd_id, role

        try:
            temp_parser_res = depparser.parse(
                [normalize_tok(w) for w in words])
        except:
            print(words)
            print([normalize_tok(w) for w in words])
            exit(0)
        parser_res = []
        for i in temp_parser_res:
            temp = i.to_conll(4).strip().split('\n')
            for t in temp:
                parser_res.append(t.split('\t'))
        pos_list.extend([a[1] for a in parser_res])
        dep_list.extend([a[3] for a in parser_res])

        for word in words:
            word = normalize_tok(word, lower_case, normalize_digits)
            # if embedd_dict is not None and (word in embedd_dict or word.lower() in embedd_dict):
            token_list.append(word)
            char_list.extend(list(word))

        aspects_list.extend(aspects)
        opinions_list.extend(opinions)

    for inst in tqdm(dev_list, total=len(dev_list)):
        words = inst['words']
        aspects = inst['aspects_idx']  # idx, prds_type
        opinions = inst['opinions_idx']  # arg_id, prd_id, role

        temp_parser_res = depparser.parse([normalize_tok(w) for w in words])
        parser_res = []
        for i in temp_parser_res:
            temp = i.to_conll(4).strip().split('\n')
            for t in temp:
                parser_res.append(t.split('\t'))
        pos_list.extend([a[1] for a in parser_res])
        dep_list.extend([a[3] for a in parser_res])

        for word in words:
            word = normalize_tok(word, lower_case, normalize_digits)
            token_list.append(word)
            char_list.extend(list(word))

        aspects_list.extend(aspects)
        opinions_list.extend(opinions)

    for inst in tqdm(test_list, total=len(test_list)):
        words = inst['words']
        aspects = inst['aspects_idx']  # idx, prds_type
        opinions = inst['opinions_idx']  # arg_id, prd_id, role

        temp_parser_res = depparser.parse([normalize_tok(w) for w in words])

        parser_res = []
        for i in temp_parser_res:
            temp = i.to_conll(4).strip().split('\n')
            for t in temp:
                parser_res.append(t.split('\t'))
        pos_list.extend([a[1] for a in parser_res])
        dep_list.extend([a[3] for a in parser_res])

        for word in words:
            word = normalize_tok(word, lower_case, normalize_digits)
            # if embedd_dict is not None and (word in embedd_dict or word.lower() in embedd_dict):
            token_list.append(word)
            char_list.extend(list(word))

        aspects_list.extend(aspects)
        opinions_list.extend(opinions)

    token_vocab_file = os.path.join(vocab_dir, data_type,
                                    config['token_vocab_file'])
    char_vocab_file = os.path.join(vocab_dir, data_type,
                                   config['char_vocab_file'])
    pos_vocab_file = os.path.join(vocab_dir, data_type,
                                  config['pos_vocab_file'])
    dep_type_vocab_file = os.path.join(vocab_dir, data_type,
                                       config['dep_type_vocab_file'])

    print('--------token_vocab---------------')
    token_vocab = Vocab()
    token_vocab.add_spec_toks(unk_tok=True, pad_tok=False)
    token_vocab.add_counter(Counter(token_list))
    token_vocab.save(token_vocab_file)
    print(token_vocab)

    print('--------char_vocab---------------')
    char_vocab = Vocab()
    char_vocab.add_spec_toks(unk_tok=True, pad_tok=False)
    char_vocab.add_counter(Counter(char_list))
    char_vocab.save(char_vocab_file)
    print(char_vocab)

    print('--------pos_vocab---------------')
    pos_vocab = Vocab()
    pos_vocab.add_spec_toks(pad_tok=True, unk_tok=True)
    pos_vocab.add_counter(Counter(pos_list))
    pos_vocab.save(pos_vocab_file)
    print(pos_vocab)

    print('--------dep_vocab---------------')
    dep_vocab = Vocab()
    dep_vocab.add_spec_toks(pad_tok=True, unk_tok=True)
    dep_vocab.add_counter(Counter(dep_list))
    dep_vocab.save(dep_type_vocab_file)
    print(dep_vocab)