def build_vocab(): token_list = [] char_list = [] tri_type_list = [] ent_type_list = [] ent_ref_list = [] arg_type_list = [] actions_list = [] pos_list = [] tri_word_set = [] for inst in train_list: words = inst['nlp_words'] tris = inst['Triggers'] # (idx, event_type) ents = inst['Entities'] # (start, end, coarse_type, ref_type) args = inst['Arguments'] # (ent_start, ent_end, trigger_idx, argument_type) pos_list.extend(inst['nlp_pos']) for word in words: word = normalize_tok(word, lower_case, normalize_digits) token_list.append(word) char_list.extend(list(word)) for tri in tris: tri_type_list.append(tri[1].lower()) tri_word_set.append((words[tri[0]])) for ent in ents: ent_type_list.append(ent[2]) ent_ref_list.append(ent[3]) collapsed_args = [] for arg in args: collapsed_type = collapse_role_type(arg[3]).lower() arg_type_list.append(collapsed_type) collapsed_args.append([arg[0], arg[1], arg[2], collapsed_type]) actions = Actions.make_oracle(words,tris,ents,collapsed_args) actions_list.extend(actions) train_token_set = set(token_list) dev_oo_train_but_in_glove = 0 for inst in dev_list: words = inst['nlp_words'] tris = inst['Triggers'] # (idx, event_type) ents = inst['Entities'] # (start, end, coarse_type, ref_type) args = inst['Arguments'] # (ent_start, ent_end, trigger_idx, argument_type) pos_list.extend(inst['nlp_pos']) for word in words: word = normalize_tok(word, lower_case, normalize_digits) if embedd_dict is not None and (word in embedd_dict or word.lower() in embedd_dict): token_list.append(word) char_list.extend(list(word)) if word not in train_token_set: dev_oo_train_but_in_glove += 1 for tri in tris: tri_type_list.append(tri[1].lower()) tri_word_set.append((words[tri[0]])) for ent in ents: ent_type_list.append(ent[2]) ent_ref_list.append(ent[3]) collapsed_args = [] for arg in args: collapsed_type = collapse_role_type(arg[3]).lower() arg_type_list.append(collapsed_type) collapsed_args.append([arg[0], arg[1], arg[2], collapsed_type]) actions = Actions.make_oracle(words, tris, ents, collapsed_args) actions_list.extend(actions) test_oo_train_but_in_glove = 0 for inst in test_list: words = inst['nlp_words'] tris = inst['Triggers'] # (idx, event_type) ents = inst['Entities'] # (start, end, coarse_type, ref_type) args = inst['Arguments'] # (ent_start, ent_end, trigger_idx, argument_type) pos_list.extend(inst['nlp_pos']) for word in words: word = normalize_tok(word, lower_case, normalize_digits) if embedd_dict is not None and (word in embedd_dict or word.lower() in embedd_dict): token_list.append(word) char_list.extend(list(word)) if word not in train_token_set: test_oo_train_but_in_glove += 1 for tri in tris: tri_type_list.append(tri[1].lower()) #tri_word_set.append((words[tri[0]])) for ent in ents: ent_type_list.append(ent[2]) ent_ref_list.append(ent[3]) collapsed_args = [] for arg in args: collapsed_type = collapse_role_type(arg[3]).lower() arg_type_list.append(collapsed_type) collapsed_args.append([arg[0], arg[1], arg[2], collapsed_type]) actions = Actions.make_oracle(words, tris, ents, collapsed_args) actions_list.extend(actions) print('dev_oo_train_but_in_glove : ', dev_oo_train_but_in_glove) print('test_oo_train_but_in_glove : ', test_oo_train_but_in_glove) print('--------token_vocab---------------') token_vocab = Vocab() token_vocab.add_spec_toks(unk_tok=True, pad_tok=False) token_vocab.add_counter(Counter(token_list)) token_vocab.save(token_vocab_file) print(token_vocab) print('--------char_vocab---------------') char_vocab = Vocab() char_vocab.add_spec_toks(unk_tok=True, pad_tok=False) char_vocab.add_counter(Counter(char_list)) char_vocab.save(char_vocab_file) print(char_vocab) print('--------ent_type_vocab---------------') ent_type_vocab = Vocab() ent_type_vocab.add_spec_toks(pad_tok=False, unk_tok=False) ent_type_vocab.add_counter(Counter(ent_type_list)) ent_type_vocab.save(ent_type_vocab_file) print(ent_type_vocab) print('--------ent_ref_vocab---------------') ent_ref_vocab = Vocab() ent_ref_vocab.add_spec_toks(pad_tok=False, unk_tok=False) ent_ref_vocab.add_counter(Counter(ent_ref_list)) ent_ref_vocab.save(ent_ref_vocab_file) print(ent_ref_vocab) print('--------tri_type_vocab---------------') tri_type_vocab = Vocab() tri_type_vocab.add_spec_toks(pad_tok=False, unk_tok=False, null_tok=True) tri_type_vocab.add_counter(Counter(tri_type_list)) tri_type_vocab.save(tri_type_vocab_file) print(tri_type_vocab) print('--------arg_type_vocab---------------') arg_type_vocab = Vocab() arg_type_vocab.add_spec_toks(pad_tok=False, unk_tok=False, null_tok=True) arg_type_vocab.add_counter(Counter(arg_type_list)) arg_type_vocab.save(arg_type_vocab_file) print(arg_type_vocab) print('--------action_vocab---------------') action_vocab = Vocab() action_vocab.add_spec_toks(pad_tok=False, unk_tok=False) action_vocab.add_counter(Counter(actions_list)) action_vocab.save(action_vocab_file) print(action_vocab) print('--------pos_vocab---------------') pos_vocab = Vocab() pos_vocab.add_spec_toks(pad_tok=False, unk_tok=False) pos_vocab.add_counter(Counter(pos_list)) pos_vocab.save(pos_vocab_file) print(pos_vocab)
def construct_instance(inst_list, token_vocab, char_vocab, pos_vocab, dep_vocab, is_train=True): word_num = 0 processed_inst_list = [] for inst in tqdm(inst_list, total=len(inst_list)): words = inst['words'] aspects = inst['aspects_idx'] opinions = inst['opinions_idx'] pair_idx = inst['pair_idx'] if is_train and len(pair_idx) == 0: continue words_processed = [] word_indices = [] char_indices = [] for word in words: word = normalize_tok(word, lower_case, normalize_digits) words_processed.append(word) word_idx = token_vocab.get_index(word) word_indices.append(word_idx) char_indices.append([char_vocab.get_index(c) for c in word]) inst['words'] = words_processed inst['word_indices'] = word_indices inst['char_indices'] = char_indices temp_parser_res = depparser.parse(words_processed) parser_res = [] for i in temp_parser_res: temp = i.to_conll(4).strip().split('\n') for t in temp: parser_res.append(t.split('\t')) if len(parser_res) > len(inst['words']): words = [a[0] for a in parser_res] inst['words'] = words # print("source text: ", inst['words']) # print("new text: ", words) s_to_t = {} i = j = 0 while i < len(inst['words']): if inst['words'][i] == words[j]: s_to_t[i] = [j] i += 1 j += 1 else: s_to_t[i] = [] if i + 1 > len(inst['words']) - 1: s_to_t[i] = [x for x in range(j, len(words))] else: next_token = inst['words'][i + 1] while words[j] != '-RRB-' and words[ j] != next_token and words[ j] not in next_token and j <= len( words) - 1: s_to_t[i].append(j) j += 1 i += 1 def get_new_term(old_term): new_term = [] for i in old_term: temp = [] for j in i: temp.extend(s_to_t[j]) new_term.append(temp) return new_term new_aspects = get_new_term(aspects) new_opinions = get_new_term(opinions) inst['aspects_idx'] = new_aspects inst['opinions_idx'] = new_opinions new_pairs = [] for p in pair_idx: new_p_a = [] for a in p[0]: new_p_a.extend(s_to_t[a]) new_p_o = [] for a in p[1]: new_p_o.extend(s_to_t[a]) new_pairs.append((new_p_a, new_p_o)) inst['pair_idx'] = new_pairs inst['dep_label'] = [a[3] for a in parser_res] inst['dep_label_indices'] = [ dep_vocab.get_index(a[3]) for a in parser_res ] inst['dep'] = [a[2] for a in parser_res] inst['tag_type'] = [a[1] for a in parser_res] inst['tag_type_indices'] = [ pos_vocab.get_index(a[1]) for a in parser_res ] inst['sent_range'] = list(range(word_num, word_num + len(words))) word_num += len(words) processed_inst_list.append(inst) return processed_inst_list
def construct_instance(inst_list, token_vocab, char_vocab, ent_type_vocab, ent_ref_vocab, tri_type_vocab, arg_type_vocab, action_vocab, pos_vocab, is_train=True): word_num = 0 processed_inst_list = [] sample_sent_total = 2000 sample_sent_num = 0 for inst in inst_list: words = inst['nlp_words'] tris = inst['Triggers'] # (idx, event_type) ents = inst['Entities'] # (start, end, coarse_type, ref_type) args = inst['Arguments'] # (ent_start, ent_end, trigger_idx, argument_type) pos = inst['nlp_pos'] deps = inst['nlp_deps'] # if is_train and len(tris) == 0: # if len(ents) > 0 and sample_sent_num < sample_sent_total: # sample_sent_num += 1 # else: # continue # Empirically filter out sentences where event size is 0 or entity size less than 3 (for traning) if is_train and len(tris) == 0 and len(ents) < 3: continue words_processed = [] word_indices = [] char_indices = [] for word in words: word = normalize_tok(word, lower_case, normalize_digits) words_processed.append(word) word_idx = token_vocab.get_index(word) word_indices.append(word_idx) char_indices.append([char_vocab.get_index(c) for c in word]) del inst['Sent'] inst['words'] = words_processed inst['word_indices'] = word_indices inst['char_indices'] = char_indices inst['pos_indices'] = [pos_vocab.get_index(p) for p in pos] inst['tri_indices'] = [[tri[0], tri_type_vocab.get_index(tri[1].lower())] for tri in tris] inst['ent_indices'] = [[ent[0], ent[1], ent_type_vocab.get_index(ent[2]), ent_ref_vocab.get_index(ent[3])] for ent in ents] collapsed_args = [] for arg in args: collapsed_type = collapse_role_type(arg[3]).lower() collapsed_args.append([arg[0], arg[1], arg[2], collapsed_type]) inst['Arguments'] = collapsed_args inst['arg_indices'] = [[arg[0], arg[1], arg[2], arg_type_vocab.get_index(arg[3])] for arg in collapsed_args] actions = Actions.make_oracle(words, tris, ents, collapsed_args) inst['actions'] = actions inst['action_indices'] = [action_vocab.get_index(act) for act in actions] inst['sent_range'] = list(range(word_num, word_num + len(words))) word_num += len(words) processed_inst_list.append(inst) return processed_inst_list
def build_vocab(train_list, dev_list, test_list, data_type='lap14'): token_list = [] char_list = [] aspects_list = [] opinions_list = [] pos_list = [] dep_list = [] for inst in tqdm(train_list, total=len(train_list)): words = inst['words'] aspects = inst['aspects_idx'] # idx, prds_type opinions = inst['opinions_idx'] # arg_id, prd_id, role try: temp_parser_res = depparser.parse( [normalize_tok(w) for w in words]) except: print(words) print([normalize_tok(w) for w in words]) exit(0) parser_res = [] for i in temp_parser_res: temp = i.to_conll(4).strip().split('\n') for t in temp: parser_res.append(t.split('\t')) pos_list.extend([a[1] for a in parser_res]) dep_list.extend([a[3] for a in parser_res]) for word in words: word = normalize_tok(word, lower_case, normalize_digits) # if embedd_dict is not None and (word in embedd_dict or word.lower() in embedd_dict): token_list.append(word) char_list.extend(list(word)) aspects_list.extend(aspects) opinions_list.extend(opinions) for inst in tqdm(dev_list, total=len(dev_list)): words = inst['words'] aspects = inst['aspects_idx'] # idx, prds_type opinions = inst['opinions_idx'] # arg_id, prd_id, role temp_parser_res = depparser.parse([normalize_tok(w) for w in words]) parser_res = [] for i in temp_parser_res: temp = i.to_conll(4).strip().split('\n') for t in temp: parser_res.append(t.split('\t')) pos_list.extend([a[1] for a in parser_res]) dep_list.extend([a[3] for a in parser_res]) for word in words: word = normalize_tok(word, lower_case, normalize_digits) token_list.append(word) char_list.extend(list(word)) aspects_list.extend(aspects) opinions_list.extend(opinions) for inst in tqdm(test_list, total=len(test_list)): words = inst['words'] aspects = inst['aspects_idx'] # idx, prds_type opinions = inst['opinions_idx'] # arg_id, prd_id, role temp_parser_res = depparser.parse([normalize_tok(w) for w in words]) parser_res = [] for i in temp_parser_res: temp = i.to_conll(4).strip().split('\n') for t in temp: parser_res.append(t.split('\t')) pos_list.extend([a[1] for a in parser_res]) dep_list.extend([a[3] for a in parser_res]) for word in words: word = normalize_tok(word, lower_case, normalize_digits) # if embedd_dict is not None and (word in embedd_dict or word.lower() in embedd_dict): token_list.append(word) char_list.extend(list(word)) aspects_list.extend(aspects) opinions_list.extend(opinions) token_vocab_file = os.path.join(vocab_dir, data_type, config['token_vocab_file']) char_vocab_file = os.path.join(vocab_dir, data_type, config['char_vocab_file']) pos_vocab_file = os.path.join(vocab_dir, data_type, config['pos_vocab_file']) dep_type_vocab_file = os.path.join(vocab_dir, data_type, config['dep_type_vocab_file']) print('--------token_vocab---------------') token_vocab = Vocab() token_vocab.add_spec_toks(unk_tok=True, pad_tok=False) token_vocab.add_counter(Counter(token_list)) token_vocab.save(token_vocab_file) print(token_vocab) print('--------char_vocab---------------') char_vocab = Vocab() char_vocab.add_spec_toks(unk_tok=True, pad_tok=False) char_vocab.add_counter(Counter(char_list)) char_vocab.save(char_vocab_file) print(char_vocab) print('--------pos_vocab---------------') pos_vocab = Vocab() pos_vocab.add_spec_toks(pad_tok=True, unk_tok=True) pos_vocab.add_counter(Counter(pos_list)) pos_vocab.save(pos_vocab_file) print(pos_vocab) print('--------dep_vocab---------------') dep_vocab = Vocab() dep_vocab.add_spec_toks(pad_tok=True, unk_tok=True) dep_vocab.add_counter(Counter(dep_list)) dep_vocab.save(dep_type_vocab_file) print(dep_vocab)