def extract_childTrigger2parents_vec(valid_pairs): childTrigger2parents_words = {} for pair in valid_pairs: eventpair = EventPair(pair, -1) if eventpair.event2_trigger not in childTrigger2parents_words: childTrigger2parents_words[eventpair.event2_trigger] = [] for w in eventpair.event1.split(): if w in ["<", ">"] or w in invalid_words or w in B_light_verbs: continue childTrigger2parents_words[eventpair.event2_trigger].append( w.replace("[", "").replace("]", "")) childTrigger2parents_vec = {} for childTrigger in childTrigger2parents_words: vec = np.zeros(300) count = 0 for w in childTrigger2parents_words[childTrigger]: if w in model: vec += model[w] count += 1 if count != 0: vec = vec / float(count) childTrigger2parents_vec[childTrigger] = vec return childTrigger2parents_vec
def corrupt_tail_filter(pair, tailTotal, all_pairs): eventpair = EventPair(pair, -1) while (True): random_idx = random.randint(0, len(tailTotal) - 1) if eventpair.event1 + " -> " + tailTotal[random_idx] not in all_pairs: break return eventpair.event1 + " -> " + tailTotal[random_idx]
def pair_str2instance(args, pair_str, tokenizer): eventpair = EventPair(pair_str, -1) event_pair_sentence = "[CLS] " + remove_brackets(eventpair.event1) + " [SEP] " + remove_brackets(eventpair.event2) + " [SEP]" input_ids = tokenizer.encode(event_pair_sentence) input_ids, input_mask = seq_padding(args, input_ids) event1_trigger = remove_brackets(eventpair.event1_trigger) event2_trigger = remove_brackets(eventpair.event2_trigger) event_pair_sentence_wordList = event_pair_sentence.split() E1_trigger_index, E2_trigger_index = event_pair_sentence_wordList.index(event1_trigger), event_pair_sentence_wordList.index(event2_trigger) trigger1_ids = [tokenizer._convert_token_to_id(token) for token in tokenizer.tokenize(event1_trigger)] trigger2_ids = [tokenizer._convert_token_to_id(token) for token in tokenizer.tokenize(event2_trigger)] masked_idxList = [E1_trigger_index, E2_trigger_index+len(trigger1_ids)-1] if masked_idxList[1] >= len(input_ids): print("Type2: One common-sense pair instance exceeds max_seq_length.") return None instance = {"event_pair": pair_str, "masked_sentence": event_pair_sentence, "input_ids": input_ids, "masked_idxList": masked_idxList} if " -> " in pair_str or " CONTAINS-SUBEVENT " in pair_str: instance["class"] = 1 elif " <- " in pair_str or " R_CONTAINS-SUBEVENT " in pair_str: instance["class"] = 2 else: instance["class"] = 0 return instance
def corrupt_head_filter(pair, headTotal, all_pairs): eventpair = EventPair(pair, -1) while (True): random_idx = random.randint(0, len(headTotal) - 1) if headTotal[random_idx] + " -> " + eventpair.event2 not in all_pairs: break return headTotal[random_idx] + " -> " + eventpair.event2
def make_instance(args, pair, headTotal, tailTotal, all_pairs, word_index): instance = {"pos_pair": pair} if random.random() < 0.5: instance["neg_pair"] = corrupt_head_filter(pair, headTotal, all_pairs) else: instance["neg_pair"] = corrupt_tail_filter(pair, tailTotal, all_pairs) pos_eventpair = EventPair(instance["pos_pair"], -1) neg_eventpair = EventPair(instance["neg_pair"], -1) instance["pos_head_ids"] = get_ids(args, word_index, pos_eventpair.event1)[0] instance["pos_tail_ids"] = get_ids(args, word_index, pos_eventpair.event2)[0] instance["pos_rel"] = 0 instance["neg_head_ids"] = get_ids(args, word_index, neg_eventpair.event1)[0] instance["neg_tail_ids"] = get_ids(args, word_index, neg_eventpair.event2)[0] instance["neg_rel"] = 0 return instance
def read_trigger_pair2score(fileList): trigger_pair2score = {} for file in fileList: input_lines = open(file, "r") for line in input_lines: if not line.strip(): continue eventpair = EventPair(line, -1) trigger_pair = eventpair.event1_trigger + " " + eventpair.event2_trigger if trigger_pair not in trigger_pair2score: trigger_pair2score[trigger_pair] = 0.0 trigger_pair2score[trigger_pair] += 1.0 input_lines.close() return trigger_pair2score
def test(self, epoch, data_flag): if data_flag == "dev": dataset = self.dev_set output_file = open("dev_emb_" + str(epoch) + ".txt", "w") elif data_flag == "test": dataset = self.test_set output_file = open("test_emb_" + str(epoch) + ".txt", "w") head_embList, tail_embList, rel_embList, acc = self.evaluate(dataset) event2vec = {} for i, instance in enumerate(dataset): eventpair = EventPair(instance["pos_pair"], -1) if eventpair.event1 not in event2vec: event2vec[eventpair.event1] = head_embList[i] if eventpair.event2 not in event2vec: event2vec[eventpair.event2] = tail_embList[i] for event in event2vec: output_file.write(event + "\t" + str(event2vec[event]) + "\n") output_file.close() return acc
"of", "for", "to", "up", "on", "with", "not", "at", "from", "into", "over", "by", "against","poss", "about", "off", "before"]) invalid_words = invalid_words | light_verbs | pronouns | person_pronouns random.seed(11) seed_pairs = extract_valid_pairs("../run_extract_event_pair_nmod2/news/sorted_parent_child2num.txt", 2.0) #seed_pairs = set(list(seed_pairs)[:8000]) vocab = [] all_parents = set() all_children = set() for pair in seed_pairs: eventpair = EventPair(pair, -1) all_parents.add(eventpair.event1) all_children.add(eventpair.event2) for pair in seed_pairs: words = pair.split() for w in words: if w in ['<', '>']: continue vocab.append(w.replace("[", "").replace("]", "")) all_parent_triggers = set() for pair in seed_pairs: eventpair = EventPair(pair, -1)
def get_filename2instanceList_new(args, trigger_pair2score, parentTrigger2children_vec, childTrigger2parents_vec, tokenizer): event2vec = read_event_vec("../run_Trans_50x_50d_news/test_emb_20.txt") subevent_trigger_pair2score = read_trigger_pair2score( ["../subevent_pairs/all_subevent_pairs.txt"]) minList = [] maxList = [] for trigger_pair2score in [subevent_trigger_pair2score]: scoreList = [] for trigger_pair in trigger_pair2score: scoreList.append(trigger_pair2score[trigger_pair]) minList.append(min(scoreList)) maxList.append(max(scoreList)) print("minList:", minList) print("maxList:", maxList) MASK_id = tokenizer._convert_token_to_id("[MASK]") output = open("get_filename2instanceList.log", "w", 1) filename2instanceList = {} input_lines = open(args.test_file, "r") for line in input_lines: if not line.strip(): continue words = line.split() if words[0] == "<filename>": filename = words[-1] continue if words[0] == "<relation>": relation = words[1] continue if words[0] == "<event1_trigger>": event1_trigger = words[-1] continue if words[0] == "<event2_trigger>": event2_trigger = words[-1] continue if words[0] == "<order_flag>": order_flag = words[1] continue if words[0] == "<event1>": event1 = " ".join(words[1:]) continue if words[0] == "<event2>": event2 = " ".join(words[1:]) continue if words[0] == "<sentence1>": sentence1 = " ".join(words[1:]) continue if words[0] == "<sentence2>": sentence2 = " ".join(words[1:]) continue if words[0] == "<masked_sentence1>": masked_sentence1 = " ".join(words[1:]) continue if words[0] == "<masked_sentence2>": masked_sentence2 = " ".join(words[1:]) continue if words[0] == "<END>": if args.sentence_setting == "within" and masked_sentence1 != masked_sentence2: continue elif args.sentence_setting == "across" and masked_sentence1 == masked_sentence2: continue instance = {} if order_flag == "e1->e2": word_pair = "< " + event1 + " > " + relation + " < " + event2 + " >" else: word_pair = "< " + event2 + " > " + "R_" + relation + " < " + event1 + " >" eventpair = EventPair(word_pair, -1) # KE E1_vec = event2vec[eventpair.event1.lower()] E2_vec = event2vec[eventpair.event2.lower()] #children_vector = np.concatenate((E1_vec, E2_vec), axis=0) #children_vector = np.concatenate((E1_vec, E2_vec, E2_vec-E1_vec), axis=0) children_vector = E2_vec - E1_vec #knowledge_vector = get_knowledge_vec(eventpair, trigger_pair2score, C_trigger_pair2score) knowledge_vector = get_knowledge_vec_new( eventpair, subevent_trigger_pair2score, minList, maxList) #print("word_pair:", word_pair) #print("closest_trigger_pair:", closest_trigger_pair) #print("knowledge_vector:", knowledge_vector) output.write("word_pair: " + str(word_pair) + "\n") #output.write("closest_trigger_pair: " + str(closest_trigger_pair) + "\n") output.write("knowledge_vector: " + str(knowledge_vector) + "\n\n") #print(relation, get_class_idx(args, relation, order_flag)) #print("\n") if masked_sentence1 == masked_sentence2: masked_sentence = "[CLS] " + masked_sentence1 + " [SEP]" elif order_flag == "e1->e2": masked_sentence = "[CLS] " + masked_sentence1 + " [SEP] " + masked_sentence2 + " [SEP]" elif order_flag == "e2->e1": masked_sentence = "[CLS] " + masked_sentence2 + " [SEP] " + masked_sentence1 + " [SEP]" input_ids = tokenizer.encode(masked_sentence) input_ids, input_mask = seq_padding(args, input_ids) masked_idxList = [] for i, input_id in enumerate(input_ids): if input_id == MASK_id: masked_idxList.append(i) if len(masked_idxList) != 2: print("Type1: One test instance exceeds max_seq_length.") continue if args.mask_trigger == True: instance = { "event_pair": word_pair, "masked_sentence": masked_sentence, "input_ids": input_ids, "masked_idxList": masked_idxList, "knowledge_vector": knowledge_vector, "children_vector": children_vector } else: if sentence1 == sentence2: sentence = "[CLS] " + sentence1 + " [SEP]" elif order_flag == "e1->e2": sentence = "[CLS] " + sentence1 + " [SEP] " + sentence2 + " [SEP]" elif order_flag == "e2->e1": sentence = "[CLS] " + sentence2 + " [SEP] " + sentence1 + " [SEP]" input_ids = tokenizer.encode(sentence) input_ids, input_mask = seq_padding(args, input_ids) trigger1_ids = [ tokenizer._convert_token_to_id(token) for token in tokenizer.tokenize(event1_trigger) ] trigger2_ids = [ tokenizer._convert_token_to_id(token) for token in tokenizer.tokenize(event2_trigger) ] if order_flag == "e1->e2": masked_idxList[1] = masked_idxList[1] + ( len(trigger1_ids) - 1 ) # How many slots shift to the right else: masked_idxList[1] = masked_idxList[1] + ( len(trigger2_ids) - 1 ) # How many slots shift to the right if masked_idxList[1] >= len(input_ids): print("Type2: One test instance exceeds max_seq_length.") continue instance = { "event_pair": word_pair, "masked_sentence": masked_sentence, "input_ids": input_ids, "masked_idxList": masked_idxList, "knowledge_vector": knowledge_vector, "children_vector": children_vector } # !!!!!!!!!!!!!!!!!!!!! class_idx = get_class_idx(args, relation, order_flag) if filename not in filename2instanceList: filename2instanceList[filename] = [] instance_found_idx = None for idx, previous_instance in enumerate( filename2instanceList[filename]): if previous_instance["masked_sentence"] == masked_sentence: instance_found_idx = idx if instance_found_idx != None: if class_idx in [1, 2]: # relation and R_relation class_idx filename2instanceList[filename][instance_found_idx][ "event_pair"] = word_pair filename2instanceList[filename][instance_found_idx][ "class"] = class_idx else: instance["class"] = class_idx filename2instanceList[filename].append(instance) input_lines.close() output.close() return filename2instanceList
def LSTM_prepare_data(args, all_pairs, test_all_pairs): trainList = [] devList = [] testList = [] headTotal = [] tailTotal = [] vocab = [] input_lines = open("test_pairs.csv", "r") for line in input_lines: event = line.split()[0] test_all_pairs.append("< [" + event + "] > -> < [" + event + "] >") input_lines.close() all_pairs_list = list(all_pairs) for pair in all_pairs_list + test_all_pairs: eventpair = EventPair(pair, -1) for w in pair.split(): if w in ["<", ">"]: continue vocab.append(w.replace("[", "").replace("]", "")) headTotal.append(eventpair.event1) tailTotal.append(eventpair.event2) vocab = list(set(vocab)) headTotal = list(set(headTotal)) tailTotal = list(set(tailTotal)) print("{} unique words".format(len(vocab))) print("len(all_pairs):", len(all_pairs)) index_word = {index + 2: word for index, word in enumerate(vocab)} word_index = {word: index + 2 for index, word in enumerate(vocab)} index_word[0], index_word[1] = '<pad>', '<unk>' word_index['<pad>'], word_index['<unk>'] = 0, 1 split_num = len(all_pairs_list) // 50 for repeat in range(0, 100): for pair in all_pairs_list[split_num:]: instance = make_instance(args, pair, headTotal, tailTotal, all_pairs, word_index) trainList.append(instance) for repeat in range(0, 10): for pair in all_pairs_list[:split_num]: instance = make_instance(args, pair, headTotal, tailTotal, all_pairs, word_index) devList.append(instance) for pair in test_all_pairs: instance = make_instance(args, pair, headTotal, tailTotal, all_pairs, word_index) testList.append(instance) print("len(trainList):", len(trainList)) print("len(devList):", len(devList)) print("len(testList):", len(testList)) glove = {} print("Read Glove embedding...") with open(args.w2v_file) as f: for l in f: vec = l.split(' ') word = vec[0].lower() vec = vec[1:] glove[word] = np.array(vec) vocab_size = len(vocab) dimensions = 300 matrix = np.zeros((len(word_index), dimensions)) oov = 0 filtered_glove = {} for i in tqdm(range(2, len(word_index))): word = index_word[i].lower() if (word in glove): vec = glove[word] filtered_glove[word] = glove[word] matrix[i] = vec else: random_init = np.random.uniform(low=-0.1, high=0.1, size=(1, dimensions)) matrix[i] = random_init oov += 1 print("oov={}".format(oov)) env = {"index_word": index_word, "word_index": word_index, "glove": matrix} random.shuffle(trainList) env["train"] = trainList env["dev"] = devList env["test"] = testList pickle.dump(env, open("env.pkl", "wb"))
for file in glob.glob(args.folder + args.genre + "*.txt"): print(file) input_file = open(file, "r") for line in input_file: if not line.strip(): continue words = line.split() if words[0] == "<doc_id>": doc_id = words[1] continue if words[0] == "<subevent>": eventpair = EventPair(" ".join(words[1:]), 1) if "including" in eventpair.relation: pair = eventpair.event1 + " -> " + eventpair.event2 else: pair = eventpair.event2 + " -> " + eventpair.event1 pair = clean_eventpair(pair) continue if words[0] == "<word>": sentence_str = " ".join(words) if sentence_str in sentence_str_set: continue else: sentence_str_set.add(sentence_str)
fig = plt.figure(figsize=(12, 12)) plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r') for word, (x, y) in zip(words, twodim): plt.text(x + 0.00, y + 0.00, word) fig.savefig(flag + "_" + str(eval_iteration) + '.png', dpi=fig.dpi) plt.show() fig.savefig(flag + "_" + str(eval_iteration) + '.pdf', bbox_inches='tight') if __name__ == "__main__": words = ["conflict", "war", "attack", "protest", "clash", "fighting", "march", "game", "olympics", "match", \ "bankruptcy", "reform", "recession", "investigation",\ "hurricane", "storm", "earthquake", "flooding", "disaster",\ "meeting", "conference", "forum", "discussion", \ "festival", "ceremony", "celebration", \ "election", "explosion", "wedding", "birthday", "carnival"] # "entertainment", input_lines = open("test_emb_20.txt", "r") trigger2vec = {} for line in input_lines: fields = line.split("\t") eventpair = EventPair(fields[0] + " -> " + fields[0], -1) if len(fields[0].split()) == 3: trigger2vec[eventpair.event1_trigger.replace("[", "").replace( "]", "")] = ast.literal_eval(fields[1]) input_lines.close() eval_iteration = 20 display_pca_scatterplot(words, trigger2vec, eval_iteration, "child")