Ejemplo n.º 1
0
def extract_childTrigger2parents_vec(valid_pairs):

    childTrigger2parents_words = {}
    for pair in valid_pairs:
        eventpair = EventPair(pair, -1)
        if eventpair.event2_trigger not in childTrigger2parents_words:
            childTrigger2parents_words[eventpair.event2_trigger] = []
        for w in eventpair.event1.split():
            if w in ["<", ">"] or w in invalid_words or w in B_light_verbs:
                continue

            childTrigger2parents_words[eventpair.event2_trigger].append(
                w.replace("[", "").replace("]", ""))

    childTrigger2parents_vec = {}
    for childTrigger in childTrigger2parents_words:
        vec = np.zeros(300)
        count = 0
        for w in childTrigger2parents_words[childTrigger]:
            if w in model:
                vec += model[w]
                count += 1
        if count != 0:
            vec = vec / float(count)
            childTrigger2parents_vec[childTrigger] = vec

    return childTrigger2parents_vec
Ejemplo n.º 2
0
def corrupt_tail_filter(pair, tailTotal, all_pairs):
    eventpair = EventPair(pair, -1)
    while (True):
        random_idx = random.randint(0, len(tailTotal) - 1)
        if eventpair.event1 + " -> " + tailTotal[random_idx] not in all_pairs:
            break
    return eventpair.event1 + " -> " + tailTotal[random_idx]
Ejemplo n.º 3
0
def pair_str2instance(args, pair_str, tokenizer):
    eventpair = EventPair(pair_str, -1)
    event_pair_sentence = "[CLS] " + remove_brackets(eventpair.event1) + " [SEP] " + remove_brackets(eventpair.event2) + " [SEP]"
    
    input_ids = tokenizer.encode(event_pair_sentence)
    input_ids, input_mask = seq_padding(args, input_ids)

    event1_trigger = remove_brackets(eventpair.event1_trigger)
    event2_trigger = remove_brackets(eventpair.event2_trigger)

    event_pair_sentence_wordList = event_pair_sentence.split()

    E1_trigger_index, E2_trigger_index = event_pair_sentence_wordList.index(event1_trigger), event_pair_sentence_wordList.index(event2_trigger)

    trigger1_ids = [tokenizer._convert_token_to_id(token) for token in tokenizer.tokenize(event1_trigger)]
    trigger2_ids = [tokenizer._convert_token_to_id(token) for token in tokenizer.tokenize(event2_trigger)]

    masked_idxList = [E1_trigger_index, E2_trigger_index+len(trigger1_ids)-1]

    if masked_idxList[1] >= len(input_ids):
        print("Type2: One common-sense pair instance exceeds max_seq_length.")
        return None

    instance = {"event_pair": pair_str, "masked_sentence": event_pair_sentence, "input_ids": input_ids, "masked_idxList": masked_idxList}
    
    if " -> " in pair_str or " CONTAINS-SUBEVENT " in pair_str:
        instance["class"] = 1
    elif " <- " in pair_str or " R_CONTAINS-SUBEVENT " in pair_str:
        instance["class"] = 2
    else:
        instance["class"] = 0

    return instance
Ejemplo n.º 4
0
def corrupt_head_filter(pair, headTotal, all_pairs):
    eventpair = EventPair(pair, -1)
    while (True):
        random_idx = random.randint(0, len(headTotal) - 1)
        if headTotal[random_idx] + " -> " + eventpair.event2 not in all_pairs:
            break
    return headTotal[random_idx] + " -> " + eventpair.event2
Ejemplo n.º 5
0
def make_instance(args, pair, headTotal, tailTotal, all_pairs, word_index):
    instance = {"pos_pair": pair}
    if random.random() < 0.5:
        instance["neg_pair"] = corrupt_head_filter(pair, headTotal, all_pairs)
    else:
        instance["neg_pair"] = corrupt_tail_filter(pair, tailTotal, all_pairs)
    pos_eventpair = EventPair(instance["pos_pair"], -1)
    neg_eventpair = EventPair(instance["neg_pair"], -1)
    instance["pos_head_ids"] = get_ids(args, word_index,
                                       pos_eventpair.event1)[0]
    instance["pos_tail_ids"] = get_ids(args, word_index,
                                       pos_eventpair.event2)[0]
    instance["pos_rel"] = 0
    instance["neg_head_ids"] = get_ids(args, word_index,
                                       neg_eventpair.event1)[0]
    instance["neg_tail_ids"] = get_ids(args, word_index,
                                       neg_eventpair.event2)[0]
    instance["neg_rel"] = 0

    return instance
Ejemplo n.º 6
0
def read_trigger_pair2score(fileList):
    trigger_pair2score = {}
    for file in fileList:
        input_lines = open(file, "r")

        for line in input_lines:
            if not line.strip():
                continue
            eventpair = EventPair(line, -1)
            trigger_pair = eventpair.event1_trigger + " " + eventpair.event2_trigger

            if trigger_pair not in trigger_pair2score:
                trigger_pair2score[trigger_pair] = 0.0
            trigger_pair2score[trigger_pair] += 1.0

        input_lines.close()
    return trigger_pair2score
Ejemplo n.º 7
0
    def test(self, epoch, data_flag):
        if data_flag == "dev":
            dataset = self.dev_set
            output_file = open("dev_emb_" + str(epoch) + ".txt", "w")

        elif data_flag == "test":
            dataset = self.test_set
            output_file = open("test_emb_" + str(epoch) + ".txt", "w")

        head_embList, tail_embList, rel_embList, acc = self.evaluate(dataset)

        event2vec = {}
        for i, instance in enumerate(dataset):
            eventpair = EventPair(instance["pos_pair"], -1)
            if eventpair.event1 not in event2vec:
                event2vec[eventpair.event1] = head_embList[i]
            if eventpair.event2 not in event2vec:
                event2vec[eventpair.event2] = tail_embList[i]
        for event in event2vec:
            output_file.write(event + "\t" + str(event2vec[event]) + "\n")
        output_file.close()

        return acc
Ejemplo n.º 8
0
                         "of", "for", "to", "up", "on", "with", "not", "at", "from", "into", "over", "by", "against","poss",
                         "about", "off", "before"])
    invalid_words = invalid_words | light_verbs | pronouns | person_pronouns
    
    random.seed(11)
    seed_pairs = extract_valid_pairs("../run_extract_event_pair_nmod2/news/sorted_parent_child2num.txt", 2.0)
    

    #seed_pairs = set(list(seed_pairs)[:8000])

    vocab = []

    all_parents = set()
    all_children = set()
    for pair in seed_pairs:
        eventpair = EventPair(pair, -1)
        all_parents.add(eventpair.event1)
        all_children.add(eventpair.event2)

    
    for pair in seed_pairs:
        words = pair.split()
        for w in words:
            if w in ['<', '>']:
                continue
            vocab.append(w.replace("[", "").replace("]", ""))

    
    all_parent_triggers = set()
    for pair in seed_pairs:
        eventpair = EventPair(pair, -1)
Ejemplo n.º 9
0
def get_filename2instanceList_new(args, trigger_pair2score,
                                  parentTrigger2children_vec,
                                  childTrigger2parents_vec, tokenizer):
    event2vec = read_event_vec("../run_Trans_50x_50d_news/test_emb_20.txt")

    subevent_trigger_pair2score = read_trigger_pair2score(
        ["../subevent_pairs/all_subevent_pairs.txt"])

    minList = []
    maxList = []
    for trigger_pair2score in [subevent_trigger_pair2score]:
        scoreList = []
        for trigger_pair in trigger_pair2score:
            scoreList.append(trigger_pair2score[trigger_pair])
        minList.append(min(scoreList))
        maxList.append(max(scoreList))
    print("minList:", minList)
    print("maxList:", maxList)

    MASK_id = tokenizer._convert_token_to_id("[MASK]")
    output = open("get_filename2instanceList.log", "w", 1)

    filename2instanceList = {}
    input_lines = open(args.test_file, "r")

    for line in input_lines:
        if not line.strip():
            continue
        words = line.split()
        if words[0] == "<filename>":
            filename = words[-1]
            continue
        if words[0] == "<relation>":
            relation = words[1]
            continue
        if words[0] == "<event1_trigger>":
            event1_trigger = words[-1]
            continue
        if words[0] == "<event2_trigger>":
            event2_trigger = words[-1]
            continue
        if words[0] == "<order_flag>":
            order_flag = words[1]
            continue
        if words[0] == "<event1>":
            event1 = " ".join(words[1:])
            continue
        if words[0] == "<event2>":
            event2 = " ".join(words[1:])
            continue
        if words[0] == "<sentence1>":
            sentence1 = " ".join(words[1:])
            continue
        if words[0] == "<sentence2>":
            sentence2 = " ".join(words[1:])
            continue
        if words[0] == "<masked_sentence1>":
            masked_sentence1 = " ".join(words[1:])
            continue
        if words[0] == "<masked_sentence2>":
            masked_sentence2 = " ".join(words[1:])
            continue
        if words[0] == "<END>":
            if args.sentence_setting == "within" and masked_sentence1 != masked_sentence2:
                continue
            elif args.sentence_setting == "across" and masked_sentence1 == masked_sentence2:
                continue
            instance = {}

            if order_flag == "e1->e2":
                word_pair = "< " + event1 + " > " + relation + " < " + event2 + " >"
            else:
                word_pair = "< " + event2 + " > " + "R_" + relation + " < " + event1 + " >"

            eventpair = EventPair(word_pair, -1)

            # KE
            E1_vec = event2vec[eventpair.event1.lower()]
            E2_vec = event2vec[eventpair.event2.lower()]
            #children_vector = np.concatenate((E1_vec, E2_vec), axis=0)
            #children_vector = np.concatenate((E1_vec, E2_vec, E2_vec-E1_vec), axis=0)
            children_vector = E2_vec - E1_vec

            #knowledge_vector = get_knowledge_vec(eventpair, trigger_pair2score, C_trigger_pair2score)
            knowledge_vector = get_knowledge_vec_new(
                eventpair, subevent_trigger_pair2score, minList, maxList)

            #print("word_pair:", word_pair)
            #print("closest_trigger_pair:", closest_trigger_pair)
            #print("knowledge_vector:", knowledge_vector)
            output.write("word_pair: " + str(word_pair) + "\n")
            #output.write("closest_trigger_pair: " + str(closest_trigger_pair) + "\n")
            output.write("knowledge_vector: " + str(knowledge_vector) + "\n\n")
            #print(relation, get_class_idx(args, relation, order_flag))
            #print("\n")

            if masked_sentence1 == masked_sentence2:
                masked_sentence = "[CLS] " + masked_sentence1 + " [SEP]"
            elif order_flag == "e1->e2":
                masked_sentence = "[CLS] " + masked_sentence1 + " [SEP] " + masked_sentence2 + " [SEP]"
            elif order_flag == "e2->e1":
                masked_sentence = "[CLS] " + masked_sentence2 + " [SEP] " + masked_sentence1 + " [SEP]"

            input_ids = tokenizer.encode(masked_sentence)
            input_ids, input_mask = seq_padding(args, input_ids)

            masked_idxList = []

            for i, input_id in enumerate(input_ids):
                if input_id == MASK_id:
                    masked_idxList.append(i)
            if len(masked_idxList) != 2:
                print("Type1: One test instance exceeds max_seq_length.")
                continue

            if args.mask_trigger == True:
                instance = {
                    "event_pair": word_pair,
                    "masked_sentence": masked_sentence,
                    "input_ids": input_ids,
                    "masked_idxList": masked_idxList,
                    "knowledge_vector": knowledge_vector,
                    "children_vector": children_vector
                }

            else:
                if sentence1 == sentence2:
                    sentence = "[CLS] " + sentence1 + " [SEP]"
                elif order_flag == "e1->e2":
                    sentence = "[CLS] " + sentence1 + " [SEP] " + sentence2 + " [SEP]"
                elif order_flag == "e2->e1":
                    sentence = "[CLS] " + sentence2 + " [SEP] " + sentence1 + " [SEP]"

                input_ids = tokenizer.encode(sentence)
                input_ids, input_mask = seq_padding(args, input_ids)

                trigger1_ids = [
                    tokenizer._convert_token_to_id(token)
                    for token in tokenizer.tokenize(event1_trigger)
                ]
                trigger2_ids = [
                    tokenizer._convert_token_to_id(token)
                    for token in tokenizer.tokenize(event2_trigger)
                ]

                if order_flag == "e1->e2":
                    masked_idxList[1] = masked_idxList[1] + (
                        len(trigger1_ids) - 1
                    )  # How many slots shift to the right
                else:
                    masked_idxList[1] = masked_idxList[1] + (
                        len(trigger2_ids) - 1
                    )  # How many slots shift to the right

                if masked_idxList[1] >= len(input_ids):
                    print("Type2: One test instance exceeds max_seq_length.")
                    continue

                instance = {
                    "event_pair": word_pair,
                    "masked_sentence": masked_sentence,
                    "input_ids": input_ids,
                    "masked_idxList": masked_idxList,
                    "knowledge_vector": knowledge_vector,
                    "children_vector": children_vector
                }

            # !!!!!!!!!!!!!!!!!!!!!
            class_idx = get_class_idx(args, relation, order_flag)

            if filename not in filename2instanceList:
                filename2instanceList[filename] = []

            instance_found_idx = None
            for idx, previous_instance in enumerate(
                    filename2instanceList[filename]):
                if previous_instance["masked_sentence"] == masked_sentence:
                    instance_found_idx = idx
            if instance_found_idx != None:
                if class_idx in [1, 2]:  # relation and R_relation class_idx
                    filename2instanceList[filename][instance_found_idx][
                        "event_pair"] = word_pair
                    filename2instanceList[filename][instance_found_idx][
                        "class"] = class_idx
            else:
                instance["class"] = class_idx
                filename2instanceList[filename].append(instance)

    input_lines.close()
    output.close()

    return filename2instanceList
Ejemplo n.º 10
0
def LSTM_prepare_data(args, all_pairs, test_all_pairs):
    trainList = []
    devList = []
    testList = []

    headTotal = []
    tailTotal = []
    vocab = []

    input_lines = open("test_pairs.csv", "r")
    for line in input_lines:
        event = line.split()[0]
        test_all_pairs.append("< [" + event + "] > -> < [" + event + "] >")
    input_lines.close()

    all_pairs_list = list(all_pairs)

    for pair in all_pairs_list + test_all_pairs:
        eventpair = EventPair(pair, -1)
        for w in pair.split():
            if w in ["<", ">"]:
                continue
            vocab.append(w.replace("[", "").replace("]", ""))
        headTotal.append(eventpair.event1)
        tailTotal.append(eventpair.event2)

    vocab = list(set(vocab))
    headTotal = list(set(headTotal))
    tailTotal = list(set(tailTotal))

    print("{} unique words".format(len(vocab)))
    print("len(all_pairs):", len(all_pairs))

    index_word = {index + 2: word for index, word in enumerate(vocab)}
    word_index = {word: index + 2 for index, word in enumerate(vocab)}
    index_word[0], index_word[1] = '<pad>', '<unk>'
    word_index['<pad>'], word_index['<unk>'] = 0, 1

    split_num = len(all_pairs_list) // 50

    for repeat in range(0, 100):
        for pair in all_pairs_list[split_num:]:
            instance = make_instance(args, pair, headTotal, tailTotal,
                                     all_pairs, word_index)
            trainList.append(instance)

    for repeat in range(0, 10):
        for pair in all_pairs_list[:split_num]:
            instance = make_instance(args, pair, headTotal, tailTotal,
                                     all_pairs, word_index)
            devList.append(instance)

    for pair in test_all_pairs:
        instance = make_instance(args, pair, headTotal, tailTotal, all_pairs,
                                 word_index)
        testList.append(instance)

    print("len(trainList):", len(trainList))
    print("len(devList):", len(devList))
    print("len(testList):", len(testList))

    glove = {}
    print("Read Glove embedding...")

    with open(args.w2v_file) as f:
        for l in f:
            vec = l.split(' ')
            word = vec[0].lower()
            vec = vec[1:]
            glove[word] = np.array(vec)
    vocab_size = len(vocab)
    dimensions = 300
    matrix = np.zeros((len(word_index), dimensions))
    oov = 0
    filtered_glove = {}
    for i in tqdm(range(2, len(word_index))):
        word = index_word[i].lower()
        if (word in glove):
            vec = glove[word]
            filtered_glove[word] = glove[word]
            matrix[i] = vec
        else:
            random_init = np.random.uniform(low=-0.1,
                                            high=0.1,
                                            size=(1, dimensions))
            matrix[i] = random_init
            oov += 1
    print("oov={}".format(oov))
    env = {"index_word": index_word, "word_index": word_index, "glove": matrix}

    random.shuffle(trainList)

    env["train"] = trainList
    env["dev"] = devList
    env["test"] = testList

    pickle.dump(env, open("env.pkl", "wb"))
Ejemplo n.º 11
0
    for file in glob.glob(args.folder + args.genre + "*.txt"):
        print(file)
        input_file = open(file, "r")

        for line in input_file:
            if not line.strip():
                continue

            words = line.split()
            if words[0] == "<doc_id>":
                doc_id = words[1]
                continue

            if words[0] == "<subevent>":
                eventpair = EventPair(" ".join(words[1:]), 1)

                if "including" in eventpair.relation:
                    pair = eventpair.event1 + " -> " + eventpair.event2
                else:
                    pair = eventpair.event2 + " -> " + eventpair.event1

                pair = clean_eventpair(pair)
                continue

            if words[0] == "<word>":
                sentence_str = " ".join(words)
                if sentence_str in sentence_str_set:
                    continue
                else:
                    sentence_str_set.add(sentence_str)
Ejemplo n.º 12
0
    fig = plt.figure(figsize=(12, 12))
    plt.scatter(twodim[:, 0], twodim[:, 1], edgecolors='k', c='r')
    for word, (x, y) in zip(words, twodim):
        plt.text(x + 0.00, y + 0.00, word)
    fig.savefig(flag + "_" + str(eval_iteration) + '.png', dpi=fig.dpi)
    plt.show()
    fig.savefig(flag + "_" + str(eval_iteration) + '.pdf', bbox_inches='tight')


if __name__ == "__main__":

    words = ["conflict", "war", "attack", "protest", "clash", "fighting", "march", "game", "olympics", "match", \
             "bankruptcy", "reform", "recession", "investigation",\
             "hurricane", "storm", "earthquake", "flooding", "disaster",\
             "meeting", "conference", "forum", "discussion", \
             "festival", "ceremony", "celebration", \
             "election", "explosion", "wedding", "birthday", "carnival"] # "entertainment",
    input_lines = open("test_emb_20.txt", "r")

    trigger2vec = {}
    for line in input_lines:
        fields = line.split("\t")
        eventpair = EventPair(fields[0] + " -> " + fields[0], -1)
        if len(fields[0].split()) == 3:
            trigger2vec[eventpair.event1_trigger.replace("[", "").replace(
                "]", "")] = ast.literal_eval(fields[1])

    input_lines.close()
    eval_iteration = 20
    display_pca_scatterplot(words, trigger2vec, eval_iteration, "child")