Exemple #1
0
def unit_test(labels):
    # This is unit test
    label = labels[('Euro', 'Estonia')]
    label.append(time_signature('2012-01-01', relation='test'))
    label.append(
        time_signature('2015-01-01', relation='currency', node_type='end'))
    label.sort()

    print([(t.time, t.relation, t.type) for t in labels[('Euro', 'Estonia')]])

    tmp1 = time_signature('_'.join(['2015', '01', '01']),
                          relation='NA',
                          node_type='mention')
    tmp2 = time_signature('_'.join(['2015', '11', '01']),
                          relation='NA',
                          node_type='mention')
    label = labels[('Euro', 'Estonia')]

    print(check_relation(label, tmp1))
    print(check_relation(label, tmp2))
Exemple #2
0
def check_relation(label, x):
    # todo : need to add negation to each label.
    # label : a list of time_signature
    # x : a query for its label
    stack = Stack()
    unpop = []
    modified = True
    for node in label:
        if node < x:
            if node.type == 'start':
                stack.push(node)
                # print('push')
            if node.type == 'end':
                if node.relation == stack.peek().relation:
                    stack.pop()
                    # print('pop')
                    while (unpop and unpop[-1] == stack.peek().relation):
                        stack.pop()
                        # print('pop')
                        unpop = unpop[:-1]
                else:
                    unpop.append(node.relation)
        else:
            if not modified:
                if node.type == 'end' and stack.peek(
                ).relation[:3] == 'NOT' and node.relation[:3] != 'NOT':
                    stack.push(
                        time_signature('0000-00-00', relation=node.relation))
                if stack.size() == 1:
                    rel = node.relation
                else:
                    rel = stack.peek().relation
                return rel
            else:
                rel = stack.peek().relation
                return rel
Exemple #3
0
def construct_dataset(file_path,
                      labels,
                      w_to_ix,
                      train_test='train',
                      en2id=None,
                      save_wiki_time_path=''):
    # import reverse synonym
    # here we got doing the mapping in dataset construction phase
    with open('./origin_data/r_synonym.dat', 'rb') as f:
        r_synonym = pickle.load(f)
    print('Reading reverse synonym done!')

    # read-in rel_to_ix(modified version)
    rel_57 = True
    if rel_57:
        rel2ix_path = "./origin_data/rel2ix_temporal.txt"
    else:
        rel2ix_path = "./origin_data/rel2ix_temporal_v2.txt"
    rel_to_ix = defaultdict(set_default)
    with open(rel2ix_path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        tmp = line.split()
        rel, ix = "_".join(tmp[:-1]), int(tmp[-1])
        rel_to_ix[rel] = ix
    # rel_to_ix['PAD'] = len(rel_to_ix)
    print('Reading rel_to_ix done!')

    mentions = defaultdict(list)
    natural = defaultdict(list)
    en2labels = defaultdict(list)
    mention_filter = defaultdict(set)

    with open(file_path, 'r', encoding='utf8') as f:
        lines = f.readlines()

    debug = False
    outputs = dict()
    count = 0
    for line in lines:
        # count += 1
        # if count > 5:
        #     break
        line = line.split(',', maxsplit=8)
        # print(line)
        # extract all infos from train.txt

        # rel, en1, en2, pos1, pos2 = line[1:6]
        # year, month, day = line[6:9]
        # sent = line[9].split()
        en1, en2, pos1, pos2 = line[1:5]
        year, month, day = line[5:8]
        sent = line[8].split()

        # from mentions synonym -> entity label
        # considering multi-mapping
        # en1_list = r_synonym[en1]
        # en2_list = r_synonym[en2]
        en1_list = [
            en1,
        ]
        en2_list = [
            en2,
        ]

        for en1, en2 in product(en1_list, en2_list):
            if tuple(sent) in mention_filter[(en1, en2)]:
                continue
                # for evaluate manual tagging.
                # pass

            #   swap in case en1 and en2 's order may differ
            if labels[(en2, en1)]:
                en1, en2 = en2, en1
            if not labels[(en1, en2)]:
                # pdb.set_trace()
                continue
                # pass
            if en1 == '' or en2 == '':
                continue

            en2label = labels[(en1, en2)]
            outputs[(en1, en2)] = []
            tmp = time_signature("-".join([year, month, day]),
                                 node_type='mention')
            # pdb.set_trace()
            if (en1, en2) == ('netherlands', 'dries_van_agt'):
                # pdb.set_trace()
                continue
            tag = check_relation(en2label, tmp)

            if tag not in rel_to_ix.keys():
                # rel_to_ix[tag] = len(rel_to_ix) - 1
                tag = 'NA'
            # turn tag into int
            tag_name = tag
            tag = rel_to_ix[tag]
            # adding for understand test cases
            # here : sent is words, tagname is relation label
            natural[(en1, en2)].append(Mention(tag_name, tmp, sent))

            # mentions[(en1, en2)].append()
            org_sent = sent
            if not debug:
                sent = [
                    w_to_ix[word] if w_to_ix[word] else w_to_ix['UNK']
                    for word in sent
                ]
            # mentions.append((pos1, pos2, sent, year, month, day, tag))
            en_pair_str = (en1, en2)
            if en2id:
                if en1 not in en2id.keys():
                    en2id[en1] = len(en2id)
                elif en2 not in en2id.keys():
                    en2id[en2] = len(en2id)
                # else:
                en1, en2 = en2id[en1], en2id[en2]
            count += 1
            mention_filter[(en1, en2)].add(tuple(sent))
            en2labels[(en1, en2)] = en2label
            mentions[(en1, en2)].append(
                Mention(sent,
                        en_pair_str=en_pair_str,
                        org_sent=org_sent,
                        tag=tag,
                        tag_name=tag_name,
                        pos1=int(pos1),
                        pos2=int(pos2),
                        time=tmp))

    print('mention count : {}'.format(count))
    # keep mentions sorted
    for key, item in mentions.items():
        item.sort()
        # order embed padding is 0.
        rank = 1
        item[0].rank = rank
        for i in range(1, len(item)):
            if item[i].time == item[i - 1].time:
                item[i].rank = rank
            else:
                rank += 1
                item[i].rank = rank
    print('Finish create labels!')

    if debug:
        output_lines = []
        used = set()
        count = 0
        for en_pair in outputs.keys():
            prev = None
            if (en_pair[1], en_pair[0]) in used:
                continue
            used.add(en_pair)
            en1, en2 = en2id[en_pair[0]], en2id[en_pair[1]]
            tmp = mentions[(en1, en2)] + mentions[(en2, en1)]
            output_lines.append(str(en_pair) + ":\n")
            for mention in tmp:
                output_lines.append(mention.tag_name + '\t' +
                                    str(mention.time.time) + " : \n")
                if prev and prev != mention.tag_name:
                    count += 1
                    # print(prev, mention.tag_name)
                    if prev[4:] != mention.tag_name and mention.tag_name[
                            4:] != prev:
                        print(prev, mention.tag_name)
                prev = mention.tag_name
                try:
                    output_lines.append(" ".join(mention.sent) + "\n")
                except:
                    pdb.set_trace()
                    pass
            output_lines.append('\n')
        print(count)

        with open('./origin_data/en+label+sent.txt', 'w') as f:
            f.writelines(output_lines)
        print('Writing to outputs!')

    with open("./origin_data/mentions_" + train_test + ".dat", 'wb') as fout:
        pickle.dump(mentions, fout)
    if save_wiki_time_path:
        save_wiki_time(mentions, save_wiki_time_path)
    print('Finish save intermediate results! ')
    return mentions, rel_to_ix, natural, en2labels
Exemple #4
0
def create_labels():
    with open(data_root + "alignment.dat", 'rb') as f:
        # align is the map from wiki-data to wiki-pedia
        align = pickle.load(f)
    with open(data_root + "r_synonym.dat", 'rb') as f:
        r_synonym = pickle.load(f)

    entities_pair = pd.read_csv(data_root + "entities.csv")

    formal_entities_pair = pd.concat([
        entities_pair[[
            'entity1', 'entity2', 'entity1Label', 'entity2Label',
            'relation_name'
        ]], entities_pair['start_time'].apply(clean),
        entities_pair['end_time'].apply(clean)
    ],
                                     axis=1)

    # This is for creating label sequences
    labels = defaultdict(list)
    # pdb.set_trace()
    for ix, row in formal_entities_pair.iterrows():
        # # Maybe there is no alignment for entity in wiki-data
        # try:
        #     en1 = align[row['entity1']]
        # except KeyError:
        #     en1 = row['entity1Label']
        # try:
        #     en2 = align[row['entity2']]
        # except KeyError:
        #     en2 = row['entity2Label']

        # should not use any alignment in this process.
        en1 = row['entity1Label']
        en2 = row['entity2Label']

        # bug is in r_synonym
        # en1_label =

        en1 = Normalization(en1)
        en2 = Normalization(en2)
        rel = "_".join(row['relation_name'].split())

        if (en2, en1) in labels.keys():
            # exchange en1 & en2
            en1, en2 = en2, en1

    #   initialization for labels
    #   each time signature denotes the end of some relation
    #     if not labels[(en1, en2)]:
    #         labels[(en1, en2)].append(time_signature('0000-00-00', relation='NA', node_type='start'))
    #         labels[(en1, en2)].append(time_signature('9999-99-99', relation='NA', node_type='end'))
        if row['start_time'] != 'NaN':
            labels[(en1, en2)].append(
                time_signature(row['start_time'],
                               relation=rel,
                               node_type='start'))
        if row['end_time'] != 'NaN':
            labels[(en1, en2)].append(
                time_signature(row['end_time'], relation=rel, node_type='end'))
    for key, item in labels.items():
        item.sort()
        # add at last
        start_rel = item[0].relation
        end_rel = item[-1].relation
        # todo: check its effect
        item.insert(
            0,
            time_signature('0000-00-00',
                           relation='NOT_' + start_rel,
                           node_type='start'))
        # item.insert(0, time_signature('0000-00-00', relation='NA', node_type='start'))
        item.append(
            time_signature('9999-99-99',
                           relation='NOT_' + end_rel,
                           node_type='end'))

    with open('./data/labels.dat', 'wb') as f:
        pickle.dump(labels, f)
    print('Label making done!')
    return labels