def unit_test(labels): # This is unit test label = labels[('Euro', 'Estonia')] label.append(time_signature('2012-01-01', relation='test')) label.append( time_signature('2015-01-01', relation='currency', node_type='end')) label.sort() print([(t.time, t.relation, t.type) for t in labels[('Euro', 'Estonia')]]) tmp1 = time_signature('_'.join(['2015', '01', '01']), relation='NA', node_type='mention') tmp2 = time_signature('_'.join(['2015', '11', '01']), relation='NA', node_type='mention') label = labels[('Euro', 'Estonia')] print(check_relation(label, tmp1)) print(check_relation(label, tmp2))
def check_relation(label, x): # todo : need to add negation to each label. # label : a list of time_signature # x : a query for its label stack = Stack() unpop = [] modified = True for node in label: if node < x: if node.type == 'start': stack.push(node) # print('push') if node.type == 'end': if node.relation == stack.peek().relation: stack.pop() # print('pop') while (unpop and unpop[-1] == stack.peek().relation): stack.pop() # print('pop') unpop = unpop[:-1] else: unpop.append(node.relation) else: if not modified: if node.type == 'end' and stack.peek( ).relation[:3] == 'NOT' and node.relation[:3] != 'NOT': stack.push( time_signature('0000-00-00', relation=node.relation)) if stack.size() == 1: rel = node.relation else: rel = stack.peek().relation return rel else: rel = stack.peek().relation return rel
def construct_dataset(file_path, labels, w_to_ix, train_test='train', en2id=None, save_wiki_time_path=''): # import reverse synonym # here we got doing the mapping in dataset construction phase with open('./origin_data/r_synonym.dat', 'rb') as f: r_synonym = pickle.load(f) print('Reading reverse synonym done!') # read-in rel_to_ix(modified version) rel_57 = True if rel_57: rel2ix_path = "./origin_data/rel2ix_temporal.txt" else: rel2ix_path = "./origin_data/rel2ix_temporal_v2.txt" rel_to_ix = defaultdict(set_default) with open(rel2ix_path, 'r') as f: lines = f.readlines() for line in lines: tmp = line.split() rel, ix = "_".join(tmp[:-1]), int(tmp[-1]) rel_to_ix[rel] = ix # rel_to_ix['PAD'] = len(rel_to_ix) print('Reading rel_to_ix done!') mentions = defaultdict(list) natural = defaultdict(list) en2labels = defaultdict(list) mention_filter = defaultdict(set) with open(file_path, 'r', encoding='utf8') as f: lines = f.readlines() debug = False outputs = dict() count = 0 for line in lines: # count += 1 # if count > 5: # break line = line.split(',', maxsplit=8) # print(line) # extract all infos from train.txt # rel, en1, en2, pos1, pos2 = line[1:6] # year, month, day = line[6:9] # sent = line[9].split() en1, en2, pos1, pos2 = line[1:5] year, month, day = line[5:8] sent = line[8].split() # from mentions synonym -> entity label # considering multi-mapping # en1_list = r_synonym[en1] # en2_list = r_synonym[en2] en1_list = [ en1, ] en2_list = [ en2, ] for en1, en2 in product(en1_list, en2_list): if tuple(sent) in mention_filter[(en1, en2)]: continue # for evaluate manual tagging. # pass # swap in case en1 and en2 's order may differ if labels[(en2, en1)]: en1, en2 = en2, en1 if not labels[(en1, en2)]: # pdb.set_trace() continue # pass if en1 == '' or en2 == '': continue en2label = labels[(en1, en2)] outputs[(en1, en2)] = [] tmp = time_signature("-".join([year, month, day]), node_type='mention') # pdb.set_trace() if (en1, en2) == ('netherlands', 'dries_van_agt'): # pdb.set_trace() continue tag = check_relation(en2label, tmp) if tag not in rel_to_ix.keys(): # rel_to_ix[tag] = len(rel_to_ix) - 1 tag = 'NA' # turn tag into int tag_name = tag tag = rel_to_ix[tag] # adding for understand test cases # here : sent is words, tagname is relation label natural[(en1, en2)].append(Mention(tag_name, tmp, sent)) # mentions[(en1, en2)].append() org_sent = sent if not debug: sent = [ w_to_ix[word] if w_to_ix[word] else w_to_ix['UNK'] for word in sent ] # mentions.append((pos1, pos2, sent, year, month, day, tag)) en_pair_str = (en1, en2) if en2id: if en1 not in en2id.keys(): en2id[en1] = len(en2id) elif en2 not in en2id.keys(): en2id[en2] = len(en2id) # else: en1, en2 = en2id[en1], en2id[en2] count += 1 mention_filter[(en1, en2)].add(tuple(sent)) en2labels[(en1, en2)] = en2label mentions[(en1, en2)].append( Mention(sent, en_pair_str=en_pair_str, org_sent=org_sent, tag=tag, tag_name=tag_name, pos1=int(pos1), pos2=int(pos2), time=tmp)) print('mention count : {}'.format(count)) # keep mentions sorted for key, item in mentions.items(): item.sort() # order embed padding is 0. rank = 1 item[0].rank = rank for i in range(1, len(item)): if item[i].time == item[i - 1].time: item[i].rank = rank else: rank += 1 item[i].rank = rank print('Finish create labels!') if debug: output_lines = [] used = set() count = 0 for en_pair in outputs.keys(): prev = None if (en_pair[1], en_pair[0]) in used: continue used.add(en_pair) en1, en2 = en2id[en_pair[0]], en2id[en_pair[1]] tmp = mentions[(en1, en2)] + mentions[(en2, en1)] output_lines.append(str(en_pair) + ":\n") for mention in tmp: output_lines.append(mention.tag_name + '\t' + str(mention.time.time) + " : \n") if prev and prev != mention.tag_name: count += 1 # print(prev, mention.tag_name) if prev[4:] != mention.tag_name and mention.tag_name[ 4:] != prev: print(prev, mention.tag_name) prev = mention.tag_name try: output_lines.append(" ".join(mention.sent) + "\n") except: pdb.set_trace() pass output_lines.append('\n') print(count) with open('./origin_data/en+label+sent.txt', 'w') as f: f.writelines(output_lines) print('Writing to outputs!') with open("./origin_data/mentions_" + train_test + ".dat", 'wb') as fout: pickle.dump(mentions, fout) if save_wiki_time_path: save_wiki_time(mentions, save_wiki_time_path) print('Finish save intermediate results! ') return mentions, rel_to_ix, natural, en2labels
def create_labels(): with open(data_root + "alignment.dat", 'rb') as f: # align is the map from wiki-data to wiki-pedia align = pickle.load(f) with open(data_root + "r_synonym.dat", 'rb') as f: r_synonym = pickle.load(f) entities_pair = pd.read_csv(data_root + "entities.csv") formal_entities_pair = pd.concat([ entities_pair[[ 'entity1', 'entity2', 'entity1Label', 'entity2Label', 'relation_name' ]], entities_pair['start_time'].apply(clean), entities_pair['end_time'].apply(clean) ], axis=1) # This is for creating label sequences labels = defaultdict(list) # pdb.set_trace() for ix, row in formal_entities_pair.iterrows(): # # Maybe there is no alignment for entity in wiki-data # try: # en1 = align[row['entity1']] # except KeyError: # en1 = row['entity1Label'] # try: # en2 = align[row['entity2']] # except KeyError: # en2 = row['entity2Label'] # should not use any alignment in this process. en1 = row['entity1Label'] en2 = row['entity2Label'] # bug is in r_synonym # en1_label = en1 = Normalization(en1) en2 = Normalization(en2) rel = "_".join(row['relation_name'].split()) if (en2, en1) in labels.keys(): # exchange en1 & en2 en1, en2 = en2, en1 # initialization for labels # each time signature denotes the end of some relation # if not labels[(en1, en2)]: # labels[(en1, en2)].append(time_signature('0000-00-00', relation='NA', node_type='start')) # labels[(en1, en2)].append(time_signature('9999-99-99', relation='NA', node_type='end')) if row['start_time'] != 'NaN': labels[(en1, en2)].append( time_signature(row['start_time'], relation=rel, node_type='start')) if row['end_time'] != 'NaN': labels[(en1, en2)].append( time_signature(row['end_time'], relation=rel, node_type='end')) for key, item in labels.items(): item.sort() # add at last start_rel = item[0].relation end_rel = item[-1].relation # todo: check its effect item.insert( 0, time_signature('0000-00-00', relation='NOT_' + start_rel, node_type='start')) # item.insert(0, time_signature('0000-00-00', relation='NA', node_type='start')) item.append( time_signature('9999-99-99', relation='NOT_' + end_rel, node_type='end')) with open('./data/labels.dat', 'wb') as f: pickle.dump(labels, f) print('Label making done!') return labels