def get_entity_pairs(self): id_chunk_map, event_ids, timex_ids, sentence_chunks = self.get_id_chunk_map() doctime = get_doctime_timex(self.note_path) doctime_id = doctime.attrib["tid"] entity_pairs = [] # TODO: make more efficient... for sentence_num in sentence_chunks: for i, entity in enumerate(sentence_chunks[sentence_num]): entity_id = entity[1] entity_type = entity[0] if entity_type == "EVENT": entity_pairs += list(itertools.product([entity], sentence_chunks[sentence_num][i+1:])) entity_pairs.append((entity, ("TIMEX", doctime_id))) else: events = map(lambda event: event, filter(lambda entity: entity[0] == "EVENT", sentence_chunks[sentence_num][i+1:])) entity_pairs += list(itertools.product(events, [("TIMEX", entity_id)])) # entity_pairs.append((entity, ("TIMEX", doctime_id))) if sentence_num + 1 in sentence_chunks: # get events of sentence event_ids = filter(lambda entity: entity[0] == "EVENT", sentence_chunks[sentence_num]) main_events = filter(lambda event_id: True in [token["is_main_verb"] for token in id_chunk_map[event_id[1]]], event_ids) main_events = map(lambda event: event, main_events) # get adjacent sentence events and filter the main events adj_event_ids = filter(lambda entity: entity == "EVENT", sentence_chunks[sentence_num+1]) adj_main_events = filter(lambda event_id: True in [token["is_main_verb"] for token in id_chunk_map[event_id[1]]], adj_event_ids) entity_pairs += list(itertools.product(main_events, adj_main_events)) # add relations in the other direction (b -> a instead of a -> b) old_pairs = entity_pairs entity_pairs = [] for pair in old_pairs: src = pair[0] target = pair[1] entity_pairs.append((src, target)) entity_pairs.append((target, src)) return entity_pairs
def write(self, timexEventLabels, tlinkLabels, idPairs, offsets, tokens, output_path): ''' Note::write() Purpose: add annotations this notes tml file and write new xml tree to a .tml file in the output folder. params: timexEventLabels: list of dictionaries of labels for timex and events. tlinkLabels: list labels for tlink relations idPairs: list of pairs of eid or tid that have a one to one correspondance with the tlinkLabels offsets: list of offsets tuples used to locate events and timexes specified by the label lists. Have one to one correspondance with both lists of labels. tokens: tokens in the note (used for tense) output_path: directory to write the file to ''' # TODO: create output directory if it does not exist root = get_stripped_root(self.note_path) length = len(offsets) doc_time = get_doctime_timex(self.note_path).attrib["value"] # hack so events are detected in next for loop. for label in timexEventLabels: if label["entity_label"][0:2] not in [ "B_", "I_", "O" ] or label["entity_label"] in ["I_STATE", "I_ACTION"]: label["entity_label"] = "B_" + label["entity_label"] # start at back of document to preserve offsets until they are used for i in range(1, length + 1): index = length - i if timexEventLabels[index]["entity_label"][0:2] == "B_": start = offsets[index][0] end = offsets[index][1] entity_tokens = tokens[index]["token"] #grab any IN tokens and add them to the tag text for j in range(1, i): if (timexEventLabels[index + j]["entity_label"][0:2] == "I_"): end = offsets[index + j][1] entity_tokens += ' ' + tokens[index + j]["token"] else: break if timexEventLabels[index]["entity_type"] == "TIMEX3": # get the time norm value of the time expression # timex_value = get_normalized_time_expressions(doc_time, [entity_tokens]) timex_value = '' # if no value was returned, set the expression to an empty string # TODO: check if TimeML has a specific default value we should use here if len(timex_value) != 0: timex_value = timex_value[0] else: timex_value = '' # if None in [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]: # print "FOUND NoNE" # print [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value] # # exit() # else: # print "NONE NONE" # print [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value] annotated_text = annotate_text_element( root, "TIMEX3", start, end, { "tid": timexEventLabels[index]["entity_id"], "type": timexEventLabels[index]["entity_label"][2:], "value": timex_value }) else: annotated_text = annotate_text_element( root, "EVENT", start, end, { "eid": timexEventLabels[index]["entity_id"], "class": timexEventLabels[index]["entity_label"][2:] }) #if None in [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]: # print "FOUND NoNE" # print [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value] # exit() #else: # print "NONE NONE" # print [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value] set_text_element(root, annotated_text) # make event instances eventDict = {} for i, timexEventLabel in enumerate(timexEventLabels): token = tokens[i] pos = None # pos # if token["pos_tag"] == "IN": # pos = "PREPOSITION" # elif token["pos_tag"] in ["VB", "VBD","VBG", "VBN", "VBP", "VBZ", "RB", "RBR", "RBS"]: # pos = "VERB" # elif token["pos_tag"] in ["NN", "NNS", "NNP", "NNPS", "PRP", "PRP$"]: # pos = "NOUN" # elif token["pos_tag"] in ["JJ", "JJR", "JJS"]: # pos = "ADJECTIVE" # else: # pos = "OTHER" if timexEventLabel["entity_type"] == "EVENT": root = annotate_root( root, "MAKEINSTANCE", { "eventID": timexEventLabel["entity_id"], "eiid": "ei" + str(i), "tense": "NONE", "pos": "NONE" }) eventDict[timexEventLabel["entity_id"]] = "ei" + str(i) # add tlinks for i, tlinkLabel in enumerate(tlinkLabels): if tlinkLabel == "None": continue annotations = {"lid": "l" + str(i), "relType": tlinkLabel} firstID = idPairs[i][0] secondID = idPairs[i][1] if firstID[0] == "e": annotations["eventInstanceID"] = eventDict[firstID] if firstID[0] == "t": annotations["timeID"] = firstID if secondID[0] == "e": annotations["relatedToEventInstance"] = eventDict[secondID] if secondID[0] == "t": annotations["relatedToTime"] = secondID root = annotate_root(root, "TLINK", annotations) note_path = os.path.join(output_path, self.note_path.split('/')[-1] + ".tml") print "root: ", root print "note_path: ", note_path write_root_to_file(root, note_path)
def write(self, timexEventLabels, tlinkLabels, idPairs, offsets, tokens, output_path): ''' Note::write() Purpose: add annotations this notes tml file and write new xml tree to a .tml file in the output folder. params: timexEventLabels: list of dictionaries of labels for timex and events. tlinkLabels: list labels for tlink relations idPairs: list of pairs of eid or tid that have a one to one correspondance with the tlinkLabels offsets: list of offsets tuples used to locate events and timexes specified by the label lists. Have one to one correspondance with both lists of labels. tokens: tokens in the note (used for tense) output_path: directory to write the file to ''' # TODO: create output directory if it does not exist root = get_stripped_root(self.note_path) length = len(offsets) doc_time = get_doctime_timex(self.note_path).attrib["value"] # hack so events are detected in next for loop. for label in timexEventLabels: if label["entity_label"][0:2] not in ["B_","I_","O"] or label["entity_label"] in ["I_STATE", "I_ACTION"]: label["entity_label"] = "B_" + label["entity_label"] # start at back of document to preserve offsets until they are used for i in range(1, length+1): index = length - i if timexEventLabels[index]["entity_label"][0:2] == "B_": start = offsets[index][0] end = offsets[index][1] entity_tokens = tokens[index]["token"] #grab any IN tokens and add them to the tag text for j in range (1, i): if(timexEventLabels[index + j]["entity_label"][0:2] == "I_"): end = offsets[index + j][1] entity_tokens += ' ' + tokens[index + j]["token"] else: break if timexEventLabels[index]["entity_type"] == "TIMEX3": # get the time norm value of the time expression # timex_value = get_normalized_time_expressions(doc_time, [entity_tokens]) timex_value = '' # if no value was returned, set the expression to an empty string # TODO: check if TimeML has a specific default value we should use here if len(timex_value) != 0: timex_value = timex_value[0] else: timex_value = '' # if None in [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]: # print "FOUND NoNE" # print [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value] # # exit() # else: # print "NONE NONE" # print [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value] annotated_text = annotate_text_element(root, "TIMEX3", start, end, {"tid": timexEventLabels[index]["entity_id"], "type":timexEventLabels[index]["entity_label"][2:], "value":timex_value}) else: annotated_text = annotate_text_element(root, "EVENT", start, end, {"eid": timexEventLabels[index]["entity_id"], "class":timexEventLabels[index]["entity_label"][2:]}) #if None in [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]: # print "FOUND NoNE" # print [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value] # exit() #else: # print "NONE NONE" # print [start, end, timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value] set_text_element(root, annotated_text) # make event instances eventDict = {} for i, timexEventLabel in enumerate(timexEventLabels): token = tokens[i] pos = None # pos # if token["pos_tag"] == "IN": # pos = "PREPOSITION" # elif token["pos_tag"] in ["VB", "VBD","VBG", "VBN", "VBP", "VBZ", "RB", "RBR", "RBS"]: # pos = "VERB" # elif token["pos_tag"] in ["NN", "NNS", "NNP", "NNPS", "PRP", "PRP$"]: # pos = "NOUN" # elif token["pos_tag"] in ["JJ", "JJR", "JJS"]: # pos = "ADJECTIVE" # else: # pos = "OTHER" if timexEventLabel["entity_type"] == "EVENT": root = annotate_root(root, "MAKEINSTANCE", {"eventID": timexEventLabel["entity_id"], "eiid": "ei" + str(i), "tense":"NONE", "pos":"NONE"}) eventDict[timexEventLabel["entity_id"]] = "ei" + str(i) # add tlinks for i, tlinkLabel in enumerate(tlinkLabels): if tlinkLabel == "None": continue annotations = {"lid": "l" + str(i), "relType": tlinkLabel} firstID = idPairs[i][0] secondID = idPairs[i][1] if firstID[0] == "e": annotations["eventInstanceID"] = eventDict[firstID] if firstID[0] == "t": annotations["timeID"] = firstID if secondID[0] == "e": annotations["relatedToEventInstance"] = eventDict[secondID] if secondID[0] == "t": annotations["relatedToTime"] = secondID root = annotate_root(root, "TLINK", annotations) note_path = os.path.join(output_path, self.note_path.split('/')[-1] + ".tml") print "root: ", root print "note_path: ", note_path write_root_to_file(root, note_path)
def get_id_chunk_map(self): event_ids = set() timex_ids = set() chunks = [] chunk = [] id_chunk = [] id_chunks = [] start_entity_id = None id_chunk_map = {} B_seen = False sentence_chunks = {} # get tagged entities and group into a list for sentence_num, labels in zip(self.pre_processed_text, self.get_labels()): sentence_chunks[sentence_num] = [] for token, label in zip(self.pre_processed_text[sentence_num], labels): if label["entity_type"] == "EVENT": _chunk = [token] chunks.append(_chunk) event_ids.add(label["entity_id"]) id_chunks.append([label["entity_id"]]) # TODO: gonna drop multi span events... assert label["entity_id"] not in id_chunk_map id_chunk_map[label["entity_id"]] = _chunk sentence_chunks[sentence_num].append(("EVENT", label["entity_id"])) # start of timex elif re.search('^B_', label["entity_label"]): timex_ids.add(label["entity_id"]) if len(chunk) != 0: chunks.append(chunk) id_chunks.append(id_chunk) assert start_entity_id not in id_chunk_map #print "TIMEX: adding to id_chunk _map" #print "\t", label id_chunk_map[start_entity_id] = chunk #if start_entity_id is None: # print "start_entity_id is NONE" # print label sentence_chunks[sentence_num].append(("TIMEX", start_entity_id)) chunk = [token] id_chunk = [label["entity_id"]] else: chunk.append(token) id_chunk.append(label["entity_id"]) start_entity_id = label["entity_id"] B_seen = True # in timex chunk elif re.search('^I_', label["entity_label"]): assert label["entity_id"] == start_entity_id, "{} != {}, B_seen is {}".format(label["entity_id"], start_entity_id, B_seen) chunk.append(token) id_chunk.append(label["entity_id"]) else: pass if len(chunk) != 0: chunks.append(chunk) assert len(id_chunk) == len(chunk) id_chunks.append(id_chunk) assert start_entity_id not in id_chunk_map id_chunk_map[start_entity_id] = chunk sentence_chunks[sentence_num].append(("TIMEX", start_entity_id)) chunk = [] id_chunk = [] assert len(event_ids.union(timex_ids)) == len(id_chunks), "{} != {}".format(len(event_ids.union(timex_ids)), len(id_chunks)) assert len(id_chunk_map.keys()) == len(event_ids.union(timex_ids)), "{} != {}".format(len(id_chunk_map.keys()), len(event_ids.union(timex_ids))) # TODO: need to add features for doctime. there aren't any. # add doc time. this is a timex. doctime = get_doctime_timex(self.note_path) doctime_id = doctime.attrib["tid"] doctime_dict = {} # create dict representation of doctime timex for attrib in doctime.attrib: doctime_dict[attrib] = doctime.attrib[attrib] id_chunk_map[doctime_id] = [doctime_dict] timex_ids.add(doctime_id) return id_chunk_map, event_ids, timex_ids, sentence_chunks