def get_entity_pairs(self):

        id_chunk_map, event_ids, timex_ids, sentence_chunks = self.get_id_chunk_map()

        doctime = get_doctime_timex(self.note_path)
        doctime_id = doctime.attrib["tid"]

        entity_pairs = []

        # TODO: make more efficient...
        for sentence_num in sentence_chunks:
            for i, entity in enumerate(sentence_chunks[sentence_num]):
                entity_id   = entity[1]
                entity_type = entity[0]

                if entity_type == "EVENT":
                    entity_pairs += list(itertools.product([entity], sentence_chunks[sentence_num][i+1:]))
                    entity_pairs.append((entity, ("TIMEX", doctime_id)))
                else:
                    events = map(lambda event: event, filter(lambda entity: entity[0] == "EVENT", sentence_chunks[sentence_num][i+1:]))
                    entity_pairs += list(itertools.product(events, [("TIMEX", entity_id)]))
                    # entity_pairs.append((entity, ("TIMEX", doctime_id)))

            if sentence_num + 1 in sentence_chunks:

                # get events of sentence
                event_ids = filter(lambda entity: entity[0] == "EVENT", sentence_chunks[sentence_num])
                main_events = filter(lambda event_id: True in [token["is_main_verb"] for token in id_chunk_map[event_id[1]]], event_ids)
                main_events = map(lambda event: event, main_events)

                # get adjacent sentence events and filter the main events
                adj_event_ids = filter(lambda entity: entity == "EVENT", sentence_chunks[sentence_num+1])
                adj_main_events = filter(lambda event_id: True in [token["is_main_verb"] for token in id_chunk_map[event_id[1]]], adj_event_ids)

                entity_pairs += list(itertools.product(main_events, adj_main_events))

        # add relations in the other direction (b -> a instead of a -> b)
        old_pairs = entity_pairs
        entity_pairs = []
        for pair in old_pairs:
            src = pair[0]
            target = pair[1]

            entity_pairs.append((src, target))
            entity_pairs.append((target, src))

        return entity_pairs
Beispiel #2
0
    def write(self, timexEventLabels, tlinkLabels, idPairs, offsets, tokens,
              output_path):
        '''
        Note::write()

        Purpose: add annotations this notes tml file and write new xml tree to a .tml file in the output folder.

        params:
            timexEventLabels: list of dictionaries of labels for timex and events.
            tlinkLabels: list labels for tlink relations
            idPairs: list of pairs of eid or tid that have a one to one correspondance with the tlinkLabels
            offsets: list of offsets tuples used to locate events and timexes specified by the label lists. Have one to one correspondance with both lists of labels.
            tokens: tokens in the note (used for tense)
            output_path: directory to write the file to
        '''
        # TODO: create output directory if it does not exist
        root = get_stripped_root(self.note_path)
        length = len(offsets)
        doc_time = get_doctime_timex(self.note_path).attrib["value"]

        # hack so events are detected in next for loop.
        for label in timexEventLabels:
            if label["entity_label"][0:2] not in [
                    "B_", "I_", "O"
            ] or label["entity_label"] in ["I_STATE", "I_ACTION"]:
                label["entity_label"] = "B_" + label["entity_label"]

        # start at back of document to preserve offsets until they are used
        for i in range(1, length + 1):
            index = length - i

            if timexEventLabels[index]["entity_label"][0:2] == "B_":
                start = offsets[index][0]
                end = offsets[index][1]
                entity_tokens = tokens[index]["token"]

                #grab any IN tokens and add them to the tag text
                for j in range(1, i):

                    if (timexEventLabels[index +
                                         j]["entity_label"][0:2] == "I_"):
                        end = offsets[index + j][1]
                        entity_tokens += ' ' + tokens[index + j]["token"]
                    else:
                        break

                if timexEventLabels[index]["entity_type"] == "TIMEX3":
                    # get the time norm value of the time expression
                    # timex_value = get_normalized_time_expressions(doc_time, [entity_tokens])
                    timex_value = ''
                    # if no value was returned, set the expression to an empty string
                    # TODO: check if TimeML has a specific default value we should use here
                    if len(timex_value) != 0:
                        timex_value = timex_value[0]
                    else:
                        timex_value = ''

                # if None in [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]:
                #     print "FOUND NoNE"
                #     print [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]
        #          #      exit()
        # else:
        #     print "NONE NONE"
        #     print [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]

                    annotated_text = annotate_text_element(
                        root, "TIMEX3", start, end, {
                            "tid": timexEventLabels[index]["entity_id"],
                            "type":
                            timexEventLabels[index]["entity_label"][2:],
                            "value": timex_value
                        })
                else:
                    annotated_text = annotate_text_element(
                        root, "EVENT", start, end, {
                            "eid": timexEventLabels[index]["entity_id"],
                            "class":
                            timexEventLabels[index]["entity_label"][2:]
                        })
                    #if None in [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]:
                    #    print "FOUND NoNE"
                    #    print [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]
        #                exit()
        #else:
        #    print "NONE NONE"
        #    print [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]

                set_text_element(root, annotated_text)

        # make event instances
        eventDict = {}
        for i, timexEventLabel in enumerate(timexEventLabels):

            token = tokens[i]

            pos = None

            # pos
            # if token["pos_tag"] == "IN":
            #     pos = "PREPOSITION"
            # elif token["pos_tag"] in ["VB", "VBD","VBG", "VBN", "VBP", "VBZ", "RB", "RBR", "RBS"]:
            #     pos = "VERB"
            # elif token["pos_tag"] in ["NN", "NNS", "NNP", "NNPS", "PRP", "PRP$"]:
            #     pos = "NOUN"
            # elif token["pos_tag"] in ["JJ", "JJR", "JJS"]:
            #     pos = "ADJECTIVE"
            # else:
            #     pos = "OTHER"

            if timexEventLabel["entity_type"] == "EVENT":
                root = annotate_root(
                    root, "MAKEINSTANCE", {
                        "eventID": timexEventLabel["entity_id"],
                        "eiid": "ei" + str(i),
                        "tense": "NONE",
                        "pos": "NONE"
                    })
                eventDict[timexEventLabel["entity_id"]] = "ei" + str(i)

        # add tlinks
        for i, tlinkLabel in enumerate(tlinkLabels):

            if tlinkLabel == "None":
                continue

            annotations = {"lid": "l" + str(i), "relType": tlinkLabel}

            firstID = idPairs[i][0]
            secondID = idPairs[i][1]

            if firstID[0] == "e":
                annotations["eventInstanceID"] = eventDict[firstID]

            if firstID[0] == "t":
                annotations["timeID"] = firstID

            if secondID[0] == "e":
                annotations["relatedToEventInstance"] = eventDict[secondID]

            if secondID[0] == "t":
                annotations["relatedToTime"] = secondID

            root = annotate_root(root, "TLINK", annotations)

        note_path = os.path.join(output_path,
                                 self.note_path.split('/')[-1] + ".tml")

        print "root: ", root
        print "note_path: ", note_path

        write_root_to_file(root, note_path)
    def write(self, timexEventLabels, tlinkLabels, idPairs, offsets, tokens, output_path):
        '''
        Note::write()

        Purpose: add annotations this notes tml file and write new xml tree to a .tml file in the output folder.

        params:
            timexEventLabels: list of dictionaries of labels for timex and events.
            tlinkLabels: list labels for tlink relations
            idPairs: list of pairs of eid or tid that have a one to one correspondance with the tlinkLabels
            offsets: list of offsets tuples used to locate events and timexes specified by the label lists. Have one to one correspondance with both lists of labels.
            tokens: tokens in the note (used for tense)
            output_path: directory to write the file to
        '''
        # TODO: create output directory if it does not exist
        root = get_stripped_root(self.note_path)
        length = len(offsets)
        doc_time = get_doctime_timex(self.note_path).attrib["value"]

        # hack so events are detected in next for loop.
        for label in timexEventLabels:
            if label["entity_label"][0:2] not in ["B_","I_","O"] or label["entity_label"] in ["I_STATE", "I_ACTION"]:
                label["entity_label"] = "B_" + label["entity_label"]

        # start at back of document to preserve offsets until they are used
        for i in range(1, length+1):
            index = length - i

            if timexEventLabels[index]["entity_label"][0:2] == "B_":
                start = offsets[index][0]
                end = offsets[index][1]
                entity_tokens = tokens[index]["token"]

                #grab any IN tokens and add them to the tag text
                for j in range (1, i):

                    if(timexEventLabels[index + j]["entity_label"][0:2] == "I_"):
                        end = offsets[index + j][1]
                        entity_tokens += ' ' + tokens[index + j]["token"]
                    else:
                        break

                if timexEventLabels[index]["entity_type"] == "TIMEX3":
                    # get the time norm value of the time expression
                    # timex_value = get_normalized_time_expressions(doc_time, [entity_tokens])
                    timex_value = ''
                    # if no value was returned, set the expression to an empty string
                    # TODO: check if TimeML has a specific default value we should use here
                    if len(timex_value) != 0:
                        timex_value = timex_value[0]
                    else:
                        timex_value = ''

                   # if None in [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]:
                   #     print "FOUND NoNE"
                   #     print [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]
        #          #      exit()
                   # else:
                   #     print "NONE NONE"
                   #     print [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]


                    annotated_text = annotate_text_element(root, "TIMEX3", start, end, {"tid": timexEventLabels[index]["entity_id"], "type":timexEventLabels[index]["entity_label"][2:], "value":timex_value})
                else:
                    annotated_text = annotate_text_element(root, "EVENT", start, end, {"eid": timexEventLabels[index]["entity_id"], "class":timexEventLabels[index]["entity_label"][2:]})
                    #if None in [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]:
                    #    print "FOUND NoNE"
                    #    print [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]
        #                exit()
                    #else:
                    #    print "NONE NONE"
                    #    print [start, end,  timexEventLabels[index]["entity_id"], timexEventLabels[index]["entity_label"][2:], timex_value]

                set_text_element(root, annotated_text)

        # make event instances
        eventDict = {}
        for i, timexEventLabel in enumerate(timexEventLabels):

            token = tokens[i]

            pos = None

            # pos
           # if token["pos_tag"] == "IN":
           #     pos = "PREPOSITION"
           # elif token["pos_tag"] in ["VB", "VBD","VBG", "VBN", "VBP", "VBZ", "RB", "RBR", "RBS"]:
           #     pos = "VERB"
           # elif token["pos_tag"] in ["NN", "NNS", "NNP", "NNPS", "PRP", "PRP$"]:
           #     pos = "NOUN"
           # elif token["pos_tag"] in ["JJ", "JJR", "JJS"]:
           #     pos = "ADJECTIVE"
           # else:
           #     pos = "OTHER"

            if timexEventLabel["entity_type"] == "EVENT":
                root = annotate_root(root, "MAKEINSTANCE", {"eventID": timexEventLabel["entity_id"], "eiid": "ei" + str(i), "tense":"NONE", "pos":"NONE"})
                eventDict[timexEventLabel["entity_id"]] = "ei" + str(i)

        # add tlinks
        for i, tlinkLabel in enumerate(tlinkLabels):

            if tlinkLabel == "None":
                continue

            annotations = {"lid": "l" + str(i), "relType": tlinkLabel}

            firstID = idPairs[i][0]
            secondID = idPairs[i][1]

            if firstID[0] == "e":
                annotations["eventInstanceID"] = eventDict[firstID]

            if firstID[0] == "t":
                annotations["timeID"] = firstID

            if secondID[0] == "e":
                annotations["relatedToEventInstance"] = eventDict[secondID]

            if secondID[0] == "t":
                annotations["relatedToTime"] = secondID

            root = annotate_root(root, "TLINK", annotations)

        note_path = os.path.join(output_path, self.note_path.split('/')[-1] + ".tml")

        print "root: ", root
        print "note_path: ", note_path

        write_root_to_file(root, note_path)
    def get_id_chunk_map(self):

        event_ids = set()
        timex_ids = set()

        chunks = []
        chunk = []

        id_chunk = []
        id_chunks = []

        start_entity_id = None

        id_chunk_map = {}

        B_seen = False

        sentence_chunks = {}

        # get tagged entities and group into a list
        for sentence_num, labels in zip(self.pre_processed_text, self.get_labels()):

            sentence_chunks[sentence_num] = []

            for token, label in zip(self.pre_processed_text[sentence_num], labels):

                if label["entity_type"] == "EVENT":

                    _chunk = [token]
                    chunks.append(_chunk)

                    event_ids.add(label["entity_id"])

                    id_chunks.append([label["entity_id"]])

                    # TODO: gonna drop multi span events...
                    assert label["entity_id"] not in id_chunk_map

                    id_chunk_map[label["entity_id"]] = _chunk

                    sentence_chunks[sentence_num].append(("EVENT", label["entity_id"]))

                # start of timex
                elif re.search('^B_', label["entity_label"]):

                    timex_ids.add(label["entity_id"])

                    if len(chunk) != 0:
                        chunks.append(chunk)
                        id_chunks.append(id_chunk)

                        assert start_entity_id not in id_chunk_map

                        #print "TIMEX: adding to id_chunk _map"
                        #print "\t", label

                        id_chunk_map[start_entity_id] = chunk

                        #if start_entity_id is None:
                        #    print "start_entity_id is NONE"
                        #    print label

                        sentence_chunks[sentence_num].append(("TIMEX", start_entity_id))

                        chunk = [token]
                        id_chunk = [label["entity_id"]]


                    else:
                        chunk.append(token)
                        id_chunk.append(label["entity_id"])

                    start_entity_id = label["entity_id"]

                    B_seen = True

                # in timex chunk
                elif re.search('^I_', label["entity_label"]):

                    assert label["entity_id"] == start_entity_id, "{} != {}, B_seen is {}".format(label["entity_id"], start_entity_id, B_seen)

                    chunk.append(token)
                    id_chunk.append(label["entity_id"])

                else:
                    pass

            if len(chunk) != 0:
                chunks.append(chunk)
                assert len(id_chunk) == len(chunk)
                id_chunks.append(id_chunk)

                assert start_entity_id not in id_chunk_map
                id_chunk_map[start_entity_id] = chunk

                sentence_chunks[sentence_num].append(("TIMEX", start_entity_id))

            chunk = []
            id_chunk = []

        assert len(event_ids.union(timex_ids)) == len(id_chunks), "{} != {}".format(len(event_ids.union(timex_ids)), len(id_chunks))
        assert len(id_chunk_map.keys()) == len(event_ids.union(timex_ids)), "{} != {}".format(len(id_chunk_map.keys()), len(event_ids.union(timex_ids)))

        # TODO: need to add features for doctime. there aren't any.
        # add doc time. this is a timex.
        doctime = get_doctime_timex(self.note_path)
        doctime_id = doctime.attrib["tid"]
        doctime_dict = {}

        # create dict representation of doctime timex
        for attrib in doctime.attrib:
            doctime_dict[attrib] = doctime.attrib[attrib]

        id_chunk_map[doctime_id] = [doctime_dict]
        timex_ids.add(doctime_id)


        return id_chunk_map, event_ids, timex_ids, sentence_chunks