Beispiel #1
0
def add_learnit_relations(filepath, docs):
    with codecs.open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.strip().split('\t')
            relation_name = tokens[0]
            docid = tokens[1]
            arg1_start = int(tokens[2])
            arg1_end = int(tokens[3])
            arg2_start = int(tokens[4])
            arg2_end = int(tokens[5])

            arg1_text = re.search(r'<SLOT0>(.*?)</SLOT0>', tokens[6]).group(1)
            arg2_text = re.search(r'<SLOT1>(.*?)</SLOT1>', tokens[6]).group(1)

            if relation_name == 'causes':
                relation_name = 'cause';
            elif relation_name == 'affects':
                relation_name = 'pecondition_of';
            elif relation_name == 'occurs_before':
                relation_name = 'occurs_before'

            r = Relation(relation_name)
            #r.connective_text = eg['connective_text']
            r.docid = docid
            #r.relation_type = eg['relation_type']

            r.add_arg1_span(IntPair(arg1_start, arg1_end))
            r.add_arg2_span(IntPair(arg2_start, arg2_end))
            r.arg1_text = arg1_text
            r.arg2_text = arg2_text
            docs[docid].append(r)
Beispiel #2
0
def read_pdtb_json(filename, causal_model, flip_args_enabled=False):
    doc_relations = defaultdict(list)

    with codecs.open(filename, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    for eg in json_data:
        if eg is not None:

            semantic_class = eg['semantic_class']
            r = Relation(semantic_class)
            flip_args = False
            if 'connective_text' in eg:
                r.connective_text = eg['connective_text']
                if flip_args_enabled and r.connective_text.lower() in (
                        "after", "as", "as long as", "because", "insofar as",
                        "now that", "once", "since", "when", "when and if"):
                    flip_args = True

            arg1_spans = eg['arg1_span_list']
            arg1_text = eg['arg1_text']

            arg2_spans = eg['arg2_span_list']
            arg2_text = eg['arg2_text']

            if flip_args:
                tmp = arg1_spans
                arg1_spans = arg2_spans
                arg2_spans = tmp
                tmp = arg1_text
                arg1_text = arg2_text
                arg2_text = tmp

            docid = eg['docid']
            if '.' in docid:
                docid = re.search(r'^(.*)\.(.*)$', docid).group(1)

            r.model = causal_model

            r.docid = docid
            r.relation_type = eg['relation_type']

            for span in arg1_spans:
                offset = IntPair(int(span[0]), int(span[1]))
                r.add_arg1_span(offset)
            for span in arg2_spans:
                offset = IntPair(int(span[0]), int(span[1]))
                r.add_arg2_span(offset)

            r.arg1_text = arg1_text
            r.arg2_text = arg2_text

            if 'cause_sentence' in eg:
                r.sentence = eg['cause_sentence']

            doc_relations[r.docid].append(r)

    return doc_relations
Beispiel #3
0
def to_sentence(text, start, end):
    """Converts a sentence raw text to a Sentence object."""
    charOffsets = IntPair(start, end)
    tokens = []

    offset = start
    for t in text.split():
        token = Token(IntPair(offset, offset + len(t)), t)
        tokens.append(token)
        offset += len(t) + 1  # +1 to account for white-space

    return Sentence(charOffsets, text, tokens)
Beispiel #4
0
    def sentence_segmention_and_tokenization_with_text(self, model):
        """Whatever model we pass in, must be able to perform sentence segmentation and tokenization
        by calling model(self.text). We typically use Spacy
        """
        doc = model(self.text)

        for sent_index, sent in enumerate(doc.sents):
            tokens = []
            for token_index, token in enumerate(sent):
                start = token.idx
                end = token.idx + len(token.text)
                tokens.append(Token(IntPair(start, end), token.text, token, token_index))
            sentence = Sentence(self.docid, IntPair(sent.start_char, sent.end_char), sent.text.strip(), tokens, sent_index)
            self.sentences.append(sentence)
Beispiel #5
0
def line_to_predictions(ner_fea, dec, json_eg, attr, content_type,
                        word_embeddings, trigger_generator, trigger_model,
                        arg_generator):
    """
    :type word_embeddings: embeddings.word_embeddings.WordEmbedding
    :type trigger_generator: event.event_trigger.EventTriggerGenerator
    :type trigger_model: model.event_cnn.EventExtractionModel
    :type arg_generator: event.event_argument.EventArgumentGenerator
    """
    global spacy_en

    content = find(attr, json_eg)  # json_eg.get(attr)

    #print(content_type.encode('ascii', 'ignore'))
    #print(content.encode('ascii', 'ignore'))

    offset = 0
    all_predictions = []

    if content is not None:
        if type(content) is list:
            content = '\n'.join(content)
        for line in content.split('\n'):
            #print(offset)
            #print('[' + content_type.encode('ascii', 'ignore') + ']')
            #print('[' + line.encode('ascii', 'ignore') + ']')

            doc_ner_predictions = []
            sentences = get_sentences(line, content_type)
            if sentences is not None:
                for sent in sentences:
                    sent_predictions = decode_sentence(ner_fea, dec, content,
                                                       sent, offset,
                                                       content_type)
                    doc_ner_predictions.extend(sent_predictions)
                    all_predictions.extend(sent_predictions)

            if content_type == 'Post':
                doc = Document('dummy', line)
                for i, p in enumerate(doc_ner_predictions):
                    id = 'em-{}'.format(i)
                    doc.add_entity_mention(
                        EntityMention(id, IntPair(p['start'], p['end']),
                                      p['text'], p['label']))
                doc.annotate_sentences(spacy_en, word_embeddings)

                (trigger_examples, trigger_data, trigger_data_list,
                 trigger_label) = generate_trigger_data_feature(
                     trigger_generator, [doc])
                trigger_predictions = trigger_model.predict(trigger_data_list)

            offset += len(line) + 1  # +1 to account for newline

    # a list of dict, one for each predicted NE mention
    if len(all_predictions) > 0:
        if not "extractions" in json_eg:
            json_eg["extractions"] = {}
        json_eg['extractions'][attr] = all_predictions

    return json_eg
Beispiel #6
0
    def process_events(cls, doc, document_node):
        """
        :type doc: text.text_theory.Document
        :type document_node: xml.etree.ElementTree.Element
        """
        for event_node in document_node.findall('event'):
            event_id = event_node.attrib['ID']
            event_type = event_node.attrib['TYPE']
            event_subtype = event_node.attrib['SUBTYPE']
            #for event_argument_node in event_node.findall('event_argument'):
            #    argument = Argument(event_argument_node.attrib['REFID'], event_argument_node.attrib['ROLE'])
            #    event.add_argument(argument)

            for mention_node in event_node.findall('event_mention'):
                mention_id = mention_node.attrib['ID']
                event = Event(mention_id, event_type+'.'+event_subtype)

                anchor = mention_node.find('anchor')
                (text, start, end) = cls.process_xml_charseq(anchor[0])
                event.add_anchor(Anchor(mention_id+'-trigger', IntPair(start, end), text, event_type+'.'+event_subtype))

                for argument_mention_node in mention_node.findall('event_mention_argument'):
                    arg_id = argument_mention_node.attrib['REFID']
                    arg_role = argument_mention_node.attrib['ROLE']
                    arg_em = doc.get_entity_mention_with_id(arg_id)
                    assert arg_em is not None

                    event_arg = EventArgument('{}-a{}'.format(mention_id, event.number_of_arguments()), arg_em, arg_role)
                    event.add_argument(event_arg)
                doc.add_event(event)
Beispiel #7
0
def read_serif_json(filename):
    ret = defaultdict(list)

    with codecs.open(filename, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    for event in json_data['events'] + json_data[
            'generic_events']:  # these are ACE, KBP, and GENERIC events
        start = event['anchor_start']
        end = event['anchor_end']
        event_type = event['event_type']
        text = event['anchor_text']
        span = EventSpan('dummy', IntPair(start, end), text, event_type)
        e = Event(span)
        e.snippet = event['snippet']
        e.docid = event['docid']

        if event_type.startswith('Class-'):
            e.model = event_models.GENERIC
        else:
            e.model = event_models.KBP
        ret[e.docid].append(e)

    for event in json_data['accent_events']:
        event_name = event['event_name']

        args = event['participants']
        source_offset = None
        target_offset = None
        if 'Source' in args:
            source = args['Source']
            source_offset = IntPair(source['head_start_char'],
                                    source['head_end_char'])
        if 'Target' in args:
            target = args['Target']
            target_offset = IntPair(target['head_start_char'],
                                    target['head_end_char'])
        event_offset = offset_from_offsets(source_offset, target_offset)

        span = EventSpan('dummy', event_offset, 'dummy', event_name)
        e = Event(span)
        e.model = event_models.ACCENT
        e.snippet = event['snippet']
        e.docid = event['docid']
        ret[e.docid].append(e)

    return ret
Beispiel #8
0
 def __init__(self, id, entity_mention, label):
     """:type entity_mention: text.text_span.EntityMention"""
     Span.__init__(
         self,
         IntPair(entity_mention.start_char_offset(),
                 entity_mention.end_char_offset()), entity_mention.text)
     self.id = id
     self.label = label
     self.entity_mention = entity_mention
Beispiel #9
0
Datei: idt.py Projekt: BBN-E/Hume
def print_spacy_sentence_as_conll(sent, entity_mentions, offset):
    all_tokens = []

    for token in sent:
        start = token.idx + offset
        end = start + len(token.text)
        all_tokens.append(Token(IntPair(start, end), token.text, token.tag_))	# token.tag_ : POS-tag

    return tokens_to_conll(all_tokens, entity_mentions)
Beispiel #10
0
 def offset_from_offsets(self, offset1, offset2):
     if offset1 is not None and offset2 is not None:
         c1 = min(offset1.first, offset2.first)
         c2 = max(offset1.second, offset2.second)
         return IntPair(c1, c2)
     elif offset1 is not None:
         return offset1
     elif offset2 is not None:
         return offset2
     else:
         return None
Beispiel #11
0
 def sentence_segmention_and_tokenization_with_list(self, model):
     """Whatever model we pass in, must be able to perform sentence segmentation and tokenization
     by calling model(self.text). We typically use Spacy
     """
     offset = 0
     for ss in self.sentence_strings:
         if len(ss) == 0 or ss.isspace():
             pass
         else:
             for sent in model(ss).sents:  # for each Spacy sentence
                 tokens = []
                 for token_index, token in enumerate(sent):
                     start = offset + token.idx
                     end = start + len(token.text)
                     tokens.append(Token(IntPair(start, end), token.text, token, token_index))
                 sentence = Sentence(self.docid,
                                     IntPair(offset + sent.start_char, offset + sent.start_char + len(sent.text)),
                                     sent.text.strip(), tokens, len(self.sentences))
                 self.sentences.append(sentence)
         offset += len(ss)
Beispiel #12
0
    def process_times(cls, doc, document_node):
        """
        :type doc: text.text_theory.Document
        :type document_node: xml.etree.ElementTree.Element
        """
        for time_node in document_node.findall('timex2'):
            time_id = time_node.attrib['ID']

            all_mentions = time_node.findall('timex2_mention')
            for mention_node in all_mentions:
                mention_id = mention_node.attrib['ID']
                (text, start, end) = cls.process_xml_charseq(mention_node[0][0])
                em = EntityMention(mention_id, IntPair(start, end), text, 'Time')
                doc.add_entity_mention(em)
Beispiel #13
0
def read_pdtb_json(filename):
    doc_relations = defaultdict(list)

    with codecs.open(filename, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    for eg in json_data:
        if eg is not None:
            arg1_spans = eg['arg1_span_list']
            arg1_text = eg['arg1_text']

            arg2_spans = eg['arg2_span_list']
            arg2_text = eg['arg2_text']

            semantic_class = eg['semantic_class']

            docid = eg['docid']
            docid = re.search(r'^(.*)\.(.*)$', docid).group(1)

            r = Relation(semantic_class)
            r.connective_text = eg['connective_text']
            r.docid = docid
            r.relation_type = eg['relation_type']

            for span in arg1_spans:
                offset = IntPair(int(span[0]), int(span[1]))
                r.add_arg1_span(offset)
            for span in arg2_spans:
                offset = IntPair(int(span[0]), int(span[1]))
                r.add_arg2_span(offset)

            r.arg1_text = arg1_text
            r.arg2_text = arg2_text

            doc_relations[r.docid].append(r)

    return doc_relations
Beispiel #14
0
    def process_values(cls, doc, document_node):
        """
        :type doc: text.text_theory.Document
        :type document_node: xml.etree.ElementTree.Element
        """
        for value_node in document_node.findall('value'):
            value_id = value_node.attrib['ID']
            value_type = value_node.attrib['TYPE']

            all_mentions = value_node.findall('value_mention')
            for mention_node in all_mentions:
                mention_id = mention_node.attrib['ID']
                (text, start, end) = cls.process_xml_charseq(mention_node[0][0])
                em = EntityMention(mention_id, IntPair(start, end), text, value_type)
                doc.add_entity_mention(em)
Beispiel #15
0
def file_to_document(filepath):
    f = open(filepath, 'rU')
    sentences = []

    offset = 0
    for line in f:
        sentence = to_sentence(line, offset, offset + len(line))
        sentences.append(sentence)
        offset += len(line)
        # +1 for account for newline
    f.close()

    s_strings = [s.label for s in sentences]
    doc_text = "\n".join(s_strings)

    return Document(IntPair(0, offset - 1), doc_text, sentences)
Beispiel #16
0
    def process_entities(cls, doc, document_node):
        """
        :type doc: text.text_theory.Document
        :type document_node: xml.etree.ElementTree.Element
        """
        all_entities = document_node.findall('entity')
        for entity_node in all_entities:
            entity_id = entity_node.attrib['ID']
            entity_type = entity_node.attrib['TYPE']
            entity_subtype = entity_node.attrib['SUBTYPE']

            all_mentions = entity_node.findall('entity_mention')
            for mention_node in all_mentions:
                mention_id = mention_node.attrib['ID']
                head = mention_node.find('head')
                (text, start, end) = cls.process_xml_charseq(head[0])
                em = EntityMention(mention_id, IntPair(start, end), text, entity_type+'.'+entity_subtype)
                doc.add_entity_mention(em)
Beispiel #17
0
Datei: idt.py Projekt: BBN-E/Hume
def extract_sentence_annotation(text, offset):
    """offset: char offset thus far (excluding xml tags) from prior sentences."""

    start_tag = 0
    end_tag = -1
    raw_text = ''
    entity_mentions = []

    # ignore everything starting from 'REMOVED_URL'
    url_index = text.find(' REMOVED_URL', 0)
    if url_index != -1:
        text = text[0:url_index]

    start_tag = text.find('<ENAMEX', 0)
    while(start_tag != -1):
        raw_text += text[end_tag+1 : start_tag]

        end_tag = text.find('>', start_tag)
        entity_type = re.search(r' TYPE="(.*)"', text[start_tag:end_tag]).group(1)

        start_tag = text.find('</ENAMEX>', end_tag)
        mention_text = text[end_tag+1 : start_tag]

        start = offset+len(raw_text)
        end = offset+len(raw_text)+len(mention_text)
        if '-' in mention_text and entity_type.endswith('DESC'):
            print('Rejecting %s[%s], because Spacy will split the string into multiple tokens, and DESC should always be just a single word' % (entity_type, mention_text)).encode('utf-8')
        else:
            (new_mention_text, prefix_length, suffix_length) = strip_mention_text(mention_text)
            if new_mention_text != mention_text:
                print('Revising %s to %s' % (mention_text, new_mention_text)).encode('utf-8')
            id = 'm-' + str(start+prefix_length) + '-' + str(end-suffix_length)
            entity_mentions.append(EntityMention(id, IntPair(start+prefix_length, end-suffix_length), new_mention_text, entity_type))

        raw_text += mention_text

        end_tag = text.find('>', start_tag)
        start_tag = text.find('<ENAMEX', end_tag)

    raw_text += text[end_tag+1:]

    return (raw_text, entity_mentions)
Beispiel #18
0
Datei: idt.py Projekt: BBN-E/Hume
def file_to_document(filepath):
    f = codecs.open(filepath, 'r', encoding='utf8')
    sentences = []

    offset = 0
    for line in f:
        (raw_text, entity_mentions) = extract_sentence_annotation(line.strip(), offset)
        sentence = text_span.to_sentence(raw_text, offset, offset + len(raw_text))
        sentence.add_annotation('ENTITY_MENTIONS', entity_mentions)
        sentences.append(sentence)

        offset += len(raw_text) + 1  # +1 to account for newline

    f.close()

    s_strings = [s.label for s in sentences]
    doc_text = "\n".join(s_strings)

    #doc_id = os.path.basename(filepath)
    doc_id = filepath
    return Document(doc_id, IntPair(0, offset-1), doc_text, sentences)
Beispiel #19
0
    def _read_spans_from_file(self, infile, event_type, text, events=None):
        """Get the positive and negative spans
        Returns:
            list[text.text_span.TextSpan]
        """
        ret = []
        with open(infile, 'r') as f:
            for line in f:
                tokens = line.strip().split()
                span_type = tokens[0]
                start = int(tokens[1])
                end = int(tokens[2]) + 1
                text_string = ' '.join(text[start:end].replace('\n', ' ').strip().split())
                end = start + len(text_string)

                if '<' in text_string or '>' in text_string:
                    print('Skipping annotation of type {}, as it has either "<" or ">"'.format(span_type))
                    continue

                span_offset = IntPair(start, end)

                if span_type == event_type:
                    # if this is a positive span, then we need to make sure we have an event for it
                    if events is not None:
                        found_span = False
                        for event in events:
                            if offset_same(event.event_spans[0].int_pair, span_offset):
                                found_span = True
                                break
                        if found_span:
                            ret.append(TextSpan(span_offset, text_string))
                            self.positive_span_count += 1
                        else:
                            self.discard_span_count += 1
                    else:
                        self.discard_span_count += 1
                elif span_type == 'negative':
                    ret.append(TextSpan(span_offset, text_string))
                    self.negative_span_count += 1
        return ret
Beispiel #20
0
    def read(self, kb, serif_causal_relation, learnit_causal_relation,
             extra_causal_relation):
        print("CausalRelationReader START")
        count = 0
        #docid_to_relation_list = self.read_causal_relation_json(pdtb_json, self.causal_models.PDTB, flip_args_enabled=True)    # docs with pdtb relations
        #for key in docid_to_relation_list:
        #    count += len(docid_to_relation_list[key])
        #print('count = {}'.format(count))

        docid_to_relation_list = defaultdict(list)

        print("CausalRelationReader READ CAUSAL RELATIONS")
        self.add_serif_causal_relations(serif_causal_relation,
                                        docid_to_relation_list)
        self.add_learnit_causal_relations(learnit_causal_relation,
                                          docid_to_relation_list)
        if extra_causal_relation != "NA":
            self.add_learnit_causal_relations(extra_causal_relation,
                                              docid_to_relation_list)

        count = 0
        for key in docid_to_relation_list:
            count += len(docid_to_relation_list[key])

        # Build Document objects (Document object is a nested class above)
        docid_to_document = dict()
        print("CausalRelationReader READ EVENTS")
        for kb_event in kb.evid_to_kb_event.values():
            model = None

            if kb_event.event_mentions[0].model == "ACCENT":
                model = self.event_models.ACCENT
            elif kb_event.event_mentions[0].model == "KBP":
                model = self.event_models.KBP
            else:
                continue

            for kb_event_mention in kb_event.event_mentions:
                event_type = kb_event_mention.event_type
                event_offset = None
                if kb_event_mention.trigger_start is not None and kb_event_mention.trigger_end is not None:
                    start = kb_event_mention.trigger_start
                    end = kb_event_mention.trigger_end
                    event_offset = IntPair(start, end)
                else:
                    source_offset = None
                    target_offset = None
                    if 'Source' in kb_event_mention.arguments:
                        source = kb_event_mention.arguments['Source']
                        source_offset = IntPair(source[0].head_start_char,
                                                source[0].head_end_char)
                    if 'Target' in kb_event_mention.arguments:
                        target = kb_event_mention.arguments['Target']
                        target_offset = IntPair(target[0].head_start_char,
                                                target[0].head_end_char)
                    event_offset = self.offset_from_offsets(
                        source_offset, target_offset)
                text = kb_event_mention.trigger
                if text is None:
                    text = "dummy"
                snippet = kb_event_mention.snippet
                docid = kb_event_mention.document.id

                # Create local objects
                if docid not in docid_to_document:
                    docid_to_document[docid] = self.Document(docid)

                #print kb_event.id
                #print "Creating event span from " + str(event_offset.first) + " " + str(event_offset.second)

                span = EventSpan('dummy', event_offset, text, event_type)
                e = self.Event(span, kb_event, kb_event_mention)
                e.model = model
                e.snippet = snippet
                e.docid = docid
                docid_to_document[docid].add_event(e)

        print("CausalRelationReader ADD RELATIONS TO DOCUMENTS")
        """:type: Document"""
        for docid, doc in docid_to_document.iteritems():
            if docid in docid_to_relation_list:
                relations = docid_to_relation_list[docid]
                doc.add_relations(relations)

        count_stats = defaultdict(int)
        for docid, doc in docid_to_document.iteritems():
            kb_document = kb.docid_to_kb_document[docid]
            self.find_events_in_doc_relations(doc)
            for relation in doc.causal_relations:

                # e1 and e2 are Event objects above
                e1 = relation.left_factor
                e2 = relation.right_factor

                snippet = None

                # Map to standard type names
                relation_type = relation.label
                if relation.label == "cause" or relation.label == "Contingency.Cause":
                    relation_type = "Cause-Effect"
                elif relation.label == "Contingency.Condition" or relation.label == "precondition_of":
                    relation_type = "Precondition-Effect"
                elif relation.label == "Temporal.Asynchronous" or relation.label == "occurs_before":
                    relation_type = "Before-After"
                elif relation.label == "catalyst_effect":
                    relation_type = "Catalyst-Effect"
                elif relation.label == "cause_effect":
                    relation_type = "Cause-Effect"
                elif relation.label == "mitigating_factor_effect":
                    relation_type = "MitigatingFactor-Effect"
                elif relation.label == "precondition_effect":
                    relation_type = "Precondition-Effect"
                elif relation.label == "preventative_effect":
                    relation_type = "Preventative-Effect"

                left_id = e1.kb_event.id
                right_id = e2.kb_event.id
                relation_id = SharedIDManager.get_in_document_id(
                    "Relation", docid)
                relation_mention_id = SharedIDManager.get_in_document_id(
                    "RelationMention", docid)

                #print("reln: " + relation_type + ", " + left_id + ", " + right_id)

                kb_relation = KBRelation(relation_id, "event-event",
                                         relation_type, left_id, right_id)

                e1_start = int(e1.snippet[1])
                e1_end = int(e1.snippet[2])
                e2_start = int(e2.snippet[1])
                e2_end = int(e2.snippet[2])

                if e1_start == e2_start and e1_end == e2_end:
                    snippet = e1.snippet
                else:
                    combined_snippet = [None, None, None]

                    # Combine snippets into one, these should be adjacent sentences
                    if e1_start < e2_start:
                        combined_snippet[
                            0] = e1.snippet[0] + " " + e2.snippet[0]
                        combined_snippet[1] = e1_start
                    else:
                        combined_snippet[
                            0] = e2.snippet[0] + " " + e1.snippet[0]
                        combined_snippet[1] = e2_start
                    if e1_end > e2_end:
                        combined_snippet[2] = e1_end
                    else:
                        combined_snippet[2] = e2_end
                    snippet = combined_snippet

                kb_relation_mention = KBRelationMention(
                    relation_mention_id, e1.kb_event_mention,
                    e2.kb_event_mention, snippet, kb_document)
                kb_relation_mention.properties["model"] = relation.model
                if relation.pattern is not None:
                    kb_relation_mention.properties[
                        "pattern"] = relation.pattern
                if relation.confidence is not None:
                    kb_relation_mention.properties[
                        "extraction_confidence"] = relation.confidence

                kb_relation.add_relation_mention(kb_relation_mention)
                kb.add_relation(kb_relation)
Beispiel #21
0
    def read_causal_relation_json(self,
                                  filename,
                                  causal_model,
                                  flip_args_enabled=False):
        doc_relations = defaultdict(list)

        with codecs.open(filename, 'r', encoding='utf-8') as f:
            try:
                json_data = json.load(f)
            except ValueError as ve:
                print("While loading: " + filename)
                print(str(ve))
                sys.exit(1)

        for eg in json_data:
            if eg is not None:

                semantic_class = eg['semantic_class']
                r = self.Relation(semantic_class)
                flip_args = False
                if 'connective_text' in eg:
                    r.connective_text = eg['connective_text']
                    if flip_args_enabled and r.connective_text.lower() in (
                            "after", "as", "as long as", "because",
                            "insofar as", "now that", "once", "since", "when",
                            "when and if"):
                        flip_args = True

                if 'prob' in eg:
                    r.confidence = float(eg['prob'])

                r.pattern = eg.get('learnit_pattern')

                arg1_spans = eg['arg1_span_list']
                arg1_text = eg['arg1_text']

                arg2_spans = eg['arg2_span_list']
                arg2_text = eg['arg2_text']

                if flip_args:
                    tmp = arg1_spans
                    arg1_spans = arg2_spans
                    arg2_spans = tmp
                    tmp = arg1_text
                    arg1_text = arg2_text
                    arg2_text = tmp

                docid = eg['docid']
                if '.' in docid:
                    docid = re.search(r'^(.*)\.(.*)$', docid).group(1)

                r.model = causal_model

                r.docid = docid
                r.relation_type = eg['relation_type']

                for span in arg1_spans:
                    offset = IntPair(int(span[0]), int(span[1]))
                    r.add_arg1_span(offset)
                for span in arg2_spans:
                    offset = IntPair(int(span[0]), int(span[1]))
                    r.add_arg2_span(offset)

                r.arg1_text = arg1_text
                r.arg2_text = arg2_text

                if 'cause_sentence' in eg:
                    r.sentence = eg['cause_sentence']

                doc_relations[r.docid].append(r)

        return doc_relations
Beispiel #22
0
def process_enote_file(doc, xml_file, auto_adjust):
    """Parses ENote annotation file to annotated_document.DocumentAnnotation

    :param all_text: raw text corresponding to the xml_file
    :param xml_file: ENote annotation file
    :param docid: string representing docid
    :param auto_adjust: Adjust annotation (start, end) position to match text. Useful if annotation data is noisy.
    :return: document_annotation.DocumentAnnotation
    """
    tree = etree.parse(xml_file)
    root_node = tree.getroot()

    all_text = doc.text

    events_node = root_node.find('dc:Events', NAMESPACES)
    for event_index, event_node in enumerate(events_node):
        event_type = event_node.find('dc:Name', NAMESPACES).text.decode('UTF8')
        event_id = '{}-e{}'.format(doc.docid, event_index)
        event = Event(event_id, event_type)

        candidate_anchors = []
        candidate_arguments = []
        for argument_index, argument_node in enumerate(event_node.find('dc:Arguments', NAMESPACES)):
            argument = EnoteArgument.from_xml_node(argument_node)

            # Skip argument is empty
            if argument == None:
                continue

            start = argument.start
            end = argument.end

            unicode_text = all_text[start:end]
            if all_text[start:end] != argument.text and auto_adjust:
                start, end = utils.find_best_location(all_text, argument.text, start, end)
                unicode_text = all_text[start:end]

            # TODO : we could also treat the following as anchors:
            # - event_type == 'Vulnerability' and argument.name == 'Name'
            # - event_type == 'Exploit' and argument.name == 'Name'
            if argument.name == 'Anchor':
                anchor_id = '{}-t{}'.format(event_id, len(candidate_anchors))
                anchor = Anchor(anchor_id, IntPair(start, end), unicode_text, event_type)
                candidate_anchors.append(anchor)
                #if event.overlaps_with_anchor(anchor):
                #    print('Dropping overlapping anchor, %s' % (anchor.to_string()))
                #else:
                #    event.add_anchor(anchor)
            else:
                arg_id = '{}-a{}'.format(event_id, len(candidate_arguments))

                # get the entity mention associated with the event argument
                em = doc.get_entity_mention_with_span(start, end)
                if em is None:
                    print(
                        'Dropping event argument, as I cannot find an entity mention with same offsets. %s (%d,%d) "%s" %s' % (
                        doc.docid, start, end, unicode_text.encode('ascii','ignore'), argument.name.decode('UTF8')))
                else:
                    arg = EventArgument(arg_id, em, argument.name.decode('UTF8'))
                    candidate_arguments.append(arg)
                    #event.add_argument(arg)

        for anchor in candidate_anchors:
            if event.overlaps_with_anchor(anchor):
                print('Dropping overlapping anchor, %s' % (anchor.to_string()))
            else:
                event.add_anchor(anchor)
        for arg in candidate_arguments:
            if event.overlaps_with_anchor(arg):
                print('Dropping argument that overlaps with anchor, %s' % (arg.to_string()))
            else:
                event.add_argument(arg)

        doc.add_event(event)

    return doc
Beispiel #23
0
    def read_spans(self, annotated_events):
        """From the annotation files, we capture the positive and negative spans,
        Then return a dictionary from filename or docid to list[text.text_span.TextSpan]
        The list orders the TextSpan by their start_char_offset. The text within each TextSpan is also normalized,
        with newlines replaced by space and consecutive spaces replaced by a single space.

        :type annotated_events: dict[str, list[text.text_theory.Event]]
        """
        ret = defaultdict(list)
        """:type: dict[str, list[text.text_span.TextSpan]]"""

        # We first collect the positive and negative spans, from the annotation files.
        # Note that the same file can be annotated multiple times (via different event types).
        # Need to de-dupliciate the spans later.
        file_spans = defaultdict(list)  # filename -> list[text.text_span.TextSpan]
        """:type: dict[str, list[text.text_span.TextSpan]]"""
        for event_type, annotation_dir in self.event_type_annotation_dir.items():
            for filename in os.listdir(annotation_dir):
                if filename not in self.target_filenames:
                    continue

                annotation_file = os.path.join(annotation_dir, filename)

                text_file = os.path.join(self.text_dir, filename)
                with codecs.open(text_file, 'r', encoding='utf-8') as f:
                    raw_text = f.read()

                spans = self._read_spans_from_file(annotation_file, event_type, raw_text, events=annotated_events[filename])
                file_spans[filename].extend(spans)

        # for each file, de-duplicate the spans and order them by their start-char-offset
        for filename in file_spans.keys():
            all_spans = file_spans[filename]
            """:type: list[text.text_span.TextSpan]"""

            current_spans = dict()  # start_char_offset -> TextSpan ; holds de-duplicated spans keyed by start-offset
            """:type: dict[int, text.text_span.TextSpan]"""
            for span in all_spans:
                # check whether 'span' is already in current_spans
                span_offset = IntPair(span.start_char_offset(), span.end_char_offset())
                to_add = True
                for start, s in current_spans.items():
                    s_offset = IntPair(s.start_char_offset(), s.end_char_offset())
                    if offset_same(span_offset, s_offset):
                        print('Found offset_same spans')
                        to_add = False
                        break
                    elif offset_overlap(span_offset, s_offset):
                        # we will remove both spans, just to reduce noise
                        print('Found offset_overlap spans in file {}, {}:{}'.format(filename, span_offset.to_string(), s_offset.to_string()))
                        print('[{}]\n==== vs ====\n[{}]\n'.format(span.text, s.text))
                        del current_spans[start]
                        to_add = False
                        break
                if to_add:
                    current_spans[span.start_char_offset()] = span

            if len(current_spans) > 0:
                for start_char_offset in sorted(current_spans):
                    span = current_spans[start_char_offset]
                    """:type: text.text_span.TextSpan"""
                    ret[filename].append(span)

        return ret
Beispiel #24
0
    def adjust_and_write_annotation_offset(cls, file_spans, annotated_events, output_dir):
        """Since we keep only the positive and negative spans from the original text files,
        we need to adjust the annotation offsets accordingly.

        :type file_spans: dict[str, list[text.text_span.TextSpan]]
        :type annotated_events: dict[str, list[text.text_theory.Event]]

        The keys for both dictionaries are filenames. Note that the filename keys in annotated_events is a subset of
        the filename keys in file_spans, since some files might contain only negative spans.
        """

        for filename, events in annotated_events.items():
            outlines = []           # strings storing adjusted annotation offsets
            """:type: list[str]"""

            spans = file_spans[filename]
            """:type: list[text.text_span.TextSpan]"""

            # establish the new offsets for spans
            new_offsets = []
            """:type: list[common.utils.IntPair]"""
            offset = 0
            for span in spans:
                end = offset + len(span.text)
                new_offsets.append(IntPair(offset, end))
                offset = end + 1    # +1 for the newline

            for event in events:
                event_span = event.event_spans[0]
                # find the index of this event_span in 'spans'
                span_index = -1
                for i, span in enumerate(spans):
                    if offset_same(span.int_pair, event_span.int_pair):
                        span_index = i
                        break
                if span_index == -1:
                    raise ValueError('Could not find a corresponding span, should not happen')

                span_start = spans[span_index].start_char_offset()
                text = spans[span_index].text
                new_offset = new_offsets[span_index]

                outlines.append('<Event type="{}">'.format(event.label))
                outlines.append('{}\t{}\t{}'.format(event.label, new_offset.first, new_offset.second))

                if event.number_of_anchors() == 0:
                    raise ValueError('An event should have at least 1 anchor!')

                for anchor in event.anchors:
                    start = anchor.start_char_offset() - span_start
                    end = anchor.end_char_offset() - span_start
                    if text[start:end] != anchor.text:
                        new_start, new_end = utils.find_best_location(text, anchor.text, start, end)
                        print('Adjusting anchor offsets from ({},{}) to ({},{})'.format(start, end, new_start, new_end))
                        start = new_start
                        end = new_end
                    start += new_offset.first
                    end += new_offset.first
                    outlines.append('anchor\t{}\t{}'.format(start, end))

                for arg in event.arguments:
                    start = arg.start_char_offset() - span_start
                    end = arg.end_char_offset() - span_start
                    if text[start:end] != arg.text:
                        new_start, new_end = utils.find_best_location(text, arg.text, start, end)
                        print('Adjusting argument offsets from ({},{}) to ({},{})'.format(start, end, new_start, new_end))
                        start = new_start
                        end = new_end
                    start += new_offset.first
                    end += new_offset.first
                    outlines.append('{}/{}\t{}\t{}'.format(event.label, arg.label, start, end))

                outlines.append('</Event>')

            if len(outlines) > 0:
                with open(os.path.join(output_dir, filename+'.meta'), 'w') as f:
                    for line in outlines:
                        f.write(line + '\n')
Beispiel #25
0
    def _read_annotation_file(cls, infile, event_type, text):
        """
        :type infile: str
        :type event_type: str
        :type text: str
        Returns:
            list[text.text_theory.Event]
        :param text: this is the raw text corresponding to the annotation
        """
        docid = os.path.basename(infile)

        events = []
        """:type: list[text.text_theory.Event]"""
        negative_spans = []
        """:type: list[text.text_span.TextSpan]"""
        anchors_not_in_eventspans = []      # these might be in negative spans
        """:type: list[text.text_span.Anchor]"""
        with open(infile, 'r') as f:
            for line in f:
                tokens = line.strip().split()
                span_type = tokens[0]
                start = int(tokens[1])
                end = int(tokens[2]) + 1
                text_string = ' '.join(text[start:end].replace('\n', ' ').strip().split())
                end = start + len(text_string)

                if '<' in text_string or '>' in text_string:
                    print('Skipping annotation of type {}, as it has either "<" or ">"'.format(span_type))
                    continue

                if span_type == event_type:
                    id = '{}-e{}'.format(docid, len(events))
                    event_span = EventSpan(id, IntPair(start, end), text_string, event_type)
                    e = Event(id, event_type)
                    e.add_event_span(event_span)
                    events.append(e)
                elif '/' in span_type:  # this is an event argument
                    em = EntityMention('dummy', IntPair(start, end), text_string, 'dummy')
                    event_role = span_type.split('/')[1]
                    e = cls._find_event_containing_span(events, start, end)
                    if e is None:
                        print('Cannot find an event span for {} {} (start,end)=({},{}) "{}". Skipping.'.format(event_type, docid, start, end, text_string))
                    else:
                        arg_id = '{}-a{}'.format(e.id, e.number_of_arguments())
                        e.add_argument(EventArgument(arg_id, em, event_role))
                elif span_type == 'anchor':
                    e = cls._find_event_containing_span(events, start, end)
                    anchor = Anchor('dummy', IntPair(start, end), text_string, event_type)
                    if e is None:
                        # it might be in a negative span
                        #print('Cannot find an event span for {} {} (start,end)=({},{}) "{}". Skipping.'.format(event_type, docid, start, end, text_string.replace(' ', '_')))
                        anchors_not_in_eventspans.append(anchor)
                    else:
                        e.add_anchor(anchor)
                elif span_type == 'negative':
                    negative_spans.append(TextSpan(IntPair(start, end), text_string))
                elif span_type == 'interesting':
                    pass                # we discard these for now

        for anchor in anchors_not_in_eventspans:
            found = False
            for span in negative_spans:
                if span.start_char_offset() <= anchor.start_char_offset() and anchor.end_char_offset() <= span.end_char_offset():
                    found = True
                    break
            if not found:
                print('Cannot find an event nor negative span for anchor {} {} (start,end)=({},{}) "{}". Skipping.'.format( \
                    event_type, docid, anchor.start_char_offset(), anchor.end_char_offset(), anchor.text.replace(' ', '_')))

        # keep only events with anchor
        return [event for event in events if event.number_of_anchors() > 0]
Beispiel #26
0
                            ner_decoder,
                            content,
                            sent,
                            offset=0,
                            content_type='Blog'))

    for p in ner_predictions:
        print(p)

    # create a document based on text content, add NER predictions as EntityMentions, then apply Spacy to
    # perform sentence segmentation and tokenization, and use Spacy tokens to back the EntityMentions
    doc = Document('dummy', content)
    for i, p in enumerate(ner_predictions):
        id = 'em-{}'.format(i)
        doc.add_entity_mention(
            EntityMention(id, IntPair(p['start'], p['end']), p['text'],
                          p['label']))
    doc.annotate_sentences(spacy_en, word_embeddings)

    event_domain = None
    if params.get_string('domain') == 'cyber':
        # initialize a particular event domain, which stores info on the event types and event roles
        event_domain = CyberDomain()

    arg_generator = EventArgumentGenerator(event_domain, params)
    trigger_generator = EventTriggerGenerator(event_domain, params)

    (trigger_examples, trigger_data, trigger_data_list,
     trigger_label) = generate_trigger_data_feature(trigger_generator, [doc])

    print('==== Loading Trigger model ====')