def to_sentence(text, start, end):
    """Converts a sentence raw text to a Sentence object."""
    charOffsets = IntPair(start, end)
    tokens = []

    offset = start
    for t in text.split():
        token = Token(IntPair(offset, offset + len(t)), t)
        tokens.append(token)
        offset += len(t) + 1  # +1 to account for white-space

    return Sentence(charOffsets, text, tokens)
Example #2
0
def remove_trailing_periods(text, offset):
    """
    :type text: str
    :type offset: IntPair
    """
    newtext = text
    newoffset = IntPair(offset.first, offset.second)
    chars = set(['.', ',', ':', ';', ')', '}', ']', '"', '\'', '?', '!'])
    if text[-1] in chars:
        i = 1
        while text[-(i+1)] is ' ':
            i += 1
        newtext = text[0:-i]
        newoffset.second = newoffset.second - i
    return newtext, newoffset
Example #3
0
def record_unigram_info(docid, tokens, vocab_locations, sentence):
    """We only record nouns and verbs
    :type tokens: list[nlplingo.text.text_span.Token]
    :type sentence: serifxml.Sentence
    """
    for token in tokens:
        pos_suffix = None
        if token.pos_tag == 'NN' or token.pos_tag == 'NNS':
            pos_suffix = '.n'
        elif token.pos_tag.startswith('VB'):
            pos_suffix = '.v'
        if pos_suffix is not None:
            word_string = token.text.lower() + pos_suffix

            token_info = TokenInfo(token.int_pair, token.text)
            token_info.sentence_offset = IntPair(sentence.start_char, sentence.end_char)
            token_info.sentence_text = sentence.text

            if word_string not in vocab_locations:
                doc_offsets = defaultdict(list)
                doc_offsets[docid].append(token_info)
                vocab_locations[word_string] = doc_offsets
            else:
                vocab_locations[word_string][docid].append(token_info)

            # Fixed lookup w/o POS tags
            word_string = token.text.lower() 

            if word_string not in vocab_locations:
                doc_offsets = defaultdict(list)
                doc_offsets[docid].append(token_info)
                vocab_locations[word_string] = doc_offsets
            else:
                vocab_locations[word_string][docid].append(token_info)
Example #4
0
def to_lingo_doc(filepath):
    """Takes in a filepath to a SerifXML, and use its sentences, tokens, entity-mentions, value-mentions
    to construct a nlplingo.text.text_theory.Document
    Returns: nlplingo.text.text_theory.Document
    """
    serif_doc = serifxml.Document(filepath)
    """:type: serifxml.Document"""

    docid = serif_doc.docid
    lingo_doc = lingoDoc(docid)
    for st_index, sentence in enumerate(serif_doc.sentences):
        st = sentence.sentence_theories[0]
        """:type: serifxml.SentenceTheory"""
        if len(st.token_sequence) == 0:
            continue
        st_text, st_start, st_end = get_snippet(serif_doc, st)

        tokens = to_tokens(st)
        assert st_start == tokens[0].start_char_offset()
        assert (st_end+1) == tokens[-1].end_char_offset()

        s = Sentence(docid, IntPair(st_start, st_end+1), st_text, tokens, st_index)
        add_entity_mentions(st, s, lingo_doc)
        add_value_mentions(st, s, lingo_doc)
        add_names(st, lingo_doc)

        lingo_doc.add_sentence(s)
    return lingo_doc
Example #5
0
    def from_json(doc_d):
        docid = doc_d['docid']
        doc = Document(docid, doc_d['text'])

        for sentence_d in doc_d['sentences']:
            sentence_index = sentence_d['index']
            sentence_text = sentence_d['text']
            sentence_start = sentence_d['start']
            sentence_end = sentence_d['end']

            tokens = []
            """:type: list[nlplingo.text.text_span.Token]"""
            for token_d in sentence_d['tokens']:
                index = token_d['index']
                text = token_d['text']
                start = token_d['start']
                end = token_d['end']
                lemma = token_d['lemma']
                pos_tag = token_d['pos_tag']
                token = Token(IntPair(start, end), index, text, lemma, pos_tag)

                for dep_d in token_d['dep_relations']:
                    name = dep_d['dep_name']
                    direction = dep_d['dep_direction']
                    index = dep_d['dep_token_index']
                    token.dep_relations.append(
                        DependencyRelation(name, direction, index))

                srl_dict = token_d['srl']
                if 'predicate' in srl_dict:
                    srl = SRL('dummy')
                    srl.predicate_label = srl_dict['predicate']
                    if 'roles' in srl_dict:
                        for role_d in srl_dict['roles']:
                            role = role_d['srl_role']
                            token_span = role_d[
                                'srl_token_span']  # list of 2 int: start-token-index, end-token-index
                            srl.add_role(role, token_span[0], token_span[1])
                    token.srl = srl
                tokens.append(token)

            doc.add_sentence(
                Sentence(docid, IntPair(sentence_start, sentence_end),
                         sentence_text, tokens, sentence_index))
        return doc
Example #6
0
def print_spacy_sentence_as_conll(sent, entity_mentions, offset):
    all_tokens = []

    for token in sent:
        start = token.idx + offset
        end = start + len(token.text)
        all_tokens.append(Token(IntPair(start, end), token.text, token.tag_))	# token.tag_ : POS-tag

    return tokens_to_conll(all_tokens, entity_mentions)
Example #7
0
    def sentence_segmention_and_tokenization_with_text(self, model):
        """Whatever model we pass in, must be able to perform sentence segmentation and tokenization
        by calling model(self.text). We typically use Spacy
        """
        doc = model(self.text)

        for sent_index, sent in enumerate(doc.sents):
            tokens = []
            for token_index, token in enumerate(sent):
                start = token.idx
                end = token.idx + len(token.text)
                tokens.append(
                    Token(IntPair(start, end), token_index, token.text,
                          token.lemma_, token.tag_))
            sentence = Sentence(self.docid,
                                IntPair(sent.start_char, sent.end_char),
                                sent.text.strip(), tokens, sent_index)
            self.sentences.append(sentence)
 def __init__(self, id, entity_mention, label):
     """:type entity_mention: nlplingo.text.text_span.EntityMention"""
     Span.__init__(
         self,
         IntPair(entity_mention.start_char_offset(),
                 entity_mention.end_char_offset()), entity_mention.text)
     self.id = id
     self.label = label
     self.entity_mention = entity_mention
Example #9
0
def add_value_mentions(st, s, doc):
    """
    :type st: serifxml.SentenceTheory
    :type s: nlplingo.text.text_span.Sentence
    :type doc: nlpling.text.text_theory.Document
    """

    for m in st.value_mention_set:
        em = EntityMention(m.id, IntPair(m.start_char, m.end_char+1), m.text, m.value_type)
        doc.add_entity_mention(em)
Example #10
0
def record_bigram_info(docid, tokens, vocab_locations, sentence):
    """We only record nouns and verbs
    :type tokens: list[nlplingo.text.text_span.Token]
    :type sentence: serifxml.Sentence
    """
    for i in range(len(tokens) - 1):
        offset = IntPair(tokens[i].start_char_offset(), tokens[i+1].end_char_offset())
        word_string = '{} {}'.format(ascii(tokens[i].text.lower()), ascii(tokens[i+1].text.lower()))
        word_string_ori = '{} {}'.format(ascii(tokens[i].text), ascii(tokens[i+1].text))

        token_info = TokenInfo(offset, word_string_ori)
        token_info.sentence_offset = IntPair(sentence.start_char, sentence.end_char)
        token_info.sentence_text = sentence.text

        if word_string not in vocab_locations:
            doc_offsets = defaultdict(list)
            doc_offsets[docid].append(token_info)
            vocab_locations[word_string] = doc_offsets
        else:
            vocab_locations[word_string][docid].append(token_info)
 def _add_noun_phrases(self):
     """Now, let's just add all bigrams and trigrams
     """
     ret = []
     """:type: list[nlplingo.text.text_span.TextSpan]"""
     for i in range(len(self.tokens) - 1):  # bigrams
         toks = self.tokens[i:i + 2]
         span = TextSpan(
             IntPair(toks[0].start_char_offset(),
                     toks[-1].end_char_offset()),
             ' '.join(t.text for t in toks))
         span.with_tokens(toks)
         ret.append(span)
     for i in range(len(self.tokens) - 2):  # trigrams
         toks = self.tokens[i:i + 3]
         span = TextSpan(
             IntPair(toks[0].start_char_offset(),
                     toks[-1].end_char_offset()),
             ' '.join(t.text for t in toks))
         span.with_tokens(toks)
         ret.append(span)
     return ret
def _read_sentence_tokens(sentence_json):
    """
    Returns: list[nlplingo.text.text_span.Token]
    """
    ret = []
    for i, token in enumerate(sentence_json['tokens']):
        word = token['originalText']
        lemma = token['lemma']
        start = token['characterOffsetBegin']
        end = token['characterOffsetEnd']
        pos_tag = token['pos']
        ner = token['ner']

        ret.append(Token(IntPair(start, end), i, word, lemma, pos_tag))
    return ret
Example #13
0
 def sentence_segmention_and_tokenization_with_list(self, model):
     """Whatever model we pass in, must be able to perform sentence segmentation and tokenization
     by calling model(self.text). We typically use Spacy
     """
     offset = 0
     for ss in self.sentence_strings:
         if len(ss) == 0 or ss.isspace():
             pass
         else:
             for sent in model(ss).sents:  # for each Spacy sentence
                 tokens = []
                 for token_index, token in enumerate(sent):
                     start = offset + token.idx
                     end = start + len(token.text)
                     tokens.append(
                         Token(IntPair(start, end), token_index, token.text,
                               token.lemma_, token.tag_))
                 sentence = Sentence(
                     self.docid,
                     IntPair(offset + sent.start_char,
                             offset + sent.start_char + len(sent.text)),
                     sent.text.strip(), tokens, len(self.sentences))
                 self.sentences.append(sentence)
         offset += len(ss)
Example #14
0
def add_names(st, doc):
    """
    :type st: serifxml.SentenceTheory
    :type doc: nlplingo.text.text_theory.Document
    """
    for m in st.name_theory:
        start = m.start_char
        end = m.end_char + 1
        m_exists = False
        for em in doc.entity_mentions:
            if em.start_char_offset() == start and em.end_char_offset() == end:
                m_exists = True
                break
        if not m_exists:
            em = EntityMention(m.id, IntPair(start, end), m.text, m.entity_type)
            doc.add_entity_mention(em)
def file_to_document(filepath):
    f = open(filepath, 'rU')
    sentences = []

    offset = 0
    for line in f:
        sentence = to_sentence(line, offset, offset + len(line))
        sentences.append(sentence)
        offset += len(line)
        # +1 for account for newline
    f.close()

    s_strings = [s.label for s in sentences]
    doc_text = "\n".join(s_strings)

    return Document(IntPair(0, offset - 1), doc_text, sentences)
Example #16
0
    def _read_candidate_span_file(self, filepath):
        ret = defaultdict(set)

        filepaths = []
        with open(filepath, 'r') as f:
            for line in f:
                filepaths.append(line.strip())

        for fp in filepaths:
            with codecs.open(fp, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.strip().split()
                    docid = tokens[0]
                    offset = IntPair(int(tokens[1]), int(tokens[2]))
                    ret[docid].add(offset)
        return ret
Example #17
0
def to_tokens(st):
    """
    :type st: serifxml.SentenceTheory

    Returns: list[nlplingo.text.text_span.Token]
    """
    ret = []
    """:type: list[nlplingo.text.text_span.Token]"""

    root = st.parse.root
    """:type: serifxml.SynNode"""
    for i, t in enumerate(root.terminals):
        t_text = t.text
        t_start = t.start_char
        t_end = t.end_char
        t_pos_tag = t.parent.tag
        # we do a +1 because this has been the assumption in nlplingo
        ret.append(Token(IntPair(t_start, t_end+1), i, t_text, lemma=None, pos_tag=t_pos_tag))
    return ret
Example #18
0
def extract_sentence_annotation(text, offset):
    """offset: char offset thus far (excluding xml tags) from prior sentences."""

    start_tag = 0
    end_tag = -1
    raw_text = ''
    entity_mentions = []

    # ignore everything starting from 'REMOVED_URL'
    url_index = text.find(' REMOVED_URL', 0)
    if url_index != -1:
        text = text[0:url_index]

    start_tag = text.find('<ENAMEX', 0)
    while(start_tag != -1):
        raw_text += text[end_tag+1 : start_tag]

        end_tag = text.find('>', start_tag)
        entity_type = re.search(r' TYPE="(.*)"', text[start_tag:end_tag]).group(1)

        start_tag = text.find('</ENAMEX>', end_tag)
        mention_text = text[end_tag+1 : start_tag]

        start = offset+len(raw_text)
        end = offset+len(raw_text)+len(mention_text)
        if '-' in mention_text and entity_type.endswith('DESC'):
            print('Rejecting %s[%s], because Spacy will split the string into multiple tokens, and DESC should always be just a single word' % (entity_type, mention_text)).encode('utf-8')
        else:
            (new_mention_text, prefix_length, suffix_length) = strip_mention_text(mention_text)
            if new_mention_text != mention_text:
                print('Revising %s to %s' % (mention_text, new_mention_text)).encode('utf-8')
            id = 'm-' + str(start+prefix_length) + '-' + str(end-suffix_length)
            entity_mentions.append(EntityMention(id, IntPair(start+prefix_length, end-suffix_length), new_mention_text, entity_type))

        raw_text += mention_text

        end_tag = text.find('>', start_tag)
        start_tag = text.find('<ENAMEX', end_tag)

    raw_text += text[end_tag+1:]

    return (raw_text, entity_mentions)
Example #19
0
def to_tokens(sentence):
    """
    :type sentence: serifxml.Sentence
    :rtype: list[nlplingo.text.text_span.Token]
    """
    ret = []
    """:type: list[nlplingo.text.text_span.Token]"""

    root = sentence.parse.root
    if root is None:
        return ret

    """:type: serifxml.SynNode"""
    for i, t in enumerate(root.terminals):
        t_text = t.text
        t_start = t.start_char
        t_end = t.end_char
        t_pos_tag = t.parent.tag
        ret.append(Token(IntPair(t_start, t_end), i, t_text, lemma=None, pos_tag=t_pos_tag))
    return ret
def add_corenlp_annotations(doc, filepath):
    """Reads Stanford corenlp annotations from filename, and add to doc

    :type filepath: str
    :type doc: nlplingo.text.text_theory.Document
    """
    with codecs.open(filepath, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    for sentence_json in json_data['sentences']:
        (index, tokens) = _read_sentence(sentence_json)
        sent_start = tokens[0].start_char_offset()
        sent_end = tokens[-1].end_char_offset()
        sent_text = doc.text[sent_start:sent_end]

        s = Sentence(doc.docid, IntPair(sent_start, sent_end), sent_text,
                     tokens, index)
        doc.add_sentence(s)

    return doc
Example #21
0
def file_to_document(filepath):
    f = codecs.open(filepath, 'r', encoding='utf8')
    sentences = []

    offset = 0
    for line in f:
        (raw_text, entity_mentions) = extract_sentence_annotation(line.strip(), offset)
        sentence = text_span.to_sentence(raw_text, offset, offset + len(raw_text))
        sentence.add_annotation('ENTITY_MENTIONS', entity_mentions)
        sentences.append(sentence)

        offset += len(raw_text) + 1  # +1 to account for newline

    f.close()

    s_strings = [s.label for s in sentences]
    doc_text = "\n".join(s_strings)

    #doc_id = os.path.basename(filepath)
    doc_id = filepath
    return Document(doc_id, IntPair(0, offset-1), doc_text, sentences)
Example #22
0
    def _read_spans_from_file(self, infile, event_type, text, events=None):
        """Get the positive and negative spans
        Returns:
            list[nlplingo.text.text_span.TextSpan]
        """
        ret = []
        with open(infile, 'r') as f:
            for line in f:
                tokens = line.strip().split()
                span_type = tokens[0]
                start = int(tokens[1])
                end = int(tokens[2]) + 1
                text_string = ' '.join(text[start:end].replace('\n', ' ').strip().split())
                end = start + len(text_string)

                if '<' in text_string or '>' in text_string:
                    print('Skipping annotation of type {}, as it has either "<" or ">"'.format(span_type))
                    continue

                span_offset = IntPair(start, end)

                if span_type == event_type:
                    # if this is a positive span, then we need to make sure we have an event for it
                    if events is not None:
                        found_span = False
                        for event in events:
                            if offset_same(event.event_spans[0].int_pair, span_offset):
                                found_span = True
                                break
                        if found_span:
                            ret.append(TextSpan(span_offset, text_string))
                            self.positive_span_count += 1
                        else:
                            self.discard_span_count += 1
                    else:
                        self.discard_span_count += 1
                elif span_type == 'negative':
                    ret.append(TextSpan(span_offset, text_string))
                    self.negative_span_count += 1
        return ret
Example #23
0
 def _generate_bigram_examples(self, sentence, params, extractor_params,
                               features, hyper_params):
     """
     :type sentence: nlplingo.text.text_span.Sentence
     :type params: dict
     :type extractor_params: dict
     :type features: nlplingo.event.trigger.feature.EventTriggerFeature
     :type hyper_params: nlplingo.nn.extractor.HyperParameters
     """
     ret = []
     if self.np_spans is not None:
         doc_nps = self.np_spans[sentence.docid]  # set[IntPair]
         print('doc {} , len(doc_nps)={}, len(sentence.noun_phrases)={}'.
               format(sentence.docid, len(doc_nps),
                      len(sentence.noun_phrases)))
         for np in sentence.noun_phrases:  # TextSpan
             for doc_np in doc_nps:
                 if np.start_char_offset(
                 ) == doc_np.first and np.end_char_offset(
                 ) == doc_np.second:
                     event_type = self.get_event_type_of_np(np, sentence)
                     self.statistics['number_candidate_trigger_np'] += 1
                     if event_type != 'None':
                         self.statistics['number_positive_trigger_np'] += 1
                     anchor_candidate = Anchor(
                         'dummy-id',
                         IntPair(np.start_char_offset(),
                                 np.end_char_offset()), np.text, event_type)
                     anchor_candidate.with_tokens(np.tokens)
                     example = EventTriggerExample(anchor_candidate,
                                                   sentence,
                                                   self.event_domain,
                                                   params, extractor_params,
                                                   features, hyper_params,
                                                   event_type)
                     EventTriggerFeatureGenerator.generate_example(
                         example, sentence.tokens, hyper_params)
                     ret.append(example)
     return ret
Example #24
0
def add_entity_mentions(st, s, doc):
    """
    :type st: serifxml.SentenceTheory
    :type s: nlplingo.text.text_span.Sentence
    :type doc: nlplingo.text.text_theory.Document
    """

    for m in st.mention_set:
        if m.entity_subtype != 'UNDET':
            m_type = '{}.{}'.format(m.entity_type, m.entity_subtype)
        else:
            m_type = m.entity_type

        em = EntityMention(m.id, IntPair(m.start_char, m.end_char+1), m.text, m_type)

        head = m.head
        for t in s.tokens:
            if t.start_char_offset() == head.start_char and t.end_char_offset() == (head.end_char+1):
                em.head_token = t
                break

        doc.add_entity_mention(em)
Example #25
0
    def _generate_unigram_examples(self, sentence, feature_generator, features,
                                   hyper_params):
        """
        :type sentence: nlplingo.text.text_span.Sentence
        :type feature_generator: nlplingo.event.trigger.feature.EventTriggerFeatureGenerator
        :type params: dict
        :type extractor_params: dict
        :type features: nlplingo.event.trigger.feature.EventTriggerFeature
        :type hyper_params: nlplingo.nn.extractor.HyperParameters
        """
        ret = []
        for token_index, token in enumerate(sentence.tokens):
            # TODO if current token is a trigger for multiple event types, event_type_index is only set to 1 event_type_index
            event_type = EventTriggerFeatureGenerator.get_event_type_of_token(
                token, sentence)

            if not self.accept_tokens_as_candidate(
                [token], event_type, sentence.entity_mentions, sentence.docid):
                continue

            self.statistics['number_candidate_trigger'] += 1
            if event_type != 'None':
                self.statistics[token.pos_category()] += 1
                self.statistics['number_positive_trigger'] += 1

            anchor_candidate = Anchor(
                'dummy-id',
                IntPair(token.start_char_offset(), token.end_char_offset()),
                token.text, event_type)
            anchor_candidate.with_tokens([token])
            example = EventTriggerExample(anchor_candidate, sentence,
                                          self.event_domain, features,
                                          hyper_params, event_type)
            feature_generator.generate_example(example, sentence.tokens,
                                               hyper_params)
            ret.append(example)

        return ret
def process_enote_file(doc, xml_file, auto_adjust):
    """Parses ENote annotation file to annotated_document.DocumentAnnotation

    :param all_text: raw text corresponding to the xml_file
    :param xml_file: ENote annotation file
    :param docid: string representing docid
    :param auto_adjust: Adjust annotation (start, end) position to match text. Useful if annotation data is noisy.
    :return: document_annotation.DocumentAnnotation
    """
    tree = etree.parse(xml_file)
    root_node = tree.getroot()

    all_text = doc.text

    events_node = root_node.find('dc:Events', NAMESPACES)
    for event_index, event_node in enumerate(events_node):
        event_type = event_node.find('dc:Name', NAMESPACES).text.decode('UTF8')
        event_id = '{}-e{}'.format(doc.docid, event_index)
        event = Event(event_id, event_type)

        candidate_anchors = []
        candidate_arguments = []
        for argument_index, argument_node in enumerate(
                event_node.find('dc:Arguments', NAMESPACES)):
            argument = EnoteArgument.from_xml_node(argument_node)

            # Skip argument is empty
            if argument == None:
                continue

            start = argument.start
            end = argument.end

            unicode_text = all_text[start:end]
            if all_text[start:end] != argument.text and auto_adjust:
                start, end = utils.find_best_location(all_text, argument.text,
                                                      start, end)
                unicode_text = all_text[start:end]

            # TODO : we could also treat the following as anchors:
            # - event_type == 'Vulnerability' and argument.name == 'Name'
            # - event_type == 'Exploit' and argument.name == 'Name'
            if argument.name == 'Anchor':
                anchor_id = '{}-t{}'.format(event_id, len(candidate_anchors))
                anchor = Anchor(anchor_id, IntPair(start, end), unicode_text,
                                event_type)
                candidate_anchors.append(anchor)
                #if event.overlaps_with_anchor(anchor):
                #    print('Dropping overlapping anchor, %s' % (anchor.to_string()))
                #else:
                #    event.add_anchor(anchor)
            else:
                arg_id = '{}-a{}'.format(event_id, len(candidate_arguments))

                # get the entity mention associated with the event argument
                em = doc.get_entity_mention_with_span(start, end)
                if em is None:
                    print(
                        'Dropping event argument, as I cannot find an entity mention with same offsets. %s (%d,%d) "%s" %s'
                        % (doc.docid, start, end,
                           unicode_text.encode('ascii', 'ignore'),
                           argument.name.decode('UTF8')))
                else:
                    arg = EventArgument(arg_id, em,
                                        argument.name.decode('UTF8'))
                    candidate_arguments.append(arg)
                    #event.add_argument(arg)

        for anchor in candidate_anchors:
            if event.overlaps_with_anchor(anchor):
                print('Dropping overlapping anchor, %s' % (anchor.to_string()))
            else:
                event.add_anchor(anchor)
        for arg in candidate_arguments:
            if event.overlaps_with_anchor(arg):
                print('Dropping argument that overlaps with anchor, %s' %
                      (arg.to_string()))
            else:
                event.add_argument(arg)

        doc.add_event(event)

    return doc
    def _generate_sentence(self,
                           sentence,
                           feature_generator,
                           trigger_egs=None):
        """
        +1
        We could optionally be given a list of anchors, e.g. predicted anchors
        :type sentence: nlplingo.text.text_span.Sentence
        :type feature_generator: nlplingo.event.argument.feature.EventArgumentFeatureGenerator
        :type trigger_egs: list[nlplingo.event.trigger.example.EventTriggerExample]
        """
        # skip multi-token triggers, args that do not have embeddings, args that overlap with trigger
        ret = []
        """:type: list[nlplingo.event.argument.example.EventArgumentExample]"""

        if sentence.number_of_tokens() < 1:
            return ret
        if sentence.number_of_tokens() > self.hyper_params.max_sentence_length:
            print('Skipping overly long sentence of {} tokens'.format(
                sentence.number_of_tokens()))
            return ret

        if trigger_egs is not None:
            for trigger_index, eg in enumerate(trigger_egs):
                anchor_id = '{}-s{}-t{}'.format(sentence.docid, sentence.index,
                                                trigger_index)
                anchor = Anchor(
                    anchor_id,
                    IntPair(eg.anchor.start_char_offset(),
                            eg.anchor.end_char_offset()), eg.anchor.text,
                    eg.event_type)
                anchor.with_tokens(eg.anchor.tokens)

                for em in sentence.entity_mentions:
                    role = 'None'
                    if em.coarse_label(
                    ) in self.event_domain.entity_types.keys():
                        example = EventArgumentExample(
                            anchor, em, sentence, self.event_domain,
                            self.params, self.extractor_params,
                            feature_generator.features, self.hyper_params,
                            role)
                        feature_generator.generate_example(
                            example, sentence.tokens, self.hyper_params)
                        ret.append(example)
        else:
            for event in sentence.events:
                for anchor in event.anchors:
                    if anchor.head().pos_category(
                    ) in EventTriggerExampleGenerator.trigger_pos_category:
                        for em in sentence.entity_mentions:
                            role = event.get_role_for_entity_mention(em)
                            self.statistics['#Event-Role {}'.format(role)] += 1
                            # if spans_overlap(anchor, em):
                            #     print('Refusing to consider overlapping anchor [%s] and entity_mention [%s] as EventArgumentExample' % (anchor.to_string(), em.to_string()))
                            # else:
                            #     if role != 'None':
                            #         self.statistics['number_positive_argument'] += 1
                            #     example = EventArgumentExample(anchor, em, sentence, self.event_domain, self.params, role)
                            #     self._generate_example(example, sentence.tokens, self.max_sent_length, self.neighbor_dist, self.do_dmcnn)
                            #     ret.append(example)
                            if role != 'None':
                                self.statistics[
                                    'number_positive_argument'] += 1
                            if em.coarse_label(
                            ) in self.event_domain.entity_types.keys():
                                example = EventArgumentExample(
                                    anchor, em, sentence, self.event_domain,
                                    self.params, self.extractor_params,
                                    feature_generator.features,
                                    self.hyper_params, role)
                                feature_generator.generate_example(
                                    example, sentence.tokens,
                                    self.hyper_params)
                                ret.append(example)

        return ret
def add_srl_annotations(doc, srl_filepath, offset_filepath):
    """
    :type doc: nlplingo.text.text_theory.Document
    """

    sentences = []
    """list[nlplingo.text.text_span.Sentence]"""
    with codecs.open(offset_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.strip().split()
            offset = IntPair(int(tokens[0]), int(tokens[1]))
            # now, let's find the Sentence object with this offset
            sentence_match = None
            for sentence in doc.sentences:
                if offset.first == sentence.start_char_offset(
                ) and offset.second == sentence.end_char_offset():
                    sentence_match = sentence
                    break
            assert sentence_match is not None
            sentences.append(sentence_match)

    srl_sentences = []
    """:type: list[list[ColumnToken]]"""
    token_strings = []
    with codecs.open(srl_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                if len(token_strings) > 0:
                    srl_sentences.append(_convert_to_tokens(token_strings))
                    token_strings = []
            else:
                token_strings.append(line)

    assert len(sentences) == len(
        srl_sentences), 'len(sentences)={} len(srl_sentences)={}'.format(
            str(len(sentences)), str(len(srl_sentences)))
    #for i, sentence in enumerate(sentences):
    #    assert len(srl_sentences[i]) == len(sentence.tokens), 'i={} start={} end={}'.format(str(
    # i), str(sentence.start_char_offset()), str(sentence.end_char_offset()))

    for sentence_index, srl_sentence in enumerate(srl_sentences):
        sentence = sentences[sentence_index]
        """:type: nlplingo.text.text_span.Sentence"""

        if len(srl_sentence) != len(sentence.tokens):
            srl_tokens_string = ' '.join(t.text for t in srl_sentence)
            sentence_tokens_string = ' '.join(t.text for t in sentence.tokens)
            print(
                'add_srl_annotation: Skipping doc {} sentence {}: len(srl_sentence)={} '
                'len(sentence.tokens)={}'.format(doc.docid,
                                                 str(sentence_index),
                                                 str(len(srl_sentence)),
                                                 str(len(sentence.tokens))))
            print(' - sen_tokens: {}'.format(sentence_tokens_string))
            print(' - srl_tokens: {}'.format(srl_tokens_string))
            continue

        for column_token_index, column_token in enumerate(srl_sentence):
            if column_token.srl_predicate is not None:
                srl = SRL('dummy')
                srl.predicate_label = column_token.srl_predicate
                for token_index, srl_role in column_token.srl_roles.items():
                    if token_index != column_token.index:  # omit role-arguments that are also the predicate
                        srl.add_role(srl_role, token_index, token_index)

                # expand 'A0' srl-role to its compound and appos
                for (start_token_index, end_token_index) in srl.roles['A0']:
                    srl_argument_indices = set([
                        index
                        for index in range(start_token_index, end_token_index +
                                           1)
                    ])

                    for token_index in srl_argument_indices:
                        token = sentence.tokens[token_index]
                        expanded_indices = set(r.connecting_token_index
                                               for r in token.dep_relations
                                               if 'compound' in r.dep_name)
                        for i in expanded_indices:
                            if i != column_token.index and i != token_index:
                                srl.add_role('A0:compound', i, i)

                sentence.tokens[column_token_index].srl = srl
def add_srl_annotations(doc, srl_filepath, offset_filepath):
    """
    :type doc: nlplingo.text.text_theory.Document
    """

    sentences = []
    """list[nlplingo.text.text_span.Sentence]"""
    with codecs.open(offset_filepath, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.strip().split()
            offset = IntPair(int(tokens[0]), int(tokens[1]))
            # now, let's find the Sentence object with this offset
            sentence_match = None
            for sentence in doc.sentences:
                if offset.first == sentence.start_char_offset(
                ) and offset.second == sentence.end_char_offset():
                    sentence_match = sentence
                    break
            assert sentence_match is not None
            sentences.append(sentence_match)

    srl_sentences = []
    """:type: list[list[ColumnToken]]"""
    token_strings = []

    with codecs.open(srl_filepath, 'r', encoding='utf-8') as f:

        srl_json = json.load(f)

        srl_token_strings = srl_json["sentences"]
        srl_tags = srl_json["predicted_srl"]

        srl_indices = []

        threshold_length = 0

        for line in srl_token_strings:
            srl_sentence_split = line
            srl_sentence_indices = len(line)
            srl_indices = []
            for item in srl_tags:

                # offsets are fixed

                if int(item[0]) >= threshold_length and int(
                        item[1]) >= threshold_length and int(
                            item[2]) >= threshold_length:
                    if int(item[0]) < threshold_length + len(line) and int(
                            item[1]) < threshold_length + len(line) and int(
                                item[2]) < threshold_length + len(line):
                        srl_indices.append([
                            int(item[0]) - threshold_length,
                            int(item[1]) - threshold_length,
                            int(item[2]) - threshold_length, item[3]
                        ])

            threshold_length = threshold_length + len(line)
            #print(line)
            #print(srl_indices)
            srl_sentences.append(_convert_to_tokens(line, srl_indices))
            """   
            line = line.strip()
            if len(line) == 0:
                if len(token_strings) > 0:
                    srl_sentences.append(_convert_to_tokens(token_strings))
                    token_strings = []
            else:
                token_strings.append(line)
            """
    assert len(sentences) == len(
        srl_sentences), 'len(sentences)={} len(srl_sentences)={}'.format(
            str(len(sentences)), str(len(srl_sentences)))
    #for i, sentence in enumerate(sentences):
    #    assert len(srl_sentences[i]) == len(sentence.tokens), 'i={} start={} end={}'.format(str(
    # i), str(sentence.start_char_offset()), str(sentence.end_char_offset()))

    for sentence_index, srl_sentence in enumerate(srl_sentences):
        sentence = sentences[sentence_index]
        """:type: nlplingo.text.text_span.Sentence"""

        if len(srl_sentence) != len(sentence.tokens):
            srl_tokens_string = ' '.join(t.text for t in srl_sentence)
            sentence_tokens_string = ' '.join(t.text for t in sentence.tokens)
            print(
                'add_srl_annotation: Skipping doc {} sentence {}: len(srl_sentence)={} '
                'len(sentence.tokens)={}'.format(doc.docid,
                                                 str(sentence_index),
                                                 str(len(srl_sentence)),
                                                 str(len(sentence.tokens))))
            print(' - sen_tokens: {}'.format(
                sentence_tokens_string.encode("utf-8")))
            print(' - srl_tokens: {}'.format(
                srl_tokens_string.encode("utf-8")))
            continue

        for column_token_index, column_token in enumerate(srl_sentence):

            if column_token.srl_predicate is not None:
                srl = SRL('dummy')
                srl.predicate_label = column_token.srl_predicate
                #print(column_token.srl_roles)
                for srl_role in column_token.srl_roles[column_token_index]:
                    #if token_index != column_token.srl_roles:   # omit role-arguments that are also the predicate
                    #print(column_token_index)
                    #print(srl_role)
                    srl.add_role(srl_role[2], srl_role[0], srl_role[1])

                #print(srl.roles['A0'])
                # expand 'A0' srl-role to its compound and appos
                #print(srl.roles['A0'])
                for token_index1, token_index2 in srl.roles['A0']:
                    token = sentence.tokens[token_index1]
                    expanded_indices = set(r.connecting_token_index
                                           for r in token.dep_relations
                                           if 'compound' in r.dep_name)
                    #print(expanded_indices)
                    for i in expanded_indices:
                        #if i != column_token.index and i != token_index:
                        srl.add_role('A0:compound', i, token_index2)
                sentence.tokens[column_token_index].srl = srl
 def __init__(self, int_pair, text):
     self.int_pair = IntPair(int_pair.first, int_pair.second)
     """:type: IntPair"""
     self.text = text