def to_sentence(text, start, end): """Converts a sentence raw text to a Sentence object.""" charOffsets = IntPair(start, end) tokens = [] offset = start for t in text.split(): token = Token(IntPair(offset, offset + len(t)), t) tokens.append(token) offset += len(t) + 1 # +1 to account for white-space return Sentence(charOffsets, text, tokens)
def remove_trailing_periods(text, offset): """ :type text: str :type offset: IntPair """ newtext = text newoffset = IntPair(offset.first, offset.second) chars = set(['.', ',', ':', ';', ')', '}', ']', '"', '\'', '?', '!']) if text[-1] in chars: i = 1 while text[-(i+1)] is ' ': i += 1 newtext = text[0:-i] newoffset.second = newoffset.second - i return newtext, newoffset
def record_unigram_info(docid, tokens, vocab_locations, sentence): """We only record nouns and verbs :type tokens: list[nlplingo.text.text_span.Token] :type sentence: serifxml.Sentence """ for token in tokens: pos_suffix = None if token.pos_tag == 'NN' or token.pos_tag == 'NNS': pos_suffix = '.n' elif token.pos_tag.startswith('VB'): pos_suffix = '.v' if pos_suffix is not None: word_string = token.text.lower() + pos_suffix token_info = TokenInfo(token.int_pair, token.text) token_info.sentence_offset = IntPair(sentence.start_char, sentence.end_char) token_info.sentence_text = sentence.text if word_string not in vocab_locations: doc_offsets = defaultdict(list) doc_offsets[docid].append(token_info) vocab_locations[word_string] = doc_offsets else: vocab_locations[word_string][docid].append(token_info) # Fixed lookup w/o POS tags word_string = token.text.lower() if word_string not in vocab_locations: doc_offsets = defaultdict(list) doc_offsets[docid].append(token_info) vocab_locations[word_string] = doc_offsets else: vocab_locations[word_string][docid].append(token_info)
def to_lingo_doc(filepath): """Takes in a filepath to a SerifXML, and use its sentences, tokens, entity-mentions, value-mentions to construct a nlplingo.text.text_theory.Document Returns: nlplingo.text.text_theory.Document """ serif_doc = serifxml.Document(filepath) """:type: serifxml.Document""" docid = serif_doc.docid lingo_doc = lingoDoc(docid) for st_index, sentence in enumerate(serif_doc.sentences): st = sentence.sentence_theories[0] """:type: serifxml.SentenceTheory""" if len(st.token_sequence) == 0: continue st_text, st_start, st_end = get_snippet(serif_doc, st) tokens = to_tokens(st) assert st_start == tokens[0].start_char_offset() assert (st_end+1) == tokens[-1].end_char_offset() s = Sentence(docid, IntPair(st_start, st_end+1), st_text, tokens, st_index) add_entity_mentions(st, s, lingo_doc) add_value_mentions(st, s, lingo_doc) add_names(st, lingo_doc) lingo_doc.add_sentence(s) return lingo_doc
def from_json(doc_d): docid = doc_d['docid'] doc = Document(docid, doc_d['text']) for sentence_d in doc_d['sentences']: sentence_index = sentence_d['index'] sentence_text = sentence_d['text'] sentence_start = sentence_d['start'] sentence_end = sentence_d['end'] tokens = [] """:type: list[nlplingo.text.text_span.Token]""" for token_d in sentence_d['tokens']: index = token_d['index'] text = token_d['text'] start = token_d['start'] end = token_d['end'] lemma = token_d['lemma'] pos_tag = token_d['pos_tag'] token = Token(IntPair(start, end), index, text, lemma, pos_tag) for dep_d in token_d['dep_relations']: name = dep_d['dep_name'] direction = dep_d['dep_direction'] index = dep_d['dep_token_index'] token.dep_relations.append( DependencyRelation(name, direction, index)) srl_dict = token_d['srl'] if 'predicate' in srl_dict: srl = SRL('dummy') srl.predicate_label = srl_dict['predicate'] if 'roles' in srl_dict: for role_d in srl_dict['roles']: role = role_d['srl_role'] token_span = role_d[ 'srl_token_span'] # list of 2 int: start-token-index, end-token-index srl.add_role(role, token_span[0], token_span[1]) token.srl = srl tokens.append(token) doc.add_sentence( Sentence(docid, IntPair(sentence_start, sentence_end), sentence_text, tokens, sentence_index)) return doc
def print_spacy_sentence_as_conll(sent, entity_mentions, offset): all_tokens = [] for token in sent: start = token.idx + offset end = start + len(token.text) all_tokens.append(Token(IntPair(start, end), token.text, token.tag_)) # token.tag_ : POS-tag return tokens_to_conll(all_tokens, entity_mentions)
def sentence_segmention_and_tokenization_with_text(self, model): """Whatever model we pass in, must be able to perform sentence segmentation and tokenization by calling model(self.text). We typically use Spacy """ doc = model(self.text) for sent_index, sent in enumerate(doc.sents): tokens = [] for token_index, token in enumerate(sent): start = token.idx end = token.idx + len(token.text) tokens.append( Token(IntPair(start, end), token_index, token.text, token.lemma_, token.tag_)) sentence = Sentence(self.docid, IntPair(sent.start_char, sent.end_char), sent.text.strip(), tokens, sent_index) self.sentences.append(sentence)
def __init__(self, id, entity_mention, label): """:type entity_mention: nlplingo.text.text_span.EntityMention""" Span.__init__( self, IntPair(entity_mention.start_char_offset(), entity_mention.end_char_offset()), entity_mention.text) self.id = id self.label = label self.entity_mention = entity_mention
def add_value_mentions(st, s, doc): """ :type st: serifxml.SentenceTheory :type s: nlplingo.text.text_span.Sentence :type doc: nlpling.text.text_theory.Document """ for m in st.value_mention_set: em = EntityMention(m.id, IntPair(m.start_char, m.end_char+1), m.text, m.value_type) doc.add_entity_mention(em)
def record_bigram_info(docid, tokens, vocab_locations, sentence): """We only record nouns and verbs :type tokens: list[nlplingo.text.text_span.Token] :type sentence: serifxml.Sentence """ for i in range(len(tokens) - 1): offset = IntPair(tokens[i].start_char_offset(), tokens[i+1].end_char_offset()) word_string = '{} {}'.format(ascii(tokens[i].text.lower()), ascii(tokens[i+1].text.lower())) word_string_ori = '{} {}'.format(ascii(tokens[i].text), ascii(tokens[i+1].text)) token_info = TokenInfo(offset, word_string_ori) token_info.sentence_offset = IntPair(sentence.start_char, sentence.end_char) token_info.sentence_text = sentence.text if word_string not in vocab_locations: doc_offsets = defaultdict(list) doc_offsets[docid].append(token_info) vocab_locations[word_string] = doc_offsets else: vocab_locations[word_string][docid].append(token_info)
def _add_noun_phrases(self): """Now, let's just add all bigrams and trigrams """ ret = [] """:type: list[nlplingo.text.text_span.TextSpan]""" for i in range(len(self.tokens) - 1): # bigrams toks = self.tokens[i:i + 2] span = TextSpan( IntPair(toks[0].start_char_offset(), toks[-1].end_char_offset()), ' '.join(t.text for t in toks)) span.with_tokens(toks) ret.append(span) for i in range(len(self.tokens) - 2): # trigrams toks = self.tokens[i:i + 3] span = TextSpan( IntPair(toks[0].start_char_offset(), toks[-1].end_char_offset()), ' '.join(t.text for t in toks)) span.with_tokens(toks) ret.append(span) return ret
def _read_sentence_tokens(sentence_json): """ Returns: list[nlplingo.text.text_span.Token] """ ret = [] for i, token in enumerate(sentence_json['tokens']): word = token['originalText'] lemma = token['lemma'] start = token['characterOffsetBegin'] end = token['characterOffsetEnd'] pos_tag = token['pos'] ner = token['ner'] ret.append(Token(IntPair(start, end), i, word, lemma, pos_tag)) return ret
def sentence_segmention_and_tokenization_with_list(self, model): """Whatever model we pass in, must be able to perform sentence segmentation and tokenization by calling model(self.text). We typically use Spacy """ offset = 0 for ss in self.sentence_strings: if len(ss) == 0 or ss.isspace(): pass else: for sent in model(ss).sents: # for each Spacy sentence tokens = [] for token_index, token in enumerate(sent): start = offset + token.idx end = start + len(token.text) tokens.append( Token(IntPair(start, end), token_index, token.text, token.lemma_, token.tag_)) sentence = Sentence( self.docid, IntPair(offset + sent.start_char, offset + sent.start_char + len(sent.text)), sent.text.strip(), tokens, len(self.sentences)) self.sentences.append(sentence) offset += len(ss)
def add_names(st, doc): """ :type st: serifxml.SentenceTheory :type doc: nlplingo.text.text_theory.Document """ for m in st.name_theory: start = m.start_char end = m.end_char + 1 m_exists = False for em in doc.entity_mentions: if em.start_char_offset() == start and em.end_char_offset() == end: m_exists = True break if not m_exists: em = EntityMention(m.id, IntPair(start, end), m.text, m.entity_type) doc.add_entity_mention(em)
def file_to_document(filepath): f = open(filepath, 'rU') sentences = [] offset = 0 for line in f: sentence = to_sentence(line, offset, offset + len(line)) sentences.append(sentence) offset += len(line) # +1 for account for newline f.close() s_strings = [s.label for s in sentences] doc_text = "\n".join(s_strings) return Document(IntPair(0, offset - 1), doc_text, sentences)
def _read_candidate_span_file(self, filepath): ret = defaultdict(set) filepaths = [] with open(filepath, 'r') as f: for line in f: filepaths.append(line.strip()) for fp in filepaths: with codecs.open(fp, 'r', encoding='utf-8') as f: for line in f: tokens = line.strip().split() docid = tokens[0] offset = IntPair(int(tokens[1]), int(tokens[2])) ret[docid].add(offset) return ret
def to_tokens(st): """ :type st: serifxml.SentenceTheory Returns: list[nlplingo.text.text_span.Token] """ ret = [] """:type: list[nlplingo.text.text_span.Token]""" root = st.parse.root """:type: serifxml.SynNode""" for i, t in enumerate(root.terminals): t_text = t.text t_start = t.start_char t_end = t.end_char t_pos_tag = t.parent.tag # we do a +1 because this has been the assumption in nlplingo ret.append(Token(IntPair(t_start, t_end+1), i, t_text, lemma=None, pos_tag=t_pos_tag)) return ret
def extract_sentence_annotation(text, offset): """offset: char offset thus far (excluding xml tags) from prior sentences.""" start_tag = 0 end_tag = -1 raw_text = '' entity_mentions = [] # ignore everything starting from 'REMOVED_URL' url_index = text.find(' REMOVED_URL', 0) if url_index != -1: text = text[0:url_index] start_tag = text.find('<ENAMEX', 0) while(start_tag != -1): raw_text += text[end_tag+1 : start_tag] end_tag = text.find('>', start_tag) entity_type = re.search(r' TYPE="(.*)"', text[start_tag:end_tag]).group(1) start_tag = text.find('</ENAMEX>', end_tag) mention_text = text[end_tag+1 : start_tag] start = offset+len(raw_text) end = offset+len(raw_text)+len(mention_text) if '-' in mention_text and entity_type.endswith('DESC'): print('Rejecting %s[%s], because Spacy will split the string into multiple tokens, and DESC should always be just a single word' % (entity_type, mention_text)).encode('utf-8') else: (new_mention_text, prefix_length, suffix_length) = strip_mention_text(mention_text) if new_mention_text != mention_text: print('Revising %s to %s' % (mention_text, new_mention_text)).encode('utf-8') id = 'm-' + str(start+prefix_length) + '-' + str(end-suffix_length) entity_mentions.append(EntityMention(id, IntPair(start+prefix_length, end-suffix_length), new_mention_text, entity_type)) raw_text += mention_text end_tag = text.find('>', start_tag) start_tag = text.find('<ENAMEX', end_tag) raw_text += text[end_tag+1:] return (raw_text, entity_mentions)
def to_tokens(sentence): """ :type sentence: serifxml.Sentence :rtype: list[nlplingo.text.text_span.Token] """ ret = [] """:type: list[nlplingo.text.text_span.Token]""" root = sentence.parse.root if root is None: return ret """:type: serifxml.SynNode""" for i, t in enumerate(root.terminals): t_text = t.text t_start = t.start_char t_end = t.end_char t_pos_tag = t.parent.tag ret.append(Token(IntPair(t_start, t_end), i, t_text, lemma=None, pos_tag=t_pos_tag)) return ret
def add_corenlp_annotations(doc, filepath): """Reads Stanford corenlp annotations from filename, and add to doc :type filepath: str :type doc: nlplingo.text.text_theory.Document """ with codecs.open(filepath, 'r', encoding='utf-8') as f: json_data = json.load(f) for sentence_json in json_data['sentences']: (index, tokens) = _read_sentence(sentence_json) sent_start = tokens[0].start_char_offset() sent_end = tokens[-1].end_char_offset() sent_text = doc.text[sent_start:sent_end] s = Sentence(doc.docid, IntPair(sent_start, sent_end), sent_text, tokens, index) doc.add_sentence(s) return doc
def file_to_document(filepath): f = codecs.open(filepath, 'r', encoding='utf8') sentences = [] offset = 0 for line in f: (raw_text, entity_mentions) = extract_sentence_annotation(line.strip(), offset) sentence = text_span.to_sentence(raw_text, offset, offset + len(raw_text)) sentence.add_annotation('ENTITY_MENTIONS', entity_mentions) sentences.append(sentence) offset += len(raw_text) + 1 # +1 to account for newline f.close() s_strings = [s.label for s in sentences] doc_text = "\n".join(s_strings) #doc_id = os.path.basename(filepath) doc_id = filepath return Document(doc_id, IntPair(0, offset-1), doc_text, sentences)
def _read_spans_from_file(self, infile, event_type, text, events=None): """Get the positive and negative spans Returns: list[nlplingo.text.text_span.TextSpan] """ ret = [] with open(infile, 'r') as f: for line in f: tokens = line.strip().split() span_type = tokens[0] start = int(tokens[1]) end = int(tokens[2]) + 1 text_string = ' '.join(text[start:end].replace('\n', ' ').strip().split()) end = start + len(text_string) if '<' in text_string or '>' in text_string: print('Skipping annotation of type {}, as it has either "<" or ">"'.format(span_type)) continue span_offset = IntPair(start, end) if span_type == event_type: # if this is a positive span, then we need to make sure we have an event for it if events is not None: found_span = False for event in events: if offset_same(event.event_spans[0].int_pair, span_offset): found_span = True break if found_span: ret.append(TextSpan(span_offset, text_string)) self.positive_span_count += 1 else: self.discard_span_count += 1 else: self.discard_span_count += 1 elif span_type == 'negative': ret.append(TextSpan(span_offset, text_string)) self.negative_span_count += 1 return ret
def _generate_bigram_examples(self, sentence, params, extractor_params, features, hyper_params): """ :type sentence: nlplingo.text.text_span.Sentence :type params: dict :type extractor_params: dict :type features: nlplingo.event.trigger.feature.EventTriggerFeature :type hyper_params: nlplingo.nn.extractor.HyperParameters """ ret = [] if self.np_spans is not None: doc_nps = self.np_spans[sentence.docid] # set[IntPair] print('doc {} , len(doc_nps)={}, len(sentence.noun_phrases)={}'. format(sentence.docid, len(doc_nps), len(sentence.noun_phrases))) for np in sentence.noun_phrases: # TextSpan for doc_np in doc_nps: if np.start_char_offset( ) == doc_np.first and np.end_char_offset( ) == doc_np.second: event_type = self.get_event_type_of_np(np, sentence) self.statistics['number_candidate_trigger_np'] += 1 if event_type != 'None': self.statistics['number_positive_trigger_np'] += 1 anchor_candidate = Anchor( 'dummy-id', IntPair(np.start_char_offset(), np.end_char_offset()), np.text, event_type) anchor_candidate.with_tokens(np.tokens) example = EventTriggerExample(anchor_candidate, sentence, self.event_domain, params, extractor_params, features, hyper_params, event_type) EventTriggerFeatureGenerator.generate_example( example, sentence.tokens, hyper_params) ret.append(example) return ret
def add_entity_mentions(st, s, doc): """ :type st: serifxml.SentenceTheory :type s: nlplingo.text.text_span.Sentence :type doc: nlplingo.text.text_theory.Document """ for m in st.mention_set: if m.entity_subtype != 'UNDET': m_type = '{}.{}'.format(m.entity_type, m.entity_subtype) else: m_type = m.entity_type em = EntityMention(m.id, IntPair(m.start_char, m.end_char+1), m.text, m_type) head = m.head for t in s.tokens: if t.start_char_offset() == head.start_char and t.end_char_offset() == (head.end_char+1): em.head_token = t break doc.add_entity_mention(em)
def _generate_unigram_examples(self, sentence, feature_generator, features, hyper_params): """ :type sentence: nlplingo.text.text_span.Sentence :type feature_generator: nlplingo.event.trigger.feature.EventTriggerFeatureGenerator :type params: dict :type extractor_params: dict :type features: nlplingo.event.trigger.feature.EventTriggerFeature :type hyper_params: nlplingo.nn.extractor.HyperParameters """ ret = [] for token_index, token in enumerate(sentence.tokens): # TODO if current token is a trigger for multiple event types, event_type_index is only set to 1 event_type_index event_type = EventTriggerFeatureGenerator.get_event_type_of_token( token, sentence) if not self.accept_tokens_as_candidate( [token], event_type, sentence.entity_mentions, sentence.docid): continue self.statistics['number_candidate_trigger'] += 1 if event_type != 'None': self.statistics[token.pos_category()] += 1 self.statistics['number_positive_trigger'] += 1 anchor_candidate = Anchor( 'dummy-id', IntPair(token.start_char_offset(), token.end_char_offset()), token.text, event_type) anchor_candidate.with_tokens([token]) example = EventTriggerExample(anchor_candidate, sentence, self.event_domain, features, hyper_params, event_type) feature_generator.generate_example(example, sentence.tokens, hyper_params) ret.append(example) return ret
def process_enote_file(doc, xml_file, auto_adjust): """Parses ENote annotation file to annotated_document.DocumentAnnotation :param all_text: raw text corresponding to the xml_file :param xml_file: ENote annotation file :param docid: string representing docid :param auto_adjust: Adjust annotation (start, end) position to match text. Useful if annotation data is noisy. :return: document_annotation.DocumentAnnotation """ tree = etree.parse(xml_file) root_node = tree.getroot() all_text = doc.text events_node = root_node.find('dc:Events', NAMESPACES) for event_index, event_node in enumerate(events_node): event_type = event_node.find('dc:Name', NAMESPACES).text.decode('UTF8') event_id = '{}-e{}'.format(doc.docid, event_index) event = Event(event_id, event_type) candidate_anchors = [] candidate_arguments = [] for argument_index, argument_node in enumerate( event_node.find('dc:Arguments', NAMESPACES)): argument = EnoteArgument.from_xml_node(argument_node) # Skip argument is empty if argument == None: continue start = argument.start end = argument.end unicode_text = all_text[start:end] if all_text[start:end] != argument.text and auto_adjust: start, end = utils.find_best_location(all_text, argument.text, start, end) unicode_text = all_text[start:end] # TODO : we could also treat the following as anchors: # - event_type == 'Vulnerability' and argument.name == 'Name' # - event_type == 'Exploit' and argument.name == 'Name' if argument.name == 'Anchor': anchor_id = '{}-t{}'.format(event_id, len(candidate_anchors)) anchor = Anchor(anchor_id, IntPair(start, end), unicode_text, event_type) candidate_anchors.append(anchor) #if event.overlaps_with_anchor(anchor): # print('Dropping overlapping anchor, %s' % (anchor.to_string())) #else: # event.add_anchor(anchor) else: arg_id = '{}-a{}'.format(event_id, len(candidate_arguments)) # get the entity mention associated with the event argument em = doc.get_entity_mention_with_span(start, end) if em is None: print( 'Dropping event argument, as I cannot find an entity mention with same offsets. %s (%d,%d) "%s" %s' % (doc.docid, start, end, unicode_text.encode('ascii', 'ignore'), argument.name.decode('UTF8'))) else: arg = EventArgument(arg_id, em, argument.name.decode('UTF8')) candidate_arguments.append(arg) #event.add_argument(arg) for anchor in candidate_anchors: if event.overlaps_with_anchor(anchor): print('Dropping overlapping anchor, %s' % (anchor.to_string())) else: event.add_anchor(anchor) for arg in candidate_arguments: if event.overlaps_with_anchor(arg): print('Dropping argument that overlaps with anchor, %s' % (arg.to_string())) else: event.add_argument(arg) doc.add_event(event) return doc
def _generate_sentence(self, sentence, feature_generator, trigger_egs=None): """ +1 We could optionally be given a list of anchors, e.g. predicted anchors :type sentence: nlplingo.text.text_span.Sentence :type feature_generator: nlplingo.event.argument.feature.EventArgumentFeatureGenerator :type trigger_egs: list[nlplingo.event.trigger.example.EventTriggerExample] """ # skip multi-token triggers, args that do not have embeddings, args that overlap with trigger ret = [] """:type: list[nlplingo.event.argument.example.EventArgumentExample]""" if sentence.number_of_tokens() < 1: return ret if sentence.number_of_tokens() > self.hyper_params.max_sentence_length: print('Skipping overly long sentence of {} tokens'.format( sentence.number_of_tokens())) return ret if trigger_egs is not None: for trigger_index, eg in enumerate(trigger_egs): anchor_id = '{}-s{}-t{}'.format(sentence.docid, sentence.index, trigger_index) anchor = Anchor( anchor_id, IntPair(eg.anchor.start_char_offset(), eg.anchor.end_char_offset()), eg.anchor.text, eg.event_type) anchor.with_tokens(eg.anchor.tokens) for em in sentence.entity_mentions: role = 'None' if em.coarse_label( ) in self.event_domain.entity_types.keys(): example = EventArgumentExample( anchor, em, sentence, self.event_domain, self.params, self.extractor_params, feature_generator.features, self.hyper_params, role) feature_generator.generate_example( example, sentence.tokens, self.hyper_params) ret.append(example) else: for event in sentence.events: for anchor in event.anchors: if anchor.head().pos_category( ) in EventTriggerExampleGenerator.trigger_pos_category: for em in sentence.entity_mentions: role = event.get_role_for_entity_mention(em) self.statistics['#Event-Role {}'.format(role)] += 1 # if spans_overlap(anchor, em): # print('Refusing to consider overlapping anchor [%s] and entity_mention [%s] as EventArgumentExample' % (anchor.to_string(), em.to_string())) # else: # if role != 'None': # self.statistics['number_positive_argument'] += 1 # example = EventArgumentExample(anchor, em, sentence, self.event_domain, self.params, role) # self._generate_example(example, sentence.tokens, self.max_sent_length, self.neighbor_dist, self.do_dmcnn) # ret.append(example) if role != 'None': self.statistics[ 'number_positive_argument'] += 1 if em.coarse_label( ) in self.event_domain.entity_types.keys(): example = EventArgumentExample( anchor, em, sentence, self.event_domain, self.params, self.extractor_params, feature_generator.features, self.hyper_params, role) feature_generator.generate_example( example, sentence.tokens, self.hyper_params) ret.append(example) return ret
def add_srl_annotations(doc, srl_filepath, offset_filepath): """ :type doc: nlplingo.text.text_theory.Document """ sentences = [] """list[nlplingo.text.text_span.Sentence]""" with codecs.open(offset_filepath, 'r', encoding='utf-8') as f: for line in f: tokens = line.strip().split() offset = IntPair(int(tokens[0]), int(tokens[1])) # now, let's find the Sentence object with this offset sentence_match = None for sentence in doc.sentences: if offset.first == sentence.start_char_offset( ) and offset.second == sentence.end_char_offset(): sentence_match = sentence break assert sentence_match is not None sentences.append(sentence_match) srl_sentences = [] """:type: list[list[ColumnToken]]""" token_strings = [] with codecs.open(srl_filepath, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if len(line) == 0: if len(token_strings) > 0: srl_sentences.append(_convert_to_tokens(token_strings)) token_strings = [] else: token_strings.append(line) assert len(sentences) == len( srl_sentences), 'len(sentences)={} len(srl_sentences)={}'.format( str(len(sentences)), str(len(srl_sentences))) #for i, sentence in enumerate(sentences): # assert len(srl_sentences[i]) == len(sentence.tokens), 'i={} start={} end={}'.format(str( # i), str(sentence.start_char_offset()), str(sentence.end_char_offset())) for sentence_index, srl_sentence in enumerate(srl_sentences): sentence = sentences[sentence_index] """:type: nlplingo.text.text_span.Sentence""" if len(srl_sentence) != len(sentence.tokens): srl_tokens_string = ' '.join(t.text for t in srl_sentence) sentence_tokens_string = ' '.join(t.text for t in sentence.tokens) print( 'add_srl_annotation: Skipping doc {} sentence {}: len(srl_sentence)={} ' 'len(sentence.tokens)={}'.format(doc.docid, str(sentence_index), str(len(srl_sentence)), str(len(sentence.tokens)))) print(' - sen_tokens: {}'.format(sentence_tokens_string)) print(' - srl_tokens: {}'.format(srl_tokens_string)) continue for column_token_index, column_token in enumerate(srl_sentence): if column_token.srl_predicate is not None: srl = SRL('dummy') srl.predicate_label = column_token.srl_predicate for token_index, srl_role in column_token.srl_roles.items(): if token_index != column_token.index: # omit role-arguments that are also the predicate srl.add_role(srl_role, token_index, token_index) # expand 'A0' srl-role to its compound and appos for (start_token_index, end_token_index) in srl.roles['A0']: srl_argument_indices = set([ index for index in range(start_token_index, end_token_index + 1) ]) for token_index in srl_argument_indices: token = sentence.tokens[token_index] expanded_indices = set(r.connecting_token_index for r in token.dep_relations if 'compound' in r.dep_name) for i in expanded_indices: if i != column_token.index and i != token_index: srl.add_role('A0:compound', i, i) sentence.tokens[column_token_index].srl = srl
def add_srl_annotations(doc, srl_filepath, offset_filepath): """ :type doc: nlplingo.text.text_theory.Document """ sentences = [] """list[nlplingo.text.text_span.Sentence]""" with codecs.open(offset_filepath, 'r', encoding='utf-8') as f: for line in f: tokens = line.strip().split() offset = IntPair(int(tokens[0]), int(tokens[1])) # now, let's find the Sentence object with this offset sentence_match = None for sentence in doc.sentences: if offset.first == sentence.start_char_offset( ) and offset.second == sentence.end_char_offset(): sentence_match = sentence break assert sentence_match is not None sentences.append(sentence_match) srl_sentences = [] """:type: list[list[ColumnToken]]""" token_strings = [] with codecs.open(srl_filepath, 'r', encoding='utf-8') as f: srl_json = json.load(f) srl_token_strings = srl_json["sentences"] srl_tags = srl_json["predicted_srl"] srl_indices = [] threshold_length = 0 for line in srl_token_strings: srl_sentence_split = line srl_sentence_indices = len(line) srl_indices = [] for item in srl_tags: # offsets are fixed if int(item[0]) >= threshold_length and int( item[1]) >= threshold_length and int( item[2]) >= threshold_length: if int(item[0]) < threshold_length + len(line) and int( item[1]) < threshold_length + len(line) and int( item[2]) < threshold_length + len(line): srl_indices.append([ int(item[0]) - threshold_length, int(item[1]) - threshold_length, int(item[2]) - threshold_length, item[3] ]) threshold_length = threshold_length + len(line) #print(line) #print(srl_indices) srl_sentences.append(_convert_to_tokens(line, srl_indices)) """ line = line.strip() if len(line) == 0: if len(token_strings) > 0: srl_sentences.append(_convert_to_tokens(token_strings)) token_strings = [] else: token_strings.append(line) """ assert len(sentences) == len( srl_sentences), 'len(sentences)={} len(srl_sentences)={}'.format( str(len(sentences)), str(len(srl_sentences))) #for i, sentence in enumerate(sentences): # assert len(srl_sentences[i]) == len(sentence.tokens), 'i={} start={} end={}'.format(str( # i), str(sentence.start_char_offset()), str(sentence.end_char_offset())) for sentence_index, srl_sentence in enumerate(srl_sentences): sentence = sentences[sentence_index] """:type: nlplingo.text.text_span.Sentence""" if len(srl_sentence) != len(sentence.tokens): srl_tokens_string = ' '.join(t.text for t in srl_sentence) sentence_tokens_string = ' '.join(t.text for t in sentence.tokens) print( 'add_srl_annotation: Skipping doc {} sentence {}: len(srl_sentence)={} ' 'len(sentence.tokens)={}'.format(doc.docid, str(sentence_index), str(len(srl_sentence)), str(len(sentence.tokens)))) print(' - sen_tokens: {}'.format( sentence_tokens_string.encode("utf-8"))) print(' - srl_tokens: {}'.format( srl_tokens_string.encode("utf-8"))) continue for column_token_index, column_token in enumerate(srl_sentence): if column_token.srl_predicate is not None: srl = SRL('dummy') srl.predicate_label = column_token.srl_predicate #print(column_token.srl_roles) for srl_role in column_token.srl_roles[column_token_index]: #if token_index != column_token.srl_roles: # omit role-arguments that are also the predicate #print(column_token_index) #print(srl_role) srl.add_role(srl_role[2], srl_role[0], srl_role[1]) #print(srl.roles['A0']) # expand 'A0' srl-role to its compound and appos #print(srl.roles['A0']) for token_index1, token_index2 in srl.roles['A0']: token = sentence.tokens[token_index1] expanded_indices = set(r.connecting_token_index for r in token.dep_relations if 'compound' in r.dep_name) #print(expanded_indices) for i in expanded_indices: #if i != column_token.index and i != token_index: srl.add_role('A0:compound', i, token_index2) sentence.tokens[column_token_index].srl = srl
def __init__(self, int_pair, text): self.int_pair = IntPair(int_pair.first, int_pair.second) """:type: IntPair""" self.text = text