def add_learnit_relations(filepath, docs): with codecs.open(filepath, 'r', encoding='utf-8') as f: for line in f: tokens = line.strip().split('\t') relation_name = tokens[0] docid = tokens[1] arg1_start = int(tokens[2]) arg1_end = int(tokens[3]) arg2_start = int(tokens[4]) arg2_end = int(tokens[5]) arg1_text = re.search(r'<SLOT0>(.*?)</SLOT0>', tokens[6]).group(1) arg2_text = re.search(r'<SLOT1>(.*?)</SLOT1>', tokens[6]).group(1) if relation_name == 'causes': relation_name = 'cause'; elif relation_name == 'affects': relation_name = 'pecondition_of'; elif relation_name == 'occurs_before': relation_name = 'occurs_before' r = Relation(relation_name) #r.connective_text = eg['connective_text'] r.docid = docid #r.relation_type = eg['relation_type'] r.add_arg1_span(IntPair(arg1_start, arg1_end)) r.add_arg2_span(IntPair(arg2_start, arg2_end)) r.arg1_text = arg1_text r.arg2_text = arg2_text docs[docid].append(r)
def read_pdtb_json(filename, causal_model, flip_args_enabled=False): doc_relations = defaultdict(list) with codecs.open(filename, 'r', encoding='utf-8') as f: json_data = json.load(f) for eg in json_data: if eg is not None: semantic_class = eg['semantic_class'] r = Relation(semantic_class) flip_args = False if 'connective_text' in eg: r.connective_text = eg['connective_text'] if flip_args_enabled and r.connective_text.lower() in ( "after", "as", "as long as", "because", "insofar as", "now that", "once", "since", "when", "when and if"): flip_args = True arg1_spans = eg['arg1_span_list'] arg1_text = eg['arg1_text'] arg2_spans = eg['arg2_span_list'] arg2_text = eg['arg2_text'] if flip_args: tmp = arg1_spans arg1_spans = arg2_spans arg2_spans = tmp tmp = arg1_text arg1_text = arg2_text arg2_text = tmp docid = eg['docid'] if '.' in docid: docid = re.search(r'^(.*)\.(.*)$', docid).group(1) r.model = causal_model r.docid = docid r.relation_type = eg['relation_type'] for span in arg1_spans: offset = IntPair(int(span[0]), int(span[1])) r.add_arg1_span(offset) for span in arg2_spans: offset = IntPair(int(span[0]), int(span[1])) r.add_arg2_span(offset) r.arg1_text = arg1_text r.arg2_text = arg2_text if 'cause_sentence' in eg: r.sentence = eg['cause_sentence'] doc_relations[r.docid].append(r) return doc_relations
def to_sentence(text, start, end): """Converts a sentence raw text to a Sentence object.""" charOffsets = IntPair(start, end) tokens = [] offset = start for t in text.split(): token = Token(IntPair(offset, offset + len(t)), t) tokens.append(token) offset += len(t) + 1 # +1 to account for white-space return Sentence(charOffsets, text, tokens)
def sentence_segmention_and_tokenization_with_text(self, model): """Whatever model we pass in, must be able to perform sentence segmentation and tokenization by calling model(self.text). We typically use Spacy """ doc = model(self.text) for sent_index, sent in enumerate(doc.sents): tokens = [] for token_index, token in enumerate(sent): start = token.idx end = token.idx + len(token.text) tokens.append(Token(IntPair(start, end), token.text, token, token_index)) sentence = Sentence(self.docid, IntPair(sent.start_char, sent.end_char), sent.text.strip(), tokens, sent_index) self.sentences.append(sentence)
def line_to_predictions(ner_fea, dec, json_eg, attr, content_type, word_embeddings, trigger_generator, trigger_model, arg_generator): """ :type word_embeddings: embeddings.word_embeddings.WordEmbedding :type trigger_generator: event.event_trigger.EventTriggerGenerator :type trigger_model: model.event_cnn.EventExtractionModel :type arg_generator: event.event_argument.EventArgumentGenerator """ global spacy_en content = find(attr, json_eg) # json_eg.get(attr) #print(content_type.encode('ascii', 'ignore')) #print(content.encode('ascii', 'ignore')) offset = 0 all_predictions = [] if content is not None: if type(content) is list: content = '\n'.join(content) for line in content.split('\n'): #print(offset) #print('[' + content_type.encode('ascii', 'ignore') + ']') #print('[' + line.encode('ascii', 'ignore') + ']') doc_ner_predictions = [] sentences = get_sentences(line, content_type) if sentences is not None: for sent in sentences: sent_predictions = decode_sentence(ner_fea, dec, content, sent, offset, content_type) doc_ner_predictions.extend(sent_predictions) all_predictions.extend(sent_predictions) if content_type == 'Post': doc = Document('dummy', line) for i, p in enumerate(doc_ner_predictions): id = 'em-{}'.format(i) doc.add_entity_mention( EntityMention(id, IntPair(p['start'], p['end']), p['text'], p['label'])) doc.annotate_sentences(spacy_en, word_embeddings) (trigger_examples, trigger_data, trigger_data_list, trigger_label) = generate_trigger_data_feature( trigger_generator, [doc]) trigger_predictions = trigger_model.predict(trigger_data_list) offset += len(line) + 1 # +1 to account for newline # a list of dict, one for each predicted NE mention if len(all_predictions) > 0: if not "extractions" in json_eg: json_eg["extractions"] = {} json_eg['extractions'][attr] = all_predictions return json_eg
def process_events(cls, doc, document_node): """ :type doc: text.text_theory.Document :type document_node: xml.etree.ElementTree.Element """ for event_node in document_node.findall('event'): event_id = event_node.attrib['ID'] event_type = event_node.attrib['TYPE'] event_subtype = event_node.attrib['SUBTYPE'] #for event_argument_node in event_node.findall('event_argument'): # argument = Argument(event_argument_node.attrib['REFID'], event_argument_node.attrib['ROLE']) # event.add_argument(argument) for mention_node in event_node.findall('event_mention'): mention_id = mention_node.attrib['ID'] event = Event(mention_id, event_type+'.'+event_subtype) anchor = mention_node.find('anchor') (text, start, end) = cls.process_xml_charseq(anchor[0]) event.add_anchor(Anchor(mention_id+'-trigger', IntPair(start, end), text, event_type+'.'+event_subtype)) for argument_mention_node in mention_node.findall('event_mention_argument'): arg_id = argument_mention_node.attrib['REFID'] arg_role = argument_mention_node.attrib['ROLE'] arg_em = doc.get_entity_mention_with_id(arg_id) assert arg_em is not None event_arg = EventArgument('{}-a{}'.format(mention_id, event.number_of_arguments()), arg_em, arg_role) event.add_argument(event_arg) doc.add_event(event)
def read_serif_json(filename): ret = defaultdict(list) with codecs.open(filename, 'r', encoding='utf-8') as f: json_data = json.load(f) for event in json_data['events'] + json_data[ 'generic_events']: # these are ACE, KBP, and GENERIC events start = event['anchor_start'] end = event['anchor_end'] event_type = event['event_type'] text = event['anchor_text'] span = EventSpan('dummy', IntPair(start, end), text, event_type) e = Event(span) e.snippet = event['snippet'] e.docid = event['docid'] if event_type.startswith('Class-'): e.model = event_models.GENERIC else: e.model = event_models.KBP ret[e.docid].append(e) for event in json_data['accent_events']: event_name = event['event_name'] args = event['participants'] source_offset = None target_offset = None if 'Source' in args: source = args['Source'] source_offset = IntPair(source['head_start_char'], source['head_end_char']) if 'Target' in args: target = args['Target'] target_offset = IntPair(target['head_start_char'], target['head_end_char']) event_offset = offset_from_offsets(source_offset, target_offset) span = EventSpan('dummy', event_offset, 'dummy', event_name) e = Event(span) e.model = event_models.ACCENT e.snippet = event['snippet'] e.docid = event['docid'] ret[e.docid].append(e) return ret
def __init__(self, id, entity_mention, label): """:type entity_mention: text.text_span.EntityMention""" Span.__init__( self, IntPair(entity_mention.start_char_offset(), entity_mention.end_char_offset()), entity_mention.text) self.id = id self.label = label self.entity_mention = entity_mention
def print_spacy_sentence_as_conll(sent, entity_mentions, offset): all_tokens = [] for token in sent: start = token.idx + offset end = start + len(token.text) all_tokens.append(Token(IntPair(start, end), token.text, token.tag_)) # token.tag_ : POS-tag return tokens_to_conll(all_tokens, entity_mentions)
def offset_from_offsets(self, offset1, offset2): if offset1 is not None and offset2 is not None: c1 = min(offset1.first, offset2.first) c2 = max(offset1.second, offset2.second) return IntPair(c1, c2) elif offset1 is not None: return offset1 elif offset2 is not None: return offset2 else: return None
def sentence_segmention_and_tokenization_with_list(self, model): """Whatever model we pass in, must be able to perform sentence segmentation and tokenization by calling model(self.text). We typically use Spacy """ offset = 0 for ss in self.sentence_strings: if len(ss) == 0 or ss.isspace(): pass else: for sent in model(ss).sents: # for each Spacy sentence tokens = [] for token_index, token in enumerate(sent): start = offset + token.idx end = start + len(token.text) tokens.append(Token(IntPair(start, end), token.text, token, token_index)) sentence = Sentence(self.docid, IntPair(offset + sent.start_char, offset + sent.start_char + len(sent.text)), sent.text.strip(), tokens, len(self.sentences)) self.sentences.append(sentence) offset += len(ss)
def process_times(cls, doc, document_node): """ :type doc: text.text_theory.Document :type document_node: xml.etree.ElementTree.Element """ for time_node in document_node.findall('timex2'): time_id = time_node.attrib['ID'] all_mentions = time_node.findall('timex2_mention') for mention_node in all_mentions: mention_id = mention_node.attrib['ID'] (text, start, end) = cls.process_xml_charseq(mention_node[0][0]) em = EntityMention(mention_id, IntPair(start, end), text, 'Time') doc.add_entity_mention(em)
def read_pdtb_json(filename): doc_relations = defaultdict(list) with codecs.open(filename, 'r', encoding='utf-8') as f: json_data = json.load(f) for eg in json_data: if eg is not None: arg1_spans = eg['arg1_span_list'] arg1_text = eg['arg1_text'] arg2_spans = eg['arg2_span_list'] arg2_text = eg['arg2_text'] semantic_class = eg['semantic_class'] docid = eg['docid'] docid = re.search(r'^(.*)\.(.*)$', docid).group(1) r = Relation(semantic_class) r.connective_text = eg['connective_text'] r.docid = docid r.relation_type = eg['relation_type'] for span in arg1_spans: offset = IntPair(int(span[0]), int(span[1])) r.add_arg1_span(offset) for span in arg2_spans: offset = IntPair(int(span[0]), int(span[1])) r.add_arg2_span(offset) r.arg1_text = arg1_text r.arg2_text = arg2_text doc_relations[r.docid].append(r) return doc_relations
def process_values(cls, doc, document_node): """ :type doc: text.text_theory.Document :type document_node: xml.etree.ElementTree.Element """ for value_node in document_node.findall('value'): value_id = value_node.attrib['ID'] value_type = value_node.attrib['TYPE'] all_mentions = value_node.findall('value_mention') for mention_node in all_mentions: mention_id = mention_node.attrib['ID'] (text, start, end) = cls.process_xml_charseq(mention_node[0][0]) em = EntityMention(mention_id, IntPair(start, end), text, value_type) doc.add_entity_mention(em)
def file_to_document(filepath): f = open(filepath, 'rU') sentences = [] offset = 0 for line in f: sentence = to_sentence(line, offset, offset + len(line)) sentences.append(sentence) offset += len(line) # +1 for account for newline f.close() s_strings = [s.label for s in sentences] doc_text = "\n".join(s_strings) return Document(IntPair(0, offset - 1), doc_text, sentences)
def process_entities(cls, doc, document_node): """ :type doc: text.text_theory.Document :type document_node: xml.etree.ElementTree.Element """ all_entities = document_node.findall('entity') for entity_node in all_entities: entity_id = entity_node.attrib['ID'] entity_type = entity_node.attrib['TYPE'] entity_subtype = entity_node.attrib['SUBTYPE'] all_mentions = entity_node.findall('entity_mention') for mention_node in all_mentions: mention_id = mention_node.attrib['ID'] head = mention_node.find('head') (text, start, end) = cls.process_xml_charseq(head[0]) em = EntityMention(mention_id, IntPair(start, end), text, entity_type+'.'+entity_subtype) doc.add_entity_mention(em)
def extract_sentence_annotation(text, offset): """offset: char offset thus far (excluding xml tags) from prior sentences.""" start_tag = 0 end_tag = -1 raw_text = '' entity_mentions = [] # ignore everything starting from 'REMOVED_URL' url_index = text.find(' REMOVED_URL', 0) if url_index != -1: text = text[0:url_index] start_tag = text.find('<ENAMEX', 0) while(start_tag != -1): raw_text += text[end_tag+1 : start_tag] end_tag = text.find('>', start_tag) entity_type = re.search(r' TYPE="(.*)"', text[start_tag:end_tag]).group(1) start_tag = text.find('</ENAMEX>', end_tag) mention_text = text[end_tag+1 : start_tag] start = offset+len(raw_text) end = offset+len(raw_text)+len(mention_text) if '-' in mention_text and entity_type.endswith('DESC'): print('Rejecting %s[%s], because Spacy will split the string into multiple tokens, and DESC should always be just a single word' % (entity_type, mention_text)).encode('utf-8') else: (new_mention_text, prefix_length, suffix_length) = strip_mention_text(mention_text) if new_mention_text != mention_text: print('Revising %s to %s' % (mention_text, new_mention_text)).encode('utf-8') id = 'm-' + str(start+prefix_length) + '-' + str(end-suffix_length) entity_mentions.append(EntityMention(id, IntPair(start+prefix_length, end-suffix_length), new_mention_text, entity_type)) raw_text += mention_text end_tag = text.find('>', start_tag) start_tag = text.find('<ENAMEX', end_tag) raw_text += text[end_tag+1:] return (raw_text, entity_mentions)
def file_to_document(filepath): f = codecs.open(filepath, 'r', encoding='utf8') sentences = [] offset = 0 for line in f: (raw_text, entity_mentions) = extract_sentence_annotation(line.strip(), offset) sentence = text_span.to_sentence(raw_text, offset, offset + len(raw_text)) sentence.add_annotation('ENTITY_MENTIONS', entity_mentions) sentences.append(sentence) offset += len(raw_text) + 1 # +1 to account for newline f.close() s_strings = [s.label for s in sentences] doc_text = "\n".join(s_strings) #doc_id = os.path.basename(filepath) doc_id = filepath return Document(doc_id, IntPair(0, offset-1), doc_text, sentences)
def _read_spans_from_file(self, infile, event_type, text, events=None): """Get the positive and negative spans Returns: list[text.text_span.TextSpan] """ ret = [] with open(infile, 'r') as f: for line in f: tokens = line.strip().split() span_type = tokens[0] start = int(tokens[1]) end = int(tokens[2]) + 1 text_string = ' '.join(text[start:end].replace('\n', ' ').strip().split()) end = start + len(text_string) if '<' in text_string or '>' in text_string: print('Skipping annotation of type {}, as it has either "<" or ">"'.format(span_type)) continue span_offset = IntPair(start, end) if span_type == event_type: # if this is a positive span, then we need to make sure we have an event for it if events is not None: found_span = False for event in events: if offset_same(event.event_spans[0].int_pair, span_offset): found_span = True break if found_span: ret.append(TextSpan(span_offset, text_string)) self.positive_span_count += 1 else: self.discard_span_count += 1 else: self.discard_span_count += 1 elif span_type == 'negative': ret.append(TextSpan(span_offset, text_string)) self.negative_span_count += 1 return ret
def read(self, kb, serif_causal_relation, learnit_causal_relation, extra_causal_relation): print("CausalRelationReader START") count = 0 #docid_to_relation_list = self.read_causal_relation_json(pdtb_json, self.causal_models.PDTB, flip_args_enabled=True) # docs with pdtb relations #for key in docid_to_relation_list: # count += len(docid_to_relation_list[key]) #print('count = {}'.format(count)) docid_to_relation_list = defaultdict(list) print("CausalRelationReader READ CAUSAL RELATIONS") self.add_serif_causal_relations(serif_causal_relation, docid_to_relation_list) self.add_learnit_causal_relations(learnit_causal_relation, docid_to_relation_list) if extra_causal_relation != "NA": self.add_learnit_causal_relations(extra_causal_relation, docid_to_relation_list) count = 0 for key in docid_to_relation_list: count += len(docid_to_relation_list[key]) # Build Document objects (Document object is a nested class above) docid_to_document = dict() print("CausalRelationReader READ EVENTS") for kb_event in kb.evid_to_kb_event.values(): model = None if kb_event.event_mentions[0].model == "ACCENT": model = self.event_models.ACCENT elif kb_event.event_mentions[0].model == "KBP": model = self.event_models.KBP else: continue for kb_event_mention in kb_event.event_mentions: event_type = kb_event_mention.event_type event_offset = None if kb_event_mention.trigger_start is not None and kb_event_mention.trigger_end is not None: start = kb_event_mention.trigger_start end = kb_event_mention.trigger_end event_offset = IntPair(start, end) else: source_offset = None target_offset = None if 'Source' in kb_event_mention.arguments: source = kb_event_mention.arguments['Source'] source_offset = IntPair(source[0].head_start_char, source[0].head_end_char) if 'Target' in kb_event_mention.arguments: target = kb_event_mention.arguments['Target'] target_offset = IntPair(target[0].head_start_char, target[0].head_end_char) event_offset = self.offset_from_offsets( source_offset, target_offset) text = kb_event_mention.trigger if text is None: text = "dummy" snippet = kb_event_mention.snippet docid = kb_event_mention.document.id # Create local objects if docid not in docid_to_document: docid_to_document[docid] = self.Document(docid) #print kb_event.id #print "Creating event span from " + str(event_offset.first) + " " + str(event_offset.second) span = EventSpan('dummy', event_offset, text, event_type) e = self.Event(span, kb_event, kb_event_mention) e.model = model e.snippet = snippet e.docid = docid docid_to_document[docid].add_event(e) print("CausalRelationReader ADD RELATIONS TO DOCUMENTS") """:type: Document""" for docid, doc in docid_to_document.iteritems(): if docid in docid_to_relation_list: relations = docid_to_relation_list[docid] doc.add_relations(relations) count_stats = defaultdict(int) for docid, doc in docid_to_document.iteritems(): kb_document = kb.docid_to_kb_document[docid] self.find_events_in_doc_relations(doc) for relation in doc.causal_relations: # e1 and e2 are Event objects above e1 = relation.left_factor e2 = relation.right_factor snippet = None # Map to standard type names relation_type = relation.label if relation.label == "cause" or relation.label == "Contingency.Cause": relation_type = "Cause-Effect" elif relation.label == "Contingency.Condition" or relation.label == "precondition_of": relation_type = "Precondition-Effect" elif relation.label == "Temporal.Asynchronous" or relation.label == "occurs_before": relation_type = "Before-After" elif relation.label == "catalyst_effect": relation_type = "Catalyst-Effect" elif relation.label == "cause_effect": relation_type = "Cause-Effect" elif relation.label == "mitigating_factor_effect": relation_type = "MitigatingFactor-Effect" elif relation.label == "precondition_effect": relation_type = "Precondition-Effect" elif relation.label == "preventative_effect": relation_type = "Preventative-Effect" left_id = e1.kb_event.id right_id = e2.kb_event.id relation_id = SharedIDManager.get_in_document_id( "Relation", docid) relation_mention_id = SharedIDManager.get_in_document_id( "RelationMention", docid) #print("reln: " + relation_type + ", " + left_id + ", " + right_id) kb_relation = KBRelation(relation_id, "event-event", relation_type, left_id, right_id) e1_start = int(e1.snippet[1]) e1_end = int(e1.snippet[2]) e2_start = int(e2.snippet[1]) e2_end = int(e2.snippet[2]) if e1_start == e2_start and e1_end == e2_end: snippet = e1.snippet else: combined_snippet = [None, None, None] # Combine snippets into one, these should be adjacent sentences if e1_start < e2_start: combined_snippet[ 0] = e1.snippet[0] + " " + e2.snippet[0] combined_snippet[1] = e1_start else: combined_snippet[ 0] = e2.snippet[0] + " " + e1.snippet[0] combined_snippet[1] = e2_start if e1_end > e2_end: combined_snippet[2] = e1_end else: combined_snippet[2] = e2_end snippet = combined_snippet kb_relation_mention = KBRelationMention( relation_mention_id, e1.kb_event_mention, e2.kb_event_mention, snippet, kb_document) kb_relation_mention.properties["model"] = relation.model if relation.pattern is not None: kb_relation_mention.properties[ "pattern"] = relation.pattern if relation.confidence is not None: kb_relation_mention.properties[ "extraction_confidence"] = relation.confidence kb_relation.add_relation_mention(kb_relation_mention) kb.add_relation(kb_relation)
def read_causal_relation_json(self, filename, causal_model, flip_args_enabled=False): doc_relations = defaultdict(list) with codecs.open(filename, 'r', encoding='utf-8') as f: try: json_data = json.load(f) except ValueError as ve: print("While loading: " + filename) print(str(ve)) sys.exit(1) for eg in json_data: if eg is not None: semantic_class = eg['semantic_class'] r = self.Relation(semantic_class) flip_args = False if 'connective_text' in eg: r.connective_text = eg['connective_text'] if flip_args_enabled and r.connective_text.lower() in ( "after", "as", "as long as", "because", "insofar as", "now that", "once", "since", "when", "when and if"): flip_args = True if 'prob' in eg: r.confidence = float(eg['prob']) r.pattern = eg.get('learnit_pattern') arg1_spans = eg['arg1_span_list'] arg1_text = eg['arg1_text'] arg2_spans = eg['arg2_span_list'] arg2_text = eg['arg2_text'] if flip_args: tmp = arg1_spans arg1_spans = arg2_spans arg2_spans = tmp tmp = arg1_text arg1_text = arg2_text arg2_text = tmp docid = eg['docid'] if '.' in docid: docid = re.search(r'^(.*)\.(.*)$', docid).group(1) r.model = causal_model r.docid = docid r.relation_type = eg['relation_type'] for span in arg1_spans: offset = IntPair(int(span[0]), int(span[1])) r.add_arg1_span(offset) for span in arg2_spans: offset = IntPair(int(span[0]), int(span[1])) r.add_arg2_span(offset) r.arg1_text = arg1_text r.arg2_text = arg2_text if 'cause_sentence' in eg: r.sentence = eg['cause_sentence'] doc_relations[r.docid].append(r) return doc_relations
def process_enote_file(doc, xml_file, auto_adjust): """Parses ENote annotation file to annotated_document.DocumentAnnotation :param all_text: raw text corresponding to the xml_file :param xml_file: ENote annotation file :param docid: string representing docid :param auto_adjust: Adjust annotation (start, end) position to match text. Useful if annotation data is noisy. :return: document_annotation.DocumentAnnotation """ tree = etree.parse(xml_file) root_node = tree.getroot() all_text = doc.text events_node = root_node.find('dc:Events', NAMESPACES) for event_index, event_node in enumerate(events_node): event_type = event_node.find('dc:Name', NAMESPACES).text.decode('UTF8') event_id = '{}-e{}'.format(doc.docid, event_index) event = Event(event_id, event_type) candidate_anchors = [] candidate_arguments = [] for argument_index, argument_node in enumerate(event_node.find('dc:Arguments', NAMESPACES)): argument = EnoteArgument.from_xml_node(argument_node) # Skip argument is empty if argument == None: continue start = argument.start end = argument.end unicode_text = all_text[start:end] if all_text[start:end] != argument.text and auto_adjust: start, end = utils.find_best_location(all_text, argument.text, start, end) unicode_text = all_text[start:end] # TODO : we could also treat the following as anchors: # - event_type == 'Vulnerability' and argument.name == 'Name' # - event_type == 'Exploit' and argument.name == 'Name' if argument.name == 'Anchor': anchor_id = '{}-t{}'.format(event_id, len(candidate_anchors)) anchor = Anchor(anchor_id, IntPair(start, end), unicode_text, event_type) candidate_anchors.append(anchor) #if event.overlaps_with_anchor(anchor): # print('Dropping overlapping anchor, %s' % (anchor.to_string())) #else: # event.add_anchor(anchor) else: arg_id = '{}-a{}'.format(event_id, len(candidate_arguments)) # get the entity mention associated with the event argument em = doc.get_entity_mention_with_span(start, end) if em is None: print( 'Dropping event argument, as I cannot find an entity mention with same offsets. %s (%d,%d) "%s" %s' % ( doc.docid, start, end, unicode_text.encode('ascii','ignore'), argument.name.decode('UTF8'))) else: arg = EventArgument(arg_id, em, argument.name.decode('UTF8')) candidate_arguments.append(arg) #event.add_argument(arg) for anchor in candidate_anchors: if event.overlaps_with_anchor(anchor): print('Dropping overlapping anchor, %s' % (anchor.to_string())) else: event.add_anchor(anchor) for arg in candidate_arguments: if event.overlaps_with_anchor(arg): print('Dropping argument that overlaps with anchor, %s' % (arg.to_string())) else: event.add_argument(arg) doc.add_event(event) return doc
def read_spans(self, annotated_events): """From the annotation files, we capture the positive and negative spans, Then return a dictionary from filename or docid to list[text.text_span.TextSpan] The list orders the TextSpan by their start_char_offset. The text within each TextSpan is also normalized, with newlines replaced by space and consecutive spaces replaced by a single space. :type annotated_events: dict[str, list[text.text_theory.Event]] """ ret = defaultdict(list) """:type: dict[str, list[text.text_span.TextSpan]]""" # We first collect the positive and negative spans, from the annotation files. # Note that the same file can be annotated multiple times (via different event types). # Need to de-dupliciate the spans later. file_spans = defaultdict(list) # filename -> list[text.text_span.TextSpan] """:type: dict[str, list[text.text_span.TextSpan]]""" for event_type, annotation_dir in self.event_type_annotation_dir.items(): for filename in os.listdir(annotation_dir): if filename not in self.target_filenames: continue annotation_file = os.path.join(annotation_dir, filename) text_file = os.path.join(self.text_dir, filename) with codecs.open(text_file, 'r', encoding='utf-8') as f: raw_text = f.read() spans = self._read_spans_from_file(annotation_file, event_type, raw_text, events=annotated_events[filename]) file_spans[filename].extend(spans) # for each file, de-duplicate the spans and order them by their start-char-offset for filename in file_spans.keys(): all_spans = file_spans[filename] """:type: list[text.text_span.TextSpan]""" current_spans = dict() # start_char_offset -> TextSpan ; holds de-duplicated spans keyed by start-offset """:type: dict[int, text.text_span.TextSpan]""" for span in all_spans: # check whether 'span' is already in current_spans span_offset = IntPair(span.start_char_offset(), span.end_char_offset()) to_add = True for start, s in current_spans.items(): s_offset = IntPair(s.start_char_offset(), s.end_char_offset()) if offset_same(span_offset, s_offset): print('Found offset_same spans') to_add = False break elif offset_overlap(span_offset, s_offset): # we will remove both spans, just to reduce noise print('Found offset_overlap spans in file {}, {}:{}'.format(filename, span_offset.to_string(), s_offset.to_string())) print('[{}]\n==== vs ====\n[{}]\n'.format(span.text, s.text)) del current_spans[start] to_add = False break if to_add: current_spans[span.start_char_offset()] = span if len(current_spans) > 0: for start_char_offset in sorted(current_spans): span = current_spans[start_char_offset] """:type: text.text_span.TextSpan""" ret[filename].append(span) return ret
def adjust_and_write_annotation_offset(cls, file_spans, annotated_events, output_dir): """Since we keep only the positive and negative spans from the original text files, we need to adjust the annotation offsets accordingly. :type file_spans: dict[str, list[text.text_span.TextSpan]] :type annotated_events: dict[str, list[text.text_theory.Event]] The keys for both dictionaries are filenames. Note that the filename keys in annotated_events is a subset of the filename keys in file_spans, since some files might contain only negative spans. """ for filename, events in annotated_events.items(): outlines = [] # strings storing adjusted annotation offsets """:type: list[str]""" spans = file_spans[filename] """:type: list[text.text_span.TextSpan]""" # establish the new offsets for spans new_offsets = [] """:type: list[common.utils.IntPair]""" offset = 0 for span in spans: end = offset + len(span.text) new_offsets.append(IntPair(offset, end)) offset = end + 1 # +1 for the newline for event in events: event_span = event.event_spans[0] # find the index of this event_span in 'spans' span_index = -1 for i, span in enumerate(spans): if offset_same(span.int_pair, event_span.int_pair): span_index = i break if span_index == -1: raise ValueError('Could not find a corresponding span, should not happen') span_start = spans[span_index].start_char_offset() text = spans[span_index].text new_offset = new_offsets[span_index] outlines.append('<Event type="{}">'.format(event.label)) outlines.append('{}\t{}\t{}'.format(event.label, new_offset.first, new_offset.second)) if event.number_of_anchors() == 0: raise ValueError('An event should have at least 1 anchor!') for anchor in event.anchors: start = anchor.start_char_offset() - span_start end = anchor.end_char_offset() - span_start if text[start:end] != anchor.text: new_start, new_end = utils.find_best_location(text, anchor.text, start, end) print('Adjusting anchor offsets from ({},{}) to ({},{})'.format(start, end, new_start, new_end)) start = new_start end = new_end start += new_offset.first end += new_offset.first outlines.append('anchor\t{}\t{}'.format(start, end)) for arg in event.arguments: start = arg.start_char_offset() - span_start end = arg.end_char_offset() - span_start if text[start:end] != arg.text: new_start, new_end = utils.find_best_location(text, arg.text, start, end) print('Adjusting argument offsets from ({},{}) to ({},{})'.format(start, end, new_start, new_end)) start = new_start end = new_end start += new_offset.first end += new_offset.first outlines.append('{}/{}\t{}\t{}'.format(event.label, arg.label, start, end)) outlines.append('</Event>') if len(outlines) > 0: with open(os.path.join(output_dir, filename+'.meta'), 'w') as f: for line in outlines: f.write(line + '\n')
def _read_annotation_file(cls, infile, event_type, text): """ :type infile: str :type event_type: str :type text: str Returns: list[text.text_theory.Event] :param text: this is the raw text corresponding to the annotation """ docid = os.path.basename(infile) events = [] """:type: list[text.text_theory.Event]""" negative_spans = [] """:type: list[text.text_span.TextSpan]""" anchors_not_in_eventspans = [] # these might be in negative spans """:type: list[text.text_span.Anchor]""" with open(infile, 'r') as f: for line in f: tokens = line.strip().split() span_type = tokens[0] start = int(tokens[1]) end = int(tokens[2]) + 1 text_string = ' '.join(text[start:end].replace('\n', ' ').strip().split()) end = start + len(text_string) if '<' in text_string or '>' in text_string: print('Skipping annotation of type {}, as it has either "<" or ">"'.format(span_type)) continue if span_type == event_type: id = '{}-e{}'.format(docid, len(events)) event_span = EventSpan(id, IntPair(start, end), text_string, event_type) e = Event(id, event_type) e.add_event_span(event_span) events.append(e) elif '/' in span_type: # this is an event argument em = EntityMention('dummy', IntPair(start, end), text_string, 'dummy') event_role = span_type.split('/')[1] e = cls._find_event_containing_span(events, start, end) if e is None: print('Cannot find an event span for {} {} (start,end)=({},{}) "{}". Skipping.'.format(event_type, docid, start, end, text_string)) else: arg_id = '{}-a{}'.format(e.id, e.number_of_arguments()) e.add_argument(EventArgument(arg_id, em, event_role)) elif span_type == 'anchor': e = cls._find_event_containing_span(events, start, end) anchor = Anchor('dummy', IntPair(start, end), text_string, event_type) if e is None: # it might be in a negative span #print('Cannot find an event span for {} {} (start,end)=({},{}) "{}". Skipping.'.format(event_type, docid, start, end, text_string.replace(' ', '_'))) anchors_not_in_eventspans.append(anchor) else: e.add_anchor(anchor) elif span_type == 'negative': negative_spans.append(TextSpan(IntPair(start, end), text_string)) elif span_type == 'interesting': pass # we discard these for now for anchor in anchors_not_in_eventspans: found = False for span in negative_spans: if span.start_char_offset() <= anchor.start_char_offset() and anchor.end_char_offset() <= span.end_char_offset(): found = True break if not found: print('Cannot find an event nor negative span for anchor {} {} (start,end)=({},{}) "{}". Skipping.'.format( \ event_type, docid, anchor.start_char_offset(), anchor.end_char_offset(), anchor.text.replace(' ', '_'))) # keep only events with anchor return [event for event in events if event.number_of_anchors() > 0]
ner_decoder, content, sent, offset=0, content_type='Blog')) for p in ner_predictions: print(p) # create a document based on text content, add NER predictions as EntityMentions, then apply Spacy to # perform sentence segmentation and tokenization, and use Spacy tokens to back the EntityMentions doc = Document('dummy', content) for i, p in enumerate(ner_predictions): id = 'em-{}'.format(i) doc.add_entity_mention( EntityMention(id, IntPair(p['start'], p['end']), p['text'], p['label'])) doc.annotate_sentences(spacy_en, word_embeddings) event_domain = None if params.get_string('domain') == 'cyber': # initialize a particular event domain, which stores info on the event types and event roles event_domain = CyberDomain() arg_generator = EventArgumentGenerator(event_domain, params) trigger_generator = EventTriggerGenerator(event_domain, params) (trigger_examples, trigger_data, trigger_data_list, trigger_label) = generate_trigger_data_feature(trigger_generator, [doc]) print('==== Loading Trigger model ====')