def _iterate_mention_arguments_as_spans( sms: List[SituationMention] ) -> Iterable[Union[CementSpan, CementEntityMention]]: for sm in sms: for arg in sm.argumentList: if arg.situationMentionId is not None: ref_sm: SituationMention = doc.comm.situationMentionForUUID[ arg.situationMentionId.uuidString] span = CementSpan.from_token_ref_sequence( ref_sm.tokens, document=doc) elif arg.tokens is not None: span = CementSpan.from_token_ref_sequence(arg.tokens, document=doc) elif arg.entityMentionId is not None: span = CementEntityMention.from_entity_mention( mention=doc.comm.entityMentionForUUID[ arg.entityMentionId.uuidString], document=doc) else: logger.info( f'MentionArgument={arg} - does not have any span information.' ) continue span.attrs.add('role', arg.role) yield span
def bnb_json_to_concrete(documents: Iterable[Dict]) -> Iterable[CementDocument]: wnl = WordNetLemmatizer() for i, doc in tqdm(enumerate(documents)): cement_doc = CementDocument.from_tokens(tokens={'passage': doc['sentences']}, doc_id=doc['doc_key']) arguments = [] event_type = wnl.lemmatize(word=doc['trigger']['text'][0].lower()) event_type = event_type if 'loss' not in event_type else 'loss' for k, v in doc['arguments'].items(): if len(v) > 1: logger.info(f'Arg has more than one span: {k} - {v}') for arg in v: arguments.append( CementEntityMention(start=arg['span'][0], end=arg['span'][1], document=cement_doc, role=k) # role=f'{event_type}-{k}' if 'Arg' in k else k) ) # special handling for `tax-loss` predicate cement_doc.add_event_mention( trigger=CementSpan(start=doc['trigger']['span'][0], end=doc['trigger']['span'][1], document=cement_doc), arguments=arguments, event_type=event_type, ) yield cement_doc
def rams_json_to_concrete(documents: Iterable[Dict], has_additional_mention: bool = True) -> Iterable[CementDocument]: for i, doc in tqdm(enumerate(documents)): cement_doc = CementDocument.from_tokens(tokens={'passage': doc['sentences']}, doc_id=doc['doc_key']) raw_args, event_type = get_rams_event(doc) cement_doc.add_event_mention( trigger=CementSpan(start=raw_args[-1][0], end=raw_args[-1][1], document=cement_doc), arguments=[ CementEntityMention(start=start, end=end, document=cement_doc, role=role) for start, end, role in raw_args if role != 'TRIGGER' ], event_type=ontology['mappings']['events'][event_type], ) if has_additional_mention: additional_mentions = get_predicted_mentions(doc) for mention in additional_mentions: cement_doc.add_entity_mention(mention=CementEntityMention(start=mention[0], end=mention[-1], document=cement_doc)) yield cement_doc
def gvdb_json_to_concrete(documents: Iterable[Dict]) -> Iterable[CementDocument]: for i, doc in tqdm(enumerate(documents)): cement_doc = CementDocument.from_tokens(tokens={'passage': doc['full_text']}, doc_id=doc['doc_key']) cement_doc.add_event_mention( arguments=[ CementEntityMention(start=start, end=end - 1, document=cement_doc, role=role) for start, end, role, _, _ in doc['spans'] ], event_type='Shooting', ) yield cement_doc
def _get_events(jdoc: Dict, cdoc: CementDocument) -> Iterable[Tuple[str, CementSpan, List[CementSpan]]]: events: Dict[Tuple[int, int], List] = {} for trigger in jdoc['evt_triggers']: trigger_start = trigger[0] trigger_end = trigger[1] etype = trigger[2][0][0] if 'unspecified' not in trigger[2][0][0] else trigger[2][0][0].replace( 'unspecified', 'n/a') events[(trigger_start, trigger_end)] = [ ontology['mappings']['events'][etype], CementSpan(start=trigger_start, end=trigger_end, document=cdoc) ] for link in jdoc['gold_evt_links']: # logger.info(f'Role: {link[2]} - Normalized Role: {ontology["mappings"]["args"][link[2]]}') events[(link[0][0], link[0][1])].append( CementEntityMention( start=link[1][0], end=link[1][1], role=ontology['mappings']['args'][link[2]], document=cdoc ) ) for event in events.values(): yield event[0], event[1], event[2:]
def process_mentions(doc_stream: Iterable[CementDocument], predictor: Predictor) -> Iterable[CementDocument]: for doc in doc_stream: # process EntityMention for em in doc.iterate_entity_mentions(): cem = CementEntityMention.from_entity_mention(mention=em, document=doc) head_token_offset = find_mention_head(mention=cem, predictor=predictor) cem.attrs.head = cem.start + head_token_offset cem.write_em_head_to_comm() # process trigger for sm in doc.iterate_situation_mentions(): if sm.tokens is not None: trigger_span = CementSpan.from_token_ref_sequence(token_ref_sequence=sm.tokens, document=doc) head_token_offset = find_mention_head(mention=trigger_span, predictor=predictor) trigger_span.write_span_kv(value=str(trigger_span.start + head_token_offset), suffix='head', key=sm.uuid.uuidString, key_prefix='trigger') yield doc
def collect_raw_document_data( doc: CementDocument) -> Dict[str, Union[Counter, Set, List]]: # check meta for sentence level iterator pre-generated # has_ner_sentence_iterator = bool(doc.read_kv_map(prefix='meta', key='ner-iterator', suffix='sentence')) # has_event_sentence_iterator = bool(doc.read_kv_map(prefix='meta', key='events-iterator', suffix='sentence')) # has_relation_sentence_iterator = bool(doc.read_kv_map(prefix='meta', key='relations-iterator', suffix='sentence')) document_counter = Counter() document_desc = { 'doc_key': doc.comm.id, 'num_sentences': doc.num_sentences() } ner = set() event_types = set() event_roles: defaultdict = defaultdict(set) relation_types = set() document_counter['Tokens'] += len(doc) for i in range(doc.num_sentences()): document_counter[('Sentence', i)] += len(doc.get_sentence(sent_id=i)) document_counter['Sentence'] += 1 # count `EntityMention`s # if has_ner_sentence_iterator and doc.read_kv_map(prefix='ner', key=str(i), suffix='sentence') != '': # em_uuids = doc.read_kv_map(prefix='ner', key=str(i), suffix='sentence').split(',') ems = list(doc.iterate_entity_mentions()) document_counter['EntityMention'] += len(ems) for em in ems: cem = CementEntityMention.from_entity_mention(mention=em, document=doc) document_counter[('EntityMention', 'tokens')] += len(cem) document_counter[('NER', cem.attrs.entity_type)] += 1 ner.add(cem.attrs.entity_type) # count `SituationMention`s # Event Mention # if has_event_sentence_iterator and doc.read_kv_map(prefix='event', key=str(i), suffix='sentence') != '': # event_uuids = doc.read_kv_map(prefix='event', key=str(i), suffix='sentence').split(',') events = list(doc.iterate_event_mentions()) document_counter['EventMention'] += len(events) for event in events: document_counter[('EventMention', 'args')] += len(event.argumentList) document_counter[('EventType', event.situationKind)] += 1 event_types.add(event.situationKind) participated_roles = set() for arg in event.argumentList: participated_roles.add(arg.role) document_counter[('EventRole', arg.role)] += 1 event_roles[event.situationKind].update(participated_roles) # Relation Mention # if has_relation_sentence_iterator and doc.read_kv_map(prefix='relation', key=str(i), suffix='sentence') != '': # relation_uuids = doc.read_kv_map(prefix='relation', key=str(i), suffix='sentence').split(',') relations = list(doc.iterate_relation_mentions()) document_counter['RelationMention'] += len(relations) for relation in relations: document_counter[('RelationType', relation.situationKind)] += 1 relation_types.add(relation.situationKind) # if not has_ner_sentence_iterator: # logger.warning(f'doc_key: {doc.comm.id} - does not have sentence iterator info for NER.') # if not has_event_sentence_iterator: # logger.warning(f'doc_key: {doc.comm.id} - does not have sentence iterator info for Event Mention.') # if not has_relation_sentence_iterator: # logger.warning(f'doc_key: {doc.comm.id} - does not have sentence iterator info for Relation Mention.') document_desc['counter'] = document_counter document_desc['ontology'] = { 'ner': ner, 'event_types': event_types, 'event_roles': event_roles, 'relation_types': relation_types } return document_desc
def extract_instances_from_doc( self, doc: CementDocument, cache_target: Optional[str] = None) -> Iterable[Instance]: def _iterate_mention_arguments_as_spans( sms: List[SituationMention] ) -> Iterable[Union[CementSpan, CementEntityMention]]: for sm in sms: for arg in sm.argumentList: if arg.situationMentionId is not None: ref_sm: SituationMention = doc.comm.situationMentionForUUID[ arg.situationMentionId.uuidString] span = CementSpan.from_token_ref_sequence( ref_sm.tokens, document=doc) elif arg.tokens is not None: span = CementSpan.from_token_ref_sequence(arg.tokens, document=doc) elif arg.entityMentionId is not None: span = CementEntityMention.from_entity_mention( mention=doc.comm.entityMentionForUUID[ arg.entityMentionId.uuidString], document=doc) else: logger.info( f'MentionArgument={arg} - does not have any span information.' ) continue span.attrs.add('role', arg.role) yield span entity_mentions: List[CementEntityMention] = [ CementEntityMention.from_entity_mention(mention, doc) for mention in doc.iterate_entity_mentions() ] event_mentions: List[SituationMention] = list( doc.iterate_event_mentions()) if self._sentence_mode: local_mention_indices: Dict[Tuple[int, int], Union[CementSpan, CementEntityMention]] = {} for em in entity_mentions: sent_ids, indices = zip(*em.to_local_indices()) if sent_ids[0] != sent_ids[1]: logger.info( f'Mention span crosses sentence boundary: {sent_ids}') continue else: local_mention_indices[(indices[0], indices[-1])] = em sent_to_ems: Dict[int, Dict[Tuple[int, int], Union[ CementSpan, CementEntityMention]]] = { sent_id: {k: v for k, v in mention_group} for sent_id, mention_group in groupby( local_mention_indices.items(), key=lambda t: t[0][0]) } for event in event_mentions: # if this event has a trigger assert event.tokens is not None, 'This event does not have a trigger' trigger_span = CementSpan.from_token_ref_sequence(event.tokens, document=doc) sent_id, (trigger_start, trigger_end) = trigger_span.to_local_indices() sequence_array: np.ndarray = self._access_cache_by_key( key=[doc.comm.id, str(sent_id)], target=cache_target) sequence = [doc.get_sentence(sent_id=sent_id)] event_type: str = event.situationKind argument_mentions: Dict[Tuple[int, int], Tuple[str, Union[ CementSpan, CementEntityMention]]] = {} for arg in _iterate_mention_arguments_as_spans([event]): sid, arg_span = arg.to_local_indices() assert sid == sent_id, f'Arguments cross sentences - cannot process with sentence mode.' argument_mentions[arg_span] = (arg.attrs.role, arg) if not self._gold_mentions_only: for em_indices, em in sent_to_ems[sent_id].items(): if em_indices not in argument_mentions: argument_mentions[em_indices] = ('None', em) yield self.text_to_instance(doc=doc, sequence=sequence, sequence_array=sequence_array, event_type=event_type, trigger=(trigger_start, trigger_end), mention_spans=argument_mentions) else: sequence_array: np.ndarray = self._read_context_array_from_cache( key=[doc.comm.id], target=cache_target) sequence = doc.iterate_sentences() for event in event_mentions: if event.tokens is not None: trigger_span = CementSpan.from_token_ref_sequence( event.tokens, document=doc) else: # use the whole document as the trigger trigger_span = CementSpan(start=0, end=len(doc) - 1, document=doc) event_type: str = event.situationKind argument_mentions: Dict[Tuple[int, int], Tuple[str, Union[ CementSpan, CementEntityMention]]] = { arg.to_index_tuple(): (arg.attrs.role, arg) for arg in _iterate_mention_arguments_as_spans([event]) } if not self._gold_mentions_only: for em in entity_mentions: em_indices = em.to_index_tuple() if em_indices not in argument_mentions: argument_mentions[em_indices] = ('None', em) yield self.text_to_instance( doc=doc, sequence=sequence, sequence_array=sequence_array, event_type=event_type, trigger=trigger_span.to_index_tuple(), mention_spans=argument_mentions)
def to_cement_doc_stream( json_stream: Iterable[Dict]) -> Iterable[CementDocument]: for json_obj in json_stream: # create a `CementDocument` doc = CementDocument.from_tokens( tokens={'passage': json_obj['sentences']}, doc_id=json_obj['doc_key']) # extract entity mentions (EMD or NER) doc.write_kv_map(prefix='meta', key='ner-iterator', suffix='sentence', value='True') for line_id, ems in enumerate(json_obj['ner']): uuids = [] for em in ems: cem = CementEntityMention(start=em[0], end=em[1], entity_type=em[2], document=doc) em_id = doc.add_entity_mention(mention=cem) uuids.append(em_id.uuidString) doc.write_kv_map(prefix='ner', key=str(line_id), suffix='sentence', value=','.join(uuids)) # extract event mentions if 'events' in json_obj: doc.write_kv_map(prefix='meta', key='relations-iterator', suffix='sentence', value='True') for line_id, events in enumerate(json_obj['events']): uuids = [] for event in events: trigger = CementSpan(start=event[0][0], end=event[0][0], document=doc) arguments = [ CementEntityMention(start=start, end=end, role=role, document=doc) for start, end, role in event[1:] ] sm_id = doc.add_event_mention(trigger=trigger, arguments=arguments, event_type=event[0][1]) uuids.append(sm_id.uuidString) doc.write_kv_map(prefix='event', key=str(line_id), suffix='sentence', value=','.join(uuids)) else: logger.info( f'doc_key: {json_obj["doc_key"]} - does not have events.') # extract relation mentions if 'relations' in json_obj: doc.write_kv_map(prefix='meta', key='events-iterator', suffix='sentence', value='True') for line_id, relations in enumerate(json_obj['relations']): uuids = [] for relation in relations: sub = CementEntityMention(start=relation[0], end=relation[1], document=doc) obj = CementEntityMention(start=relation[2], end=relation[3], document=doc) sm_id = doc.add_relation_mention(arguments=[sub, obj], relation_type=relation[4]) uuids.append(sm_id.uuidString) doc.write_kv_map(prefix='relation', key=str(line_id), suffix='sentence', value=','.join(uuids)) else: logger.info( f'doc_key: {json_obj["doc_key"]} - does not have relations.') yield doc