def _enumerate_spans(self, sentences, max_span_width, text, labels_list): """ Enumerates all spans """ spans: List[Field] = [] sentence_offset = 0 python_spans = [] sentence_ids = [] for sentence_id, sentence in enumerate(sentences): for start, end in enumerate_spans(sentence, offset=sentence_offset, max_span_width=max_span_width): spans.append(SpanField(start, end, text)) python_spans.append((start, end)) sentence_ids.append( LabelField(sentence_id, label_namespace="sentence_id_tags", skip_indexing=True)) sentence_offset += len(sentence) python_span_dict = {span: i for i, span in enumerate(python_spans)} labels_idx = [python_span_dict[span] for span in labels_list] spans_list_field = ListField(spans) labels_index_field = ListField( [IndexField(idx, spans_list_field) for idx in labels_idx]) sentence_id_field = ListField(sentence_ids) return spans_list_field, labels_index_field, sentence_id_field
def _get_unique_spans(self, spans, text, sentence_ids): # span_indices are effectively the labels spans, span_indices = np.unique(np.array(spans), return_inverse=True, axis=0) spans_list_field = ListField( [SpanField(int(s[0]), int(s[1]), text) for s in spans]) labels_index_field = ListField( [IndexField(int(idx), spans_list_field) for idx in span_indices]) sent_ids = [sentence_ids[(s[0], s[1])] for s in spans] sentence_id_field = ListField([ LabelField(si, label_namespace="sentence_id_tags", skip_indexing=True) for si in sent_ids ]) return spans_list_field, labels_index_field, sentence_id_field
def _enumerate_spans(self, sentences, max_span_width, text, labels_list, possible_args=None): """ Enumerates all spans """ spans: List[Field] = [] sentence_offset = 0 python_spans = [] sentence_ids = [] for sentence_id, sentence in enumerate(sentences): for start, end in enumerate_spans(sentence, offset=sentence_offset, max_span_width=max_span_width): if (possible_args is None) or ((start, end) in possible_args): spans.append(SpanField(start, end, text)) python_spans.append((start, end)) sentence_ids.append( LabelField(sentence_id, label_namespace="sentence_id_tags", skip_indexing=True)) sentence_offset += len(sentence) python_span_dict = {span: i for i, span in enumerate(python_spans)} labels_idx = [ python_span_dict.get(span, None) for span in labels_list ] # given spans might cross sentence boundaries (e.g., due to parser errors), so remove them to avoid issues in the model labels_idx = [i for i in labels_idx if i != None] spans_list_field = ListField(spans) labels_index_field = ListField( [IndexField(idx, spans_list_field) for idx in labels_idx]) sentence_id_field = ListField(sentence_ids) return spans_list_field, labels_index_field, sentence_id_field
def text_to_instance( self, # type: ignore sentences: List[List[str]], sentence_start_offsets: List[int], doc_link_info: Optional[List[Dict[Tuple[int, int], List[Tuple[ str, Tuple[int, int], str]]]]] = None, # [{(event_type_start, event_type_end): [(slot, (value_start, value_end))]}] genre: Optional[str] = None, document_id: Optional[str] = None) -> Instance: # pylint: disable=arguments-differ max_sent_len = max(len(s) for s in sentences) padded_sentences = [[ self._normalize_word(sentence[i]) if i < len(sentence) else "UNK" for i in range(max_sent_len) ] for sentence in sentences] flattened_sentences = [ word for sentence in padded_sentences for word in sentence ] text_lens = [len(s) for s in sentences] text_lens_idx = [list(range(max_sent_len)) for _ in sentences] text_lens_mask = [[int(idx < sent_len) for idx in idxs] for sent_len, idxs in zip(text_lens, text_lens_idx)] sentence_offsets = [0] for tl in text_lens: sentence_offsets.append(sentence_offsets[-1] + tl) sentence_offsets = sentence_offsets[:-1] sentence_offsets.append( float('inf')) # sentinel/padding for finding sentence id from span text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) if genre: genre_field = LabelField(self._genres[genre], skip_indexing=True) else: genre_field = LabelField(0, skip_indexing=True) # pack up data as: List[str], List[List[SpanField]] # | | # labels (trigger,arg) span pairs trigger_arg_pairs = [] if doc_link_info is not None else None triggers = [] if doc_link_info is not None else None args = [] if doc_link_info is not None else None roles = [] if doc_link_info is not None else None gold_strings = [] if doc_link_info is not None else None unique_roles = set() if doc_link_info is not None else None trigger_sent_ids = {} if doc_link_info is not None else None arg_sent_ids = {} if doc_link_info is not None else None skipped_args = 0 if doc_link_info is not None: trigger_arg_pairs = [] seen_trigger_spans = set() seen_arg_spans = set() for frame_data in doc_link_info: for trigger_span, argument_data in frame_data.items(): trigger_sentence_id = self._get_sentence_id( trigger_span, sentence_offsets) # event_type is allowed to cross sentence boundaries, since the event_type is the entire document trigger_sent_ids[trigger_span] = trigger_sentence_id for (role, argument_span, gold_string) in argument_data: trigger_arg_pairs.append( [trigger_span, argument_span, role, gold_string]) arg_sentence_id = self._get_sentence_id( argument_span, sentence_offsets) if arg_sentence_id == CROSSES_SENTENCE_BOUNDARY: skipped_args += 1 continue # If not using gold, need to make sure the labels # we care about exist in the enumerated spans span_size_condition = True if (not self._use_gold_triggers): span_size_condition &= ( trigger_span[1] - trigger_span[0] < self._max_trigger_span_width) if (not self._use_gold_arguments): span_size_condition &= ( argument_span[1] - argument_span[0] < self._max_arg_span_width) if span_size_condition: roles.append(LabelField(role)) unique_roles.add(role) triggers.append(trigger_span) args.append(argument_span) arg_sent_ids[argument_span] = arg_sentence_id metadata: Dict[str, Any] = dict() metadata["annotation_kind"] = "" # to play nicely with RAMS metadata["sentences"] = sentences metadata["sentence_start_offsets"] = sentence_start_offsets metadata["text_lens"] = np.array(text_lens_mask) metadata["doc_id"] = document_id metadata["has_gold_targets"] = doc_link_info is not None metadata["data_path"] = self._file_path # Create all triggers and all args # Since the event_type is the entire document (and therefore crosses boundaries), we replace CROSSES_SENTENCE_BOUNDARY with sentence id of 0 to ensure we don't get out-of-bounds issues trigger_sent_ids = { k: (lambda v: v if v != CROSSES_SENTENCE_BOUNDARY else 0)(v) for k, v in trigger_sent_ids.items() } trigger_spans_field, trigger_idx_field, trigger_sentence_id_field = self._get_unique_spans( triggers, text_field, trigger_sent_ids) if self._use_gold_arguments: arg_spans_field, arg_idx_field, arg_sentence_id_field = self._get_unique_spans( args, text_field, arg_sent_ids) else: arg_spans_field, arg_idx_field, arg_sentence_id_field = self._enumerate_spans( sentences, self._max_arg_span_width, text_field, args) (metadata['triggers'], metadata['arguments'], metadata['roles'], metadata['gold_strings']) = zip(*trigger_arg_pairs) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "genre": genre_field, "metadata": metadata_field, "all_triggers": trigger_spans_field, "all_args": arg_spans_field, "all_trigger_sentence_ids": trigger_sentence_id_field, "all_arg_sentence_ids": arg_sentence_id_field, "target_roles": ListField(roles), "target_trigger_idx": trigger_idx_field, "target_arg_idx": arg_idx_field, } return Instance(fields)
def text_to_instance( self, # type: ignore sentences: List[List[str]], sentence_start_offsets: List[int], genre: str, document_id: str, possible_triggers, # all given spans, regardless of whether they participate in a link possible_args, # all given spans, regardless of whether they participate in a link doc_trigger_arg_info: Optional[Dict[Tuple[int, int], List[Tuple[str, Tuple[int, int]]]]] = None ) -> Instance: # pylint: disable=arguments-differ max_sent_len = max(len(s) for s in sentences) padded_sentences = [[ sentence[i] if i < len(sentence) else "UNK" for i in range(max_sent_len) ] for sentence in sentences] flattened_sentences = [ word for sentence in padded_sentences for word in sentence ] text_lens = [len(s) for s in sentences] text_lens_idx = [list(range(max_sent_len)) for _ in sentences] text_lens_mask = [[int(idx < sent_len) for idx in idxs] for sent_len, idxs in zip(text_lens, text_lens_idx)] sentence_offsets = [0] for tl in text_lens: sentence_offsets.append(sentence_offsets[-1] + tl) sentence_offsets = sentence_offsets[:-1] assert sentence_offsets == sentence_start_offsets sentence_offsets.append( float('inf')) # sentinel/padding for finding sentence id from span text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) if genre: genre_field = LabelField(self._genres[genre], label_namespace="genre_labels", skip_indexing=True) else: genre_field = LabelField(0, label_namespace="genre_labels", skip_indexing=True) # pack up data as: List[str], List[List[SpanField]] # | | # labels (trig,arg) span pairs has_links = bool(doc_trigger_arg_info) trigger_arg_pairs = [] if has_links else None triggers = [] args = [] roles = [] if has_links else None trigger_sent_ids = {} arg_sent_ids = {} for p in possible_triggers: trigger_sent_ids[p] = self._get_sentence_id(p, sentence_offsets) for a in possible_args: arg_sent_ids[a] = self._get_sentence_id(a, sentence_offsets) if has_links: trigger_arg_pairs = [] seen_trigger_spans = set() seen_arg_spans = set() for trigger_span, argument_data in doc_trigger_arg_info.items(): for (argument_span, role) in argument_data: trigger_arg_pairs.append( [trigger_span, argument_span, role]) # If not using gold, need to make sure the labels # we care about exist in the enumerated spans span_size_condition = True if (not self._use_gold_triggers): span_size_condition &= ( trigger_span[1] - trigger_span[0] < self._max_trigger_span_width) if (not self._use_gold_arguments): span_size_condition &= ( argument_span[1] - argument_span[0] < self._max_arg_span_width) if span_size_condition: roles.append(LabelField(role, label_namespace="labels")) triggers.append(trigger_span) args.append(argument_span) else: # Just get triggers and arguments for trigger_span in possible_triggers: span_size_condition = True if (not self._use_gold_triggers): span_size_condition &= (trigger_span[1] - trigger_span[0] < self._max_trigger_span_width) if span_size_condition: triggers.append(trigger_span) for argument_span in possible_args: span_size_condition = True if (not self._use_gold_arguments): span_size_condition &= (argument_span[1] - argument_span[0] < self._max_arg_span_width) if span_size_condition: args.append(argument_span) metadata: Dict[str, Any] = dict() metadata["sentences"] = sentences metadata["sentence_start_offsets"] = sentence_start_offsets metadata["text_lens"] = np.array(text_lens_mask) metadata["doc_id"] = document_id metadata["has_gold_targets"] = has_links metadata["data_path"] = self._file_path metadata["language"] = self._language metadata["annotation_kind"] = self._annotation_mode # Create all triggers and all args if self._use_gold_triggers: trigger_spans_field, trigger_idx_field, trigger_sentence_id_field = self._get_unique_spans( triggers, text_field, trigger_sent_ids) else: trigger_spans_field, trigger_idx_field, trigger_sentence_id_field = self._enumerate_spans( sentences, self._max_trigger_span_width, text_field, triggers) if self._use_gold_arguments: arg_spans_field, arg_idx_field, arg_sentence_id_field = self._get_unique_spans( args, text_field, arg_sent_ids) else: #arg_spans_field, arg_idx_field, arg_sentence_id_field = self._enumerate_spans(sentences, self._max_arg_span_width, text_field, args, possible_args=possible_args) #### EXPERIMENT: USE SYNTACTIC SPANS arg_spans_field, arg_idx_field, arg_sentence_id_field = self._enumerate_spans( sentences, self._max_arg_span_width, text_field, args) #### EXPERIMENT: USE ALL SPANS roles_field = ListField(roles) if roles else None metadata['triggers'] = triggers metadata['arguments'] = args if trigger_arg_pairs: metadata['roles'] = list(zip(*trigger_arg_pairs))[2] f_sentences = [word for sentence in sentences for word in sentence] gold_strings = [ " ".join(f_sentences[arg[0]:arg[1] + 1]) for arg in metadata['arguments'] ] metadata['gold_strings'] = gold_strings metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "genre": genre_field, "metadata": metadata_field, "all_triggers": trigger_spans_field, "all_args": arg_spans_field, "all_trigger_sentence_ids": trigger_sentence_id_field, "all_arg_sentence_ids": arg_sentence_id_field, "target_trigger_idx": trigger_idx_field, "target_arg_idx": arg_idx_field, } if roles_field: fields["target_roles"] = roles_field return Instance(fields)
def text_to_instance( self, # type: ignore sentences: List[List[str]], sentence_start_offsets: List[int], pred_arg_info: Optional[Dict[Tuple[int, int], List[Tuple[str, Tuple[int, int]]]]] = None, genre: Optional[str] = None, document_id: Optional[str] = None) -> Instance: # pylint: disable=arguments-differ max_sent_len = max(len(s) for s in sentences) padded_sentences = [[ self._normalize_word(sentence[i]) if i < len(sentence) else "UNK" for i in range(max_sent_len) ] for sentence in sentences] flattened_sentences = [ word for sentence in padded_sentences for word in sentence ] text_lens = [len(s) for s in sentences] text_lens_idx = [list(range(max_sent_len)) for _ in sentences] text_lens_mask = [[int(idx < sent_len) for idx in idxs] for sent_len, idxs in zip(text_lens, text_lens_idx)] sentence_offsets = [0] for tl in text_lens: sentence_offsets.append(sentence_offsets[-1] + tl) sentence_offsets = sentence_offsets[:-1] sentence_offsets.append( float('inf')) # sentinel/padding for finding sentence id from span text_field = TextField([Token(word) for word in flattened_sentences], self._token_indexers) if genre: genre_field = LabelField(self._genres[genre], skip_indexing=True) else: genre_field = LabelField(0, skip_indexing=True) # pack up data as: List[str], List[List[SpanField]] # | | # labels (pred,arg) span pairs pred_arg_pairs = [] if pred_arg_info is not None else None preds = [] if pred_arg_info is not None else None args = [] if pred_arg_info is not None else None roles = [] if pred_arg_info is not None else None pred_sent_ids = {} if pred_arg_info is not None else None arg_sent_ids = {} if pred_arg_info is not None else None if pred_arg_info is not None: pred_arg_pairs = [] seen_pred_spans = set() seen_arg_spans = set() for pred_span, argument_data in pred_arg_info.items(): pred_sentence_id = self._get_sentence_id( pred_span, sentence_offsets) pred_sent_ids[pred_span] = pred_sentence_id for (role, argument_span) in argument_data: pred_arg_pairs.append([pred_span, argument_span, role]) arg_sentence_id = self._get_sentence_id( argument_span, sentence_offsets) # If not using gold, need to make sure the labels # we care about exist in the enumerated spans span_size_condition = True if (not self._use_gold_triggers): span_size_condition &= (pred_span[1] - pred_span[0] < self._max_trigger_span_width) if (not self._use_gold_arguments): span_size_condition &= ( argument_span[1] - argument_span[0] < self._max_arg_span_width) if span_size_condition: roles.append(LabelField(role)) preds.append(pred_span) args.append(argument_span) arg_sent_ids[argument_span] = arg_sentence_id metadata: Dict[str, Any] = dict() metadata["sentences"] = sentences metadata["sentence_start_offsets"] = sentence_start_offsets metadata["text_lens"] = np.array(text_lens_mask) metadata["doc_id"] = document_id metadata["has_gold_targets"] = pred_arg_info is not None metadata["data_path"] = self._file_path metadata["annotation_kind"] = "SRL" # Create all preds and all args if self._use_gold_triggers: pred_spans_field, pred_idx_field, pred_sentence_id_field = self._get_unique_spans( preds, text_field, pred_sent_ids) else: pred_spans_field, pred_idx_field, pred_sentence_id_field = self._enumerate_spans( sentences, self._max_trigger_span_width, text_field, preds) if self._use_gold_arguments: arg_spans_field, arg_idx_field, arg_sentence_id_field = self._get_unique_spans( args, text_field, arg_sent_ids) else: arg_spans_field, arg_idx_field, arg_sentence_id_field = self._enumerate_spans( sentences, self._max_arg_span_width, text_field, args) (metadata['triggers'], metadata['arguments'], metadata['roles']) = zip(*pred_arg_pairs) metadata_field = MetadataField(metadata) fields: Dict[str, Field] = { "text": text_field, "genre": genre_field, "metadata": metadata_field, "all_triggers": pred_spans_field, "all_args": arg_spans_field, "all_trigger_sentence_ids": pred_sentence_id_field, "all_arg_sentence_ids": arg_sentence_id_field, "target_roles": ListField(roles), "target_trigger_idx": pred_idx_field, "target_arg_idx": arg_idx_field, } return Instance(fields)