def text_to_instance( self, source, target=None, shortlist=None, reverse_shortlist=None, raw_entity_ids=None, entity_ids=None, relations=None, parent_ids=None, shortlist_inds=None, mention_type=None, alias_copy_inds=None) -> Instance: # pylint: disable=arguments-differ metadata = { 'source_tokens': source, 'alias_database': self._alias_database } fields = { 'metadata': MetadataField(metadata), 'source': TextField(_tokenize(source), self._token_indexers), } if target is not None: fields['target'] = TextField(_tokenize(target), self._token_indexers) metadata['target_tokens'] = target if shortlist is not None: fields['shortlist'] = TextField(_tokenize(shortlist), self._entity_indexers) if raw_entity_ids is not None: fields['raw_entity_ids'] = TextField(_tokenize(raw_entity_ids), self._raw_entity_indexers) if entity_ids is not None: fields['entity_ids'] = TextField(_tokenize(entity_ids), self._entity_indexers) if parent_ids is not None: fields['parent_ids'] = ListField([ TextField(_tokenize(sublist), token_indexers=self._entity_indexers) for sublist in parent_ids ]) if relations is not None: fields['relations'] = ListField([ TextField(_tokenize(sublist), token_indexers=self._relation_indexers) for sublist in relations ]) if mention_type is not None: fields['mention_type'] = SequentialArrayField(mention_type, dtype=np.int64) if shortlist_inds is not None: fields['shortlist_inds'] = SequentialArrayField(shortlist_inds, dtype=np.int64) if alias_copy_inds is not None: fields['alias_copy_inds'] = SequentialArrayField(alias_copy_inds, dtype=np.int64) return Instance(fields)
def _split(self, instance: Instance) -> Tuple[List[Instance], int]: # Determine the size of the sequence inside the instance. true_length = len(instance['source']) padded_length = self._split_size * (true_length // self._split_size) # Determine the split indices. split_indices = list(range(0, true_length, self._split_size)) if true_length > split_indices[-1]: split_indices.append(true_length) # Determine which fields are not going to be split constant_fields = [ x for x in instance.fields if x not in self._splitting_keys ] # Create the list of chunks chunks: List[Instance] = [] for i, (start, end) in enumerate(zip(split_indices[:-1], split_indices[1:])): # Copy all of the constant fields from the instance to the chunk. chunk_fields = {key: instance[key] for key in constant_fields} # Determine whether or not to signal model to reset. if i == 0: reset = SequentialArrayField(np.array(1), dtype=np.uint8) else: reset = SequentialArrayField(np.array(0), dtype=np.uint8) chunk_fields['reset'] = reset # Obtain splits derived from sequence fields. for key in self._splitting_keys: source_field = instance[key] # pylint: disable=protected-access if isinstance(source_field, TextField): split_field = TextField(source_field.tokens[start:end], source_field._token_indexers) elif isinstance(source_field, SequentialArrayField): # TODO: Figure out how to use sequence dim here... split_field = SequentialArrayField( source_field.array[start:end], dtype=source_field._dtype) elif isinstance(source_field, ListField): split_field = ListField(source_field.field_list[start:end]) else: raise NotImplementedError( 'FancyIterator currently only supports splitting ' '`TextField`s or `SequentialArrayField`s.') chunk_fields[key] = split_field chunks.append(Instance(chunk_fields)) return chunks, padded_length
def text_to_instance(self, data: Dict[str, Any]) -> Instance: # pylint: disable=arguments-differ # Flatten and pad tokens tokens = _flatten(data['tokens']) tokens = [*tokens] tokens = [Token(x) for x in tokens] fields = {'tokens': TextField(tokens, self._token_indexers)} # If annotations are provided, process them into arrays. if 'annotations' in data: # Initialize arrays and book keeping data structures. seen_entities: Set[str] = set() entity_types = np.zeros(shape=(len(tokens), )) entity_ids = np.zeros(shape=(len(tokens), )) mention_lengths = np.ones(shape=(len(tokens), )) # Process annotations for annotation in data['annotations']: seen_entities.add(annotation['id']) start, end = annotation['span'] length = end - start for i in range(*annotation['span']): # Note: +1 offset to account for start token. entity_types[i] = 1 entity_ids[i] = len(seen_entities) mention_lengths[i] = length length -= 1 fields['entity_types'] = SequentialArrayField(entity_types, dtype=np.uint8) fields['entity_ids'] = SequentialArrayField(entity_ids, dtype=np.int64) fields['mention_lengths'] = SequentialArrayField(mention_lengths, dtype=np.int64) return Instance(fields)
def text_to_instance(self, data: Dict[str, Any]) -> Instance: # pylint: disable=arguments-differ # Flatten and pad tokens tokens = _flatten(data['tokens']) tokens = ['@@START@@', *tokens, '@@END@@'] source = [Token(x) for x in tokens[:-1]] target = [Token(x) for x in tokens[1:]] fields = { 'source': TextField(source, self._token_indexers), 'target': TextField(target, self._token_indexers) } # Process annotations if 'annotations' in data: # We maintain a "shortlist" of observed entities, that is used for baseline models # that only select entities from the set that appear in the document (as opposed to # the set of all possible entities). shortlist = [DEFAULT_PADDING_TOKEN] reverse_shortlist = {DEFAULT_PADDING_TOKEN: 0} entity_ids = [DEFAULT_PADDING_TOKEN] * len(target) shortlist_inds = np.zeros(shape=(len(target, ))) alias_copy_inds = np.zeros(shape=(len(target), )) alias_tokens = [TextField([], self._token_indexers)] * len(target) alias_inds: List[List[int]] = [[]] * len(target) max_len = 0 # Process annotations for annotation in data['annotations']: # Obtain the entity identifier for the annotated span entity_id = annotation['id'] alias = annotation['alias'] alias_map = { token: i + 1 for i, token in enumerate(set(alias)) } # If neccessary, update the shortlist. Obtain the index of the entity identifier in # the shortlist. if entity_id not in reverse_shortlist: reverse_shortlist[entity_id] = len(reverse_shortlist) shortlist.append(entity_id) shortlist_ind = reverse_shortlist[entity_id] # Update the outputs for i in range(*annotation['span']): # Note: +1 offset to account for start token. if tokens[i + 1] not in alias_map: continue else: entity_ids[i] = entity_id shortlist_inds[i] = shortlist_ind alias_copy_inds[i] = alias_map[tokens[i + 1]] alias_inds[i] = [alias_map[token] for token in alias] alias_tokens[i] = TextField([Token(x) for x in alias], self._token_indexers) max_len = max(max_len, len(alias)) # Make alias_inds into a numpy array alias_ind_array = np.zeros((len(target), max_len)) for i, arr in enumerate(alias_inds): for j, ind in enumerate(arr): alias_ind_array[i, j] = ind fields['entity_ids'] = TextField( [Token(x) for x in entity_ids], token_indexers=self._entity_indexers) fields['alias_copy_inds'] = SequentialArrayField(alias_copy_inds, dtype=np.int64) fields['shortlist'] = TextField( [Token(x) for x in shortlist], token_indexers=self._entity_indexers) fields['shortlist_inds'] = SequentialArrayField(shortlist_inds, dtype=np.int64) fields['alias_tokens'] = ListField(alias_tokens) fields['alias_inds'] = SequentialArrayField(alias_ind_array, dtype=np.int64) return Instance(fields)
def _json_to_instance(self, json_dict: JsonDict) -> Instance: # pylint: disable=protected-access # Extract tokens and EOS offset tokens = [x + ['@@END@@'] for x in json_dict['tokens'][1:-1]] eos_offset = [[i] * len(x) for i, x in enumerate(tokens)] tokens = ['@@START@@'] + _flatten(tokens) eos_offset = [0] + _flatten(eos_offset) source = tokens[:-1] if self._dataset_reader._mode == 'generative': target = tokens[1:] else: target = None # Process annotations if 'annotations' not in json_dict: shortlist = None reverse_shortlist = None raw_entity_ids = None entity_ids = None relations = None parent_ids = None shortlist_inds = None mention_type = None else: # We maintain a "shortlist" of observed entities, that is used for baseline models # that only select entities from the set that appear in the document (as opposed to # the set of all possible entities). shortlist = [DEFAULT_PADDING_TOKEN] reverse_shortlist = {DEFAULT_PADDING_TOKEN: 0} raw_entity_ids = [DEFAULT_PADDING_TOKEN] * len(source) entity_ids = [DEFAULT_PADDING_TOKEN] * len(source) relations = [[DEFAULT_PADDING_TOKEN]] * len(source) parent_ids = [[DEFAULT_PADDING_TOKEN]] * len(source) shortlist_inds = np.zeros(shape=(len(source), )) mention_type = np.zeros(shape=(len(source), )) if self._dataset_reader._mode == "generative": alias_copy_inds = np.zeros(shape=(len(source), )) else: alias_copy_inds = None # Process annotations for annotation in json_dict['annotations']: # Obtain the entity identifier for the annotated span raw_entity_id = annotation['id'] raw_parent_id = annotation['parent_id'] entity_id = normalize_entity_id(raw_entity_id) if entity_id is None: continue parent_id = [normalize_entity_id(x) for x in raw_parent_id] assert len(parent_id) == len(raw_parent_id) relation = annotation['relation'] new_entity = relation == ['@@NEW@@'] # If necessary, update the shortlist. Obtain the index of the entity identifier in # the shortlist. if entity_id not in reverse_shortlist: reverse_shortlist[entity_id] = len(reverse_shortlist) shortlist.append(entity_id) shortlist_ind = reverse_shortlist[entity_id] # Update the outputs # Offset is 0 in generative case, since each timestep is for predicting # attributes of the next token. In the discriminative case, each timestep # is for predicting attributes of the current token. mode_offset = -1 if self._dataset_reader._mode == "generative" else 0 span = annotation['span'] eos_offset_adjusted_span = tuple(i + eos_offset[i] for i in span) for i in range(*eos_offset_adjusted_span): raw_entity_ids[i + mode_offset] = raw_entity_id entity_ids[i + mode_offset] = entity_id mention_type[i + mode_offset] = 3 if new_entity: shortlist_inds[i + mode_offset] = shortlist_ind else: relations[i + mode_offset] = relation[:MAX_PARENTS] parent_ids[i + mode_offset] = parent_id[:MAX_PARENTS] if self._dataset_reader._mode == "generative": alias_copy_inds[ i + mode_offset] = self._dataset_reader._alias_database.token_to_uid( raw_entity_id, tokens[i]) # Now put in proper mention type for first token start = annotation['span'][0] if new_entity: mention_type[start + mode_offset] = 1 else: mention_type[start + mode_offset] = 2 instance = self._dataset_reader.text_to_instance( source, target, shortlist, reverse_shortlist, raw_entity_ids, entity_ids, relations, parent_ids, shortlist_inds, mention_type, alias_copy_inds) reset = SequentialArrayField(np.array(1), dtype=np.uint8) instance.add_field('reset', reset) return instance
def text_to_instance( self, # type: ignore sentences: List[List[str]], gold_clusters: Optional[List[List[Tuple[int, int]]]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- sentences : ``List[List[str]]``, required. A list of lists representing the tokenised words and sentences in the document. gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None) A list of all clusters in the document, represented as word spans. Each cluster contains some number of spans, which can be nested and overlap, but will never exactly match between clusters. Returns ------- An ``Instance`` containing the following ``Fields``: tokens : ``TextField`` The text of the full document. entity_types : ``SequentialArrayField`` An array with 1's in positions corresponding to words in entities, and 0's in positions corresponding to words not in entities. entity_ids : ``SequentialArrayField`` An array with an entity index in positions corresponding to words in entities, and 0's in positions corresponding to words not in entities. Words in coreferring entities share the same entity ID. mention_lengths : ``SequentialArrayField`` An array with the remaining words in each entity. For words that aren't in an entity, the corresponding index is "1". Else, the corresponding index has the number of words remaining in the entity. If the entity is of length "1", it is assigned "1". """ # Filter gold_clusters: for embedded mentions, only the # enclosing (outer) entity mention is kept. filtered_gold_clusters = [] all_entity_spans = [ span for gold_cluster in gold_clusters for span in gold_cluster ] for cluster in gold_clusters: filtered_cluster = [] for span in cluster: is_embedded_span = False for other_span in all_entity_spans: # Skip if span is equal to other_span if span == other_span: continue if span[0] >= other_span[0] and span[1] <= other_span[1]: # span is embedded within other_span, so don't use it is_embedded_span = True break if not is_embedded_span: filtered_cluster.append(span) if filtered_cluster: filtered_gold_clusters.append(filtered_cluster) # Sort the gold clusters, so the earlier-occurring clusters are earlier in the list filtered_gold_clusters = sorted(filtered_gold_clusters, key=lambda x: sorted(x)[0][0]) flattened_sentences = [ self._normalize_word(word, self._replace_numbers) for sentence in sentences for word in sentence ] tokens = ['@@START@@', *flattened_sentences, '@@END@@'] text_field = TextField([Token(word) for word in tokens], self._token_indexers) fields: Dict[str, Field] = {"tokens": text_field} cluster_dict = {} if filtered_gold_clusters is not None: for cluster_id, cluster in enumerate(filtered_gold_clusters, 1): for mention in cluster: cluster_dict[tuple(mention)] = cluster_id # Initialize fields. entity_types = np.zeros(shape=(len(tokens), )) entity_ids = np.zeros(shape=(len(tokens), )) mention_lengths = np.ones(shape=(len(tokens), )) if cluster_dict: for cluster, entity_id in cluster_dict.items(): # Fill in "1" for positions corresponding to words in entities # Need offset by 1 to account for @@START@@ token. entity_types[cluster[0] + 1:cluster[1] + 1 + 1] = 1 # Fill in entity ID entity_ids[cluster[0] + 1:cluster[1] + 1 + 1] = entity_id entity_length = (cluster[1] + 1) - cluster[0] # Fill in mention length mention_lengths[cluster[0] + 1:cluster[1] + 1 + 1] = np.arange( entity_length, 0, step=-1) fields['entity_ids'] = SequentialArrayField(entity_ids, dtype=np.int64) fields['mention_lengths'] = SequentialArrayField(mention_lengths, dtype=np.int64) fields['entity_types'] = SequentialArrayField(entity_types, dtype=np.uint8) return Instance(fields)