def text_to_instance(
            self,
            source,
            target=None,
            shortlist=None,
            reverse_shortlist=None,
            raw_entity_ids=None,
            entity_ids=None,
            relations=None,
            parent_ids=None,
            shortlist_inds=None,
            mention_type=None,
            alias_copy_inds=None) -> Instance:  # pylint: disable=arguments-differ
        metadata = {
            'source_tokens': source,
            'alias_database': self._alias_database
        }
        fields = {
            'metadata': MetadataField(metadata),
            'source': TextField(_tokenize(source), self._token_indexers),
        }

        if target is not None:
            fields['target'] = TextField(_tokenize(target),
                                         self._token_indexers)
            metadata['target_tokens'] = target
        if shortlist is not None:
            fields['shortlist'] = TextField(_tokenize(shortlist),
                                            self._entity_indexers)
        if raw_entity_ids is not None:
            fields['raw_entity_ids'] = TextField(_tokenize(raw_entity_ids),
                                                 self._raw_entity_indexers)
        if entity_ids is not None:
            fields['entity_ids'] = TextField(_tokenize(entity_ids),
                                             self._entity_indexers)
        if parent_ids is not None:
            fields['parent_ids'] = ListField([
                TextField(_tokenize(sublist),
                          token_indexers=self._entity_indexers)
                for sublist in parent_ids
            ])
        if relations is not None:
            fields['relations'] = ListField([
                TextField(_tokenize(sublist),
                          token_indexers=self._relation_indexers)
                for sublist in relations
            ])
        if mention_type is not None:
            fields['mention_type'] = SequentialArrayField(mention_type,
                                                          dtype=np.int64)
        if shortlist_inds is not None:
            fields['shortlist_inds'] = SequentialArrayField(shortlist_inds,
                                                            dtype=np.int64)
        if alias_copy_inds is not None:
            fields['alias_copy_inds'] = SequentialArrayField(alias_copy_inds,
                                                             dtype=np.int64)

        return Instance(fields)
    def _split(self, instance: Instance) -> Tuple[List[Instance], int]:
        # Determine the size of the sequence inside the instance.
        true_length = len(instance['source'])
        padded_length = self._split_size * (true_length // self._split_size)

        # Determine the split indices.
        split_indices = list(range(0, true_length, self._split_size))
        if true_length > split_indices[-1]:
            split_indices.append(true_length)

        # Determine which fields are not going to be split
        constant_fields = [
            x for x in instance.fields if x not in self._splitting_keys
        ]

        # Create the list of chunks
        chunks: List[Instance] = []

        for i, (start,
                end) in enumerate(zip(split_indices[:-1], split_indices[1:])):

            # Copy all of the constant fields from the instance to the chunk.
            chunk_fields = {key: instance[key] for key in constant_fields}

            # Determine whether or not to signal model to reset.
            if i == 0:
                reset = SequentialArrayField(np.array(1), dtype=np.uint8)
            else:
                reset = SequentialArrayField(np.array(0), dtype=np.uint8)
            chunk_fields['reset'] = reset

            # Obtain splits derived from sequence fields.
            for key in self._splitting_keys:
                source_field = instance[key]
                # pylint: disable=protected-access
                if isinstance(source_field, TextField):
                    split_field = TextField(source_field.tokens[start:end],
                                            source_field._token_indexers)
                elif isinstance(source_field, SequentialArrayField):
                    # TODO: Figure out how to use sequence dim here...
                    split_field = SequentialArrayField(
                        source_field.array[start:end],
                        dtype=source_field._dtype)
                elif isinstance(source_field, ListField):
                    split_field = ListField(source_field.field_list[start:end])
                else:
                    raise NotImplementedError(
                        'FancyIterator currently only supports splitting '
                        '`TextField`s or `SequentialArrayField`s.')
                chunk_fields[key] = split_field
            chunks.append(Instance(chunk_fields))

        return chunks, padded_length
    def text_to_instance(self, data: Dict[str, Any]) -> Instance:  # pylint: disable=arguments-differ
        # Flatten and pad tokens
        tokens = _flatten(data['tokens'])
        tokens = [*tokens]
        tokens = [Token(x) for x in tokens]
        fields = {'tokens': TextField(tokens, self._token_indexers)}

        # If annotations are provided, process them into arrays.
        if 'annotations' in data:

            # Initialize arrays and book keeping data structures.
            seen_entities: Set[str] = set()
            entity_types = np.zeros(shape=(len(tokens), ))
            entity_ids = np.zeros(shape=(len(tokens), ))
            mention_lengths = np.ones(shape=(len(tokens), ))

            # Process annotations
            for annotation in data['annotations']:

                seen_entities.add(annotation['id'])
                start, end = annotation['span']
                length = end - start

                for i in range(*annotation['span']):
                    # Note: +1 offset to account for start token.
                    entity_types[i] = 1
                    entity_ids[i] = len(seen_entities)
                    mention_lengths[i] = length
                    length -= 1

            fields['entity_types'] = SequentialArrayField(entity_types,
                                                          dtype=np.uint8)
            fields['entity_ids'] = SequentialArrayField(entity_ids,
                                                        dtype=np.int64)
            fields['mention_lengths'] = SequentialArrayField(mention_lengths,
                                                             dtype=np.int64)

        return Instance(fields)
    def text_to_instance(self, data: Dict[str, Any]) -> Instance:  # pylint: disable=arguments-differ
        # Flatten and pad tokens
        tokens = _flatten(data['tokens'])
        tokens = ['@@START@@', *tokens, '@@END@@']
        source = [Token(x) for x in tokens[:-1]]
        target = [Token(x) for x in tokens[1:]]
        fields = {
            'source': TextField(source, self._token_indexers),
            'target': TextField(target, self._token_indexers)
        }

        # Process annotations
        if 'annotations' in data:

            # We maintain a "shortlist" of observed entities, that is used for baseline models
            # that only select entities from the set that appear in the document (as opposed to
            # the set of all possible entities).
            shortlist = [DEFAULT_PADDING_TOKEN]
            reverse_shortlist = {DEFAULT_PADDING_TOKEN: 0}

            entity_ids = [DEFAULT_PADDING_TOKEN] * len(target)
            shortlist_inds = np.zeros(shape=(len(target, )))
            alias_copy_inds = np.zeros(shape=(len(target), ))
            alias_tokens = [TextField([], self._token_indexers)] * len(target)
            alias_inds: List[List[int]] = [[]] * len(target)
            max_len = 0

            # Process annotations
            for annotation in data['annotations']:

                # Obtain the entity identifier for the annotated span
                entity_id = annotation['id']
                alias = annotation['alias']
                alias_map = {
                    token: i + 1
                    for i, token in enumerate(set(alias))
                }

                # If neccessary, update the shortlist. Obtain the index of the entity identifier in
                # the shortlist.
                if entity_id not in reverse_shortlist:
                    reverse_shortlist[entity_id] = len(reverse_shortlist)
                    shortlist.append(entity_id)
                shortlist_ind = reverse_shortlist[entity_id]

                # Update the outputs
                for i in range(*annotation['span']):
                    # Note: +1 offset to account for start token.
                    if tokens[i + 1] not in alias_map:
                        continue
                    else:
                        entity_ids[i] = entity_id
                        shortlist_inds[i] = shortlist_ind
                        alias_copy_inds[i] = alias_map[tokens[i + 1]]
                        alias_inds[i] = [alias_map[token] for token in alias]
                        alias_tokens[i] = TextField([Token(x) for x in alias],
                                                    self._token_indexers)
                        max_len = max(max_len, len(alias))

            # Make alias_inds into a numpy array
            alias_ind_array = np.zeros((len(target), max_len))
            for i, arr in enumerate(alias_inds):
                for j, ind in enumerate(arr):
                    alias_ind_array[i, j] = ind

            fields['entity_ids'] = TextField(
                [Token(x) for x in entity_ids],
                token_indexers=self._entity_indexers)
            fields['alias_copy_inds'] = SequentialArrayField(alias_copy_inds,
                                                             dtype=np.int64)
            fields['shortlist'] = TextField(
                [Token(x) for x in shortlist],
                token_indexers=self._entity_indexers)
            fields['shortlist_inds'] = SequentialArrayField(shortlist_inds,
                                                            dtype=np.int64)
            fields['alias_tokens'] = ListField(alias_tokens)
            fields['alias_inds'] = SequentialArrayField(alias_ind_array,
                                                        dtype=np.int64)

        return Instance(fields)
Beispiel #5
0
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        # pylint: disable=protected-access

        # Extract tokens and EOS offset
        tokens = [x + ['@@END@@'] for x in json_dict['tokens'][1:-1]]
        eos_offset = [[i] * len(x) for i, x in enumerate(tokens)]
        tokens = ['@@START@@'] + _flatten(tokens)
        eos_offset = [0] + _flatten(eos_offset)
        source = tokens[:-1]
        if self._dataset_reader._mode == 'generative':
            target = tokens[1:]
        else:
            target = None

        # Process annotations
        if 'annotations' not in json_dict:
            shortlist = None
            reverse_shortlist = None
            raw_entity_ids = None
            entity_ids = None
            relations = None
            parent_ids = None
            shortlist_inds = None
            mention_type = None
        else:
            # We maintain a "shortlist" of observed entities, that is used for baseline models
            # that only select entities from the set that appear in the document (as opposed to
            # the set of all possible entities).
            shortlist = [DEFAULT_PADDING_TOKEN]
            reverse_shortlist = {DEFAULT_PADDING_TOKEN: 0}
            raw_entity_ids = [DEFAULT_PADDING_TOKEN] * len(source)
            entity_ids = [DEFAULT_PADDING_TOKEN] * len(source)
            relations = [[DEFAULT_PADDING_TOKEN]] * len(source)
            parent_ids = [[DEFAULT_PADDING_TOKEN]] * len(source)
            shortlist_inds = np.zeros(shape=(len(source), ))
            mention_type = np.zeros(shape=(len(source), ))

            if self._dataset_reader._mode == "generative":
                alias_copy_inds = np.zeros(shape=(len(source), ))
            else:
                alias_copy_inds = None

            # Process annotations
            for annotation in json_dict['annotations']:

                # Obtain the entity identifier for the annotated span
                raw_entity_id = annotation['id']
                raw_parent_id = annotation['parent_id']
                entity_id = normalize_entity_id(raw_entity_id)
                if entity_id is None:
                    continue
                parent_id = [normalize_entity_id(x) for x in raw_parent_id]
                assert len(parent_id) == len(raw_parent_id)
                relation = annotation['relation']
                new_entity = relation == ['@@NEW@@']

                # If necessary, update the shortlist. Obtain the index of the entity identifier in
                # the shortlist.
                if entity_id not in reverse_shortlist:
                    reverse_shortlist[entity_id] = len(reverse_shortlist)
                    shortlist.append(entity_id)
                shortlist_ind = reverse_shortlist[entity_id]

                # Update the outputs
                # Offset is 0 in generative case, since each timestep is for predicting
                # attributes of the next token. In the discriminative case, each timestep
                # is for predicting attributes of the current token.
                mode_offset = -1 if self._dataset_reader._mode == "generative" else 0
                span = annotation['span']
                eos_offset_adjusted_span = tuple(i + eos_offset[i]
                                                 for i in span)
                for i in range(*eos_offset_adjusted_span):
                    raw_entity_ids[i + mode_offset] = raw_entity_id
                    entity_ids[i + mode_offset] = entity_id
                    mention_type[i + mode_offset] = 3
                    if new_entity:
                        shortlist_inds[i + mode_offset] = shortlist_ind
                    else:
                        relations[i + mode_offset] = relation[:MAX_PARENTS]
                        parent_ids[i + mode_offset] = parent_id[:MAX_PARENTS]
                    if self._dataset_reader._mode == "generative":
                        alias_copy_inds[
                            i +
                            mode_offset] = self._dataset_reader._alias_database.token_to_uid(
                                raw_entity_id, tokens[i])
                # Now put in proper mention type for first token
                start = annotation['span'][0]
                if new_entity:
                    mention_type[start + mode_offset] = 1
                else:
                    mention_type[start + mode_offset] = 2

        instance = self._dataset_reader.text_to_instance(
            source, target, shortlist, reverse_shortlist, raw_entity_ids,
            entity_ids, relations, parent_ids, shortlist_inds, mention_type,
            alias_copy_inds)

        reset = SequentialArrayField(np.array(1), dtype=np.uint8)
        instance.add_field('reset', reset)
        return instance
Beispiel #6
0
    def text_to_instance(
        self,  # type: ignore
        sentences: List[List[str]],
        gold_clusters: Optional[List[List[Tuple[int,
                                                int]]]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        sentences : ``List[List[str]]``, required.
            A list of lists representing the tokenised words and sentences in the document.
        gold_clusters : ``Optional[List[List[Tuple[int, int]]]]``, optional (default = None)
            A list of all clusters in the document, represented as word spans. Each cluster
            contains some number of spans, which can be nested and overlap, but will never
            exactly match between clusters.

        Returns
        -------
        An ``Instance`` containing the following ``Fields``:
            tokens : ``TextField``
                The text of the full document.
            entity_types : ``SequentialArrayField``
                An array with 1's in positions corresponding to words in entities,
                and 0's in positions corresponding to words not in entities.
            entity_ids : ``SequentialArrayField``
                An array with an entity index in positions corresponding to words in
                entities, and 0's in positions corresponding to words not in entities.
                Words in coreferring entities share the same entity ID.
            mention_lengths : ``SequentialArrayField``
                An array with the remaining words in each entity. For words that aren't
                in an entity, the corresponding index is "1". Else, the corresponding
                index has the number of words remaining in the entity. If the entity
                is of length "1", it is assigned "1".
        """
        # Filter gold_clusters: for embedded mentions, only the
        # enclosing (outer) entity mention is kept.
        filtered_gold_clusters = []
        all_entity_spans = [
            span for gold_cluster in gold_clusters for span in gold_cluster
        ]
        for cluster in gold_clusters:
            filtered_cluster = []
            for span in cluster:
                is_embedded_span = False
                for other_span in all_entity_spans:
                    # Skip if span is equal to other_span
                    if span == other_span:
                        continue
                    if span[0] >= other_span[0] and span[1] <= other_span[1]:
                        # span is embedded within other_span, so don't use it
                        is_embedded_span = True
                        break
                if not is_embedded_span:
                    filtered_cluster.append(span)
            if filtered_cluster:
                filtered_gold_clusters.append(filtered_cluster)

        # Sort the gold clusters, so the earlier-occurring clusters are earlier in the list
        filtered_gold_clusters = sorted(filtered_gold_clusters,
                                        key=lambda x: sorted(x)[0][0])

        flattened_sentences = [
            self._normalize_word(word, self._replace_numbers)
            for sentence in sentences for word in sentence
        ]
        tokens = ['@@START@@', *flattened_sentences, '@@END@@']
        text_field = TextField([Token(word) for word in tokens],
                               self._token_indexers)
        fields: Dict[str, Field] = {"tokens": text_field}

        cluster_dict = {}
        if filtered_gold_clusters is not None:
            for cluster_id, cluster in enumerate(filtered_gold_clusters, 1):
                for mention in cluster:
                    cluster_dict[tuple(mention)] = cluster_id

        # Initialize fields.
        entity_types = np.zeros(shape=(len(tokens), ))
        entity_ids = np.zeros(shape=(len(tokens), ))
        mention_lengths = np.ones(shape=(len(tokens), ))

        if cluster_dict:
            for cluster, entity_id in cluster_dict.items():
                # Fill in "1" for positions corresponding to words in entities
                # Need offset by 1 to account for @@START@@ token.
                entity_types[cluster[0] + 1:cluster[1] + 1 + 1] = 1
                # Fill in entity ID
                entity_ids[cluster[0] + 1:cluster[1] + 1 + 1] = entity_id
                entity_length = (cluster[1] + 1) - cluster[0]
                # Fill in mention length
                mention_lengths[cluster[0] + 1:cluster[1] + 1 + 1] = np.arange(
                    entity_length, 0, step=-1)

        fields['entity_ids'] = SequentialArrayField(entity_ids, dtype=np.int64)
        fields['mention_lengths'] = SequentialArrayField(mention_lengths,
                                                         dtype=np.int64)
        fields['entity_types'] = SequentialArrayField(entity_types,
                                                      dtype=np.uint8)
        return Instance(fields)