Beispiel #1
0
    def get_entries(
        self,
        entry_type: Type[EntryType],
        range_annotation: Optional[Annotation] = None,
        components: Optional[Union[str,
                                   List[str]]] = None) -> Iterable[EntryType]:
        """
        Get ``entry_type`` entries from the span of ``range_annotation`` in a
        DataPack.

        Args:
            entry_type (type): The type of entries requested.
            range_annotation (Annotation, optional): The range of entries
                requested. If `None`, will return valid entries in the range of
                whole data_pack.
            components (str or list, optional): The component generating the
                entries requested. If `None`, will return valid entries
                generated by any component.
        """

        range_begin = range_annotation.span.begin if range_annotation else 0
        range_end = (range_annotation.span.end
                     if range_annotation else self.annotations[-1].span.end)

        # valid type
        valid_id = self.get_ids_by_type(entry_type)
        # valid component
        if components is not None:
            if isinstance(components, str):
                components = [components]
            valid_component_id: Set[int] = set()
            for component in components:
                valid_component_id |= self.get_ids_by_component(component)
            valid_id &= valid_component_id
        # valid span
        if range_annotation is not None:
            coverage_index = self.index.coverage_index(type(range_annotation),
                                                       entry_type)
            if coverage_index is not None:
                valid_id &= coverage_index[range_annotation.tid]

        if issubclass(entry_type, Annotation):
            begin_index = self.annotations.bisect(
                Annotation(self, range_begin, range_begin))
            end_index = self.annotations.bisect(
                Annotation(self, range_end, range_end))
            for annotation in self.annotations[begin_index:end_index]:
                if annotation.tid not in valid_id:
                    continue
                if (range_annotation is None or self.index.in_span(
                        annotation, range_annotation.span)):
                    yield annotation

        elif issubclass(entry_type, (Link, Group)):
            for entry_id in valid_id:
                entry: EntryType = self.get_entry(entry_id)  # type: ignore
                if (range_annotation is None
                        or self.index.in_span(entry, range_annotation.span)):
                    yield entry
Beispiel #2
0
    def iter_in_range(self, entry_type: Type[EntryType],
                      range_annotation: Annotation) -> Iterator[EntryType]:
        """
        Iterate the entries of the provided type within or fulfill the
        constraints of the `range_annotation`. The constraint is True if
        an entry is `in_span` of the provided `range_annotation`.

        Internally, if the coverage index between the entry type and the
        type of the `range_annotation` is built, then this will create the
        iterator from the index. Otherwise, the function will iterate them
        from scratch (which is slower). If there are frequent usage of this
        function, it is suggested to build the coverage index.

        Args:
            entry_type: The type of entry to iterate over.
            range_annotation: The range annotation that serve as the constraint.

        Returns:
            An iterator of the entries with in the `range_annotation`.

        """
        use_coverage = self._index.coverage_index_is_valid
        coverage_index: Optional[Dict[int, Set[int]]] = {}

        if use_coverage:
            coverage_index = self._index.coverage_index(
                type(range_annotation), entry_type)
            if coverage_index is None:
                use_coverage = False

        if use_coverage and coverage_index is not None:
            for tid in coverage_index[range_annotation.tid]:
                yield self.get_entry(tid)  # type: ignore
        else:
            if issubclass(entry_type, Annotation):
                range_begin = (range_annotation.span.begin
                               if range_annotation else 0)
                range_end = (range_annotation.span.end if range_annotation else
                             self.annotations[-1].span.end)

                if issubclass(entry_type, Annotation):
                    temp_begin = Annotation(self, range_begin, range_begin)
                    begin_index = self.annotations.bisect(temp_begin)

                    temp_end = Annotation(self, range_end, range_end)
                    end_index = self.annotations.bisect(temp_end)

                    # Make sure these temporary annotations are not part of the
                    # actual data.
                    temp_begin.regret_creation()
                    temp_end.regret_creation()
                    yield from self.annotations[begin_index:end_index]
            elif issubclass(entry_type, Link):
                for link in self.links:
                    if self._index.in_span(link, range_annotation.span):
                        yield link
            elif issubclass(entry_type, Group):
                for group in self.groups:
                    if self._index.in_span(group, range_annotation.span):
                        yield group
Beispiel #3
0
    def get_entries(
        self,
        entry_type: Type[EntryType],
        range_annotation: Optional[Annotation] = None,
        components: Optional[Union[str,
                                   List[str]]] = None) -> Iterable[EntryType]:
        r"""Get ``entry_type`` entries from the span of ``range_annotation`` in
        a DataPack.

        Example:

            .. code-block:: python

                for sentence in input_pack.get_entries(Sentence):
                    token_entries = input_pack.get_entries(
                            entry_type=Token,range_annotation=sentence,
                            component=token_component)
                    ...

            In the above code snippet, we get entries of type ``Token`` within
            each ``sentence`` which were generated by ``token_component``

        Args:
            entry_type (type): The type of entries requested.
            range_annotation (Annotation, optional): The range of entries
                requested. If `None`, will return valid entries in the range of
                whole data_pack.
            components (str or list, optional): The component generating the
                entries requested. If `None`, will return valid entries
                generated by any component.
        """
        if range_annotation is not None and len(self.annotations) == 0:
            yield from []
            # After yield, don't do real queries.
            return

        range_begin = range_annotation.span.begin if range_annotation else 0
        range_end = (range_annotation.span.end
                     if range_annotation else self.annotations[-1].span.end)

        # valid type
        valid_id = self.get_ids_by_type(entry_type)
        # valid component
        if components is not None:
            if isinstance(components, str):
                components = [components]
            valid_id &= self.get_ids_by_components(components)

        # Generics do not work with range_annotation.
        if issubclass(entry_type, Generics):
            for entry_id in valid_id:
                entry: EntryType = self.get_entry(entry_id)  # type: ignore
                yield entry
            return

        # valid span
        if range_annotation is not None:
            coverage_index = self.index.coverage_index(type(range_annotation),
                                                       entry_type)
            if coverage_index is not None:
                valid_id &= coverage_index[range_annotation.tid]

        if issubclass(entry_type, Annotation):
            temp_begin = Annotation(self, range_begin, range_begin)
            begin_index = self.annotations.bisect(temp_begin)

            temp_end = Annotation(self, range_end, range_end)
            end_index = self.annotations.bisect(temp_end)

            # Make sure these temporary annotations are not part of the
            # actual data.
            temp_begin.regret_creation()
            temp_end.regret_creation()

            for annotation in self.annotations[begin_index:end_index]:
                if annotation.tid not in valid_id:
                    continue
                if (range_annotation is None or self.index.in_span(
                        annotation, range_annotation.span)):
                    yield annotation

        elif issubclass(entry_type, (Link, Group)):
            for entry_id in valid_id:
                entry: EntryType = self.get_entry(entry_id)  # type: ignore
                if (range_annotation is None
                        or self.index.in_span(entry, range_annotation.span)):
                    yield entry