Example #1
0
    def _process(self, data_pack: DataPack):
        """
        Process the data pack to collect vocabulary information.

        Args:
            data_pack: The ner data to create vocabulary with.

        Returns:

        """
        # for data_pack in input_pack:
        for instance in data_pack.get_data(context_type=Sentence,
                                           request={
                                               Token: ["chunk", "pos", "ner"]
                                           }):
            for token in instance["Token"]["text"]:
                for char in token:
                    self.char_cnt[char] += 1
                word = self.normalize_func(token)
                self.word_cnt[word] += 1

            for pos in instance["Token"]["pos"]:
                self.pos_cnt[pos] += 1
            for chunk in instance["Token"]["chunk"]:
                self.chunk_cnt[chunk] += 1
            for ner in instance["Token"]["ner"]:
                self.ner_cnt[ner] += 1
Example #2
0
    def _get_data_batch(self,
                        data_pack: DataPack,
                        context_type: Type[Annotation],
                        requests: Optional[Dict[Type[Entry],
                                                Union[Dict, List]]] = None,
                        offset: int = 0) -> Iterable[Tuple[Dict, int]]:
        """
        Try to get batches from a dataset  with ``batch_size``, but will
        yield an incomplete batch if the data_pack is exhausted.

        Returns:
            An iterator of tuples ``(batch, cnt)``, ``batch`` is a dict
            containing the required annotations and context, and ``cnt`` is
            the number of instances in the batch.
        """
        instances: List[Dict] = []
        for data in data_pack.get_data(context_type, requests, offset):
            instances.append(data)
            if len(instances) == self.batch_size:
                batch = batch_instances(instances)
                # self.instance_num_in_current_batch += len(instances)
                self.batch_is_full = True
                yield (batch, len(instances))
                instances = []
                self.batch_is_full = False

        # Flush the remaining data.
        if len(instances) > 0:
            # self.instance_num_in_current_batch += len(instances)
            batch = batch_instances(instances)
            yield (batch, len(instances))