def _process(self, data_pack: DataPack): """ Process the data pack to collect vocabulary information. Args: data_pack: The ner data to create vocabulary with. Returns: """ # for data_pack in input_pack: for instance in data_pack.get_data(context_type=Sentence, request={ Token: ["chunk", "pos", "ner"] }): for token in instance["Token"]["text"]: for char in token: self.char_cnt[char] += 1 word = self.normalize_func(token) self.word_cnt[word] += 1 for pos in instance["Token"]["pos"]: self.pos_cnt[pos] += 1 for chunk in instance["Token"]["chunk"]: self.chunk_cnt[chunk] += 1 for ner in instance["Token"]["ner"]: self.ner_cnt[ner] += 1
def _get_data_batch(self, data_pack: DataPack, context_type: Type[Annotation], requests: Optional[Dict[Type[Entry], Union[Dict, List]]] = None, offset: int = 0) -> Iterable[Tuple[Dict, int]]: """ Try to get batches from a dataset with ``batch_size``, but will yield an incomplete batch if the data_pack is exhausted. Returns: An iterator of tuples ``(batch, cnt)``, ``batch`` is a dict containing the required annotations and context, and ``cnt`` is the number of instances in the batch. """ instances: List[Dict] = [] for data in data_pack.get_data(context_type, requests, offset): instances.append(data) if len(instances) == self.batch_size: batch = batch_instances(instances) # self.instance_num_in_current_batch += len(instances) self.batch_is_full = True yield (batch, len(instances)) instances = [] self.batch_is_full = False # Flush the remaining data. if len(instances) > 0: # self.instance_num_in_current_batch += len(instances) batch = batch_instances(instances) yield (batch, len(instances))