def compute_lens(self, data: Union[List[Dict[str, Any]], str], dataset: TransformDataset, input_ids='token_input_ids', length_field='token'): """ Args: data: Samples to be measured or path to dataset during training time. dataset: During training time, use this dataset to measure the length of each sample inside. input_ids: Field name corresponds to input ids. length_field: Fall back to this field during prediction as input_ids may not be generated yet. Returns: Length list of this samples """ if isinstance(data, str): if not dataset.cache: warnings.warn( f'Caching for the dataset is not enabled, ' f'try `dataset.purge_cache()` if possible. The dataset is {dataset}.' ) timer = CountdownTimer(len(dataset)) for each in dataset: timer.log( 'Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]' ) timer.erase() return [len(x[input_ids]) for x in dataset] return [len(x[length_field]) for x in data]
def build_vocabs(self, dataset, logger, **kwargs): self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None) # Use null to indicate no relationship self.vocabs.srl_label.add('<null>') timer = CountdownTimer(len(dataset)) max_seq_len = 0 for each in dataset: max_seq_len = max(max_seq_len, len(each['token_input_ids'])) timer.log( f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]' ) pass timer.stop() timer.erase() self.vocabs['srl_label'].set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger)