Example #1
0
    def compute_lens(self,
                     data: Union[List[Dict[str, Any]], str],
                     dataset: TransformDataset,
                     input_ids='token_input_ids',
                     length_field='token'):
        """

        Args:
            data: Samples to be measured or path to dataset during training time.
            dataset: During training time, use this dataset to measure the length of each sample inside.
            input_ids: Field name corresponds to input ids.
            length_field: Fall back to this field during prediction as input_ids may not be generated yet.

        Returns:

            Length list of this samples

        """
        if isinstance(data, str):
            if not dataset.cache:
                warnings.warn(
                    f'Caching for the dataset is not enabled, '
                    f'try `dataset.purge_cache()` if possible. The dataset is {dataset}.'
                )
            timer = CountdownTimer(len(dataset))
            for each in dataset:
                timer.log(
                    'Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]'
                )
            timer.erase()
            return [len(x[input_ids]) for x in dataset]
        return [len(x[length_field]) for x in data]
Example #2
0
 def build_vocabs(self, dataset, logger, **kwargs):
     self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None)
     # Use null to indicate no relationship
     self.vocabs.srl_label.add('<null>')
     timer = CountdownTimer(len(dataset))
     max_seq_len = 0
     for each in dataset:
         max_seq_len = max(max_seq_len, len(each['token_input_ids']))
         timer.log(
             f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]'
         )
         pass
     timer.stop()
     timer.erase()
     self.vocabs['srl_label'].set_unk_as_safe_unk()
     self.vocabs.lock()
     self.vocabs.summary(logger)