def batchify(self, tokenizer_src, tokenizer_tgt):
        src_ids = dataset_to_ids(
            self.dataset_src,
            tokenizer_src,
            cache_ids=self.cache_ids,
            cache_data_per_node=self.cache_data_per_node,
            use_cache=self.use_cache,
        )
        tgt_ids = dataset_to_ids(
            self.dataset_tgt,
            tokenizer_tgt,
            cache_ids=self.cache_ids,
            cache_data_per_node=self.cache_data_per_node,
            use_cache=self.use_cache,
        )
        if self.clean:
            src_ids, tgt_ids = self.clean_src_and_target(
                src_ids,
                tgt_ids,
                max_tokens=self.max_seq_length,
                min_tokens=self.min_seq_length,
                max_tokens_diff=self.max_seq_length_diff,
                max_tokens_ratio=self.max_seq_length_ratio,
            )
        self.src_pad_id = tokenizer_src.pad_id
        self.tgt_pad_id = tokenizer_tgt.pad_id

        self.batch_indices = self.pack_data_into_batches(src_ids, tgt_ids)
        self.batches = self.pad_batches(src_ids, tgt_ids, self.batch_indices)
Exemple #2
0
    def __init__(
        self,
        tokenizer: Any,
        dataset: Any,
        tokens_in_batch: int = 1024,
        clean: bool = False,
        cache_ids: bool = False,
        max_seq_length: int = 512,
        min_seq_length: int = 1,
    ):

        self.tokenizer = tokenizer
        self.tokens_in_batch = tokens_in_batch

        ids = dataset_to_ids(dataset, tokenizer, cache_ids=cache_ids)
        if clean:
            ids = self.clean(ids,
                             max_tokens=max_seq_length,
                             min_tokens=min_seq_length)
        self.batch_sent_ids, self.batch_elem_lengths = self.pack_data_into_batches(
            ids)
        self.batches = self.pad_batches(ids)