Example #1
0
    def __init__(
        self,
        src,
        src_sizes,
        tgt=None,
        tgt_sizes=None,
        dictionary=None,
        left_pad_source=False,
        left_pad_target=False,
        shuffle=True,
        input_feeding=True,
        num_buckets=0,
    ):
        self.src = src
        self.tgt = tgt
        self.src_sizes = np.array(src_sizes)
        self.tgt_sizes = np.array(tgt_sizes) if tgt_sizes is not None else None
        self.dictionary = dictionary
        self.left_pad_source = left_pad_source
        self.left_pad_target = left_pad_target
        self.shuffle = shuffle
        self.input_feeding = input_feeding
        if self.tgt is not None:
            self._match_src_tgt()

        if num_buckets > 0:
            from espresso.data import FeatBucketPadLengthDataset, TextBucketPadLengthDataset
            self.src = FeatBucketPadLengthDataset(
                self.src,
                sizes=self.src_sizes,
                num_buckets=num_buckets,
                pad_idx=0.0,
                left_pad=False,
            )
            self.src_sizes = self.src.sizes
            logger.info('bucketing source lengths: {}'.format(
                list(self.src.buckets)))
            if self.tgt is not None:
                self.tgt = TextBucketPadLengthDataset(
                    self.tgt,
                    sizes=self.tgt_sizes,
                    num_buckets=num_buckets,
                    pad_idx=self.dictionary.pad(),
                    left_pad=False,
                )
                self.tgt_sizes = self.tgt.sizes
                logger.info('bucketing target lengths: {}'.format(
                    list(self.tgt.buckets)))

            # determine bucket sizes using self.num_tokens, which will return
            # the padded lengths (thanks to FeatBucketPadLengthDataset)
            num_tokens = np.vectorize(self.num_tokens, otypes=[np.long])
            self.bucketed_num_tokens = num_tokens(np.arange(len(self.src)))
            self.buckets = [
                (None, num_tokens)
                for num_tokens in np.unique(self.bucketed_num_tokens)
            ]
        else:
            self.buckets = None
Example #2
0
    def __init__(
        self,
        src,
        src_sizes,
        tgt=None,
        tgt_sizes=None,
        text=None,
        shuffle=True,
        num_buckets=0,
        pad_to_multiple=1,
    ):
        self.src = src
        self.tgt = tgt
        self.src_sizes = np.array(src_sizes)
        self.tgt_sizes = np.array(tgt_sizes) if tgt_sizes is not None else None
        self.text = text
        self.shuffle = shuffle
        self.epoch = 1
        num_before_matching = len(self.src.utt_ids)
        if self.tgt is not None:
            self._match_src_tgt()
        if self.text is not None:
            changed = self._match_src_text()
            if self.tgt is not None and changed:
                self._match_src_tgt()
        num_after_matching = len(self.src.utt_ids)
        num_removed = num_before_matching - num_after_matching
        if num_removed > 0:
            logger.warning(
                "Removed {} examples due to empty numerator graphs or missing entries, "
                "{} remaining".format(num_removed, num_after_matching))
        self.sizes = np.vstack(
            (self.src_sizes, self.tgt_sizes
             )).T if self.tgt_sizes is not None else self.src_sizes

        if num_buckets > 0:
            from espresso.data import FeatBucketPadLengthDataset
            self.src = FeatBucketPadLengthDataset(
                self.src,
                sizes=self.src_sizes,
                num_buckets=num_buckets,
                pad_idx=0.0,
                left_pad=False,
            )
            self.src_sizes = self.src.sizes
            logger.info("bucketing source lengths: {}".format(
                list(self.src.buckets)))

            # determine bucket sizes using self.num_tokens, which will return
            # the padded lengths (thanks to FeatBucketPadLengthDataset)
            num_tokens = np.vectorize(self.num_tokens, otypes=[np.long])
            self.bucketed_num_tokens = num_tokens(np.arange(len(self.src)))
            self.buckets = [
                (None, num_tokens)
                for num_tokens in np.unique(self.bucketed_num_tokens)
            ]
        else:
            self.buckets = None
        self.pad_to_multiple = pad_to_multiple
Example #3
0
    def __init__(
        self,
        src,
        src_sizes,
        tgt: Optional[AliScpCachedDataset] = None,
        tgt_sizes=None,
        text=None,
        shuffle=True,
        num_buckets=0,
        pad_to_multiple=1,
        seed=1,
        chunk_width=None,
        chunk_left_context=None,
        chunk_right_context=None,
        label_delay=0,
        random_chunking=True,
    ):
        self.src = src
        self.tgt = tgt
        self.src_sizes = np.array(src_sizes)
        self.tgt_sizes = np.array(tgt_sizes) if tgt_sizes is not None else None
        self.text = text
        self.shuffle = shuffle
        self.seed = seed
        self.epoch = 1
        assert chunk_width is None or chunk_width > 0
        self.chunk_width = chunk_width
        assert chunk_left_context >= 0 and chunk_right_context >= 0
        self.chunk_left_context = chunk_left_context
        self.chunk_right_context = chunk_right_context
        assert (label_delay < 0 and -label_delay <= chunk_right_context) or \
            (label_delay >= 0 and (chunk_width is None or label_delay < chunk_width))
        self.label_delay = label_delay
        self.random_chunking = random_chunking
        if self.tgt is not None:
            self._match_src_tgt()
        if self.text is not None:
            changed = self._match_src_text()
            if self.tgt is not None and changed:
                self._match_src_tgt()
        self.sizes = np.vstack(
            (self.src_sizes, self.tgt_sizes
             )).T if self.tgt_sizes is not None else self.src_sizes

        if chunk_width is not None:
            # remove those whose lengths are shorter than chunk_size
            indices = np.flatnonzero(self.src.sizes >= chunk_width)
            if len(indices) < self.src.size:
                logger.warning(
                    "Removing {} examples whose lengths are shorter than chunk_size={}"
                    .format(self.src.size - len(indices), chunk_width))
                self.src.filter_and_reorder(indices)
                if self.tgt is not None:
                    self.tgt.filter_and_reorder(indices)
                if self.text is not None:
                    self.text.filter_and_reorder(indices)
                logger.warning("Done removal. {} examples remaining".format(
                    len(indices)))

        if num_buckets > 0:
            from fairseq.data import BucketPadLengthDataset
            from espresso.data import FeatBucketPadLengthDataset
            self.src = FeatBucketPadLengthDataset(
                self.src,
                sizes=self.src_sizes,
                num_buckets=num_buckets,
                pad_idx=0.0,
                left_pad=False,
            )
            self.src_sizes = self.src.sizes
            logger.info("bucketing source lengths: {}".format(
                list(self.src.buckets)))
            if self.tgt is not None:
                self.tgt = BucketPadLengthDataset(
                    self.tgt,
                    sizes=self.tgt_sizes,
                    num_buckets=num_buckets,
                    pad_idx=self.dictionary.pad(),
                    left_pad=False,
                )
                self.tgt_sizes = self.tgt.sizes
                logger.info("bucketing target lengths: {}".format(
                    list(self.tgt.buckets)))

            # determine bucket sizes using self.num_tokens, which will return
            # the padded lengths (thanks to FeatBucketPadLengthDataset)
            num_tokens = np.vectorize(self.num_tokens, otypes=[np.long])
            self.bucketed_num_tokens = num_tokens(np.arange(len(self.src)))
            self.buckets = [
                (None, num_tokens)
                for num_tokens in np.unique(self.bucketed_num_tokens)
            ]
        else:
            self.buckets = None
        self.pad_to_multiple = pad_to_multiple