Beispiel #1
0
    def get_vocab(self, size_gb=None):

        if self.vocab.is_file():
            print("vocab already exists.")
            return

        print("get vocab ...")
        if size_gb is None:
            nlines = None
        else:
            size_gb_ = size_gb / len(self.langs)
            nlines = [
                int(self.sizes[l.l][0] * size_gb_ * 1024**3 /
                    self.sizes[l.l][1]) for l in self.langs
            ]
        # get vocab only from a subset of 40GB (20 each lang) of the bpe-ed train set
        data_get_vocab = self.folder.joinpath(
            f'train{self.suffix}.bpe.{size_gb}GB')
        print(f"regroup and select data in {data_get_vocab} to get vocab ...")
        regroup_and_select_data(files=[
            self.folder.glob(f'{l.l}.train{self.suffix}.[01234567].bpe')
            for l in self.langs
        ],
                                nlines=nlines,
                                output=data_get_vocab)
        print(f"computing vocab on {data_get_vocab}...")
        get_vocab_file(data_get_vocab, self.vocab)
Beispiel #2
0
    def train_bpe(self, ncodes, size_gb=None):

        if self.codes.is_file():
            print("bpe codes already exists.")
            return

        print("train bpe ...")
        if size_gb is None:
            nlines = None
        else:
            size_gb_ = size_gb / len(self.langs)
            nlines = [
                int(self.sizes[l.l][0] * size_gb_ * 1024**3 /
                    self.sizes[l.l][1]) for l in self.langs
            ]
            print(
                f"we need to regroup {nlines} lines for {self.langs[0].l} {self.langs[1].l} and {self.langs[2].l} to gather {size_gb} Go"
            )
        # train bpe on only 50 GB (25 each lang) of the tokenized train set
        data_train_bpe = self.folder.joinpath(
            f'train{self.suffix}.tok.{size_gb}GB')
        print(
            f"regroup and select data for training bpe in {data_train_bpe} ..."
        )
        regroup_and_select_data(files=[
            l.folder.glob(f'train{self.suffix}.[01234567].tok')
            for l in self.langs
        ],
                                nlines=nlines,
                                output=data_train_bpe)

        print(f"training bpe on {data_train_bpe}...")
        learn_bpe_file(data_train_bpe, ncodes, self.codes)