def get_vocab(self, size_gb=None): if self.vocab.is_file(): print("vocab already exists.") return print("get vocab ...") if size_gb is None: nlines = None else: size_gb_ = size_gb / len(self.langs) nlines = [ int(self.sizes[l.l][0] * size_gb_ * 1024**3 / self.sizes[l.l][1]) for l in self.langs ] # get vocab only from a subset of 40GB (20 each lang) of the bpe-ed train set data_get_vocab = self.folder.joinpath( f'train{self.suffix}.bpe.{size_gb}GB') print(f"regroup and select data in {data_get_vocab} to get vocab ...") regroup_and_select_data(files=[ self.folder.glob(f'{l.l}.train{self.suffix}.[01234567].bpe') for l in self.langs ], nlines=nlines, output=data_get_vocab) print(f"computing vocab on {data_get_vocab}...") get_vocab_file(data_get_vocab, self.vocab)
def train_bpe(self, ncodes, size_gb=None): if self.codes.is_file(): print("bpe codes already exists.") return print("train bpe ...") if size_gb is None: nlines = None else: size_gb_ = size_gb / len(self.langs) nlines = [ int(self.sizes[l.l][0] * size_gb_ * 1024**3 / self.sizes[l.l][1]) for l in self.langs ] print( f"we need to regroup {nlines} lines for {self.langs[0].l} {self.langs[1].l} and {self.langs[2].l} to gather {size_gb} Go" ) # train bpe on only 50 GB (25 each lang) of the tokenized train set data_train_bpe = self.folder.joinpath( f'train{self.suffix}.tok.{size_gb}GB') print( f"regroup and select data for training bpe in {data_train_bpe} ..." ) regroup_and_select_data(files=[ l.folder.glob(f'train{self.suffix}.[01234567].tok') for l in self.langs ], nlines=nlines, output=data_train_bpe) print(f"training bpe on {data_train_bpe}...") learn_bpe_file(data_train_bpe, ncodes, self.codes)