def train(args, inputs, lang, tgt=False): spm_dir = args.spm_dir if not os.path.exists(spm_dir): os.makedirs(spm_dir) train_config = { k: getattr(args, ("tgt" if tgt else "src") + "_" + k) for k in [ "vocab_size", "character_coverage", "byte_fallback", ] } SentencePieceTrainer.train( input=inputs, model_prefix=os.path.join(args.spm_dir, lang), **train_config, )
def spm(name, path, size=8192, bos=2, eos=1, unk=0, coverage=0.9995): """-> SentencePieceProcessor trains a sentence piece model of `size` from text file on `path` and saves with `name`. """ SentencePieceTrainer.train("--model_prefix={name} \ --input={path} \ --vocab_size={size} \ --bos_id={bos} \ --eos_id={eos} \ --unk_id={unk} \ --unk_surface=☹ \ --character_coverage={coverage}".format(coverage=coverage, unk=unk, eos=eos, bos=bos, size=size, path=path, name=name))
def train_sentencepiece_tokenizer(sentences: list, vocab_size: int, folder_name: str = "sentencepiece", model_name: str = "tokenizer_de") -> None: '''Trains a sentencepiece tokenizer on a given corpus. Args: sentences: contains all sentences of a corpus. vocab_size: maximum number of (sub-)words in the vocabulary of the tokenizer. folder_name: name of the folder where the trained tokenizer will be placed in. model_name: filename of the trained sentencepiece tokenizer. ''' temp_file = "sentences.txt" # this file will be deleted after training of the tokenizer is done. if folder_name != "": output_file = folder_name + "/" + model_name else: output_file = model_name # write all sentences to a temporary file with open(temp_file, 'w', encoding='utf-8') as f: for sentence in sentences: f.write(sentence + "\n") parameters = f"--input={temp_file} \ --model_prefix={output_file} \ --vocab_size={vocab_size} \ --bos_id=2 \ --eos_id=3 \ --unk_id=1 \ --pad_id=0 \ --bos_piece=<s> \ --eos_piece=</s> \ --hard_vocab_limit=false" # train tokenizer on our corpus SentencePieceTrainer.train(parameters) # delete temp_file os.remove(temp_file)
def train_sp_model(text_file): sp_model = SentencePieceTrainer.train(input=text_file, vocab_size=32000, model_type='word', hard_vocab_limit=False, model_prefix='m')
from pathlib import Path from sentencepiece import SentencePieceTrainer paths = [str(x) for x in Path("./data/").glob("**/image1_train.csv")] # Customize training SentencePieceTrainer.train(input=paths, model_prefix='model/ispbpe/spiece', vocab_size=21_128, user_defined_symbols=[])
const=True, default=False) parser.add_argument('--decode', action='store_const', const=True, default=False) args = parser.parse_args() if args.train: SentencePieceTrainer.train(input=[ args.data_dir + 'train.' + args.src, args.data_dir + 'train.' + args.tgt ], model_prefix=args.model_dir + 'sentencepiece.bpe', vocab_size=args.vocab_size, character_coverage=args.character_coverage, accept_language=[args.src, args.tgt], model_type='bpe') if args.encode: model = SentencePieceProcessor(model_file=args.model_dir + 'sentencepiece.bpe.model') for split in ['train', 'dev', 'test']: for ext in [args.src, args.tgt]: try: # https://github.com/google/sentencepiece/issues/508 with open(args.data_dir + split + '.' + ext,
from pathlib import Path from sentencepiece import SentencePieceTrainer paths = [str(x) for x in Path("./data/").glob("**/*train.csv")] # Customize training SentencePieceTrainer.train(input=paths, model_prefix='model/spbpe/spiece', vocab_size=21_128, user_defined_symbols=[])