def _create_default_tokenizer(store: BaseConfig) -> BaseTokenizerTrainer: """ Create a default tokenizer. If store.vocab_size == 0, use a CharacterTokenizer. Otherwise use SentencePieceTokenizer """ if store.vocab_size == 0: logging.info("Loading CharTokenizerTrainer") trainer = CharTokenizerTrainer(config=store) else: logging.info("Loading SentencePieceTokenizerTrainer") trainer = SentencePieceTokenizerTrainer( vocab_size=store.vocab_size, character_coverage=store.character_coverage, pretrain_sentence_count=store.pretrain_sentence_count, max_line_len=store.max_line_len, config=store, ) return trainer
def test_train_batch_char_tok(train_df, tmp_path): config = TensorFlowConfig(epochs=5, field_delimiter=",", checkpoint_dir=tmp_path, input_data_path=PATH_HOLDER, learning_rate=.01) batcher = DataFrameBatch(df=train_df, config=config, tokenizer=CharTokenizerTrainer(config=config)) batcher.create_training_data() batcher.train_all_batches() tok_params = json.loads( open(tmp_path / "batch_0" / BaseTokenizerTrainer.settings_fname).read()) assert tok_params["tokenizer_type"] == CharTokenizerTrainer.__name__ batcher.generate_all_batch_lines(num_lines=100, max_invalid=5000) syn_df = batcher.batches_to_df() assert syn_df.shape[0] == 100