Ejemplo n.º 1
0
def _create_default_tokenizer(store: BaseConfig) -> BaseTokenizerTrainer:
    """
    Create a default tokenizer. If store.vocab_size == 0, use a CharacterTokenizer.
    Otherwise use SentencePieceTokenizer
    """
    if store.vocab_size == 0:
        logging.info("Loading CharTokenizerTrainer")
        trainer = CharTokenizerTrainer(config=store)
    else:
        logging.info("Loading SentencePieceTokenizerTrainer")
        trainer = SentencePieceTokenizerTrainer(
            vocab_size=store.vocab_size,
            character_coverage=store.character_coverage,
            pretrain_sentence_count=store.pretrain_sentence_count,
            max_line_len=store.max_line_len,
            config=store,
        )
    return trainer
Ejemplo n.º 2
0
def test_train_batch_char_tok(train_df, tmp_path):
    config = TensorFlowConfig(epochs=5,
                              field_delimiter=",",
                              checkpoint_dir=tmp_path,
                              input_data_path=PATH_HOLDER,
                              learning_rate=.01)
    batcher = DataFrameBatch(df=train_df,
                             config=config,
                             tokenizer=CharTokenizerTrainer(config=config))
    batcher.create_training_data()
    batcher.train_all_batches()

    tok_params = json.loads(
        open(tmp_path / "batch_0" /
             BaseTokenizerTrainer.settings_fname).read())
    assert tok_params["tokenizer_type"] == CharTokenizerTrainer.__name__

    batcher.generate_all_batch_lines(num_lines=100, max_invalid=5000)
    syn_df = batcher.batches_to_df()
    assert syn_df.shape[0] == 100