Esempio n. 1
0
def test_local_config_missing_attrs():
    with pytest.raises(AttributeError):
        TensorFlowConfig()

    with pytest.raises(AttributeError):
        TensorFlowConfig(checkpoint_dir="foo")

    with pytest.raises(AttributeError):
        TensorFlowConfig(input_data_path="foo")
Esempio n. 2
0
def test_local_config_save_model_params():
    test_data_dir = Path(__file__).parent
    target = test_data_dir / uuid.uuid4().hex
    test_data_file = test_data_dir / "data" / "smol.txt"
    lc = TensorFlowConfig(checkpoint_dir=target.as_posix(),
                          input_data_path=test_data_file.as_posix())
    check = lc.save_model_params()
    assert json.loads(open(check).read())
    shutil.rmtree(target)
Esempio n. 3
0
def test_train_batch_sp_tok(train_df, tmp_path):
    config = TensorFlowConfig(
        epochs=5,
        field_delimiter=",",
        checkpoint_dir=tmp_path,
        input_data_path=PATH_HOLDER,
        learning_rate=.01
    )
    tokenizer = SentencePieceTokenizerTrainer(
        vocab_size=10000,
        config=config
    )
    batcher = DataFrameBatch(
        df=train_df,
        config=config,
        tokenizer=tokenizer
    )
    batcher.create_training_data()
    batcher.train_all_batches()

    batcher.generate_all_batch_lines(num_lines=_tok_gen_count, max_invalid=5000)
    syn_df = batcher.batches_to_df()
    assert syn_df.shape[0] == _tok_gen_count

    # Generate with a RecordFactory
    factory = batcher.create_record_factory(num_lines=_tok_gen_count, max_invalid=5000)
    syn_df = factory.generate_all(output="df")
    assert syn_df.shape[0] == _tok_gen_count
    assert list(syn_df.columns) == list(train_df.columns)
    assert factory.summary["valid_count"] == _tok_gen_count
Esempio n. 4
0
def test_epoch_callback(train_df, tmp_path):
    def epoch_callback(s: EpochState):
        with open(tmp_path / 'callback_dump.txt', 'a') as f:
            f.write(f'{s.epoch},{s.accuracy},{s.loss},{s.batch}\n')

    config = TensorFlowConfig(epochs=5,
                              field_delimiter=",",
                              checkpoint_dir=tmp_path,
                              input_data_path=PATH_HOLDER,
                              learning_rate=.01,
                              epoch_callback=epoch_callback)
    tokenizer = SentencePieceTokenizerTrainer(vocab_size=10000, config=config)
    batcher = DataFrameBatch(batch_size=4,
                             df=train_df,
                             config=config,
                             tokenizer=tokenizer)
    batcher.create_training_data()
    batcher.train_all_batches()
    with open(tmp_path / 'callback_dump.txt', 'r') as f:
        lines = f.readlines()
        assert len(lines) == 20
        for i, line in enumerate(lines):
            fields = line.strip().split(',')
            assert len(fields) == 4
            assert int(fields[0]) == i % 5
            assert int(fields[3]) == i // 5
            float(fields[1])
            float(fields[2])
    os.remove(tmp_path / 'callback_dump.txt')
Esempio n. 5
0
def test_bad_epoch_callback(tmp_path):
    with pytest.raises(ValueError) as err:
        config = TensorFlowConfig(epochs=1,
                                  field_delimiter=",",
                                  checkpoint_dir=tmp_path,
                                  input_data_path=PATH_HOLDER,
                                  epoch_callback=1)
    assert "must be a callable" in str(err)
Esempio n. 6
0
def test_local_config(mkdir):
    target = uuid.uuid4().hex
    test_data_dir = Path(__file__).parent
    test_data_file = test_data_dir / "data" / "smol.txt"
    lc = TensorFlowConfig(checkpoint_dir=target, input_data_path=test_data_file.as_posix())

    mkdir.assert_called
    assert lc.epochs == 100
    assert lc.input_data_path == test_data_file.as_posix()
    assert lc.training_data_path == Path(target, "training_data.txt").as_posix()
def test_local_config_settings(mkdir):
    lc = TensorFlowConfig(checkpoint_dir="foo", input_data_path="bar")
    check = lc.as_dict()
    assert check == {
        "max_lines": 0,
        "epochs": 100,
        "epoch_callback": None,
        "early_stopping": True,
        "early_stopping_patience": 5,
        "validation_split": True,
        "best_model_metric": METRIC_VAL_LOSS,
        "batch_size": 64,
        "buffer_size": 10000,
        "seq_length": 100,
        "embedding_dim": 256,
        "rnn_units": 256,
        "dropout_rate": 0.2,
        "rnn_initializer": "glorot_uniform",
        "vocab_size": 20000,
        "character_coverage": 1.0,
        "pretrain_sentence_count": 1000000,
        "dp": False,
        "learning_rate": 0.01,
        "dp_noise_multiplier": 0.1,
        "dp_l2_norm_clip": 3.0,
        "dp_microbatches": 64,
        "gen_temp": 1.0,
        "gen_chars": 0,
        "gen_lines": 1000,
        "max_line_len": 2048,
        "save_all_checkpoints": False,
        "save_best_model": True,
        "checkpoint_dir": "foo",
        "field_delimiter": None,
        "field_delimiter_token": "<d>",
        "overwrite": False,
        "input_data_path": "bar",
        "predict_batch_size": 64,
        "reset_states": True,
        "training_data_path": "foo/training_data.txt",
        "model_type": "TensorFlowConfig"
    }
Esempio n. 8
0
def test_train_small_df(train_df, tmp_path):
    small_df = train_df.sample(n=50)
    config = TensorFlowConfig(epochs=5,
                              field_delimiter=",",
                              checkpoint_dir=tmp_path,
                              input_data_path=PATH_HOLDER)
    batcher = DataFrameBatch(df=small_df, config=config)
    batcher.create_training_data()
    with pytest.raises(RuntimeError) as excinfo:
        batcher.train_all_batches()
    assert "Model training failed" in str(excinfo.value)
Esempio n. 9
0
def tf_config():
    target = test_data_dir / "ckpoint"
    input_data = test_data_dir / "data" / "smol.txt"
    if not target.exists():
        target.mkdir()
    config = TensorFlowConfig(
        checkpoint_dir=target.as_posix(),
        input_data_path=input_data.as_posix(),
        field_delimiter=",",
        predict_batch_size=1,
        overwrite=True,
    )
    yield config
    shutil.rmtree(target)
Esempio n. 10
0
def test_train_batch_sp_tok(train_df, tmp_path):
    config = TensorFlowConfig(epochs=5,
                              field_delimiter=",",
                              checkpoint_dir=tmp_path,
                              input_data_path=PATH_HOLDER,
                              learning_rate=.01)
    tokenizer = SentencePieceTokenizerTrainer(vocab_size=10000, config=config)
    batcher = DataFrameBatch(df=train_df, config=config, tokenizer=tokenizer)
    batcher.create_training_data()
    batcher.train_all_batches()

    batcher.generate_all_batch_lines(num_lines=100, max_invalid=5000)
    syn_df = batcher.batches_to_df()
    assert syn_df.shape[0] == 100
Esempio n. 11
0
def test_train_batch_sp(train_df, tmp_path):
    config = TensorFlowConfig(epochs=1,
                              field_delimiter=",",
                              checkpoint_dir=tmp_path,
                              input_data_path=PATH_HOLDER)
    batcher = DataFrameBatch(df=train_df, config=config)
    batcher.create_training_data()
    batcher.train_all_batches()

    model_params = json.loads(
        open(tmp_path / "batch_0" / const.MODEL_PARAMS).read())
    assert model_params[const.MODEL_TYPE] == TensorFlowConfig.__name__

    tok_params = json.loads(
        open(tmp_path / "batch_0" /
             BaseTokenizerTrainer.settings_fname).read())
    assert tok_params[
        "tokenizer_type"] == SentencePieceTokenizerTrainer.__name__
Esempio n. 12
0
def test_train_batch_char_tok(train_df, tmp_path):
    config = TensorFlowConfig(epochs=5,
                              field_delimiter=",",
                              checkpoint_dir=tmp_path,
                              input_data_path=PATH_HOLDER,
                              learning_rate=.01)
    batcher = DataFrameBatch(df=train_df,
                             config=config,
                             tokenizer=CharTokenizerTrainer(config=config))
    batcher.create_training_data()
    batcher.train_all_batches()

    tok_params = json.loads(
        open(tmp_path / "batch_0" /
             BaseTokenizerTrainer.settings_fname).read())
    assert tok_params["tokenizer_type"] == CharTokenizerTrainer.__name__

    batcher.generate_all_batch_lines(num_lines=100, max_invalid=5000)
    syn_df = batcher.batches_to_df()
    assert syn_df.shape[0] == 100
def test_no_delim_bad_start_string(tmpdir):
    config = TensorFlowConfig(checkpoint_dir=tmpdir, input_data_path=tmpdir)
    with pytest.raises(GenerationError):
        Settings(config=config, start_string=123, tokenizer=mock_tokenizer)
Esempio n. 14
0
def test_local_config_no_validation_split():
    lc = TensorFlowConfig(checkpoint_dir="foo",
                          input_data_path="bar",
                          validation_split=False)
    check = lc.as_dict()
    assert check['best_model_metric'] == METRIC_LOSS