def test_local_config_missing_attrs(): with pytest.raises(AttributeError): TensorFlowConfig() with pytest.raises(AttributeError): TensorFlowConfig(checkpoint_dir="foo") with pytest.raises(AttributeError): TensorFlowConfig(input_data_path="foo")
def test_local_config_save_model_params(): test_data_dir = Path(__file__).parent target = test_data_dir / uuid.uuid4().hex test_data_file = test_data_dir / "data" / "smol.txt" lc = TensorFlowConfig(checkpoint_dir=target.as_posix(), input_data_path=test_data_file.as_posix()) check = lc.save_model_params() assert json.loads(open(check).read()) shutil.rmtree(target)
def test_train_batch_sp_tok(train_df, tmp_path): config = TensorFlowConfig( epochs=5, field_delimiter=",", checkpoint_dir=tmp_path, input_data_path=PATH_HOLDER, learning_rate=.01 ) tokenizer = SentencePieceTokenizerTrainer( vocab_size=10000, config=config ) batcher = DataFrameBatch( df=train_df, config=config, tokenizer=tokenizer ) batcher.create_training_data() batcher.train_all_batches() batcher.generate_all_batch_lines(num_lines=_tok_gen_count, max_invalid=5000) syn_df = batcher.batches_to_df() assert syn_df.shape[0] == _tok_gen_count # Generate with a RecordFactory factory = batcher.create_record_factory(num_lines=_tok_gen_count, max_invalid=5000) syn_df = factory.generate_all(output="df") assert syn_df.shape[0] == _tok_gen_count assert list(syn_df.columns) == list(train_df.columns) assert factory.summary["valid_count"] == _tok_gen_count
def test_epoch_callback(train_df, tmp_path): def epoch_callback(s: EpochState): with open(tmp_path / 'callback_dump.txt', 'a') as f: f.write(f'{s.epoch},{s.accuracy},{s.loss},{s.batch}\n') config = TensorFlowConfig(epochs=5, field_delimiter=",", checkpoint_dir=tmp_path, input_data_path=PATH_HOLDER, learning_rate=.01, epoch_callback=epoch_callback) tokenizer = SentencePieceTokenizerTrainer(vocab_size=10000, config=config) batcher = DataFrameBatch(batch_size=4, df=train_df, config=config, tokenizer=tokenizer) batcher.create_training_data() batcher.train_all_batches() with open(tmp_path / 'callback_dump.txt', 'r') as f: lines = f.readlines() assert len(lines) == 20 for i, line in enumerate(lines): fields = line.strip().split(',') assert len(fields) == 4 assert int(fields[0]) == i % 5 assert int(fields[3]) == i // 5 float(fields[1]) float(fields[2]) os.remove(tmp_path / 'callback_dump.txt')
def test_bad_epoch_callback(tmp_path): with pytest.raises(ValueError) as err: config = TensorFlowConfig(epochs=1, field_delimiter=",", checkpoint_dir=tmp_path, input_data_path=PATH_HOLDER, epoch_callback=1) assert "must be a callable" in str(err)
def test_local_config(mkdir): target = uuid.uuid4().hex test_data_dir = Path(__file__).parent test_data_file = test_data_dir / "data" / "smol.txt" lc = TensorFlowConfig(checkpoint_dir=target, input_data_path=test_data_file.as_posix()) mkdir.assert_called assert lc.epochs == 100 assert lc.input_data_path == test_data_file.as_posix() assert lc.training_data_path == Path(target, "training_data.txt").as_posix()
def test_local_config_settings(mkdir): lc = TensorFlowConfig(checkpoint_dir="foo", input_data_path="bar") check = lc.as_dict() assert check == { "max_lines": 0, "epochs": 100, "epoch_callback": None, "early_stopping": True, "early_stopping_patience": 5, "validation_split": True, "best_model_metric": METRIC_VAL_LOSS, "batch_size": 64, "buffer_size": 10000, "seq_length": 100, "embedding_dim": 256, "rnn_units": 256, "dropout_rate": 0.2, "rnn_initializer": "glorot_uniform", "vocab_size": 20000, "character_coverage": 1.0, "pretrain_sentence_count": 1000000, "dp": False, "learning_rate": 0.01, "dp_noise_multiplier": 0.1, "dp_l2_norm_clip": 3.0, "dp_microbatches": 64, "gen_temp": 1.0, "gen_chars": 0, "gen_lines": 1000, "max_line_len": 2048, "save_all_checkpoints": False, "save_best_model": True, "checkpoint_dir": "foo", "field_delimiter": None, "field_delimiter_token": "<d>", "overwrite": False, "input_data_path": "bar", "predict_batch_size": 64, "reset_states": True, "training_data_path": "foo/training_data.txt", "model_type": "TensorFlowConfig" }
def test_train_small_df(train_df, tmp_path): small_df = train_df.sample(n=50) config = TensorFlowConfig(epochs=5, field_delimiter=",", checkpoint_dir=tmp_path, input_data_path=PATH_HOLDER) batcher = DataFrameBatch(df=small_df, config=config) batcher.create_training_data() with pytest.raises(RuntimeError) as excinfo: batcher.train_all_batches() assert "Model training failed" in str(excinfo.value)
def tf_config(): target = test_data_dir / "ckpoint" input_data = test_data_dir / "data" / "smol.txt" if not target.exists(): target.mkdir() config = TensorFlowConfig( checkpoint_dir=target.as_posix(), input_data_path=input_data.as_posix(), field_delimiter=",", predict_batch_size=1, overwrite=True, ) yield config shutil.rmtree(target)
def test_train_batch_sp_tok(train_df, tmp_path): config = TensorFlowConfig(epochs=5, field_delimiter=",", checkpoint_dir=tmp_path, input_data_path=PATH_HOLDER, learning_rate=.01) tokenizer = SentencePieceTokenizerTrainer(vocab_size=10000, config=config) batcher = DataFrameBatch(df=train_df, config=config, tokenizer=tokenizer) batcher.create_training_data() batcher.train_all_batches() batcher.generate_all_batch_lines(num_lines=100, max_invalid=5000) syn_df = batcher.batches_to_df() assert syn_df.shape[0] == 100
def test_train_batch_sp(train_df, tmp_path): config = TensorFlowConfig(epochs=1, field_delimiter=",", checkpoint_dir=tmp_path, input_data_path=PATH_HOLDER) batcher = DataFrameBatch(df=train_df, config=config) batcher.create_training_data() batcher.train_all_batches() model_params = json.loads( open(tmp_path / "batch_0" / const.MODEL_PARAMS).read()) assert model_params[const.MODEL_TYPE] == TensorFlowConfig.__name__ tok_params = json.loads( open(tmp_path / "batch_0" / BaseTokenizerTrainer.settings_fname).read()) assert tok_params[ "tokenizer_type"] == SentencePieceTokenizerTrainer.__name__
def test_train_batch_char_tok(train_df, tmp_path): config = TensorFlowConfig(epochs=5, field_delimiter=",", checkpoint_dir=tmp_path, input_data_path=PATH_HOLDER, learning_rate=.01) batcher = DataFrameBatch(df=train_df, config=config, tokenizer=CharTokenizerTrainer(config=config)) batcher.create_training_data() batcher.train_all_batches() tok_params = json.loads( open(tmp_path / "batch_0" / BaseTokenizerTrainer.settings_fname).read()) assert tok_params["tokenizer_type"] == CharTokenizerTrainer.__name__ batcher.generate_all_batch_lines(num_lines=100, max_invalid=5000) syn_df = batcher.batches_to_df() assert syn_df.shape[0] == 100
def test_no_delim_bad_start_string(tmpdir): config = TensorFlowConfig(checkpoint_dir=tmpdir, input_data_path=tmpdir) with pytest.raises(GenerationError): Settings(config=config, start_string=123, tokenizer=mock_tokenizer)
def test_local_config_no_validation_split(): lc = TensorFlowConfig(checkpoint_dir="foo", input_data_path="bar", validation_split=False) check = lc.as_dict() assert check['best_model_metric'] == METRIC_LOSS