def test_freeze_pipeline(): exp = Experiment('experiments/sample-exp', read_only=True) exp.config['trainer'].update(dict(steps=50, check_point=25)) # enable these trainable = { 'include': ['src_embed', 'tgt_embed', 'generator', 'encoder:0', 'decoder:0,1'] } exp.config['optim']['trainable'] = trainable pipe = Pipeline(exp) pipe.run(run_tests=False)
def test_finetune_pipeline_transformer(): codec_lib = 'nlcodec' tmp_dir = tempfile.mkdtemp() print(f"Testing finetune transformer: {tmp_dir}") config = load_conf('experiments/sample-exp/conf.yml') prep = config['prep'] prep.update( dict(codec_lib=codec_lib, char_coverage=0.9995, finetune_src=prep['train_src'], finetune_tgt=prep['train_tgt'])) exp = Experiment(tmp_dir, config=config, read_only=False) exp.config['trainer'].update( dict(steps=50, check_point=25, finetune_steps=100, batch_size=400, split_ratio=0.1, dynamic_epoch=True)) Pipeline(exp).run() assert exp.train_file.exists() or exp.train_db.exists() assert exp.finetune_file.exists() # TODO: add more assertions print(f"Cleaning up {tmp_dir}") shutil.rmtree(tmp_dir, ignore_errors=True)
def test_robertamt_2layer_init(): tmp_dir = tempfile.mkdtemp() config = load_conf('experiments/pretrained/robertamt-xlmr-2layer.yml') model_id = config['model_args']['model_id'] print(f"Testing {model_id} --> {tmp_dir}") assert 'pretrainmatch' == config['prep'].get('codec_lib') exp = Experiment(tmp_dir, config=config, read_only=False) exp.config['trainer'].update(dict(steps=4, check_point=1)) Pipeline(exp).run(run_tests=False) sanity_check_experiment(exp) print(f"Cleaning up {tmp_dir}") shutil.rmtree(tmp_dir, ignore_errors=True)
def test_parent_child_pipeline(): parent_dir = tempfile.mkdtemp() # parent_dir = 'tmp-xyz-parent' print(f"Making parent at {parent_dir}") exp = Experiment(parent_dir, config='experiments/transformer.test.yml', read_only=False) exp.config['trainer'].update(dict(steps=50, check_point=25)) Pipeline(exp).run(run_tests=False) sanity_check_experiment(exp) assert not exp.parent_model_state.exists() child_config = load_conf('experiments/transformer.test.yml') child_config.update({ 'parent': { 'experiment': str(parent_dir), 'vocab': { 'shared': 'shared' }, 'model': { 'ensemble': 2 } } }) child_dir = tempfile.mkdtemp() # child_dir = 'tmp-xyz-child' print(f"Making child at {child_dir}") exp = Experiment(child_dir, config=child_config, read_only=False) exp.config['trainer'].update(dict(steps=50, check_point=25)) Pipeline(exp).run(run_tests=False) sanity_check_experiment(exp) assert exp.parent_model_state.exists() for dir in [parent_dir, child_dir]: print(f"Cleaning up {dir}") shutil.rmtree(dir, ignore_errors=True)
def test_pipeline_transformer(): for codec_lib in ['sentpiece', 'nlcodec']: tmp_dir = tempfile.mkdtemp() config = load_conf('experiments/transformer.test.yml') print(f"Testing {codec_lib} --> {tmp_dir}") config['prep'].update({ 'codec_lib': codec_lib, 'char_coverage': 0.9995 }) exp = Experiment(tmp_dir, config=config, read_only=False) exp.config['trainer'].update(dict(steps=50, check_point=25)) exp.config['prep']['num_samples'] = 0 Pipeline(exp).run(run_tests=False) sanity_check_experiment(exp) print(f"Cleaning up {tmp_dir}") shutil.rmtree(tmp_dir, ignore_errors=True)
def test_spark_prep(): tmp_dir = tempfile.mkdtemp() try: print(f"Testing dataprep on pyspark: {tmp_dir}") config = load_conf('experiments/spark-bigdataprep.yml') exp = Experiment(tmp_dir, config=config, read_only=False) exp.config['trainer'].update( dict(steps=50, check_point=25, batch_size=400)) Pipeline(exp).run() assert exp._prepared_flag.exists() assert exp._trained_flag.exists() assert exp.train_file.exists() or exp.train_db.exists() sanity_check_experiment(exp) finally: print(f"Cleaning up {tmp_dir}") shutil.rmtree(tmp_dir, ignore_errors=True)
def test_prepared_pipeline(): exp = Experiment('experiments/sample-exp', read_only=True) exp.config['trainer'].update(dict(steps=50, check_point=25)) pipe = Pipeline(exp) pipe.run(run_tests=False)