def _init_network(args): with open(args.data_config) as fp: config = json.load(fp) CONFIG.update(config) MODEL = "transformer" HPARAMS = "transformer_big_tpu" train_steps = 1000000 eval_steps = 10 save_checkpoints_steps = 10000 schedule = "continuous_train_and_eval" hparams = create_hparams(HPARAMS) print(json.loads(hparams.to_json())) # fix TPU zone from tensorflow.distribute.cluster_resolver import TPUClusterResolver TPUClusterResolver.__init__.__defaults__ = (args.tpu_name, args.tpu_zone, None, 'worker', None, None, 'default', None, None) print(TPUClusterResolver.__init__.__defaults__) RUN_CONFIG = create_run_config( model_dir=args.model_dir, model_name=MODEL, save_checkpoints_steps=save_checkpoints_steps, use_tpu=True, cloud_tpu_name=args.tpu_name, ) print(type(RUN_CONFIG)) tensorflow_exp_fn = create_experiment( run_config=RUN_CONFIG, hparams=hparams, model_name=MODEL, problem_name=TranslateManyToMany.name, data_dir=args.data_dir, train_steps=train_steps, eval_steps=eval_steps, use_tpu=True, schedule=schedule, #use_xla=True # For acceleration ) return tensorflow_exp_fn
def main(): # print(registry.list_hparams()) data_dir = '../t2t_data/' tmp_dir = '../data/' TRAIN_DIR = '../logs_lm_new_t2t' MODEL = 'transformer' PROBLEM = 'pinyin2zh_problem' tfe = tf.contrib.eager tfe.enable_eager_execution() pinyin2zh_problem = registry.problem(PROBLEM) pinyin2zh_problem.generate_data(data_dir=data_dir, tmp_dir=tmp_dir) hparams = trainer_lib.create_hparams("transformer_base") hparams.batch_size = 4 hparams.learning_rate_warmup_steps = 45000 hparams.learning_rate = 0.0003 print(json.loads(hparams.to_json())) # Initi Run COnfig for Model Training RUN_CONFIG = create_run_config( model_name=MODEL, model_dir=TRAIN_DIR # Location of where model file is store # More Params here in this fucntion for controling how noften to tave checkpoints and more. ) # Create Tensorflow Experiment Object tensorflow_exp_fn = create_experiment( run_config=RUN_CONFIG, hparams=hparams, model_name=MODEL, problem_name=PROBLEM, data_dir=data_dir, train_steps=400000, # Total number of train steps for all Epochs eval_steps=100 # Number of steps to perform for each evaluation ) # Kick off Training tensorflow_exp_fn.train_and_evaluate()
hparams.attention_dropout_broadcast_dims = '0,1' hparams.relu_dropout_broadcast_dims = '1' hparams.layer_prepostprocess_dropout_broadcast_dims = '1' hparams.optimizer = 'Adafactor' hparams.learning_rate_warmup_steps = 10000 hparams.learning_rate_schedule = 'rsqrt_decay' hparams.warm_start_from_second = 'small-tatabahasa/model.ckpt' print(hparams) RUN_CONFIG = create_run_config( model_dir=TRAIN_DIR, model_name=MODEL, save_checkpoints_steps=save_checkpoints_steps, num_gpus=2, ) tensorflow_exp_fn = create_experiment( run_config=RUN_CONFIG, hparams=hparams, model_name=MODEL, problem_name=PROBLEM, data_dir=DATA_DIR, train_steps=train_steps, eval_steps=eval_steps, # use_xla=True # For acceleration ) tensorflow_exp_fn.train()
DATA_DIR = './translator/' # Initi Run COnfig for Model Training RUN_CONFIG = create_run_config( model_dir=TRAIN_DIR, # Location of where model file is stored model_name=MODEL, # More Params here in this fucntion for controling how often to save checkpoints and more. ) # Init Hparams object from T2T Problem hparams = create_hparams(HPARAMS) hparams.batch_size = 1024 # Create Tensorflow Experiment Object tensorflow_exp_fn = create_experiment( run_config=RUN_CONFIG, hparams=hparams, model_name=MODEL, problem_name=PROBLEM, data_dir=DATA_DIR, train_steps=40, # Total number of train steps for all Epochs eval_steps=100 # Number of steps to perform for each evaluation ) # Kick off Training tensorflow_exp_fn.train_and_evaluate()
'gs://mesolitica-tpu-general/t2t-base/model.ckpt-475000') print(hparams) RUN_CONFIG = create_run_config( model_dir=TRAIN_DIR, model_name=MODEL, save_checkpoints_steps=save_checkpoints_steps, use_tpu=True, cloud_tpu_name='node-5', iterations_per_loop=100, schedule='train', ) tensorflow_exp_fn = create_experiment( run_config=RUN_CONFIG, hparams=hparams, model_name=MODEL, problem_name=PROBLEM, data_dir=DATA_DIR, train_steps=train_steps, eval_steps=eval_steps, use_tpu=True, use_tpu_estimator=False, schedule='train', warm_start_from='gs://mesolitica-tpu-general/t2t-base/model.ckpt-475000' # use_xla=True # For acceleration ) tensorflow_exp_fn.train()
model_dir=train_dir, model_name=MODEL, num_gpus=2, #keep_checkpoint_max=keep_checkpoint_max, save_checkpoints_steps= save_checkpoints_steps # Location of where model file is store # More Params here in this fucntion for controling how noften to tave checkpoints and more. ) # # Create Tensorflow Experiment Object tensorflow_exp_fn = create_experiment( run_config=RUN_CONFIG, hparams=hparams, model_name=MODEL, problem_name=PROBLEM, data_dir=data_dir, schedule="train_and_evaluate", #eval_early_stopping_steps=5000, min_eval_frequency=1000, train_steps=90000, # Total number of train steps for all Epochs eval_steps=100 # Number of steps to perform for each evaluation ) # Kick off Training print('Training started.....') #file = open("Model_Training_Progress.txt", "w") #file.close() #with open("Model_Training_Progress.txt", "a") as f: # f.write(print(tensorflow_exp_fn.train_and_evaluate()))
FLAGS.problems = problem_name FLAGS.model = model_name FLAGS.schedule = "train_and_evaluate" FLAGS.save_checkpoints_secs = 0 FLAGS.local_eval_frequency = 2000 FLAGS.gpu_memory_fraction = .99 FLAGS.worker_gpu = 1 FLAGS.ps_gpu = 2 FLAGS.log_device_placement = True FLAGS.worker_replicas = 2 RUN_CONFIG = trainer_lib.create_run_config( model_dir=train_dir, model_name="test", keep_checkpoint_max=3, save_checkpoints_secs=0, gpu_mem_fraction=FLAGS.gpu_memory_fraction ) exp_fn = trainer_lib.create_experiment( run_config=RUN_CONFIG, hparams=hparams, model_name=model_name, problem_name=problem_name, data_dir=(data_dir), train_steps=1000000, eval_steps=100 ) exp_fn.train_and_evaluate()