Ejemplo n.º 1
0
def _init_network(args):
    with open(args.data_config) as fp:
        config = json.load(fp)
    CONFIG.update(config)

    MODEL = "transformer"
    HPARAMS = "transformer_big_tpu"
    train_steps = 1000000
    eval_steps = 10
    save_checkpoints_steps = 10000
    schedule = "continuous_train_and_eval"

    hparams = create_hparams(HPARAMS)
    print(json.loads(hparams.to_json()))

    # fix TPU zone
    from tensorflow.distribute.cluster_resolver import TPUClusterResolver
    TPUClusterResolver.__init__.__defaults__ = (args.tpu_name, args.tpu_zone,
                                                None, 'worker', None, None,
                                                'default', None, None)
    print(TPUClusterResolver.__init__.__defaults__)

    RUN_CONFIG = create_run_config(
        model_dir=args.model_dir,
        model_name=MODEL,
        save_checkpoints_steps=save_checkpoints_steps,
        use_tpu=True,
        cloud_tpu_name=args.tpu_name,
    )
    print(type(RUN_CONFIG))
    tensorflow_exp_fn = create_experiment(
        run_config=RUN_CONFIG,
        hparams=hparams,
        model_name=MODEL,
        problem_name=TranslateManyToMany.name,
        data_dir=args.data_dir,
        train_steps=train_steps,
        eval_steps=eval_steps,
        use_tpu=True,
        schedule=schedule,
        #use_xla=True # For acceleration
    )
    return tensorflow_exp_fn
Ejemplo n.º 2
0
def main():
    # print(registry.list_hparams())
    data_dir = '../t2t_data/'
    tmp_dir = '../data/'
    TRAIN_DIR = '../logs_lm_new_t2t'
    MODEL = 'transformer'
    PROBLEM = 'pinyin2zh_problem'

    tfe = tf.contrib.eager
    tfe.enable_eager_execution()

    pinyin2zh_problem = registry.problem(PROBLEM)
    pinyin2zh_problem.generate_data(data_dir=data_dir, tmp_dir=tmp_dir)
    hparams = trainer_lib.create_hparams("transformer_base")
    hparams.batch_size = 4
    hparams.learning_rate_warmup_steps = 45000
    hparams.learning_rate = 0.0003
    print(json.loads(hparams.to_json()))

    # Initi Run COnfig for Model Training
    RUN_CONFIG = create_run_config(
        model_name=MODEL,
        model_dir=TRAIN_DIR  # Location of where model file is store
        # More Params here in this fucntion for controling how noften to tave checkpoints and more.
    )

    # Create Tensorflow Experiment Object
    tensorflow_exp_fn = create_experiment(
        run_config=RUN_CONFIG,
        hparams=hparams,
        model_name=MODEL,
        problem_name=PROBLEM,
        data_dir=data_dir,
        train_steps=400000,  # Total number of train steps for all Epochs
        eval_steps=100  # Number of steps to perform for each evaluation
    )

    # Kick off Training
    tensorflow_exp_fn.train_and_evaluate()
Ejemplo n.º 3
0
hparams.attention_dropout_broadcast_dims = '0,1'
hparams.relu_dropout_broadcast_dims = '1'
hparams.layer_prepostprocess_dropout_broadcast_dims = '1'

hparams.optimizer = 'Adafactor'
hparams.learning_rate_warmup_steps = 10000
hparams.learning_rate_schedule = 'rsqrt_decay'
hparams.warm_start_from_second = 'small-tatabahasa/model.ckpt'

print(hparams)

RUN_CONFIG = create_run_config(
    model_dir=TRAIN_DIR,
    model_name=MODEL,
    save_checkpoints_steps=save_checkpoints_steps,
    num_gpus=2,
)

tensorflow_exp_fn = create_experiment(
    run_config=RUN_CONFIG,
    hparams=hparams,
    model_name=MODEL,
    problem_name=PROBLEM,
    data_dir=DATA_DIR,
    train_steps=train_steps,
    eval_steps=eval_steps,
    # use_xla=True # For acceleration
)

tensorflow_exp_fn.train()
Ejemplo n.º 4
0
DATA_DIR = './translator/'


# Initi Run COnfig for Model Training
RUN_CONFIG = create_run_config(
    model_dir=TRAIN_DIR, # Location of where model file is stored
    model_name=MODEL,

    # More Params here in this fucntion for controling how often to save checkpoints and more.
)


# Init Hparams object from T2T Problem
hparams = create_hparams(HPARAMS)
hparams.batch_size = 1024


# Create Tensorflow Experiment Object
tensorflow_exp_fn = create_experiment(
    run_config=RUN_CONFIG,
    hparams=hparams,
    model_name=MODEL,
    problem_name=PROBLEM,
    data_dir=DATA_DIR,
    train_steps=40, # Total number of train steps for all Epochs
    eval_steps=100 # Number of steps to perform for each evaluation
)

# Kick off Training
tensorflow_exp_fn.train_and_evaluate()
Ejemplo n.º 5
0
    'gs://mesolitica-tpu-general/t2t-base/model.ckpt-475000')

print(hparams)

RUN_CONFIG = create_run_config(
    model_dir=TRAIN_DIR,
    model_name=MODEL,
    save_checkpoints_steps=save_checkpoints_steps,
    use_tpu=True,
    cloud_tpu_name='node-5',
    iterations_per_loop=100,
    schedule='train',
)

tensorflow_exp_fn = create_experiment(
    run_config=RUN_CONFIG,
    hparams=hparams,
    model_name=MODEL,
    problem_name=PROBLEM,
    data_dir=DATA_DIR,
    train_steps=train_steps,
    eval_steps=eval_steps,
    use_tpu=True,
    use_tpu_estimator=False,
    schedule='train',
    warm_start_from='gs://mesolitica-tpu-general/t2t-base/model.ckpt-475000'
    # use_xla=True # For acceleration
)

tensorflow_exp_fn.train()
Ejemplo n.º 6
0
    model_dir=train_dir,
    model_name=MODEL,
    num_gpus=2,
    #keep_checkpoint_max=keep_checkpoint_max,
    save_checkpoints_steps=
    save_checkpoints_steps  # Location of where model file is store
    # More Params here in this fucntion for controling how noften to tave checkpoints and more.
)

# # Create Tensorflow Experiment Object
tensorflow_exp_fn = create_experiment(
    run_config=RUN_CONFIG,
    hparams=hparams,
    model_name=MODEL,
    problem_name=PROBLEM,
    data_dir=data_dir,
    schedule="train_and_evaluate",
    #eval_early_stopping_steps=5000,
    min_eval_frequency=1000,
    train_steps=90000,  # Total number of train steps for all Epochs
    eval_steps=100  # Number of steps to perform for each evaluation
)

# Kick off Training
print('Training started.....')

#file = open("Model_Training_Progress.txt", "w")
#file.close()

#with open("Model_Training_Progress.txt", "a") as f:
#    f.write(print(tensorflow_exp_fn.train_and_evaluate()))
FLAGS.problems = problem_name
FLAGS.model = model_name
FLAGS.schedule = "train_and_evaluate"
FLAGS.save_checkpoints_secs = 0
FLAGS.local_eval_frequency = 2000
FLAGS.gpu_memory_fraction = .99
FLAGS.worker_gpu = 1
FLAGS.ps_gpu = 2
FLAGS.log_device_placement = True
FLAGS.worker_replicas = 2

RUN_CONFIG = trainer_lib.create_run_config(
      model_dir=train_dir,
      model_name="test",
      keep_checkpoint_max=3,
      save_checkpoints_secs=0,
      gpu_mem_fraction=FLAGS.gpu_memory_fraction
)


exp_fn = trainer_lib.create_experiment(
        run_config=RUN_CONFIG,
        hparams=hparams,
        model_name=model_name,
        problem_name=problem_name,
        data_dir=(data_dir),
        train_steps=1000000,
        eval_steps=100
    )
exp_fn.train_and_evaluate()