def main(args): tf.logging.set_verbosity(tf.logging.INFO) model_cls = models.get_model(args.model) params = default_parameters() # Import and override parameters # Priorities (low -> high): # default -> saved -> command params = merge_parameters(params, model_cls.get_parameters()) params = import_params(args.output, args.model, params) override_parameters(params, args) # Export all parameters and model specific parameters export_params(params.output, "params.json", params) export_params(params.output, "%s.json" % args.model, collect_params(params, model_cls.get_parameters())) # Build Graph with tf.Graph().as_default(): if not params.record: # Build input queue if params.use_bert and params.bert_emb_path: features = dataset.get_training_input_with_bert( params.input + [params.bert_emb_path], params) else: features = dataset.get_training_input(params.input, params) else: features = record.get_input_features( # ??? os.path.join(params.record, "*train*"), "train", params) # Build model initializer = get_initializer(params) model = model_cls(params) # Multi-GPU setting sharded_losses = parallel.parallel_model( model.get_training_func(initializer), features, params.device_list) loss = tf.add_n(sharded_losses) / len(sharded_losses) # Create global step global_step = tf.train.get_or_create_global_step() # Print parameters all_weights = {v.name: v for v in tf.trainable_variables()} total_size = 0 for v_name in sorted(list(all_weights)): v = all_weights[v_name] tf.logging.info("%s\tshape %s", v.name[:-2].ljust(80), str(v.shape).ljust(20)) v_size = np.prod(np.array( v.shape.as_list())).tolist() # mutiple all dimension size total_size += v_size tf.logging.info("Total trainable variables size: %d", total_size) learning_rate = get_learning_rate_decay(params.learning_rate, global_step, params) learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32) tf.summary.scalar("learning_rate", learning_rate) # Create optimizer opt = tf.train.AdamOptimizer(learning_rate, beta1=params.adam_beta1, beta2=params.adam_beta2, epsilon=params.adam_epsilon) if params.update_cycle == 1: train_op = tf.contrib.layers.optimize_loss( name="training", loss=loss, global_step=global_step, learning_rate=learning_rate, clip_gradients=params.clip_grad_norm or None, optimizer=opt, colocate_gradients_with_ops=True) zero_op = tf.no_op("zero_op") collect_op = tf.no_op("collect_op") else: grads_and_vars = opt.compute_gradients( loss, colocate_gradients_with_ops=True) gradients = [item[0] for item in grads_and_vars] variables = [item[1] for item in grads_and_vars] variables = utils.replicate_variables(variables) zero_op = utils.zero_variables(variables) collect_op = utils.collect_gradients(gradients, variables) scale = 1.0 / params.update_cycle gradients, variables = utils.scale_gradients(grads_and_vars, scale) # Gradient clipping avoid greadient explosion!! if isinstance(params.clip_grad_norm or None, float): gradients, _ = tf.clip_by_global_norm(gradients, params.clip_grad_norm) # Update variables grads_and_vars = list(zip(gradients, variables)) with tf.control_dependencies([collect_op]): train_op = opt.apply_gradients(grads_and_vars, global_step) # Validation ''' if params.validation and params.references[0]: files = [params.validation] + list(params.references) eval_inputs = files eval_input_fn = dataset.get_evaluation_input else: print("Don't evaluate") eval_input_fn = None ''' # Add hooks train_hooks = [ tf.train.StopAtStepHook(last_step=params.train_steps), tf.train.NanTensorHook( loss ), # Monitors the loss tensor and stops training if loss is NaN tf.train.LoggingTensorHook( { "step": global_step, "loss": loss, "chars": tf.shape(features["chars"]), "source": tf.shape(features["source"]), #"bert": tf.shape(features["bert"]), "lr": learning_rate }, every_n_iter=1), tf.train.CheckpointSaverHook( checkpoint_dir=params.output, save_secs=params.save_checkpoint_secs or None, save_steps=params.save_checkpoint_steps or None, saver=tf.train.Saver(max_to_keep=params.keep_checkpoint_max, sharded=False)) ] config = session_config(params) ''' if not eval_input_fn is None: train_hooks.append( hooks.EvaluationHook( lambda f: search.create_inference_graph( model.get_evaluation_func(), f, params ), lambda: eval_input_fn(eval_inputs, params), lambda x: decode_target_ids(x, params), params.output, config, params.keep_top_checkpoint_max, eval_secs=params.eval_secs, eval_steps=params.eval_steps ) ) ''' with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output, hooks=train_hooks, save_checkpoint_secs=None, config=config) as sess: while not sess.should_stop(): utils.session_run(sess, zero_op) for i in range(1, params.update_cycle): utils.session_run(sess, collect_op) sess.run(train_op)
def main(args): tf.logging.set_verbosity(tf.logging.INFO) model_cls = models.get_model(args.model) params = default_parameters() # Import and override parameters # Priorities (low -> high): # default -> saved -> command params = merge_parameters(params, model_cls.get_parameters()) params = import_params(args.output, args.model, params) override_parameters(params, args) # Export all parameters and model specific parameters export_params(params.output, "params.json", params) export_params(params.output, "%s.json" % args.model, collect_params(params, model_cls.get_parameters())) # Build Graph with tf.Graph().as_default(): if not params.record: # Build input queue #features = dataset.get_training_input(params.input, params) features = dataset.get_training_input_contextual( params.input, params.context, params) else: features = record.get_input_features( os.path.join(params.record, "*train*"), "train", params) features, init_op = cache.cache_features(features, params.update_cycle) # Build model initializer = get_initializer(params) model = model_cls(params) # Multi-GPU setting sharded_losses = parallel.parallel_model( model.get_training_func(initializer), features, params.device_list) loss = tf.add_n(sharded_losses) / len(sharded_losses) # Create global step global_step = tf.train.get_or_create_global_step() # Print parameters all_weights = {v.name: v for v in tf.trainable_variables()} total_size = 0 for v_name in sorted(list(all_weights)): v = all_weights[v_name] tf.logging.info("%s\tshape %s", v.name[:-2].ljust(80), str(v.shape).ljust(20)) v_size = np.prod(np.array(v.shape.as_list())).tolist() total_size += v_size tf.logging.info("Total trainable variables size: %d", total_size) learning_rate = get_learning_rate_decay(params.learning_rate, global_step, params) learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32) tf.summary.scalar("learning_rate", learning_rate) # Create optimizer if params.optimizer == "Adam": opt = tf.train.AdamOptimizer(learning_rate, beta1=params.adam_beta1, beta2=params.adam_beta2, epsilon=params.adam_epsilon) elif params.optimizer == "LazyAdam": opt = tf.contrib.opt.LazyAdamOptimizer(learning_rate, beta1=params.adam_beta1, beta2=params.adam_beta2, epsilon=params.adam_epsilon) else: raise RuntimeError("Optimizer %s not supported" % params.optimizer) loss, ops = optimize.create_train_op(loss, opt, global_step, params) # Validation if params.validation and params.references[0]: files = [params.validation] + list(params.references) eval_inputs = dataset.sort_and_zip_files(files) eval_input_fn = dataset.get_evaluation_input else: eval_input_fn = None # Add hooks save_vars = tf.trainable_variables() + [global_step] saver = tf.train.Saver( var_list=save_vars if params.only_save_trainable else None, max_to_keep=params.keep_checkpoint_max, sharded=False) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) train_hooks = [ tf.train.StopAtStepHook(last_step=params.train_steps), tf.train.NanTensorHook(loss), tf.train.LoggingTensorHook({ "step": global_step, "loss": loss, }, every_n_iter=1), tf.train.CheckpointSaverHook( checkpoint_dir=params.output, save_secs=params.save_checkpoint_secs or None, save_steps=params.save_checkpoint_steps or None, saver=saver) ] config = session_config(params) if eval_input_fn is not None: train_hooks.append( hooks.EvaluationHook( lambda f: inference.create_inference_graph( [model.get_inference_func()], f, params), lambda: eval_input_fn(eval_inputs, params), lambda x: decode_target_ids(x, params), params.output, config, params.keep_top_checkpoint_max, eval_secs=params.eval_secs, eval_steps=params.eval_steps)) # Create session, do not use default CheckpointSaverHook with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output, hooks=train_hooks, save_checkpoint_secs=None, config=config) as sess: while not sess.should_stop(): # Bypass hook calls utils.session_run(sess, [init_op, ops["zero_op"]]) for i in range(params.update_cycle): utils.session_run(sess, ops["collect_op"]) utils.session_run(sess, ops["scale_op"]) sess.run(ops["train_op"])