def train(self): """Trains the model.""" params, flags_obj, is_train = self.params, self.flags_obj, True _ensure_dir(flags_obj.model_dir) if self.distribution_strategy: with self.distribution_strategy.scope(): model = transformer.create_model(params, is_train) opt = self._create_optimizer() model.compile(opt) else: model = transformer.create_model(params, is_train) opt = self._create_optimizer() model.compile(opt) model.summary() # TODO(guptapriya): Figure out a way to structure input that works in both # distributed and non distributed cases. train_ds = data_pipeline.train_input_fn(params) if not self.distribution_strategy: map_data_fn = data_pipeline.map_data_for_transformer_fn train_ds = train_ds.map( map_data_fn, num_parallel_calls=params["num_parallel_calls"]) callbacks = self._create_callbacks(flags_obj.model_dir, 0, params) if flags_obj.train_steps < flags_obj.steps_between_evals: flags_obj.steps_between_evals = flags_obj.train_steps iterations = flags_obj.train_steps // flags_obj.steps_between_evals cased_score, uncased_score = None, None for i in range(1, iterations + 1): print("Start train iteration:{}/{}".format(i, iterations)) history = model.fit( train_ds, initial_epoch=i-1, epochs=i, steps_per_epoch=flags_obj.steps_between_evals, callbacks=callbacks, # If TimeHistory is enabled, progress bar would be messy. Increase the # verbose level to get rid of it. verbose=(2 if flags_obj.enable_time_history else 1)) print("End train iteration:{}/{} global step:{}".format( i, iterations, i*flags_obj.steps_between_evals)) tf.compat.v1.logging.info("Train history: {}".format(history.history)) stats = misc.build_stats(history, callbacks) if (flags_obj.bleu_source and flags_obj.bleu_ref): uncased_score, cased_score = self.eval() print("BLEU: uncased={}, cased={}".format(uncased_score, cased_score)) stats = misc.build_stats(history, callbacks) if uncased_score and cased_score: stats["bleu_uncased"] = uncased_score stats["bleu_cased"] = cased_score return stats
def eval(self): """Evaluates the model.""" params, flags_obj, is_train = self.params, self.flags_obj, False with tf.name_scope("model"): model = transformer.create_model(params, is_train) self._load_weights_if_possible(model, flags_obj.init_weight_path) model.summary() evaluate_and_log_bleu(model, flags_obj.bleu_source, flags_obj.bleu_ref, flags_obj.vocab_file)
def test_create_model_not_train(self): model = transformer.create_model(self.params, False) inputs, outputs = model.inputs, model.outputs self.assertEqual(len(inputs), 1) self.assertEqual(len(outputs), 2) self.assertEqual(inputs[0].shape.as_list(), [None, None]) self.assertEqual(inputs[0].dtype, tf.int64) self.assertEqual(outputs[0].shape.as_list(), [None, None]) self.assertEqual(outputs[0].dtype, tf.int32) self.assertEqual(outputs[1].shape.as_list(), [None]) self.assertEqual(outputs[1].dtype, tf.float32)
def eval(self): """Evaluates the model.""" if not self.predict_model: self.predict_model = transformer.create_model(self.params, False) self._load_weights_if_possible( self.predict_model, tf.train.latest_checkpoint(self.flags_obj.model_dir)) self.predict_model.summary() return evaluate_and_log_bleu(self.predict_model, self.flags_obj.bleu_source, self.flags_obj.bleu_ref, self.flags_obj.vocab_file)
def eval(self): """Evaluates the model.""" with distribution_utils.get_strategy_scope(self.distribution_strategy): if not self.predict_model: self.predict_model = transformer.create_model(self.params, False) self._load_weights_if_possible( self.predict_model, tf.train.latest_checkpoint(self.flags_obj.model_dir)) self.predict_model.summary() return evaluate_and_log_bleu( self.predict_model, self.params, self.flags_obj.bleu_source, self.flags_obj.bleu_ref, self.flags_obj.vocab_file, self.distribution_strategy if self.use_tpu else None)
def predict(self): """Predicts result from the model.""" params, flags_obj, is_train = self.params, self.flags_obj, False with tf.name_scope("model"): model = transformer.create_model(params, is_train) self._load_weights_if_possible(model, flags_obj.init_weight_path) model.summary() subtokenizer = tokenizer.Subtokenizer(flags_obj.vocab_file) ds = data_pipeline.eval_input_fn(params) ds = ds.map(lambda x, y: x).take(_SINGLE_SAMPLE) ret = model.predict(ds) val_outputs, _ = ret length = len(val_outputs) for i in range(length): translate.translate_from_input(val_outputs[i], subtokenizer)
def eval(self): """Evaluates the model.""" distribution_strategy = self.distribution_strategy if self.use_tpu else None # We only want to create the model under DS scope for TPU case. # When 'distribution_strategy' is None, a no-op DummyContextManager will # be used. with distribution_utils.get_strategy_scope(distribution_strategy): if not self.predict_model: self.predict_model = transformer.create_model(self.params, False) self._load_weights_if_possible( self.predict_model, tf.train.latest_checkpoint(self.flags_obj.model_dir)) self.predict_model.summary() return evaluate_and_log_bleu( self.predict_model, self.params, self.flags_obj.bleu_source, self.flags_obj.bleu_ref, self.flags_obj.vocab_file, distribution_strategy)
def train(self): """Trains the model.""" params, flags_obj, is_train = self.params, self.flags_obj, True _ensure_dir(flags_obj.model_dir) model = transformer.create_model(params, is_train) opt = self._create_optimizer() model.compile(opt, target_tensors=[]) model.summary() map_data_fn = data_pipeline.map_data_for_transformer_fn train_ds = data_pipeline.train_input_fn(params) train_ds = train_ds.map( map_data_fn, num_parallel_calls=params["num_parallel_calls"]) callbacks = self._create_callbacks(flags_obj.model_dir, 0, params) if flags_obj.train_steps < flags_obj.steps_between_evals: flags_obj.steps_between_evals = flags_obj.train_steps iterations = flags_obj.train_steps // flags_obj.steps_between_evals cased_score, uncased_score = None, None for i in range(1, iterations + 1): print("Start train iteration:{}/{}".format(i, iterations)) history = model.fit(train_ds, initial_epoch=i - 1, epochs=i, steps_per_epoch=flags_obj.steps_between_evals, callbacks=callbacks, verbose=2) print("End train iteration:{}/{} global step:{}".format( i, iterations, i * flags_obj.steps_between_evals)) tf.compat.v1.logging.info("Train history: {}".format( history.history)) stats = misc.build_stats(history, callbacks) if (flags_obj.bleu_source and flags_obj.bleu_ref): uncased_score, cased_score = self.eval() stats = misc.build_stats(history, callbacks) if uncased_score and cased_score: stats["bleu_uncased"] = uncased_score stats["bleu_cased"] = cased_score return stats
def train(self): """Trains the model.""" params, flags_obj, is_train = self.params, self.flags_obj, True model = transformer.create_model(params, is_train) opt = self._create_optimizer() model.compile(opt, target_tensors=[]) model.summary() self._load_weights_if_possible(model, flags_obj.init_weight_path) cur_log_dir = _get_log_dir_or_default(flags_obj) _ensure_dir(cur_log_dir) map_data_fn = data_pipeline.map_data_for_transformer_fn train_ds = data_pipeline.train_input_fn(params) train_ds = train_ds.map( map_data_fn, num_parallel_calls=params["num_parallel_calls"]) valid_ds = data_pipeline.eval_input_fn(params) valid_ds = valid_ds.map( map_data_fn, num_parallel_calls=params["num_parallel_calls"]) init_epoch = flags_obj.init_epoch or 0 init_steps = init_epoch * flags_obj.steps_per_epoch callbacks = self._create_callbacks(cur_log_dir, init_steps, params) history = model.fit(train_ds, initial_epoch=init_epoch, epochs=flags_obj.train_epochs, steps_per_epoch=flags_obj.steps_per_epoch, validation_data=valid_ds, validation_steps=flags_obj.validation_steps, callbacks=callbacks) tf.compat.v1.logging.info("\nTrain history: {}".format( history.history)) save_weight_path = os.path.join(cur_log_dir, "saves-model-weights.hdf5") save_model_path = os.path.join(cur_log_dir, "saves-model.hdf5") model.save_weights(save_weight_path) model.save(save_model_path)
def train(self): """Trains the model.""" params = self.params flags_obj = self.flags_obj # Sets config options. # xla? keras_utils.set_session_config( enable_xla=flags_obj.enable_xla) _ensure_dir(flags_obj.model_dir) with distribution_utils.get_strategy_scope(self.distribution_strategy): model = transformer.create_model(params, is_train=True) opt = self._create_optimizer() # 恢复checkpoint current_step = 0 checkpoint = tf.train.Checkpoint(model=model, optimizer=opt) latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir) if latest_checkpoint: checkpoint.restore(latest_checkpoint) logging.info("Loaded checkpoint %s", latest_checkpoint) current_step = opt.iterations.numpy() #? # 分布式,均值 if params["use_ctl"]: train_loss_metric = tf.keras.metrics.Mean( "training_loss", dtype=tf.float32) else: # 模型训练的配置,包括优化器、LOSS等 model.compile(opt) # model结构 model.summary() if self.use_tpu: # Different from experimental_distribute_dataset, # experimental_distribute_datasets_from_function requires # per-replica/local batch size. params["batch_size"] /= self.distribution_strategy.num_replicas_in_sync train_ds = ( self.distribution_strategy .experimental_distribute_datasets_from_function( lambda ctx: data_pipeline.train_input_fn(params, ctx))) else: # 平行句对 train_ds = data_pipeline.train_input_fn(params) map_data_fn = data_pipeline.map_data_for_transformer_fn train_ds = train_ds.map( map_data_fn, num_parallel_calls=params["num_parallel_calls"]) if params["use_ctl"]: train_ds_iterator = iter(train_ds) callbacks = self._create_callbacks(flags_obj.model_dir, 0, params) # TODO(b/139418525): Refactor the custom training loop logic. @tf.function def train_steps(iterator, steps): """Training steps function for TPU runs. Args: iterator: The input iterator of the training dataset. steps: An integer, the number of training steps. Returns: A float, the loss value. """ def _step_fn(inputs): """Per-replica step function.""" inputs, targets = inputs with tf.GradientTape() as tape: logits = model([inputs, targets], training=True) loss = metrics.transformer_loss(logits, targets, params["label_smoothing"], params["vocab_size"]) # Scales the loss, which results in using the average loss across all # of the replicas for backprop. scaled_loss = loss / self.distribution_strategy.num_replicas_in_sync # De-dupes variables due to keras tracking issues. tvars = list({id(v): v for v in model.trainable_variables}.values()) grads = tape.gradient(scaled_loss, tvars) opt.apply_gradients(zip(grads, tvars)) # For reporting, the metric takes the mean of losses. train_loss_metric.update_state(loss) for _ in tf.range(steps): train_loss_metric.reset_states() self.distribution_strategy.experimental_run_v2( _step_fn, args=(next(iterator),)) cased_score, uncased_score = None, None cased_score_history, uncased_score_history = [], [] while current_step < flags_obj.train_steps: remaining_steps = flags_obj.train_steps - current_step train_steps_per_eval = ( remaining_steps if remaining_steps < flags_obj.steps_between_evals else flags_obj.steps_between_evals) current_iteration = current_step // flags_obj.steps_between_evals logging.info( "Start train iteration at global step:{}".format(current_step)) history = None # tpu使用的是上述train_steps函数 # gpu可直接用model.fit() if params["use_ctl"]: if not self.use_tpu: raise NotImplementedError( "Custom training loop on GPUs is not implemented.") # Runs training steps. train_steps(train_ds_iterator, tf.convert_to_tensor(train_steps_per_eval, dtype=tf.int32)) current_step += train_steps_per_eval train_loss = train_loss_metric.result().numpy().astype(float) logging.info("Train Step: %d/%d / loss = %s", current_step, flags_obj.train_steps, train_loss) checkpoint_name = checkpoint.save( os.path.join( flags_obj.model_dir, "ctl_step_{}.ckpt".format(current_step))) logging.info("Saved checkpoint to %s", checkpoint_name) else: if self.use_tpu: raise NotImplementedError( "Keras model.fit on TPUs is not implemented.") history = model.fit( train_ds, initial_epoch=current_iteration, epochs=current_iteration + 1, steps_per_epoch=train_steps_per_eval, callbacks=callbacks, # If TimeHistory is enabled, progress bar would be messy. Increase # the verbose level to get rid of it. verbose=(2 if flags_obj.enable_time_history else 1)) current_step += train_steps_per_eval logging.info("Train history: {}".format(history.history)) logging.info("End train iteration at global step:{}".format(current_step)) if (flags_obj.bleu_source and flags_obj.bleu_ref): # 区分大小写 uncased_score, cased_score = self.eval() cased_score_history.append([current_iteration + 1, cased_score]) uncased_score_history.append([current_iteration + 1, uncased_score]) stats = ({ "loss": train_loss } if history is None else misc.build_stats(history, callbacks)) if uncased_score and cased_score: stats["bleu_uncased"] = uncased_score stats["bleu_cased"] = cased_score stats["bleu_uncased_history"] = uncased_score_history stats["bleu_cased_history"] = cased_score_history return stats
def train(self): """Trains the model.""" params, flags_obj, is_train = self.params, self.flags_obj, True # Sets config options. keras_utils.set_session_config( enable_xla=flags_obj.enable_xla) _ensure_dir(flags_obj.model_dir) if self.distribution_strategy: with self.distribution_strategy.scope(): model = transformer.create_model(params, is_train) opt = self._create_optimizer() model.compile(opt) else: model = transformer.create_model(params, is_train) opt = self._create_optimizer() model.compile(opt) model.summary() train_ds = data_pipeline.train_input_fn(params) map_data_fn = data_pipeline.map_data_for_transformer_fn train_ds = train_ds.map(map_data_fn, num_parallel_calls=params["num_parallel_calls"]) callbacks = self._create_callbacks(flags_obj.model_dir, 0, params) if flags_obj.train_steps < flags_obj.steps_between_evals: flags_obj.steps_between_evals = flags_obj.train_steps iterations = flags_obj.train_steps // flags_obj.steps_between_evals cased_score, uncased_score = None, None cased_score_history, uncased_score_history = [], [] for i in range(1, iterations + 1): print("Start train iteration:{}/{}".format(i, iterations)) history = model.fit( train_ds, initial_epoch=i-1, epochs=i, steps_per_epoch=flags_obj.steps_between_evals, callbacks=callbacks, # If TimeHistory is enabled, progress bar would be messy. Increase the # verbose level to get rid of it. verbose=(2 if flags_obj.enable_time_history else 1)) print("End train iteration:{}/{} global step:{}".format( i, iterations, i*flags_obj.steps_between_evals)) tf.compat.v1.logging.info("Train history: {}".format(history.history)) stats = misc.build_stats(history, callbacks) if (flags_obj.bleu_source and flags_obj.bleu_ref): uncased_score, cased_score = self.eval() cased_score_history.append([i, cased_score]) uncased_score_history.append([i, uncased_score]) stats = misc.build_stats(history, callbacks) if uncased_score and cased_score: stats["bleu_uncased"] = uncased_score stats["bleu_cased"] = cased_score stats["bleu_uncased_history"] = uncased_score_history stats["bleu_cased_history"] = cased_score_history return stats
def train(self): """Trains the model.""" params = self.params flags_obj = self.flags_obj # Sets config options. keras_utils.set_session_config( enable_xla=flags_obj.enable_xla) _ensure_dir(flags_obj.model_dir) with distribution_utils.get_strategy_scope(self.distribution_strategy): model = transformer.create_model(params, is_train=True) opt = self._create_optimizer() if params["use_ctl"]: train_loss_metric = tf.keras.metrics.Mean( "training_loss", dtype=tf.float32) else: model.compile(opt) model.summary() train_ds = data_pipeline.train_input_fn(params) if self.use_tpu: if params["is_tpu_pod"]: train_ds = ( self.distribution_strategy .experimental_distribute_datasets_from_function( lambda: data_pipeline.train_input_fn(params))) else: train_ds = ( self.distribution_strategy.experimental_distribute_dataset(train_ds) ) else: map_data_fn = data_pipeline.map_data_for_transformer_fn train_ds = train_ds.map( map_data_fn, num_parallel_calls=params["num_parallel_calls"]) callbacks = self._create_callbacks(flags_obj.model_dir, 0, params) # TODO(b/139418525): Refactor the custom training loop logic. @tf.function def train_steps(iterator, steps): """Training steps function for TPU runs. Args: iterator: The input iterator of the training dataset. steps: An integer, the number of training steps. Returns: A float, the loss value. """ def _step_fn(inputs): """Per-replica step function.""" inputs, targets = inputs with tf.GradientTape() as tape: logits = model([inputs, targets], training=True) loss = metrics.transformer_loss(logits, targets, params["label_smoothing"], params["vocab_size"]) # Scales the loss, which results in using the average loss across all # of the replicas for backprop. scaled_loss = loss / self.distribution_strategy.num_replicas_in_sync # De-dupes variables due to keras tracking issues. tvars = list( object_identity.ObjectIdentitySet(model.trainable_variables)) grads = tape.gradient(scaled_loss, tvars) opt.apply_gradients(zip(grads, tvars)) # For reporting, the metric takes the mean of losses. train_loss_metric.update_state(loss) for _ in tf.range(steps): train_loss_metric.reset_states() self.distribution_strategy.experimental_run_v2( _step_fn, args=(next(iterator),)) if self.use_tpu: checkpoint = tf.train.Checkpoint(model=model, optimizer=opt) latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir) if latest_checkpoint: checkpoint.restore(latest_checkpoint) logging.info("Loaded checkpoint %s", latest_checkpoint) if flags_obj.train_steps < flags_obj.steps_between_evals: flags_obj.steps_between_evals = flags_obj.train_steps iterations = flags_obj.train_steps // flags_obj.steps_between_evals cased_score, uncased_score = None, None cased_score_history, uncased_score_history = [], [] for i in range(1, iterations + 1): print("Start train iteration:{}/{}".format(i, iterations)) history = None if params["use_ctl"]: if not self.use_tpu: raise NotImplementedError( "Custom training loop on GPUs is not implemented.") train_steps_per_eval = tf.convert_to_tensor( flags_obj.steps_between_evals, dtype=tf.int32) # Runs training steps. train_steps(iter(train_ds), train_steps_per_eval) train_loss = train_loss_metric.result().numpy().astype(float) logging.info("Train Step: %d/%d / loss = %s", i * flags_obj.steps_between_evals, flags_obj.train_steps, train_loss) checkpoint_name = checkpoint.save( os.path.join( flags_obj.model_dir, "ctl_step_{}.ckpt".format(i * flags_obj.steps_between_evals))) logging.info("Saved checkpoint to %s", checkpoint_name) else: if self.use_tpu: raise NotImplementedError( "Keras model.fit on TPUs is not implemented.") history = model.fit( train_ds, initial_epoch=i - 1, epochs=i, steps_per_epoch=flags_obj.steps_between_evals, callbacks=callbacks, # If TimeHistory is enabled, progress bar would be messy. Increase # the verbose level to get rid of it. verbose=(2 if flags_obj.enable_time_history else 1)) logging.info("Train history: {}".format(history.history)) print("End train iteration:{}/{} global step:{}".format( i, iterations, i*flags_obj.steps_between_evals)) if (flags_obj.bleu_source and flags_obj.bleu_ref): uncased_score, cased_score = self.eval() cased_score_history.append([i, cased_score]) uncased_score_history.append([i, uncased_score]) stats = ({ "loss": train_loss } if history is None else misc.build_stats(history, callbacks)) if uncased_score and cased_score: stats["bleu_uncased"] = uncased_score stats["bleu_cased"] = cased_score stats["bleu_uncased_history"] = uncased_score_history stats["bleu_cased_history"] = cased_score_history return stats