def _build_train_spec(self): train_hooks = [ hooks.LogParametersCountHook(), hooks.CountersHook( every_n_steps=self._estimator.config.save_summary_steps, output_dir=self._estimator.model_dir)] train_spec = tf.estimator.TrainSpec( input_fn=self._model.input_fn( tf.estimator.ModeKeys.TRAIN, self._config["train"]["batch_size"], self._config["data"], self._config["data"]["train_features_file"], labels_file=self._config["data"]["train_labels_file"], batch_type=self._config["train"].get("batch_type", "examples"), batch_multiplier=self._num_devices, bucket_width=self._config["train"].get("bucket_width", 5), single_pass=self._config["train"].get("single_pass", False), num_threads=self._config["train"].get("num_threads"), sample_buffer_size=self._config["train"].get("sample_buffer_size", 500000), prefetch_buffer_size=self._config["train"].get("prefetch_buffer_size"), maximum_features_length=self._config["train"].get("maximum_features_length"), maximum_labels_length=self._config["train"].get("maximum_labels_length")), max_steps=self._config["train"].get("train_steps"), hooks=train_hooks) return train_spec
def _model_fn(features, labels, params, mode, config): """model_fn implementation.""" if mode == tf.estimator.ModeKeys.TRAIN: counters = self._register_word_counters(features, labels) counters_hook = hooks.CountersHook( every_n_steps=config.save_summary_steps, output_dir=config.model_dir, counters=counters) features_shards = dispatcher.shard(features) labels_shards = dispatcher.shard(labels) with tf.variable_scope(self.name, initializer=self._initializer(params)): losses_shards = dispatcher( _loss_op, features_shards, labels_shards, params, mode, config) loss = _extract_loss(losses_shards) train_op = optimize(loss, params, mixed_precision=(self.dtype == tf.float16)) return tf.estimator.EstimatorSpec( mode, loss=loss, train_op=train_op, training_hooks=[counters_hook]) elif mode == tf.estimator.ModeKeys.EVAL: with tf.variable_scope(self.name): logits, predictions = self._build(features, labels, params, mode, config=config) loss = self._compute_loss(features, labels, logits, params, mode) loss = _extract_loss(loss) eval_metric_ops = self._compute_metrics(features, labels, predictions) evaluation_hooks = [] if predictions is not None and eval_prediction_hooks_fn is not None: evaluation_hooks.extend(eval_prediction_hooks_fn(predictions)) return tf.estimator.EstimatorSpec( mode, loss=loss, eval_metric_ops=eval_metric_ops, evaluation_hooks=evaluation_hooks) elif mode == tf.estimator.ModeKeys.PREDICT: with tf.variable_scope(self.name): _, predictions = self._build(features, labels, params, mode, config=config) export_outputs = {} export_outputs[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = ( tf.estimator.export.PredictOutput(predictions)) return tf.estimator.EstimatorSpec( mode, predictions=predictions, export_outputs=export_outputs) else: raise RuntimeError("Invalid mode")
def _build_train_spec(self): train_hooks = [ hooks.LogParametersCountHook(), hooks.CountersHook( every_n_steps=self._estimator.config.save_summary_steps, output_dir=self._estimator.model_dir) ] default_sample_buffer_size = 1000000 if "sample_buffer_size" not in self._config["train"]: tf.logging.warn( "You did not set sample_buffer_size. By default, the " "training dataset is shuffled by chunk of %d examples. " "If your dataset is larger than this value and eval_delay " "is shorter than the training time of one epoch, a section " "of the dataset will be discarded. Consider setting " "sample_buffer_size to the size of your dataset." % default_sample_buffer_size) train_spec = tf.estimator.TrainSpec( input_fn=self._model.input_fn( tf.estimator.ModeKeys.TRAIN, self._config["train"]["batch_size"], self._config["data"], self._config["data"]["train_features_file"], labels_file=self._config["data"]["train_labels_file"], batch_type=self._config["train"].get("batch_type", "examples"), batch_multiplier=self._num_devices, bucket_width=self._config["train"].get("bucket_width", 5), single_pass=self._config["train"].get("single_pass", False), num_threads=self._config["train"].get("num_threads"), sample_buffer_size=self._config["train"].get( "sample_buffer_size", default_sample_buffer_size), maximum_features_length=self._config["train"].get( "maximum_features_length"), maximum_labels_length=self._config["train"].get( "maximum_labels_length")), max_steps=self._config["train"].get("train_steps"), hooks=train_hooks) return train_spec
def train(estimator, model, config): """Runs training. Args: estimator: A `tf.estimator.Estimator`. model: A `opennmt.models.Model`. config: The configuration. """ batch_size = config["train"]["batch_size"] prefetch_buffer_size = config["train"].get("prefetch_buffer_size", batch_size * 1000) num_parallel_process_calls = config["train"].get( "num_parallel_process_calls", multiprocessing.cpu_count()) train_hooks = [ hooks.LogParametersCountHook(), hooks.CountersHook(every_n_steps=estimator.config.save_summary_steps, output_dir=estimator.model_dir) ] eval_hooks = [] if config["train"].get("save_eval_predictions", False): save_path = os.path.join(estimator.model_dir, "eval") if not os.path.isdir(save_path): os.makedirs(save_path) eval_hooks.append( hooks.SaveEvaluationPredictionHook( model, os.path.join(save_path, "predictions.txt"), post_evaluation_fn=external_evaluation_fn( config["train"].get("external_evaluators"), config["data"]["eval_labels_file"], output_dir=estimator.model_dir))) elif config["train"].get("external_evaluators") is not None: tf.logging.warning( "External evaluators only work when save_eval_predictions is enabled." ) train_spec = tf.estimator.TrainSpec(input_fn=model.input_fn( tf.estimator.ModeKeys.TRAIN, batch_size, prefetch_buffer_size, num_parallel_process_calls, config["data"], config["data"]["train_features_file"], labels_file=config["data"]["train_labels_file"], num_buckets=config["train"].get("num_buckets", 5), sample_buffer_size=config["train"].get("sample_buffer_size", 1000000), maximum_features_length=config["train"].get("maximum_features_length"), maximum_labels_length=config["train"].get("maximum_labels_length")), max_steps=config["train"].get( "train_steps"), hooks=train_hooks) eval_spec = tf.estimator.EvalSpec( input_fn=model.input_fn( tf.estimator.ModeKeys.EVAL, batch_size, prefetch_buffer_size, num_parallel_process_calls, config["data"], config["data"]["eval_features_file"], labels_file=config["data"]["eval_labels_file"]), steps=None, hooks=eval_hooks, exporters=tf.estimator.LatestExporter( "latest", model.serving_input_fn(config["data"])), throttle_secs=config["train"].get("eval_delay", 18000)) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train(estimator, model, config): """Runs training. Args: estimator: A `tf.estimator.Estimator`. model: A `opennmt.models.Model`. config: The configuration. """ if "eval" not in config: config["eval"] = {} train_hooks = [ hooks.LogParametersCountHook(), hooks.CountersHook(every_n_steps=estimator.config.save_summary_steps, output_dir=estimator.model_dir) ] eval_hooks = [] if (config["eval"].get("save_eval_predictions", False) or config["eval"].get("external_evaluators") is not None): save_path = os.path.join(estimator.model_dir, "eval") if not os.path.isdir(save_path): os.makedirs(save_path) eval_hooks.append( hooks.SaveEvaluationPredictionHook( model, os.path.join(save_path, "predictions.txt"), post_evaluation_fn=external_evaluation_fn( config["eval"].get("external_evaluators"), config["data"]["eval_labels_file"], output_dir=estimator.model_dir))) default_sample_buffer_size = 1000000 if "sample_buffer_size" not in config["train"]: tf.logging.warn( "You did not set sample_buffer_size. By default, the " "training dataset is shuffled by chunk of %d examples. " "If your dataset is larger than this value and eval_delay " "is shorter than the training time of one epoch, a section " "of the dataset will be discarded. Consider setting " "sample_buffer_size to the size of your dataset." % default_sample_buffer_size) train_batch_size = config["train"]["batch_size"] train_batch_type = config["train"].get("batch_type", "examples") train_prefetch_buffer_size = config["train"].get( "prefetch_buffer_size", train_batch_size * (1000 if train_batch_type == "examples" else 50)) train_num_parallel_process_calls = config["train"].get( "num_parallel_process_calls", multiprocessing.cpu_count()) train_spec = tf.estimator.TrainSpec(input_fn=model.input_fn( tf.estimator.ModeKeys.TRAIN, train_batch_size, train_prefetch_buffer_size, train_num_parallel_process_calls, config["data"], config["data"]["train_features_file"], labels_file=config["data"]["train_labels_file"], batch_type=train_batch_type, bucket_width=config["train"].get("bucket_width", 5), sample_buffer_size=config["train"].get("sample_buffer_size", default_sample_buffer_size), maximum_features_length=config["train"].get("maximum_features_length"), maximum_labels_length=config["train"].get("maximum_labels_length")), max_steps=config["train"].get( "train_steps"), hooks=train_hooks) eval_batch_size = config["eval"].get( "batch_size", train_batch_size if train_batch_type == "examples" else 30) eval_prefetch_buffer_size = config["eval"].get("prefetch_buffer_size", eval_batch_size * 10) eval_num_parallel_process_calls = config["eval"].get( "num_parallel_process_calls", train_num_parallel_process_calls) eval_spec = tf.estimator.EvalSpec( input_fn=model.input_fn( tf.estimator.ModeKeys.EVAL, eval_batch_size, eval_prefetch_buffer_size, eval_num_parallel_process_calls, config["data"], config["data"]["eval_features_file"], labels_file=config["data"]["eval_labels_file"]), steps=None, hooks=eval_hooks, exporters=tf.estimator.LatestExporter( "latest", model.serving_input_fn(config["data"])), throttle_secs=config["eval"].get("eval_delay", 18000)) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def _model_fn(features, labels, params, mode, config): """model_fn implementation.""" if mode == tf.estimator.ModeKeys.TRAIN: counters = self._register_word_counters(features, labels) training_hooks = [] if config is not None: training_hooks.append( hooks.CountersHook( every_n_steps=config.save_summary_steps, output_dir=config.model_dir, counters=counters)) features_shards = dispatcher.shard(features) labels_shards = dispatcher.shard(labels) with tf.variable_scope(self.name, initializer=self._initializer(params)): losses_shards = dispatcher(_loss_op, features_shards, labels_shards, params, mode, config) loss = _extract_loss(losses_shards) #train_op, extra_variables = optimize_loss( # loss, params, mixed_precision=(self.dtype == tf.float16)) freeze_params = params.get("freeze") if freeze_params is not None: var_list = self._get_variables(freeze_params) train_op, extra_variables = optimize_loss( loss, params, mixed_precision=(self.dtype == tf.float16), var_list=var_list) else: train_op, extra_variables = optimize_loss( loss, params, mixed_precision=(self.dtype == tf.float16)) if extra_variables: training_hooks.append( hooks.VariablesInitializerHook(extra_variables)) return tf.estimator.EstimatorSpec( mode, loss=loss, train_op=train_op, training_hooks=training_hooks) elif mode == tf.estimator.ModeKeys.EVAL: with tf.variable_scope(self.name): logits, predictions = self._build(features, labels, params, mode, config=config) loss = self._compute_loss(features, labels, logits, params, mode) loss = _extract_loss\ (loss) eval_metric_ops = self._compute_metrics( features, labels, predictions) # pylint: disable=assignment-from-none evaluation_hooks = [] if predictions is not None and eval_prediction_hooks_fn is not None: evaluation_hooks.extend( eval_prediction_hooks_fn(predictions)) return tf.estimator.EstimatorSpec( mode, loss=loss, eval_metric_ops=eval_metric_ops, evaluation_hooks=evaluation_hooks) elif mode == tf.estimator.ModeKeys.PREDICT: with tf.variable_scope(self.name): _, predictions = self._build(features, labels, params, mode, config=config) # Forward example index for reordering predictions. if "index" in features: predictions["index"] = features["index"] export_outputs = {} export_outputs[tf.saved_model.signature_constants. DEFAULT_SERVING_SIGNATURE_DEF_KEY] = ( tf.estimator.export.PredictOutput( predictions)) return tf.estimator.EstimatorSpec( mode, predictions=predictions, export_outputs=export_outputs) else: raise RuntimeError("Invalid mode")
def _model_fn(features, labels, params, mode, config): """model_fn implementation.""" if mode == tf.estimator.ModeKeys.TRAIN: counters = self._register_word_counters(features, labels) training_hooks = [] if config is not None: training_hooks.append(hooks.CountersHook( every_n_steps=config.save_summary_steps, output_dir=config.model_dir, counters=counters)) features_shards = dispatcher.shard(features) labels_shards = dispatcher.shard(labels) with tf.variable_scope(self.name, initializer=self._initializer(params)): losses_shards = dispatcher( _loss_op, features_shards, labels_shards, params, mode, config) loss = _extract_loss(losses_shards) #train_op, extra_variables = optimize_loss( # loss, params, mixed_precision=(self.dtype == tf.float16)) #TODO: freeze_update, get config, get var_list, pass var_list, move >>> DIFFERENT VARLIST FOR LSTM AND TRANSFORMER, THUS, BEFORE SLECTING WHAT VARS, CHECK IF IT EXISTS IN CKPT >> MAKE IT ARCHI DEPENDENT freeze_params = params.get("freeze") if False: #freeze_params is not None: tf.logging.info("Optimizing selected network components:", freeze_params) #json.dumps(freeze_params, indent=2, sort_keys=True)) var_list = self._get_variables(freeze_params) train_op, extra_variables = optimize_loss( loss, params, mixed_precision=(self.dtype == tf.float16), var_list=var_list) else: train_op, extra_variables = optimize_loss( loss, params, mixed_precision=(self.dtype == tf.float16)) #TODO: simpler way of doing it, if freeze is None, pass var_list=None, else select vars to pass. #train_op, extra_variables = optimize_loss( # loss, params, mixed_precision=(self.dtype == tf.float16), var_list=self._get_variables(params)) if extra_variables: training_hooks.append(hooks.VariablesInitializerHook(extra_variables)) return tf.estimator.EstimatorSpec( mode, loss=loss, train_op=train_op, training_hooks=training_hooks) elif mode == tf.estimator.ModeKeys.EVAL: with tf.variable_scope(self.name): logits, predictions = self._build(features, labels, params, mode, config=config) loss = self._compute_loss(features, labels, logits, params, mode) loss = _extract_loss\ (loss) eval_metric_ops = self._compute_metrics(features, labels, predictions) # pylint: disable=assignment-from-none evaluation_hooks = [] if predictions is not None and eval_prediction_hooks_fn is not None: evaluation_hooks.extend(eval_prediction_hooks_fn(predictions)) return tf.estimator.EstimatorSpec( mode, loss=loss, eval_metric_ops=eval_metric_ops, evaluation_hooks=evaluation_hooks) elif mode == tf.estimator.ModeKeys.PREDICT: with tf.variable_scope(self.name): _, predictions = self._build(features, labels, params, mode, config=config) # Forward example index for reordering predictions. if "index" in features: predictions["index"] = features["index"] export_outputs = {} export_outputs[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = ( tf.estimator.export.PredictOutput(predictions)) return tf.estimator.EstimatorSpec( mode, predictions=predictions, export_outputs=export_outputs) else: raise RuntimeError("Invalid mode")