def CreateOptimizerParameters(self, learning_rate): p = self.params return tpu_embedding_lib.StochasticGradientDescentParameters( learning_rate=learning_rate, clip_weight_min=p.clip_weight_min, clip_weight_max=p.clip_weight_max, weight_decay_factor=p.weight_decay_factor, multiply_weight_decay_factor_by_learning_rate=p. multiply_weight_decay_factor_by_learning_rate)
def __init__(self, params): super().__init__(params) p = self.params self._tpu_embedding_optimizer_parameters = ( tpu_embedding_lib.StochasticGradientDescentParameters( learning_rate=p.learning_rate, clip_weight_min=p.clip_weight_min, clip_weight_max=p.clip_weight_max, weight_decay_factor=p.weight_decay_factor, multiply_weight_decay_factor_by_learning_rate=p. multiply_weight_decay_factor_by_learning_rate))
def _get_tpu_embedding_optimization_parameters(embedding_config_spec): """Get tpu_embedding._OptimizationParameters from EmbeddingConfigSpec.""" if embedding_config_spec.optimizer_type == 'adagrad': return tpu_embedding.AdagradParameters( embedding_config_spec.learning_rate, embedding_config_spec.adagrad_initial_accumulator, embedding_config_spec.use_gradient_accumulation) elif embedding_config_spec.optimizer_type == 'sgd': return tpu_embedding.StochasticGradientDescentParameters( embedding_config_spec.learning_rate, embedding_config_spec.use_gradient_accumulation) elif embedding_config_spec.optimizer_type == 'adam': return tpu_embedding.AdamParameters( embedding_config_spec.learning_rate, embedding_config_spec.adam_parameters.beta1, embedding_config_spec.adam_parameters.beta2, embedding_config_spec.adam_parameters.epsilon, use_gradient_accumulation=embedding_config_spec. use_gradient_accumulation) else: raise ValueError( 'optimizer_type must be adagrad or sgd or adam for now.')
def run_model(params, eval_init_fn=None, eval_finish_fn=None, run_finish_fn=None): """Run the DLRM model, using a pre-defined configuration. Args: params: HPTuner object that provides new params for the trial. eval_init_fn: Lambda to run at start of eval. None means use the default. eval_finish_fn: Lambda for end of eval. None means use the default. run_finish_fn: Lambda for end of execution. None means use the default. Returns: A list of tuples, each entry describing the eval metric for one eval. Each tuple entry is (global_step, metric_value). """ mlp_log.mlperf_print(key="cache_clear", value=True) mlp_log.mlperf_print(key="init_start", value=None) mlp_log.mlperf_print("global_batch_size", params["batch_size"]) mlp_log.mlperf_print("train_samples", _NUM_TRAIN_EXAMPLES) mlp_log.mlperf_print("eval_samples", _NUM_EVAL_EXAMPLES) adjusted_lr = params["learning_rate"] * (params["batch_size"] / 2048.0) mlp_log.mlperf_print("opt_base_learning_rate", adjusted_lr) mlp_log.mlperf_print("sgd_opt_base_learning_rate", adjusted_lr) mlp_log.mlperf_print("sgd_opt_learning_rate_decay_poly_power", 2) mlp_log.mlperf_print("sgd_opt_learning_rate_decay_steps", params["decay_steps"]) mlp_log.mlperf_print("lr_decay_start_steps", params["decay_start_step"]) mlp_log.mlperf_print("opt_learning_rate_warmup_steps", params["lr_warmup_steps"]) # Used for vizier. List of tuples. Each entry is (global_step, auc_metric). eval_metrics = [(0, 0.0)] feature_config = fc.FeatureConfig(params) (feature_to_config_dict, table_to_config_dict) = feature_config.get_feature_tbl_config() opt_params = { "sgd": tpu_embedding.StochasticGradientDescentParameters( learning_rate=params["learning_rate"]), "adagrad": tpu_embedding.AdagradParameters( learning_rate=params["learning_rate"], initial_accumulator=params["adagrad_init_accum"]) } embedding = tpu_embedding.TPUEmbedding( table_to_config_dict, feature_to_config_dict, params["batch_size"], mode=tpu_embedding.TRAINING, optimization_parameters=opt_params[params["optimizer"]], partition_strategy="mod", pipeline_execution_with_tensor_core=FLAGS.pipeline_execution, master=FLAGS.master) runner = dlrm_embedding_runner.DLRMEmbeddingRunner( iterations_per_loop=FLAGS.steps_between_evals, train_steps=FLAGS.train_steps, eval_steps=FLAGS.eval_steps, num_replicas=FLAGS.num_tpu_shards, sparse_features_key="cat-features", embedding=embedding) train_input_fn, eval_input_fn = get_input_fns(params, feature_config) runner.initialize(train_input_fn, eval_input_fn, functools.partial(dlrm.dlrm_llr_model_fn, params, feature_config), params["batch_size"], params["eval_batch_size"], train_has_labels=False, eval_has_labels=False) mlp_log.mlperf_print("init_stop", None) mlp_log.mlperf_print("run_start", None) def _default_eval_init_fn(cur_step): """Logging statements executed before every eval.""" eval_num = 0 if FLAGS.steps_between_evals: eval_num = cur_step // FLAGS.steps_between_evals tf.logging.info("== Block {}. Step {} of {}".format( eval_num + 1, cur_step, FLAGS.train_steps)) mlp_log.mlperf_print("block_start", None, metadata={ "first_epoch_num": eval_num + 1, "epoch_count": 1 }) mlp_log.mlperf_print("eval_start", None, metadata={"epoch_num": eval_num + 1}) def _default_eval_finish_fn(cur_step, eval_output, summary_writer=None): eval_num = 0 if FLAGS.steps_between_evals: eval_num = cur_step // FLAGS.steps_between_evals mlp_log.mlperf_print("eval_stop", None, metadata={"epoch_num": eval_num + 1}) mlp_log.mlperf_print("block_stop", None, metadata={"first_epoch_num": eval_num + 1}) tf.logging.info( "== Eval finished (step {}). Computing metric..".format(cur_step)) results_np = np.array(eval_output["results"]) results_np = np.reshape(results_np, (-1, 2)) predictions_np = results_np[:, 0].astype(np.float32) targets_np = results_np[:, 1].astype(np.int32) # TODO: Fix roc clif in cloud. # roc_obj = roc_metrics.RocMetrics(predictions_np, targets_np) # roc_auc = roc_obj.ComputeRocAuc() roc_auc = 0.0 tf.logging.info("== Eval shape: {}. AUC = {:.4f}".format( predictions_np.shape, roc_auc)) success = roc_auc >= _ACCURACY_THRESH mlp_log.mlperf_print("eval_accuracy", roc_auc, metadata={"epoch_num": eval_num + 1}) if success: mlp_log.mlperf_print("run_stop", None, metadata={"status": "success"}) if summary_writer: summary_writer.add_summary( utils.create_scalar_summary("auc", roc_auc), global_step=cur_step + FLAGS.steps_between_evals) eval_metrics.append((cur_step + FLAGS.steps_between_evals, roc_auc)) return success def _default_run_finish_fn(success_status): if not success_status: mlp_log.mlperf_print("run_stop", None, metadata={"status": "failure"}) tf.logging.info("Retrieving embedding vars and writing stats.") runner.retrieve_embedding_vars() runner.train_and_eval(eval_init_fn=eval_init_fn or _default_eval_init_fn, eval_finish_fn=eval_finish_fn or _default_eval_finish_fn, run_finish_fn=run_finish_fn or _default_run_finish_fn) return eval_metrics