def _create_tpu_embedding(self, mode): """Create tpu_embedding.TPUEmbedding based on mode.""" if mode == model_fn_lib.ModeKeys.TRAIN: batch_size = self._train_batch_size else: batch_size = self._eval_batch_size if mode == model_fn_lib.ModeKeys.TRAIN: tpu_embedding_mode = tpu_embedding.TRAINING elif (mode == model_fn_lib.ModeKeys.EVAL or mode == model_fn_lib.ModeKeys.PREDICT): tpu_embedding_mode = tpu_embedding.INFERENCE else: raise ValueError('Mode {} is not supported.'.format(mode)) master = (self._run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL else self._run_config.master) cluster_def = (self._run_config.session_config.cluster_def if self._run_config.session_config else None) tpu_embedding_ = tpu_embedding.TPUEmbedding( self._table_to_config_dict, self._feature_to_table_dict, batch_size, tpu_embedding_mode, master, self._embedding_config_spec.optimization_parameters, cluster_def, ) return tpu_embedding_
def _CreateLayerVariables(self): super()._CreateLayerVariables() p = self.params load_op_list = [] retrieve_op_list = [] # At the feature level, track which are associated # with "sequence embeddings". self._sequence_features = {} if py_utils.use_tpu(): num_cores = self.cluster.params.worker.tpus_per_replica global_batch_size = (self.params.batch_size * self.cluster.num_splits_per_client) table_to_config_dict = {} feature_to_config_dict = {} for table in self.tables: table_to_config_dict[table.table_name] = table.table_config load_op_list += table.load_op_list retrieve_op_list += table.retrieve_op_list for feature in table.input_keys: if table.max_sequence_length > 0: self._sequence_features[feature] = True feature_to_config_dict[ feature] = tpu_embedding_lib.FeatureConfig( table.table_name, max_sequence_length=table.max_sequence_length) tpu_embedding = self._tpu_embedding_collection.tpu_embedding if tpu_embedding: self._CheckTPUEmbeddingConfig(tpu_embedding, table_to_config_dict, feature_to_config_dict, global_batch_size) tf.logging.info( 'TPUEmbedding API singleton already exists, reusing') self._tpu_embedding = tpu_embedding else: tf.logging.info('adding load and retrieve ops to collection.') self._tpu_embedding_collection.AddLoadOps(load_op_list) self._tpu_embedding_collection.AddRetrieveOps(retrieve_op_list) mode = tpu_embedding_lib.TRAINING device_config = tpu_embedding_lib.DeviceConfig( num_cores=num_cores, num_hosts=self.params.tables[0].num_tpu_hosts, job_name=self.cluster.params.worker.name) self._tpu_embedding = tpu_embedding_lib.TPUEmbedding( table_to_config_dict, feature_to_config_dict, global_batch_size, mode, master=None, pipeline_execution_with_tensor_core=( self.params.pipeline_execution_with_tensor_core), partition_strategy=p.partition_strategy, device_config=device_config) self._tpu_embedding_collection.tpu_embedding = self._tpu_embedding
def _CreateLayerVariables(self): super()._CreateLayerVariables() load_op_list = [] retrieve_op_list = [] # At the feature level, track which are associated # with "sequence embeddings". self._sequence_features = {} if py_utils.use_tpu(): num_cores = self.cluster.params.worker.tpus_per_replica global_batch_size = (self.params.batch_size * self.cluster.num_splits_per_client) table_to_config_dict = {} feature_to_config_dict = {} for table in self.tables: table_to_config_dict[table.table_name] = table.table_config load_op_list += table.load_op_list retrieve_op_list += table.retrieve_op_list for feature in table.input_keys: if table.max_sequence_length > 0: self._sequence_features[feature] = True feature_to_config_dict[ feature] = tpu_embedding_lib.FeatureConfig( table.table_name, max_sequence_length=table.max_sequence_length) tf.logging.info('adding load and retrieve ops to collection.') tf.add_to_collection(py_utils.TPU_EMBEDDING_LOAD_OPS, load_op_list) tf.add_to_collection(py_utils.TPU_EMBEDDING_RETRIEVE_OPS, retrieve_op_list) tpu_embedding_collection = tf.get_collection( py_utils.TPU_EMBEDDING) assert len(tpu_embedding_collection) <= 1 if len(tpu_embedding_collection) == 1: tf.logging.info( 'TPUEmbedding API singleton already exists, reusing') self._tpu_embedding = tpu_embedding_collection[0] else: mode = tpu_embedding_lib.TRAINING device_config = tpu_embedding_lib.DeviceConfig( num_cores=num_cores, num_hosts=self.params.tables[0].num_tpu_hosts, job_name=self.cluster.params.worker.name) self._tpu_embedding = tpu_embedding_lib.TPUEmbedding( table_to_config_dict, feature_to_config_dict, global_batch_size, mode, master=None, pipeline_execution_with_tensor_core=( self.params.pipeline_execution_with_tensor_core), device_config=device_config) tf.add_to_collection(py_utils.TPU_EMBEDDING, self._tpu_embedding)
def _BuildTpuEmbeddingApi(): load_op_list = [] retrieve_op_list = [] num_cores = self.cluster.params.worker.tpus_per_replica global_batch_size = (self.params.batch_size * self.cluster.num_splits_per_client) table_to_config_dict = {} feature_to_config_dict = {} for table in self.tables: table_to_config_dict[table.table_name] = table.table_config load_op_list += table.load_op_list retrieve_op_list += table.retrieve_op_list for feature in table.input_keys: feature_to_config_dict[ feature] = tpu_embedding_lib.FeatureConfig( table.table_name, max_sequence_length=table.max_sequence_length) mode = tpu_embedding_lib.TRAINING device_config = tpu_embedding_lib.DeviceConfig( num_cores=num_cores, num_hosts=self.params.tables[0].num_tpu_hosts, job_name=self.cluster.params.worker.name) tpu_embedding = tpu_embedding_lib.TPUEmbedding( table_to_config_dict, feature_to_config_dict, global_batch_size, mode, master=None, pipeline_execution_with_tensor_core=( self.params.pipeline_execution_with_tensor_core), partition_strategy=p.partition_strategy, device_config=device_config) with tf.init_scope(): dummy_variables, dummy_variables_init = ( tpu_embedding_gradient.create_dummy_table_variables( tpu_embedding)) load_op_list += [dummy_variables_init] tf.add_to_collection(py_utils.TPU_EMBEDDING, tpu_embedding) tf.add_to_collection(py_utils.TPU_EMBEDDING_DUMMY_VARS, dummy_variables) tf.add_to_collection(py_utils.TPU_EMBEDDING_LOAD_OPS, load_op_list) tf.add_to_collection(py_utils.TPU_EMBEDDING_RETRIEVE_OPS, retrieve_op_list)
def _CreateLayerVariables(self): super()._CreateLayerVariables() p = self.params # At the feature level, track which are associated # with "sequence embeddings". self._sequence_features = {} if _ShouldUseTpu(p): num_cores = self.cluster.params.worker.tpus_per_replica global_batch_size = ( self.params.batch_size * self.cluster.num_splits_per_client) table_to_config_dict = {} feature_to_config_dict = {} for table in self.tables: table_to_config_dict[table.table_name] = table.table_config for feature in table.input_keys: if table.max_sequence_length > 0: self._sequence_features[feature] = True feature_to_config_dict[feature] = tpu_embedding_lib.FeatureConfig( table.table_name, max_sequence_length=table.max_sequence_length) tpu_embedding = self._tpu_embedding_collection.tpu_embedding if tpu_embedding: self._CheckTPUEmbeddingConfig(tpu_embedding, table_to_config_dict, feature_to_config_dict, global_batch_size) tf.logging.info('TPUEmbedding API singleton already exists, reusing') self._tpu_embedding = tpu_embedding else: mode = tpu_embedding_lib.TRAINING device_config = tpu_embedding_lib.DeviceConfig( num_cores=num_cores, num_hosts=self.params.tables[0].num_tpu_hosts, job_name=self.cluster.params.worker.name) self._tpu_embedding = tpu_embedding_lib.TPUEmbedding( table_to_config_dict, feature_to_config_dict, global_batch_size, mode, master=None, pipeline_execution_with_tensor_core=( self.params.pipeline_execution_with_tensor_core), partition_strategy=p.partition_strategy, device_config=device_config) self._tpu_embedding_collection.tpu_embedding = self._tpu_embedding self._tpu_embedding_collection.SetGradientMultiplierSchedule( self.gradient_multiplier_schedule)
def _create_tpu_embedding(self, mode): """Create tpu_embedding.TPUEmbedding based on mode.""" if mode == model_fn_lib.ModeKeys.TRAIN: batch_size = self._train_batch_size else: batch_size = self._eval_batch_size if mode == model_fn_lib.ModeKeys.TRAIN: tpu_embedding_mode = tpu_embedding.TRAINING optimization_parameters = ( self._embedding_config_spec.optimization_parameters) elif (mode == model_fn_lib.ModeKeys.EVAL or mode == model_fn_lib.ModeKeys.PREDICT): tpu_embedding_mode = tpu_embedding.INFERENCE optimization_parameters = None else: raise ValueError('Mode {} is not supported.'.format(mode)) if self._run_config.cluster: master = self._run_config.cluster.master() cluster_spec = self._run_config.cluster.cluster_spec() cluster_def = cluster_spec.as_cluster_def( ) if cluster_spec else None else: master = (self._run_config.evaluation_master if mode == model_fn_lib.ModeKeys.EVAL else self._run_config.master) cluster_def = None master_job_name = None if self._run_config.tpu_config.tpu_job_name is not None: master_job_name = self._run_config.tpu_config.tpu_job_name tpu_embedding_ = tpu_embedding.TPUEmbedding( self._table_to_config_dict, self._feature_to_config_dict, batch_size, tpu_embedding_mode, master, optimization_parameters, cluster_def, pipeline_execution_with_tensor_core=self._embedding_config_spec. pipeline_execution_with_tensor_core, partition_strategy=self._partition_strategy, profile_data_directory=self._embedding_config_spec. profile_data_directory, master_job_name=master_job_name) return tpu_embedding_
def _create_tpu_embedding(self, mode): """Create tpu_embedding.TPUEmbedding based on mode.""" if mode == model_fn_lib.ModeKeys.TRAIN: batch_size = self._train_batch_size else: batch_size = self._eval_batch_size if mode == model_fn_lib.ModeKeys.TRAIN: tpu_embedding_mode = tpu_embedding.TRAINING elif (mode == model_fn_lib.ModeKeys.EVAL or mode == model_fn_lib.ModeKeys.PREDICT): tpu_embedding_mode = tpu_embedding.INFERENCE else: raise ValueError('Mode {} is not supported.'.format(mode)) tpu_embedding_ = tpu_embedding.TPUEmbedding( self._table_to_config_dict, self._feature_to_table_dict, batch_size, tpu_embedding_mode, self._master, self._optimization_parameters, ) return tpu_embedding_
def run_model(params, eval_init_fn=None, eval_finish_fn=None, run_finish_fn=None): """Run the DLRM model, using a pre-defined configuration. Args: params: HPTuner object that provides new params for the trial. eval_init_fn: Lambda to run at start of eval. None means use the default. eval_finish_fn: Lambda for end of eval. None means use the default. run_finish_fn: Lambda for end of execution. None means use the default. Returns: A list of tuples, each entry describing the eval metric for one eval. Each tuple entry is (global_step, metric_value). """ mlp_log.mlperf_print(key="cache_clear", value=True) mlp_log.mlperf_print(key="init_start", value=None) mlp_log.mlperf_print("global_batch_size", params["batch_size"]) mlp_log.mlperf_print("train_samples", _NUM_TRAIN_EXAMPLES) mlp_log.mlperf_print("eval_samples", _NUM_EVAL_EXAMPLES) adjusted_lr = params["learning_rate"] * (params["batch_size"] / 2048.0) mlp_log.mlperf_print("opt_base_learning_rate", adjusted_lr) mlp_log.mlperf_print("sgd_opt_base_learning_rate", adjusted_lr) mlp_log.mlperf_print("sgd_opt_learning_rate_decay_poly_power", 2) mlp_log.mlperf_print("sgd_opt_learning_rate_decay_steps", params["decay_steps"]) mlp_log.mlperf_print("lr_decay_start_steps", params["decay_start_step"]) mlp_log.mlperf_print("opt_learning_rate_warmup_steps", params["lr_warmup_steps"]) # Used for vizier. List of tuples. Each entry is (global_step, auc_metric). eval_metrics = [(0, 0.0)] feature_config = fc.FeatureConfig(params) (feature_to_config_dict, table_to_config_dict) = feature_config.get_feature_tbl_config() opt_params = { "sgd": tpu_embedding.StochasticGradientDescentParameters( learning_rate=params["learning_rate"]), "adagrad": tpu_embedding.AdagradParameters( learning_rate=params["learning_rate"], initial_accumulator=params["adagrad_init_accum"]) } embedding = tpu_embedding.TPUEmbedding( table_to_config_dict, feature_to_config_dict, params["batch_size"], mode=tpu_embedding.TRAINING, optimization_parameters=opt_params[params["optimizer"]], partition_strategy="mod", pipeline_execution_with_tensor_core=FLAGS.pipeline_execution, master=FLAGS.master) runner = dlrm_embedding_runner.DLRMEmbeddingRunner( iterations_per_loop=FLAGS.steps_between_evals, train_steps=FLAGS.train_steps, eval_steps=FLAGS.eval_steps, num_replicas=FLAGS.num_tpu_shards, sparse_features_key="cat-features", embedding=embedding) train_input_fn, eval_input_fn = get_input_fns(params, feature_config) runner.initialize(train_input_fn, eval_input_fn, functools.partial(dlrm.dlrm_llr_model_fn, params, feature_config), params["batch_size"], params["eval_batch_size"], train_has_labels=False, eval_has_labels=False) mlp_log.mlperf_print("init_stop", None) mlp_log.mlperf_print("run_start", None) def _default_eval_init_fn(cur_step): """Logging statements executed before every eval.""" eval_num = 0 if FLAGS.steps_between_evals: eval_num = cur_step // FLAGS.steps_between_evals tf.logging.info("== Block {}. Step {} of {}".format( eval_num + 1, cur_step, FLAGS.train_steps)) mlp_log.mlperf_print("block_start", None, metadata={ "first_epoch_num": eval_num + 1, "epoch_count": 1 }) mlp_log.mlperf_print("eval_start", None, metadata={"epoch_num": eval_num + 1}) def _default_eval_finish_fn(cur_step, eval_output, summary_writer=None): eval_num = 0 if FLAGS.steps_between_evals: eval_num = cur_step // FLAGS.steps_between_evals mlp_log.mlperf_print("eval_stop", None, metadata={"epoch_num": eval_num + 1}) mlp_log.mlperf_print("block_stop", None, metadata={"first_epoch_num": eval_num + 1}) tf.logging.info( "== Eval finished (step {}). Computing metric..".format(cur_step)) results_np = np.array(eval_output["results"]) results_np = np.reshape(results_np, (-1, 2)) predictions_np = results_np[:, 0].astype(np.float32) targets_np = results_np[:, 1].astype(np.int32) # TODO: Fix roc clif in cloud. # roc_obj = roc_metrics.RocMetrics(predictions_np, targets_np) # roc_auc = roc_obj.ComputeRocAuc() roc_auc = 0.0 tf.logging.info("== Eval shape: {}. AUC = {:.4f}".format( predictions_np.shape, roc_auc)) success = roc_auc >= _ACCURACY_THRESH mlp_log.mlperf_print("eval_accuracy", roc_auc, metadata={"epoch_num": eval_num + 1}) if success: mlp_log.mlperf_print("run_stop", None, metadata={"status": "success"}) if summary_writer: summary_writer.add_summary( utils.create_scalar_summary("auc", roc_auc), global_step=cur_step + FLAGS.steps_between_evals) eval_metrics.append((cur_step + FLAGS.steps_between_evals, roc_auc)) return success def _default_run_finish_fn(success_status): if not success_status: mlp_log.mlperf_print("run_stop", None, metadata={"status": "failure"}) tf.logging.info("Retrieving embedding vars and writing stats.") runner.retrieve_embedding_vars() runner.train_and_eval(eval_init_fn=eval_init_fn or _default_eval_init_fn, eval_finish_fn=eval_finish_fn or _default_eval_finish_fn, run_finish_fn=run_finish_fn or _default_run_finish_fn) return eval_metrics