def test_sok_dense_demo(args, init_tensors, *random_samples): port = 12345 os.environ["TF_CONFIG"] = json.dumps({ "cluster": {"worker": [args.ips[i] + ":" + str(port + i) for i in range(args.worker_num)]}, "task": {"type": "worker", "index": args.task_id} }) strategy = tf.distribute.MultiWorkerMirroredStrategy() with strategy.scope(): sok.Init(global_batch_size=args.global_batch_size) sok_dense_demo = SOKDenseDemo(max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, use_hashtable=args.use_hashtable) emb_opt = utils.get_embedding_optimizer(args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer(args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = tf.keras.mixed_precision.LossScaleOptimizer(emb_opt, initial_scale=1024) sok_saver = sok.Saver() if 1 == args.restore_params: filepath = r"./embedding_variables" sok_saver.restore_from_file(sok_dense_demo.embedding_layer.embedding_variable, filepath) else: sok_saver.load_embedding_values(sok_dense_demo.embedding_layer.embedding_variable, init_tensors) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss(loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit, embedding_vector = sok_dense_demo(inputs, training=True) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss embedding_variables, other_variable = sok.split_embedding_variable_from_others(sok_dense_demo.trainable_variables) grads, emb_grads = tape.gradient(_loss, [other_variable, embedding_variables]) if args.mixed_precision: grads = emb_opt.get_unscaled_gradients(grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) if "plugin" not in args.optimizer: with sok.OptimizerScope(embedding_variables): emb_opt.apply_gradients(zip(emb_grads, embedding_variables), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, embedding_variables), experimental_aggregate_gradients=False) dense_opt.apply_gradients(zip(grads, other_variable)) return loss, embedding_vector sok_results = list() def _dataset_fn(input_context): replica_batch_size = input_context.get_per_replica_batch_size(args.global_batch_size) dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=False, repeat=1) return dataset dataset = strategy.distribute_datasets_from_function(_dataset_fn) for i, (input_tensors, replica_labels) in enumerate(dataset): print("-"*30, "step ", str(i), "-"*30) loss, embedding_vector = strategy.run(_train_step, args=(input_tensors, replica_labels)) loss = strategy.reduce("sum", loss, axis=None) print("[INFO]: iteration {}, loss {}".format(i, loss)) sok_results.append(embedding_vector) # save params to file. if 1 == args.save_params: filepath = r"./embedding_variables" utils.try_make_dirs(filepath, chief=(True if args.task_id == 0 else False)) sok_saver.dump_to_file(sok_dense_demo.embedding_layer.embedding_variable, filepath) return sok_results, sok_dense_demo.embedding_layer.embedding_variable.values[0].m_var_name
def get_sok_results(args, init_tensors, *random_samples): if args.distributed_tool == "onedevice": strategy = strategy_wrapper.OneDeviceStrategy() elif args.distributed_tool == "horovod": import horovod.tensorflow as hvd hvd.init() strategy = strategy_wrapper.HorovodStrategy() else: raise ValueError(f"{args.distributed_tool} is not supported.") with strategy.scope(): sok_init_op = sok.Init(global_batch_size=args.global_batch_size) embedding_initializer = tf.keras.initializers.Ones( ) if args.use_tf_initializer else None sok_dense_demo = SOKDemo( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, use_hashtable=args.use_hashtable, dynamic_input=args.dynamic_input, num_of_dense_layers=0, key_dtype=args.key_dtype, embedding_initializer=embedding_initializer) emb_opt = utils.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = sok.tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, 1024) sok_saver = sok.Saver() restore_op = list() for i, embedding_layer in enumerate(sok_dense_demo.embedding_layers): control_inputs = [restore_op[-1]] if restore_op else None with tf.control_dependencies(control_inputs): if args.restore_params: filepath = r"./embedding_variables" op = sok_saver.restore_from_file( embedding_layer.embedding_variable, filepath) else: if not args.use_tf_initializer: op = sok_saver.load_embedding_values( embedding_layer.embedding_variable, init_tensors[i]) else: op = tf.constant(1.0) restore_op.append(op) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none') def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) def _train_step(inputs, labels, training): def _step_fn(inputs, labels): logit, embedding_vector = sok_dense_demo(inputs, training=training) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss emb_var, other_var = sok.split_embedding_variable_from_others( sok_dense_demo.trainable_variables) grads = tf.gradients( _loss, emb_var + other_var, colocate_gradients_with_ops=True, unconnected_gradients=tf.UnconnectedGradients.NONE) emb_grads, other_grads = grads[:len(emb_var)], grads[len(emb_var):] if args.mixed_precision: other_grads = emb_opt.get_unscaled_gradients(other_grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) if "plugin" in args.optimizer: emb_train_op = emb_opt.apply_gradients(zip(emb_grads, emb_var)) else: with sok.OptimizerScope(emb_var): emb_train_op = emb_opt.apply_gradients( zip(emb_grads, emb_var)) with tf.control_dependencies([*emb_grads]): # in case NCCL runs concurrently via SOK and horovod other_grads = strategy.reduce("sum", other_grads) other_train_op = dense_opt.apply_gradients( zip(other_grads, other_var)) with tf.control_dependencies([emb_train_op, other_train_op]): total_loss = strategy.reduce("sum", loss) total_loss = tf.identity(total_loss) return total_loss, embedding_vector return strategy.run(_step_fn, inputs, labels) replica_batch_size = args.global_batch_size // args.gpu_num dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=False, repeat=1, args=args) train_iterator = dataset.make_initializable_iterator() iterator_init = train_iterator.initializer inputs, labels = train_iterator.get_next() graph_results = _train_step(inputs, labels, training=True) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) if "plugin" in args.optimizer: init_op = tf.group(init_op, emb_opt.initializer) save_op = list() for i, embedding_layer in enumerate(sok_dense_demo.embedding_layers): control_inputs = [save_op[-1]] if save_op else None with tf.control_dependencies(control_inputs): if args.save_params: filepath = r"./embedding_variables/" utils.try_make_dirs(filepath) op = sok_saver.dump_to_file(embedding_layer.embedding_variable, filepath) else: op = tf.constant(1.0) save_op.append(op) sok_results = list() config = tf.ConfigProto() config.log_device_placement = False with tf.Session(config=config) as sess: sess.run(sok_init_op) sess.run([init_op, iterator_init]) sess.run(restore_op) sess.graph.finalize() for step in range(args.iter_num): loss_v, emb_vector_v = sess.run([*graph_results]) print("*" * 80) print(f"Step: {step}, loss: {loss_v}" ) #", embedding_vector:\n{emb_vector_v}") sok_results.append(emb_vector_v) sess.run(save_op) name = list() for embedding_layer in sok_dense_demo.embedding_layers: name.append(embedding_layer.embedding_variable.m_var_name) return sok_results, name
def test_sok_demo(args, init_tensors, *random_samples): strategy = tf.distribute.MirroredStrategy() with strategy.scope(): result = sok.Init(global_batch_size=args.global_batch_size) embedding_initializer = tf.keras.initializers.Ones( ) if args.use_tf_initializer else None plugin_demo = SOKDemo( combiner=args.combiner, max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, slot_num=args.slot_num, max_nnz=args.max_nnz, embedding_vec_size=args.embedding_vec_size, use_hashtable=args.use_hashtable, key_dtype=args.key_dtype, embedding_initializer=embedding_initializer) emb_opt = utils.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, initial_scale=1024) plugin_saver = sok.Saver() if (1 == args.restore_params): # restore from trained parameters filepath = r"./embedding_variables" plugin_saver.restore_from_file( plugin_demo.embedding_layer.embedding_variable, filepath) else: # initialize using randomized initial value if not args.use_tf_initializer and init_tensors: status = plugin_saver.load_embedding_values( plugin_demo.embedding_layer.embedding_variable, init_tensors) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit, embedding_vector = plugin_demo(inputs, training=True) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss embedding_variables, other_variable = sok.split_embedding_variable_from_others( plugin_demo.trainable_variables) grads, emb_grads = tape.gradient(_loss, [other_variable, embedding_variables]) if args.mixed_precision: grads = emb_opt.get_unscaled_gradients(grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) with tf.control_dependencies([*emb_grads]): # in case NCCL runs concurrently via SOK and TF if 'plugin' not in args.optimizer: with sok.OptimizerScope(embedding_variables): emb_opt.apply_gradients( zip(emb_grads, embedding_variables), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, embedding_variables), experimental_aggregate_gradients=False) dense_opt.apply_gradients(zip(grads, other_variable)) return loss, embedding_vector sok_results = list() def _dataset_fn(input_context): replica_batch_size = input_context.get_per_replica_batch_size( args.global_batch_size) dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, to_sparse_tensor=True, repeat=1, args=args) dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) return dataset dataset = strategy.distribute_datasets_from_function(_dataset_fn) for i, (sparse_tensors, replica_labels) in enumerate(dataset): print("-" * 30, "step ", str(i), "-" * 30) loss, embedding_vector = strategy.run(_train_step, args=(sparse_tensors, replica_labels)) loss = strategy.reduce("sum", loss, axis=None) print("[INFO]: iteration {}, loss {}".format(i, loss)) sok_results.append(embedding_vector) # save params to file. if 1 == args.save_params: filepath = r"./embedding_variables/" utils.try_make_dirs(filepath) plugin_saver.dump_to_file( plugin_demo.embedding_layer.embedding_variable, filepath) return sok_results, plugin_demo.embedding_layer.embedding_variable.values[ 0].m_var_name
def get_tf_results(args, init_tensors, *random_samples): graph = tf.Graph() with graph.as_default(): tf_dense_demo = TFDemo( vocabulary_size=args.max_vocabulary_size_per_gpu * args.gpu_num, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, embedding_vec_size=args.embedding_vec_size, num_of_dense_layers=0, use_hashtable=False, dynamic_input=False) optimizer = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: optimizer = sok.tf.keras.mixed_precision.LossScaleOptimizer( optimizer, 1024) loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) def _train_step(inputs, labels, training): logit, embedding_vector = tf_dense_demo(inputs, training=training) loss = loss_fn(labels, logit) if args.mixed_precision: _loss = optimizer.get_scaled_loss(loss) else: _loss = loss grads = tf.gradients( _loss, tf_dense_demo.trainable_variables, colocate_gradients_with_ops=True, unconnected_gradients=tf.UnconnectedGradients.NONE) if args.mixed_precision: grads = optimizer.get_unscaled_gradients(grads) train_op = optimizer.apply_gradients( zip(grads, tf_dense_demo.trainable_variables)) with tf.control_dependencies([train_op]): loss = tf.identity(loss) return loss, embedding_vector dataset = utils.tf_dataset(*random_samples, batchsize=args.global_batch_size, to_sparse_tensor=False, repeat=1) train_iterator = dataset.make_initializable_iterator() iterator_init = train_iterator.initializer inputs, labels = train_iterator.get_next() graph_results = _train_step(inputs, labels, training=True) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) restore_op = list() for i, embedding_layer in enumerate(tf_dense_demo.embedding_layers): restore_op.append( embedding_layer.embeddings.assign( tf.concat(init_tensors[i], axis=0))) emb_values = list() for embedding_layer in tf_dense_demo.embedding_layers: if args.save_params: filepath = r"./embedding_variables/" utils.try_make_dirs(filepath) emb_values.append(embedding_layer.embeddings.read_value()) else: emb_values = tf.constant(1.0) tf_results = list() with tf.Session(graph=graph) as sess: sess.run([init_op, iterator_init]) sess.run(restore_op) sess.graph.finalize() for step in range(args.iter_num): loss_v, embedding_vector_v = sess.run([*graph_results]) print("*" * 80) print(f"step: {step}, loss: {loss_v}" ) #", embedding_vector:\n{embedding_vector_v}") tf_results.append(embedding_vector_v) emb_values_v = sess.run(emb_values) if args.save_params: for i, value in enumerate(emb_values_v): utils.save_to_file( os.path.join(filepath, r"tf_variable_" + str(i) + r".file"), value) name = list() for embedding_layer in tf_dense_demo.embedding_layers: name.append(embedding_layer.embeddings.name) return tf_results, name