def main(args): strategy = tf.distribute.MirroredStrategy() dataset = utility.TFDataset(filename=args.data_filename, batchsize=args.global_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) dataset = strategy.experimental_distribute_dataset(dataset) with strategy.scope(): sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseDemo( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, num_dense_layers=args.num_dense_layers) embedding_optimizer = utility.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_optimizer = utility.get_dense_optimizer( args.optimizer)(learning_rate=0.1) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) return tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit = model(inputs, training=True) loss = _replica_loss(labels, logit) emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) grads, emb_grads = tape.gradient(loss, [other_variable, emb_variable]) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_variable): embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) dense_optimizer.apply_gradients(zip(grads, other_variable)) return loss for i, (inputs, labels) in enumerate(dataset): if args.stop_at_iter > 0 and i >= args.stop_at_iter: break rng = nvtx.start_range(message="Iteration_" + str(i), color="blue") replica_loss = strategy.run(_train_step, args=(inputs, labels)) loss = strategy.reduce(tf.distribute.ReduceOp.SUM, replica_loss, axis=None) nvtx.end_range(rng) print("[INFO]: Iteration: {}, loss={}".format(i, loss))
def test_sok_multi_dense_emb(args): comm_options = tf.distribute.experimental.CommunicationOptions( bytes_per_pack=0, timeout_seconds=None, implementation=tf.distribute.experimental.CommunicationImplementation. NCCL) if args.worker_num == 1: strategy = tf.distribute.MirroredStrategy() else: port = 12345 os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "worker": [ "localhost" + ":" + str(port + i) for i in range(args.worker_num) ] }, "task": { "type": "worker", "index": args.task_id } }) strategy = tf.distribute.MultiWorkerMirroredStrategy( communication_options=comm_options) replica_batch_size = args.global_batch_size // (args.worker_num * 1) dataset = utility.TFDataset(filename=args.file_prefix + str(args.task_id) + ".file", batchsize=replica_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) dynamic_input = True if args.dynamic_input == 1 else False with strategy.scope(): sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseModel( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size_list=args.embedding_vec_size_list, slot_num_list=args.slot_num_list, nnz_per_slot_list=[ args.nnz_per_slot for _ in range(len(args.slot_num_list)) ], num_dense_layers=args.num_dense_layers, dynamic_input=dynamic_input) emb_opt = utils.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer( args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, initial_scale=1024) # set initial value to embedding variables. sok_saver = sok.Saver() for i, layer in enumerate(model.embedding_layers): init_tensors = utils.get_ones_tensor( max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size_list[i], num=args.worker_num) sok_saver.load_embedding_values(layer.embedding_variable, init_tensors) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit, all_vectors = model(inputs, training=True) loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(loss) else: _loss = loss emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) grads, emb_grads = tape.gradient(_loss, [other_variable, emb_variable]) if args.mixed_precision: grads = emb_opt.get_unscaled_gradients(grads) emb_grads = emb_opt.get_unscaled_gradients(emb_grads) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_variable): emb_opt.apply_gradients(zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) with tf.control_dependencies(emb_grads): # mannually all-reduce dense gradients replica_context = tf.distribute.get_replica_context() grads = replica_context.all_reduce("sum", grads, options=comm_options) dense_opt.apply_gradients(zip(grads, other_variable), experimental_aggregate_gradients=False) # manually all-reduce loss, it is ok, because replica_loss has already been used to # update local variables. loss = replica_context.all_reduce(tf.distribute.ReduceOp.SUM, loss, options=comm_options) return loss, all_vectors, logit # save its results sok_results = list() for i, (inputs, labels) in enumerate(dataset): if args.stop_iter >= 0 and i >= args.stop_iter: break total_loss, all_vectors, logit = strategy.run(_train_step, args=(inputs, labels)) print("[INFO]: Iteration: {}, loss={}".format(i, total_loss)) with tf.device("CPU:0"): sok_results.append(all_vectors) return sok_results
def main(args, task_id): print("task id={}".format(task_id)) comm_options = tf.distribute.experimental.CommunicationOptions( bytes_per_pack=0, timeout_seconds=None, implementation=tf.distribute.experimental.CommunicationImplementation. NCCL) # if MirroredStrategy is used here and _train_step is not decorated by @tf.function, # there will be a "Bad file descriptor" error related to multiprocessing at the end # of the program. #if args.total_gpu_num == 1: # strategy = tf.distribute.MirroredStrategy() if True: port = 12345 os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "worker": [ "localhost" + ":" + str(port + i) for i in range(args.worker_num) ] }, "task": { "type": "worker", "index": task_id } }) strategy = tf.distribute.MultiWorkerMirroredStrategy( communication_options=comm_options) if args.data_splited: filename = args.data_filename + str(task_id) + ".file" else: filename = args.data_filename replica_batch_size = args.global_batch_size // (args.worker_num * 1) dataset = utility.TFDataset(filename=filename, batchsize=replica_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) with strategy.scope(): model = TfDenseDemo(global_batch_size=args.global_batch_size, vocabulary_size=args.vocabulary_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, num_dense_layers=args.num_dense_layers, embedding_vec_size=args.embedding_vec_size) emb_optimizer = utility.get_dense_optimizer( args.optimizer)(learning_rate=0.1) dense_optimizer = utility.get_dense_optimizer( args.optimizer)(learning_rate=0.1) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) return tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) # Note: all_reduce_indexed_slices in eager mode is not supported @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit = model(inputs, training=True) loss = _replica_loss(labels, logit) emb_vars, dense_vars = split_emb_and_dense_variables( model.trainable_variables) # Debug code #print("number of embedding variables: {}".format(len(emb_vars))) #print("number of dense variables : {}".format(len(dense_vars))) emb_grads, dense_grads = tape.gradient(loss, [emb_vars, dense_vars]) # update variables of embedding layer emb_optimizer.apply_gradients(zip(emb_grads, emb_vars), experimental_aggregate_gradients=False) # Mannually all-reduce dense gradients and update variables of dense layers replica_context = tf.distribute.get_replica_context() dense_grads = replica_context.all_reduce("sum", dense_grads, options=comm_options) dense_optimizer.apply_gradients(zip(dense_grads, dense_vars), experimental_aggregate_gradients=False) # manually all-reduce loss, it is ok, because replica_loss has already been used to # update local variables. loss = replica_context.all_reduce(tf.distribute.ReduceOp.SUM, loss, options=comm_options) return loss time_arr = [] for i, (inputs, labels) in enumerate(dataset): if args.stop_at_iter > 0 and i >= args.stop_at_iter: break rng = nvtx.start_range(message="Iteration_" + str(i), color="blue") start_time = time.time() loss = strategy.run(_train_step, args=(inputs, labels)) time_arr.append(time.time() - start_time) nvtx.end_range(rng) print("[INFO]: Iteration: {}, loss={}".format(i, loss)) print("Average iteration time (except 1st iteration): ", np.mean(time_arr[1:]))
def main(args, task_id): print(task_id) comm_options = tf.distribute.experimental.CommunicationOptions( bytes_per_pack=0, timeout_seconds=None, implementation=tf.distribute.experimental.CommunicationImplementation. NCCL) if args.total_gpu_num == 1: strategy = tf.distribute.MirroredStrategy() else: port = 12345 os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "worker": [ "localhost" + ":" + str(port + i) for i in range(args.worker_num) ] }, "task": { "type": "worker", "index": task_id } }) strategy = tf.distribute.MultiWorkerMirroredStrategy( communication_options=comm_options) if args.data_splited: filename = args.data_filename + str(task_id) + ".file" else: filename = args.data_filename replica_batch_size = args.global_batch_size // (args.worker_num * 1) dataset = utility.TFDataset(filename=filename, batchsize=replica_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) with strategy.scope(): sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseDemo( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, num_dense_layers=args.num_dense_layers) embedding_optimizer = utility.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_optimizer = utility.get_dense_optimizer( args.optimizer)(learning_rate=0.1) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) return tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) @tf.function def _train_step(inputs, labels): with tf.GradientTape() as tape: logit = model(inputs, training=True) loss = _replica_loss(labels, logit) emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) grads, emb_grads = tape.gradient(loss, [other_variable, emb_variable]) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_variable): embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) # mannually all-reduce dense gradients replica_context = tf.distribute.get_replica_context() grads = replica_context.all_reduce("sum", grads, options=comm_options) dense_optimizer.apply_gradients(zip(grads, other_variable), experimental_aggregate_gradients=False) # manually all-reduce loss, it is ok, because replica_loss has already been used to # update local variables. loss = replica_context.all_reduce(tf.distribute.ReduceOp.SUM, loss, options=comm_options) return loss time_arr = [] for i, (inputs, labels) in enumerate(dataset): if args.stop_at_iter > 0 and i >= args.stop_at_iter: break rng = nvtx.start_range(message="Iteration_" + str(i), color="blue") start_time = time.time() total_loss = strategy.run(_train_step, args=(inputs, labels)) time_arr.append(time.time() - start_time) nvtx.end_range(rng) if task_id == '0': print("[INFO]: Iteration: {}, loss={}".format(i, total_loss)) if task_id == '0': print("Average iteration time (except 1st iteration): ", np.mean(time_arr[1:]))
def test_sok_multi_dense_emb(args): assert (args.global_batch_size % args.worker_num == 0) replica_batch_size = args.global_batch_size // (args.worker_num) dataset = utility.TFDataset(filename=args.file_prefix + str(args.task_id) + ".file", batchsize=replica_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) dynamic_input = True if args.dynamic_input == 1 else False # SOK initialize sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseModel( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size_list=args.embedding_vec_size_list, slot_num_list=args.slot_num_list, nnz_per_slot_list=[ args.nnz_per_slot for _ in range(len(args.slot_num_list)) ], num_dense_layers=args.num_dense_layers, dynamic_input=dynamic_input, use_hashtable=args.use_hashtable) emb_opt = utils.get_embedding_optimizer(args.optimizer)(learning_rate=0.1) dense_opt = utils.get_dense_optimizer(args.optimizer)(learning_rate=0.1) if args.mixed_precision: emb_opt = tf.keras.mixed_precision.LossScaleOptimizer( emb_opt, initial_scale=1024) sok_saver = sok.Saver() for i, layer in enumerate(model.embedding_layers): init_tensors = utils.get_ones_tensor( max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size_list[i], num=args.worker_num) sok_saver.load_embedding_values(layer.embedding_variable, init_tensors) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) _dtype = loss.dtype loss = tf.cast(loss, tf.float32) loss = tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) return tf.cast(loss, _dtype) @tf.function def _train_step(inputs, labels, first_batch): with tf.GradientTape() as tape: logit, all_vectors = model(inputs, training=True) replica_loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(replica_loss) else: _loss = replica_loss emb_var, other_var = sok.split_embedding_variable_from_others( model.trainable_variables) emb_grads, grads = tape.gradient(_loss, [emb_var, other_var]) if args.mixed_precision: emb_grads = emb_opt.get_unscaled_gradients(emb_grads) grads = emb_opt.get_unscaled_gradients(grads) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_var): emb_opt.apply_gradients(zip(emb_grads, emb_var), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, emb_var), experimental_aggregate_gradients=False) with tf.control_dependencies(emb_grads): grads = [hvd.allreduce(grad) for grad in grads] dense_opt.apply_gradients(zip(grads, other_var)) if first_batch: hvd.broadcast_variables(other_var, root_rank=0) hvd.broadcast_variables(dense_opt.variables(), root_rank=0) total_loss = hvd.allreduce(replica_loss) return total_loss, all_vectors sok_results = list() for i, (inputs, labels) in enumerate(dataset): if args.stop_iter >= 0 and i >= args.stop_iter: break total_loss, all_vectors = _train_step(inputs, labels, 0 == i) print("[INFO]: Iteration: {}, loss={}".format(i, total_loss)) sok_results.append(all_vectors) return sok_results
def main(args): # Initialize horovod hvd.init() gpus = tf.config.list_physical_devices("GPU") tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU") # Generate local filename # Assume the dataset has been splited in advance local_file = args.data_filename_prefix + str(hvd.local_rank()) + ".file" # generate local batch size assert (args.global_batch_size % hvd.size() == 0) local_batch_size = args.global_batch_size // hvd.size() dataset = utility.TFDataset(filename=local_file, batchsize=local_batch_size, as_sparse_tensor=False, repeat=1) dataset = dataset.prefetch(tf.data.AUTOTUNE) # Because there is no tensorflow distribute strategy, sok.Init() will call horovod to # broadcast nccl id and random seed, so it must be called after hvd.init() sok.Init(global_batch_size=args.global_batch_size) model = SOKDenseDemo( max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, slot_num=args.slot_num, nnz_per_slot=args.nnz_per_slot, num_dense_layers=args.num_dense_layers) embedding_optimizer = utility.get_embedding_optimizer( args.optimizer)(learning_rate=0.1) dense_optimizer = utility.get_dense_optimizer( args.optimizer)(learning_rate=0.1) loss_fn = tf.keras.losses.BinaryCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) def _replica_loss(labels, logits): loss = loss_fn(labels, logits) return tf.nn.compute_average_loss( loss, global_batch_size=args.global_batch_size) @tf.function def _train_step(inputs, labels, first_batch): with tf.GradientTape() as tape, tf.GradientTape() as emb_tape: logit = model(inputs, training=True) replica_loss = _replica_loss(labels, logit) # Horovod: wrap tf.GradientTape with Horovod DistributedGradientTape tape = hvd.DistributedGradientTape(tape) # There is no need to wrap the emb_tape because the communication is done by sok # emb_tape = hvd.DistributedGradientTape(emb_tape) emb_variable, other_variable = sok.split_embedding_variable_from_others( model.trainable_variables) # type(emb_tape) here is hvd.DistributedGradientTape # type(tape) here is tf.GradientTape emb_grads = emb_tape.gradient(replica_loss, emb_variable) grads = tape.gradient(replica_loss, other_variable) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_variable): embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) else: embedding_optimizer.apply_gradients( zip(emb_grads, emb_variable), experimental_aggregate_gradients=False) dense_optimizer.apply_gradients(zip(grads, other_variable)) # Note: broadcast should be done after the first gradient step to ensure optimizer has been initialized. # There is no need to broadcast emb_variable and embedding_optimizer, because the parallel mode inside # sok is model parallel and the communication is down by sok itself. if first_batch: hvd.broadcast_variables(other_variable, root_rank=0) hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0) return replica_loss for i, (inputs, labels) in enumerate(dataset): if args.stop_at_iter > 0 and i >= args.stop_at_iter: break rng = nvtx.start_range(message="Iteration_" + str(i), color="blue") total_loss = _train_step(inputs, labels, i == 0) nvtx.end_range(rng) print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))