Example #1
0
def main(args):
    strategy = tf.distribute.MirroredStrategy()

    dataset = utility.TFDataset(filename=args.data_filename,
                                batchsize=args.global_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    dataset = strategy.experimental_distribute_dataset(dataset)

    with strategy.scope():
        sok.Init(global_batch_size=args.global_batch_size)

        model = SOKDenseDemo(
            max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size=args.embedding_vec_size,
            slot_num=args.slot_num,
            nnz_per_slot=args.nnz_per_slot,
            num_dense_layers=args.num_dense_layers)

        embedding_optimizer = utility.get_embedding_optimizer(
            args.optimizer)(learning_rate=0.1)
        dense_optimizer = utility.get_dense_optimizer(
            args.optimizer)(learning_rate=0.1)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        return tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)

    @tf.function
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit = model(inputs, training=True)
            loss = _replica_loss(labels, logit)
        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        grads, emb_grads = tape.gradient(loss, [other_variable, emb_variable])
        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                embedding_optimizer.apply_gradients(
                    zip(emb_grads, emb_variable),
                    experimental_aggregate_gradients=False)
        else:
            embedding_optimizer.apply_gradients(
                zip(emb_grads, emb_variable),
                experimental_aggregate_gradients=False)
        dense_optimizer.apply_gradients(zip(grads, other_variable))
        return loss

    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_at_iter > 0 and i >= args.stop_at_iter:
            break

        rng = nvtx.start_range(message="Iteration_" + str(i), color="blue")

        replica_loss = strategy.run(_train_step, args=(inputs, labels))
        loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                               replica_loss,
                               axis=None)

        nvtx.end_range(rng)
        print("[INFO]: Iteration: {}, loss={}".format(i, loss))
Example #2
0
def test_sok_multi_dense_emb(args):
    comm_options = tf.distribute.experimental.CommunicationOptions(
        bytes_per_pack=0,
        timeout_seconds=None,
        implementation=tf.distribute.experimental.CommunicationImplementation.
        NCCL)

    if args.worker_num == 1:
        strategy = tf.distribute.MirroredStrategy()
    else:
        port = 12345
        os.environ["TF_CONFIG"] = json.dumps({
            "cluster": {
                "worker": [
                    "localhost" + ":" + str(port + i)
                    for i in range(args.worker_num)
                ]
            },
            "task": {
                "type": "worker",
                "index": args.task_id
            }
        })
        strategy = tf.distribute.MultiWorkerMirroredStrategy(
            communication_options=comm_options)

    replica_batch_size = args.global_batch_size // (args.worker_num * 1)

    dataset = utility.TFDataset(filename=args.file_prefix + str(args.task_id) +
                                ".file",
                                batchsize=replica_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    dynamic_input = True if args.dynamic_input == 1 else False

    with strategy.scope():
        sok.Init(global_batch_size=args.global_batch_size)

        model = SOKDenseModel(
            max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size_list=args.embedding_vec_size_list,
            slot_num_list=args.slot_num_list,
            nnz_per_slot_list=[
                args.nnz_per_slot for _ in range(len(args.slot_num_list))
            ],
            num_dense_layers=args.num_dense_layers,
            dynamic_input=dynamic_input)

        emb_opt = utils.get_embedding_optimizer(
            args.optimizer)(learning_rate=0.1)
        dense_opt = utils.get_dense_optimizer(
            args.optimizer)(learning_rate=0.1)
        if args.mixed_precision:
            emb_opt = tf.keras.mixed_precision.LossScaleOptimizer(
                emb_opt, initial_scale=1024)

    # set initial value to embedding variables.
    sok_saver = sok.Saver()
    for i, layer in enumerate(model.embedding_layers):
        init_tensors = utils.get_ones_tensor(
            max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size=args.embedding_vec_size_list[i],
            num=args.worker_num)
        sok_saver.load_embedding_values(layer.embedding_variable, init_tensors)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        _dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)
        return tf.cast(loss, _dtype)

    @tf.function
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit, all_vectors = model(inputs, training=True)
            loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = emb_opt.get_scaled_loss(loss)
            else:
                _loss = loss
        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        grads, emb_grads = tape.gradient(_loss, [other_variable, emb_variable])
        if args.mixed_precision:
            grads = emb_opt.get_unscaled_gradients(grads)
            emb_grads = emb_opt.get_unscaled_gradients(emb_grads)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                emb_opt.apply_gradients(zip(emb_grads, emb_variable),
                                        experimental_aggregate_gradients=False)
        else:
            emb_opt.apply_gradients(zip(emb_grads, emb_variable),
                                    experimental_aggregate_gradients=False)

        with tf.control_dependencies(emb_grads):
            # mannually all-reduce dense gradients
            replica_context = tf.distribute.get_replica_context()
            grads = replica_context.all_reduce("sum",
                                               grads,
                                               options=comm_options)
            dense_opt.apply_gradients(zip(grads, other_variable),
                                      experimental_aggregate_gradients=False)

            # manually all-reduce loss, it is ok, because replica_loss has already been used to
            # update local variables.
            loss = replica_context.all_reduce(tf.distribute.ReduceOp.SUM,
                                              loss,
                                              options=comm_options)
        return loss, all_vectors, logit

    # save its results
    sok_results = list()
    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_iter >= 0 and i >= args.stop_iter:
            break

        total_loss, all_vectors, logit = strategy.run(_train_step,
                                                      args=(inputs, labels))
        print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))

        with tf.device("CPU:0"):
            sok_results.append(all_vectors)

    return sok_results
Example #3
0
def main(args, task_id):
    print("task id={}".format(task_id))
    comm_options = tf.distribute.experimental.CommunicationOptions(
        bytes_per_pack=0,
        timeout_seconds=None,
        implementation=tf.distribute.experimental.CommunicationImplementation.
        NCCL)

    # if MirroredStrategy is used here and _train_step is not decorated by @tf.function,
    # there will be a "Bad file descriptor" error related to multiprocessing at the end
    # of the program.
    #if args.total_gpu_num == 1:
    #    strategy = tf.distribute.MirroredStrategy()
    if True:
        port = 12345
        os.environ["TF_CONFIG"] = json.dumps({
            "cluster": {
                "worker": [
                    "localhost" + ":" + str(port + i)
                    for i in range(args.worker_num)
                ]
            },
            "task": {
                "type": "worker",
                "index": task_id
            }
        })
        strategy = tf.distribute.MultiWorkerMirroredStrategy(
            communication_options=comm_options)

    if args.data_splited:
        filename = args.data_filename + str(task_id) + ".file"
    else:
        filename = args.data_filename

    replica_batch_size = args.global_batch_size // (args.worker_num * 1)

    dataset = utility.TFDataset(filename=filename,
                                batchsize=replica_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    with strategy.scope():
        model = TfDenseDemo(global_batch_size=args.global_batch_size,
                            vocabulary_size=args.vocabulary_size,
                            slot_num=args.slot_num,
                            nnz_per_slot=args.nnz_per_slot,
                            num_dense_layers=args.num_dense_layers,
                            embedding_vec_size=args.embedding_vec_size)
        emb_optimizer = utility.get_dense_optimizer(
            args.optimizer)(learning_rate=0.1)
        dense_optimizer = utility.get_dense_optimizer(
            args.optimizer)(learning_rate=0.1)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        return tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)

    # Note: all_reduce_indexed_slices in eager mode is not supported
    @tf.function
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit = model(inputs, training=True)
            loss = _replica_loss(labels, logit)

        emb_vars, dense_vars = split_emb_and_dense_variables(
            model.trainable_variables)

        # Debug code
        #print("number of embedding variables: {}".format(len(emb_vars)))
        #print("number of dense variables    : {}".format(len(dense_vars)))

        emb_grads, dense_grads = tape.gradient(loss, [emb_vars, dense_vars])

        # update variables of embedding layer
        emb_optimizer.apply_gradients(zip(emb_grads, emb_vars),
                                      experimental_aggregate_gradients=False)

        # Mannually all-reduce dense gradients and update variables of dense layers
        replica_context = tf.distribute.get_replica_context()
        dense_grads = replica_context.all_reduce("sum",
                                                 dense_grads,
                                                 options=comm_options)
        dense_optimizer.apply_gradients(zip(dense_grads, dense_vars),
                                        experimental_aggregate_gradients=False)

        # manually all-reduce loss, it is ok, because replica_loss has already been used to
        # update local variables.
        loss = replica_context.all_reduce(tf.distribute.ReduceOp.SUM,
                                          loss,
                                          options=comm_options)
        return loss

    time_arr = []
    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_at_iter > 0 and i >= args.stop_at_iter:
            break

        rng = nvtx.start_range(message="Iteration_" + str(i), color="blue")
        start_time = time.time()
        loss = strategy.run(_train_step, args=(inputs, labels))
        time_arr.append(time.time() - start_time)

        nvtx.end_range(rng)
        print("[INFO]: Iteration: {}, loss={}".format(i, loss))

    print("Average iteration time (except 1st iteration): ",
          np.mean(time_arr[1:]))
Example #4
0
def main(args, task_id):
    print(task_id)
    comm_options = tf.distribute.experimental.CommunicationOptions(
        bytes_per_pack=0,
        timeout_seconds=None,
        implementation=tf.distribute.experimental.CommunicationImplementation.
        NCCL)

    if args.total_gpu_num == 1:
        strategy = tf.distribute.MirroredStrategy()
    else:
        port = 12345
        os.environ["TF_CONFIG"] = json.dumps({
            "cluster": {
                "worker": [
                    "localhost" + ":" + str(port + i)
                    for i in range(args.worker_num)
                ]
            },
            "task": {
                "type": "worker",
                "index": task_id
            }
        })
        strategy = tf.distribute.MultiWorkerMirroredStrategy(
            communication_options=comm_options)

    if args.data_splited:
        filename = args.data_filename + str(task_id) + ".file"
    else:
        filename = args.data_filename

    replica_batch_size = args.global_batch_size // (args.worker_num * 1)

    dataset = utility.TFDataset(filename=filename,
                                batchsize=replica_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    with strategy.scope():
        sok.Init(global_batch_size=args.global_batch_size)

        model = SOKDenseDemo(
            max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size=args.embedding_vec_size,
            slot_num=args.slot_num,
            nnz_per_slot=args.nnz_per_slot,
            num_dense_layers=args.num_dense_layers)

        embedding_optimizer = utility.get_embedding_optimizer(
            args.optimizer)(learning_rate=0.1)
        dense_optimizer = utility.get_dense_optimizer(
            args.optimizer)(learning_rate=0.1)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        return tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)

    @tf.function
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit = model(inputs, training=True)
            loss = _replica_loss(labels, logit)
        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        grads, emb_grads = tape.gradient(loss, [other_variable, emb_variable])
        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                embedding_optimizer.apply_gradients(
                    zip(emb_grads, emb_variable),
                    experimental_aggregate_gradients=False)
        else:
            embedding_optimizer.apply_gradients(
                zip(emb_grads, emb_variable),
                experimental_aggregate_gradients=False)

        # mannually all-reduce dense gradients
        replica_context = tf.distribute.get_replica_context()
        grads = replica_context.all_reduce("sum", grads, options=comm_options)
        dense_optimizer.apply_gradients(zip(grads, other_variable),
                                        experimental_aggregate_gradients=False)

        # manually all-reduce loss, it is ok, because replica_loss has already been used to
        # update local variables.
        loss = replica_context.all_reduce(tf.distribute.ReduceOp.SUM,
                                          loss,
                                          options=comm_options)
        return loss

    time_arr = []
    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_at_iter > 0 and i >= args.stop_at_iter:
            break

        rng = nvtx.start_range(message="Iteration_" + str(i), color="blue")

        start_time = time.time()
        total_loss = strategy.run(_train_step, args=(inputs, labels))
        time_arr.append(time.time() - start_time)

        nvtx.end_range(rng)
        if task_id == '0':
            print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))

    if task_id == '0':
        print("Average iteration time (except 1st iteration): ",
              np.mean(time_arr[1:]))
Example #5
0
def test_sok_multi_dense_emb(args):
    assert (args.global_batch_size % args.worker_num == 0)
    replica_batch_size = args.global_batch_size // (args.worker_num)

    dataset = utility.TFDataset(filename=args.file_prefix + str(args.task_id) +
                                ".file",
                                batchsize=replica_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    dynamic_input = True if args.dynamic_input == 1 else False

    # SOK initialize
    sok.Init(global_batch_size=args.global_batch_size)

    model = SOKDenseModel(
        max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
        embedding_vec_size_list=args.embedding_vec_size_list,
        slot_num_list=args.slot_num_list,
        nnz_per_slot_list=[
            args.nnz_per_slot for _ in range(len(args.slot_num_list))
        ],
        num_dense_layers=args.num_dense_layers,
        dynamic_input=dynamic_input,
        use_hashtable=args.use_hashtable)

    emb_opt = utils.get_embedding_optimizer(args.optimizer)(learning_rate=0.1)
    dense_opt = utils.get_dense_optimizer(args.optimizer)(learning_rate=0.1)
    if args.mixed_precision:
        emb_opt = tf.keras.mixed_precision.LossScaleOptimizer(
            emb_opt, initial_scale=1024)

    sok_saver = sok.Saver()
    for i, layer in enumerate(model.embedding_layers):
        init_tensors = utils.get_ones_tensor(
            max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size=args.embedding_vec_size_list[i],
            num=args.worker_num)

        sok_saver.load_embedding_values(layer.embedding_variable, init_tensors)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        _dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)
        return tf.cast(loss, _dtype)

    @tf.function
    def _train_step(inputs, labels, first_batch):
        with tf.GradientTape() as tape:
            logit, all_vectors = model(inputs, training=True)
            replica_loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = emb_opt.get_scaled_loss(replica_loss)
            else:
                _loss = replica_loss

        emb_var, other_var = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        emb_grads, grads = tape.gradient(_loss, [emb_var, other_var])
        if args.mixed_precision:
            emb_grads = emb_opt.get_unscaled_gradients(emb_grads)
            grads = emb_opt.get_unscaled_gradients(grads)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_var):
                emb_opt.apply_gradients(zip(emb_grads, emb_var),
                                        experimental_aggregate_gradients=False)
        else:
            emb_opt.apply_gradients(zip(emb_grads, emb_var),
                                    experimental_aggregate_gradients=False)

        with tf.control_dependencies(emb_grads):

            grads = [hvd.allreduce(grad) for grad in grads]
            dense_opt.apply_gradients(zip(grads, other_var))

            if first_batch:
                hvd.broadcast_variables(other_var, root_rank=0)
                hvd.broadcast_variables(dense_opt.variables(), root_rank=0)

            total_loss = hvd.allreduce(replica_loss)
        return total_loss, all_vectors

    sok_results = list()
    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_iter >= 0 and i >= args.stop_iter:
            break

        total_loss, all_vectors = _train_step(inputs, labels, 0 == i)
        print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))

        sok_results.append(all_vectors)
    return sok_results
Example #6
0
def main(args):
    # Initialize horovod
    hvd.init()

    gpus = tf.config.list_physical_devices("GPU")
    tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU")

    # Generate local filename
    # Assume the dataset has been splited in advance
    local_file = args.data_filename_prefix + str(hvd.local_rank()) + ".file"

    # generate local batch size
    assert (args.global_batch_size % hvd.size() == 0)
    local_batch_size = args.global_batch_size // hvd.size()

    dataset = utility.TFDataset(filename=local_file,
                                batchsize=local_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    # Because there is no tensorflow distribute strategy, sok.Init() will call horovod to
    # broadcast nccl id and random seed, so it must be called after hvd.init()
    sok.Init(global_batch_size=args.global_batch_size)

    model = SOKDenseDemo(
        max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
        embedding_vec_size=args.embedding_vec_size,
        slot_num=args.slot_num,
        nnz_per_slot=args.nnz_per_slot,
        num_dense_layers=args.num_dense_layers)

    embedding_optimizer = utility.get_embedding_optimizer(
        args.optimizer)(learning_rate=0.1)
    dense_optimizer = utility.get_dense_optimizer(
        args.optimizer)(learning_rate=0.1)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        return tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)

    @tf.function
    def _train_step(inputs, labels, first_batch):
        with tf.GradientTape() as tape, tf.GradientTape() as emb_tape:
            logit = model(inputs, training=True)
            replica_loss = _replica_loss(labels, logit)

        # Horovod: wrap tf.GradientTape with Horovod DistributedGradientTape
        tape = hvd.DistributedGradientTape(tape)

        # There is no need to wrap the emb_tape because the communication is done by sok
        # emb_tape = hvd.DistributedGradientTape(emb_tape)

        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)

        # type(emb_tape) here is hvd.DistributedGradientTape
        # type(tape) here is tf.GradientTape
        emb_grads = emb_tape.gradient(replica_loss, emb_variable)
        grads = tape.gradient(replica_loss, other_variable)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                embedding_optimizer.apply_gradients(
                    zip(emb_grads, emb_variable),
                    experimental_aggregate_gradients=False)
        else:
            embedding_optimizer.apply_gradients(
                zip(emb_grads, emb_variable),
                experimental_aggregate_gradients=False)
        dense_optimizer.apply_gradients(zip(grads, other_variable))

        # Note: broadcast should be done after the first gradient step to ensure optimizer has been initialized.
        # There is no need to broadcast emb_variable and embedding_optimizer, because the parallel mode inside
        # sok is model parallel and the communication is down by sok itself.
        if first_batch:
            hvd.broadcast_variables(other_variable, root_rank=0)
            hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0)

        return replica_loss

    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_at_iter > 0 and i >= args.stop_at_iter:
            break

        rng = nvtx.start_range(message="Iteration_" + str(i), color="blue")

        total_loss = _train_step(inputs, labels, i == 0)

        nvtx.end_range(rng)
        print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))