Esempio n. 1
0
    def init(self, task_id):
        ip = "127.0.0.1"
        port = "12345"
        port1 = "12346"
        if 0 == task_id:
            os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
        elif 1 == task_id:
            os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
        else:
            raise RuntimeError("task_id can only be one of [0, 1].")

        os.environ['TF_CONFIG'] = json.dumps({
            "cluster": {
                "worker": [ip + ":" + port, ip + ":" + port1]
            },
            "task": {
                "type": "worker",
                "index": task_id
            }
        })

        resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()

        self.strategy = tf.distribute.MultiWorkerMirroredStrategy(resolver)
        with self.strategy.scope():
            init_re = sok.Init()
Esempio n. 2
0
def main():
    global_batch_size = 1024
    slot_num = 10
    nnz_per_slot = 5

    policy = tf.keras.mixed_precision.Policy("mixed_float16")
    tf.keras.mixed_precision.set_global_policy(policy)

    strategy = tf.distribute.MirroredStrategy()

    dataset = utility.get_dataset(global_batch_size,
                                  read_batchsize=global_batch_size)
    dataset = strategy.experimental_distribute_dataset(dataset)

    with strategy.scope():
        sok.Init(global_batch_size=global_batch_size)

        model = utility.SOKDenseDemo(max_vocabulary_size_per_gpu=1024,
                                     embedding_vec_size=8,
                                     slot_num=slot_num,
                                     nnz_per_slot=nnz_per_slot,
                                     num_dense_layers=0)

        optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
        optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        labels = tf.cast(labels, logits.dtype)
        loss = loss_fn(labels, logits)
        dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(loss,
                                          global_batch_size=global_batch_size)
        return tf.cast(loss, dtype)

    @tf.function
    def train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit = model(inputs, training=True)
            loss = _replica_loss(labels, logit)
            scaled_loss = optimizer.get_scaled_loss(loss)
        emb_vars, other_vars =\
            sok.split_embedding_variable_from_others(model.trainable_variables)
        scaled_emb_grads, scaled_other_grads = tape.gradient(
            scaled_loss, [emb_vars, other_vars])
        emb_grads = optimizer.get_unscaled_gradients(scaled_emb_grads)
        other_grads = optimizer.get_unscaled_gradients(scaled_other_grads)
        with sok.OptimizerScope(emb_vars):
            optimizer.apply_gradients(zip(emb_grads, emb_vars),
                                      experimental_aggregate_gradients=False)
        optimizer.apply_gradients(zip(other_grads, other_vars))
        return loss

    for step, (inputs, labels) in enumerate(dataset):
        replica_loss = strategy.run(train_step, args=(inputs, labels))
        total_loss = strategy.reduce("sum", replica_loss, axis=None)
        print("[INFO]: step {}, loss {}".format(step, total_loss))
Esempio n. 3
0
def main(args):
    comm_options = None

    if "mirrored" == args.distribute_strategy:
        avaiable_cuda_devices = ",".join(
            [str(gpu_id) for gpu_id in range(args.gpu_num)])
        os.environ["CUDA_VISIBLE_DEVICES"] = avaiable_cuda_devices

        strategy = tf.distribute.MirroredStrategy()
        args.task_id = 0
    elif "multiworker" == args.distribute_strategy:
        args.task_id = int(os.getenv("OMPI_COMM_WORLD_RANK"))
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.task_id)
        args.gpu_num = int(os.getenv("OMPI_COMM_WORLD_SIZE"))

        comm_options = tf.distribute.experimental.CommunicationOptions(
            bytes_per_pack=0,
            timeout_seconds=None,
            implementation=tf.distribute.experimental.
            CommunicationImplementation.NCCL)

        import json
        port = 12345
        os.environ["TF_CONFIG"] = json.dumps({
            "cluster": {
                "worker": [
                    "localhost" + ":" + str(port + i)
                    for i in range(args.gpu_num)
                ]
            },
            "task": {
                "type": "worker",
                "index": args.task_id
            }
        })
        strategy = tf.distribute.MultiWorkerMirroredStrategy(
            communication_options=comm_options)
    elif "horovod" == args.distribute_strategy:
        import horovod.tensorflow as hvd
        hvd.Init()

        args.task_id = hvd.local_rank()
        args.gpu_num = hvd.size()
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.task_id)
        strategy = utils.NullStrategy()
    else:
        raise ValueError(
            "Not supported distribute_strategy. "
            f"Can only be one of ['mirrored', 'multiworker', 'horovod']"
            f", but got {args.distribute_strategy}")

    with strategy.scope():
        if args.embedding_layer == "SOK":
            sok.Init(global_batch_size=args.global_batch_size)

        model = DLRM(vocab_size=args.vocab_size_list,
                     num_dense_features=args.num_dense_features,
                     embedding_layer=args.embedding_layer,
                     embedding_vec_size=args.embedding_vec_size,
                     bottom_stack_units=args.bottom_stack,
                     top_stack_units=args.top_stack,
                     TF_MP=args.TF_MP,
                     comm_options=comm_options)

        lr_callable = utils.get_lr_callable(
            global_batch_size=args.global_batch_size,
            decay_exp=args.decay_exp,
            learning_rate=args.learning_rate,
            warmup_steps=args.warmup_steps,
            decay_steps=args.decay_steps,
            decay_start_steps=args.decay_start_steps)

        embedding_optimizer = utils.get_optimizer(args.embedding_optimizer)
        embedding_optimizer.learning_rate = lr_callable
        dense_optimizer = utils.get_optimizer("Adam")

    batch_size = args.global_batch_size if args.distribute_strategy == "mirrored" \
                                        else args.global_batch_size // args.gpu_num
    if args.distribute_strategy != "mirrored":
        args.train_file_pattern = utils.shard_filenames(
            args.train_file_pattern, args.gpu_num, args.task_id)
        args.test_file_pattern = utils.shard_filenames(args.test_file_pattern,
                                                       args.gpu_num,
                                                       args.task_id)

    train_dataset = CriteoTsvReader(file_pattern=args.train_file_pattern,
                                    num_dense_features=args.num_dense_features,
                                    vocab_sizes=args.vocab_size_list,
                                    batch_size=batch_size)
    val_dataset = CriteoTsvReader(file_pattern=args.test_file_pattern,
                                  num_dense_features=args.num_dense_features,
                                  vocab_sizes=args.vocab_size_list,
                                  batch_size=batch_size)

    distribute_dataset = (args.distribute_strategy == "mirrored"
                          and args.gpu_num > 1)
    train_dataset = utils.get_distribute_dataset(
        train_dataset, strategy, distribute_dataset=distribute_dataset)
    val_dataset = utils.get_distribute_dataset(
        val_dataset, strategy, distribute_dataset=distribute_dataset)
    val_dataset = iter(val_dataset)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        return tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)

    metrics = [
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Mean("prediction_mean"),
        tf.keras.metrics.Mean("label_mean")
    ]
    metrics_threshold = {"auc": 0.8025}

    @tf.function
    def _train_step(features, labels, first_batch=False):
        with tf.GradientTape() as tape:
            logits = model(features, training=True)
            loss = _replica_loss(labels, logits)

        emb_vars, other_vars = utils.split_embedding_variables_from_others(
            model)
        emb_grads, other_grads = tape.gradient(loss, [emb_vars, other_vars])

        with tf.control_dependencies([logits] + emb_grads):
            utils.apply_gradients(embedding_optimizer,
                                  emb_vars,
                                  emb_grads,
                                  args.embedding_layer == "SOK",
                                  aggregate_gradients=(not args.TF_MP))

            other_grads = utils.all_reduce(other_grads,
                                           combiner="sum",
                                           comm_options=comm_options)
            utils.apply_gradients(dense_optimizer, other_vars, other_grads,
                                  False)

            if first_batch:
                utils.broadcast_variables(other_vars)
                utils.broadcast_variables(dense_optimizer.variables())

                if args.embedding_layer == "TF":
                    utils.broadcast_variables(emb_vars)
                    utils.broadcast_variables(embedding_optimizer.variables())

            total_loss = utils.all_reduce(loss,
                                          combiner="sum",
                                          comm_options=comm_options)
        return total_loss

    @tf.function
    def _val_step(features, labels, metrics):
        val_logits = model(features, training=False)
        val_loss = _replica_loss(labels, val_logits)
        val_loss = utils.all_reduce(val_loss,
                                    combiner="sum",
                                    comm_options=comm_options)

        labels = tf.identity(labels)
        val_logits = utils.all_gather(val_logits,
                                      axis=0,
                                      comm_options=comm_options)
        labels = utils.all_gather(labels, axis=0, comm_options=comm_options)

        return val_logits, labels, val_loss

    stopper = utils.EarlyStopper()

    begin_time = time.time()
    start_time = begin_time
    for i, (features, labels) in enumerate(train_dataset):
        if i >= args.train_steps:
            break
        if stopper.should_stop():
            print(stopper.stop_reason)
            break

        total_loss = strategy.run(_train_step, args=(features, labels, i == 0))

        if i % args.validation_interval == 0 and i != 0:
            val_features, val_labels = next(val_dataset)
            val_logits, val_labels, val_loss =\
                strategy.run(_val_step, args=(val_features, val_labels, metrics))

            if hasattr(val_labels, "values"):
                val_labels = val_labels.values[0]
                val_logits = val_logits.values[0]

            update_metrics_states(y_true=val_labels,
                                  y_pred=val_logits,
                                  metrics=metrics)
            val_logs = train_loop_end(metrics,
                                      total_loss,
                                      val_loss,
                                      embedding_optimizer,
                                      dense_optimizer,
                                      global_step=i)

            elapsed_time = time.time() - begin_time
            steps_sec = args.validation_interval / elapsed_time
            utils.show_logs(val_logs, strategy, elapsed_time, steps_sec,
                            metrics_threshold, stopper)
            begin_time = time.time()

    end_time = time.time()
    if args.task_id == 0:
        print(
            f"With {args.distribute_strategy} + {args.embedding_layer} embedding layer, "
            f"on {args.gpu_num} GPUs, and global_batch_size is {args.global_batch_size}, "
            f"it takes {end_time - start_time} seconds to "
            f"finish {args.train_steps} steps training for DLRM.")
def test_sok_dense_demo(args, init_tensors, *random_samples):
    port = 12345
    os.environ["TF_CONFIG"] = json.dumps({
        "cluster": {"worker": [args.ips[i] + ":" + str(port + i) for i in range(args.worker_num)]},
        "task": {"type": "worker", "index": args.task_id}
    })
    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    with strategy.scope():
        sok.Init(global_batch_size=args.global_batch_size)

        sok_dense_demo = SOKDenseDemo(max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
                                      embedding_vec_size=args.embedding_vec_size,
                                      slot_num=args.slot_num,
                                      nnz_per_slot=args.nnz_per_slot,
                                      use_hashtable=args.use_hashtable)

        emb_opt = utils.get_embedding_optimizer(args.optimizer)(learning_rate=0.1)
        dense_opt = utils.get_dense_optimizer(args.optimizer)(learning_rate=0.1)
        if args.mixed_precision:
            emb_opt = tf.keras.mixed_precision.LossScaleOptimizer(emb_opt, initial_scale=1024)

    sok_saver = sok.Saver()
    if 1 == args.restore_params:
        filepath = r"./embedding_variables"
        sok_saver.restore_from_file(sok_dense_demo.embedding_layer.embedding_variable, filepath)
    else:
        sok_saver.load_embedding_values(sok_dense_demo.embedding_layer.embedding_variable, init_tensors)

    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        _dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(loss, global_batch_size=args.global_batch_size)
        return tf.cast(loss, _dtype)

    @tf.function
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit, embedding_vector = sok_dense_demo(inputs, training=True)
            loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = emb_opt.get_scaled_loss(loss)
            else:
                _loss = loss

        embedding_variables, other_variable = sok.split_embedding_variable_from_others(sok_dense_demo.trainable_variables)
        grads, emb_grads = tape.gradient(_loss, [other_variable, embedding_variables])
        if args.mixed_precision:
            grads = emb_opt.get_unscaled_gradients(grads)
            emb_grads = emb_opt.get_unscaled_gradients(emb_grads)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(embedding_variables):
                emb_opt.apply_gradients(zip(emb_grads, embedding_variables),
                                        experimental_aggregate_gradients=False)
        else:
            emb_opt.apply_gradients(zip(emb_grads, embedding_variables),
                                    experimental_aggregate_gradients=False)
        dense_opt.apply_gradients(zip(grads, other_variable))
        return loss, embedding_vector

    sok_results = list()

    def _dataset_fn(input_context):
        replica_batch_size = input_context.get_per_replica_batch_size(args.global_batch_size)
        dataset = utils.tf_dataset(*random_samples, batchsize=replica_batch_size, 
                                   to_sparse_tensor=False, repeat=1)
        return dataset

    dataset = strategy.distribute_datasets_from_function(_dataset_fn)

    for i, (input_tensors, replica_labels) in enumerate(dataset):
        print("-"*30, "step ", str(i), "-"*30)
        loss, embedding_vector = strategy.run(_train_step, args=(input_tensors, replica_labels))
        loss = strategy.reduce("sum", loss, axis=None)
        print("[INFO]: iteration {}, loss {}".format(i, loss))
        sok_results.append(embedding_vector)


    # save params to file.
    if 1 == args.save_params:
        filepath = r"./embedding_variables"
        utils.try_make_dirs(filepath, chief=(True if args.task_id == 0 else False))

        sok_saver.dump_to_file(sok_dense_demo.embedding_layer.embedding_variable, filepath)

    return sok_results, sok_dense_demo.embedding_layer.embedding_variable.values[0].m_var_name
def run_sok_model(args, dense_variables, vocabulary_tensors, samples, labels):
    # split sample and labels
    assert (args.global_batch_size % hvd.size() == 0)
    local_batch_size = args.global_batch_size // hvd.size()
    local_id = hvd.local_rank()
    samples = samples[local_id * local_batch_size:(local_id + 1) *
                      local_batch_size]
    labels = labels[local_id * local_batch_size:(local_id + 1) *
                    local_batch_size]

    sok.Init(global_batch_size=args.global_batch_size)

    model = SOKDenseDemo(
        max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
        embedding_vec_size=args.embedding_vec_size,
        slot_num=args.slot_num,
        nnz_per_slot=args.nnz_per_slot,
        num_dense_layers=args.num_dense_layers,
        num_dense_units=args.num_dense_units)

    #model.build(input_shape=(local_batch_size, args.slot_num * args.nnz_per_slot * args.embedding_vec_size))
    model(samples, training=False)
    for i in range(args.num_dense_layers):
        model.dense_layers[i].trainable_variables[0].assign(
            dense_variables[0][i])
        model.dense_layers[i].trainable_variables[1].assign(
            dense_variables[1][i])

    sok_saver = sok.Saver()
    init_tensors = [tensor.numpy() for tensor in vocabulary_tensors]
    sok_saver.load_embedding_values(model.embedding_layer.embedding_variable,
                                    init_tensors)

    embedding_optimizer = utils.get_embedding_optimizer(
        args.optimizer)(learning_rate=0.1)
    dense_optimizer = utils.get_dense_optimizer(
        args.optimizer)(learning_rate=0.1)
    if args.mixed_precision:
        embedding_optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
            embedding_optimizer, initial_scale=1024)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        _dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)
        return tf.cast(loss, dtype=_dtype)

    @tf.function
    def _train_step(inputs, labels, first_batch):
        with tf.GradientTape() as tape, tf.GradientTape() as emb_tape:
            logit = model(inputs, training=True)
            replica_loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = embedding_optimizer.get_scaled_loss(replica_loss)
            else:
                _loss = replica_loss

        tape = hvd.DistributedGradientTape(tape)

        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        emb_grads = emb_tape.gradient(_loss, emb_variable)
        grads = tape.gradient(_loss, other_variable)
        if args.mixed_precision:
            emb_grads = embedding_optimizer.get_unscaled_gradients(emb_grads)
            grads = embedding_optimizer.get_unscaled_gradients(grads)

        if 'plugin' not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                embedding_optimizer.apply_gradients(
                    zip(emb_grads, emb_variable),
                    experimental_aggregate_gradients=False)
        else:
            embedding_optimizer.apply_gradients(
                zip(emb_grads, emb_variable),
                experimental_aggregate_gradients=False)
        dense_optimizer.apply_gradients(zip(grads, other_variable))

        # Note: broadcast should be done after the first gradient step to ensure optimizer initialization.
        if first_batch:
            hvd.broadcast_variables(other_variable, root_rank=0)
            hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0)

        return replica_loss

    loss_list = []
    for i in range(args.iter_num):
        loss = _train_step(samples, labels, i == 0)
        loss_list.append(loss)
        print("[INFO]: Iteration: {}, loss={}".format(i, loss))
    return loss_list
Esempio n. 6
0
def main():
    global_batch_size = 1024
    slot_num = 10
    nnz_per_slot = 5

    from tensorflow.python.keras.engine import base_layer_utils
    base_layer_utils.enable_v2_dtype_behavior()

    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
    tf.keras.mixed_precision.experimental.set_policy(policy)

    dataset = utility.get_dataset(global_batch_size//hvd.size(), read_batchsize=global_batch_size//hvd.size())

    sok_init_op = sok.Init(global_batch_size=global_batch_size)

    model = utility.SOKDenseDemo(max_vocabulary_size_per_gpu=1024,
                                embedding_vec_size=8,
                                slot_num=slot_num,
                                nnz_per_slot=nnz_per_slot,
                                num_dense_layers=0)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
    optimizer = sok.tf.keras.mixed_precision.LossScaleOptimizer(optimizer, 1024)

    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none')
    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(loss, global_batch_size=global_batch_size)
        return tf.cast(loss, dtype)

    def train_step(inputs, labels):
        logit = model(inputs, training=True)
        loss = _replica_loss(labels, logit)
        scaled_loss = optimizer.get_scaled_loss(loss)
        scaled_gradients = tf.gradients(scaled_loss, model.trainable_variables)
        emb_vars, other_vars =\
            sok.split_embedding_variable_from_others(model.trainable_variables)
        scaled_emb_grads, scaled_other_grads =\
            scaled_gradients[:len(emb_vars)], scaled_gradients[len(emb_vars):]
        emb_grads = optimizer.get_unscaled_gradients(scaled_emb_grads)
        other_grads = optimizer.get_unscaled_gradients(scaled_other_grads)
        other_grads = [hvd.allreduce(grad) for grad in other_grads]
        with sok.OptimizerScope(emb_vars):
            emb_train_op = optimizer.apply_gradients(zip(emb_grads, emb_vars))
        other_train_op = optimizer.apply_gradients(zip(other_grads, other_vars))
        total_loss = hvd.allreduce(loss)
        with tf.control_dependencies([emb_train_op, other_train_op]):
            return tf.identity(total_loss)

    train_iterator = dataset.make_initializable_iterator()
    iterator_init = train_iterator.initializer
    inputs, labels = train_iterator.get_next()

    loss = train_step(inputs, labels)

    init_op = tf.group(tf.global_variables_initializer(), 
                       tf.local_variables_initializer())

    with tf.Session() as sess:
        sess.run(sok_init_op)
        sess.run([init_op, iterator_init])
        
        for step in range(10):
            loss_v = sess.run(loss)
            if hvd.local_rank() == 0:
                print("[INFO]: step {}, loss {}".format(step, loss_v))
Esempio n. 7
0
    my_affinity = affinity_map[rank]
    os.sched_setaffinity(0, my_affinity)


if __name__ == '__main__':

    if args.amp:
        print('[Info] use amp mode')
        policy = tf.keras.mixed_precision.Policy('mixed_float16')
        tf.keras.mixed_precision.set_global_policy(policy)

    hvd.init()
    # set_affinity(hvd.rank())

    global_batch_size = args.global_batch_size
    sok.Init(global_batch_size=global_batch_size)

    with open(os.path.join(args.data_dir, 'train/metadata.json'), 'r') as f:
        metadata = json.load(f)
    print(metadata)

    model = DLRM(
        metadata['vocab_sizes'],
        num_dense_features=13,
        embedding_vec_size=128,
        bottom_stack_units=[512, 256, 128],
        top_stack_units=[1024, 1024, 512, 256, 1],
        num_gpus=hvd.size(),
        use_cuda_interact=args.custom_interact,
        compress=args.compress,
    )
Esempio n. 8
0
def test_sok_multi_dense_emb(args):
    comm_options = tf.distribute.experimental.CommunicationOptions(
        bytes_per_pack=0,
        timeout_seconds=None,
        implementation=tf.distribute.experimental.CommunicationImplementation.
        NCCL)

    if args.worker_num == 1:
        strategy = tf.distribute.MirroredStrategy()
    else:
        port = 12345
        os.environ["TF_CONFIG"] = json.dumps({
            "cluster": {
                "worker": [
                    "localhost" + ":" + str(port + i)
                    for i in range(args.worker_num)
                ]
            },
            "task": {
                "type": "worker",
                "index": args.task_id
            }
        })
        strategy = tf.distribute.MultiWorkerMirroredStrategy(
            communication_options=comm_options)

    replica_batch_size = args.global_batch_size // (args.worker_num * 1)

    dataset = utility.TFDataset(filename=args.file_prefix + str(args.task_id) +
                                ".file",
                                batchsize=replica_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    dynamic_input = True if args.dynamic_input == 1 else False

    with strategy.scope():
        sok.Init(global_batch_size=args.global_batch_size)

        model = SOKDenseModel(
            max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size_list=args.embedding_vec_size_list,
            slot_num_list=args.slot_num_list,
            nnz_per_slot_list=[
                args.nnz_per_slot for _ in range(len(args.slot_num_list))
            ],
            num_dense_layers=args.num_dense_layers,
            dynamic_input=dynamic_input)

        emb_opt = utils.get_embedding_optimizer(
            args.optimizer)(learning_rate=0.1)
        dense_opt = utils.get_dense_optimizer(
            args.optimizer)(learning_rate=0.1)
        if args.mixed_precision:
            emb_opt = tf.keras.mixed_precision.LossScaleOptimizer(
                emb_opt, initial_scale=1024)

    # set initial value to embedding variables.
    sok_saver = sok.Saver()
    for i, layer in enumerate(model.embedding_layers):
        init_tensors = utils.get_ones_tensor(
            max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size=args.embedding_vec_size_list[i],
            num=args.worker_num)
        sok_saver.load_embedding_values(layer.embedding_variable, init_tensors)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        _dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)
        return tf.cast(loss, _dtype)

    @tf.function
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit, all_vectors = model(inputs, training=True)
            loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = emb_opt.get_scaled_loss(loss)
            else:
                _loss = loss
        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        grads, emb_grads = tape.gradient(_loss, [other_variable, emb_variable])
        if args.mixed_precision:
            grads = emb_opt.get_unscaled_gradients(grads)
            emb_grads = emb_opt.get_unscaled_gradients(emb_grads)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                emb_opt.apply_gradients(zip(emb_grads, emb_variable),
                                        experimental_aggregate_gradients=False)
        else:
            emb_opt.apply_gradients(zip(emb_grads, emb_variable),
                                    experimental_aggregate_gradients=False)

        with tf.control_dependencies(emb_grads):
            # mannually all-reduce dense gradients
            replica_context = tf.distribute.get_replica_context()
            grads = replica_context.all_reduce("sum",
                                               grads,
                                               options=comm_options)
            dense_opt.apply_gradients(zip(grads, other_variable),
                                      experimental_aggregate_gradients=False)

            # manually all-reduce loss, it is ok, because replica_loss has already been used to
            # update local variables.
            loss = replica_context.all_reduce(tf.distribute.ReduceOp.SUM,
                                              loss,
                                              options=comm_options)
        return loss, all_vectors, logit

    # save its results
    sok_results = list()
    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_iter >= 0 and i >= args.stop_iter:
            break

        total_loss, all_vectors, logit = strategy.run(_train_step,
                                                      args=(inputs, labels))
        print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))

        with tf.device("CPU:0"):
            sok_results.append(all_vectors)

    return sok_results
Esempio n. 9
0
def test_sok_demo(args, init_tensors, *random_samples):
    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        result = sok.Init(global_batch_size=args.global_batch_size)

        embedding_initializer = tf.keras.initializers.Ones(
        ) if args.use_tf_initializer else None

        plugin_demo = SOKDemo(
            combiner=args.combiner,
            max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
            slot_num=args.slot_num,
            max_nnz=args.max_nnz,
            embedding_vec_size=args.embedding_vec_size,
            use_hashtable=args.use_hashtable,
            key_dtype=args.key_dtype,
            embedding_initializer=embedding_initializer)

        emb_opt = utils.get_embedding_optimizer(
            args.optimizer)(learning_rate=0.1)
        dense_opt = utils.get_dense_optimizer(
            args.optimizer)(learning_rate=0.1)
        if args.mixed_precision:
            emb_opt = tf.keras.mixed_precision.LossScaleOptimizer(
                emb_opt, initial_scale=1024)

    plugin_saver = sok.Saver()

    if (1 == args.restore_params):  # restore from trained parameters
        filepath = r"./embedding_variables"
        plugin_saver.restore_from_file(
            plugin_demo.embedding_layer.embedding_variable, filepath)
    else:  # initialize using randomized initial value
        if not args.use_tf_initializer and init_tensors:
            status = plugin_saver.load_embedding_values(
                plugin_demo.embedding_layer.embedding_variable, init_tensors)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        _dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)
        return tf.cast(loss, _dtype)

    @tf.function
    def _train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logit, embedding_vector = plugin_demo(inputs, training=True)
            loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = emb_opt.get_scaled_loss(loss)
            else:
                _loss = loss
        embedding_variables, other_variable = sok.split_embedding_variable_from_others(
            plugin_demo.trainable_variables)
        grads, emb_grads = tape.gradient(_loss,
                                         [other_variable, embedding_variables])
        if args.mixed_precision:
            grads = emb_opt.get_unscaled_gradients(grads)
            emb_grads = emb_opt.get_unscaled_gradients(emb_grads)

        with tf.control_dependencies([*emb_grads]):
            # in case NCCL runs concurrently via SOK and TF
            if 'plugin' not in args.optimizer:
                with sok.OptimizerScope(embedding_variables):
                    emb_opt.apply_gradients(
                        zip(emb_grads, embedding_variables),
                        experimental_aggregate_gradients=False)
            else:
                emb_opt.apply_gradients(zip(emb_grads, embedding_variables),
                                        experimental_aggregate_gradients=False)
            dense_opt.apply_gradients(zip(grads, other_variable))
            return loss, embedding_vector

    sok_results = list()

    def _dataset_fn(input_context):
        replica_batch_size = input_context.get_per_replica_batch_size(
            args.global_batch_size)
        dataset = utils.tf_dataset(*random_samples,
                                   batchsize=replica_batch_size,
                                   to_sparse_tensor=True,
                                   repeat=1,
                                   args=args)
        dataset = dataset.shard(input_context.num_input_pipelines,
                                input_context.input_pipeline_id)
        return dataset

    dataset = strategy.distribute_datasets_from_function(_dataset_fn)

    for i, (sparse_tensors, replica_labels) in enumerate(dataset):
        print("-" * 30, "step ", str(i), "-" * 30)
        loss, embedding_vector = strategy.run(_train_step,
                                              args=(sparse_tensors,
                                                    replica_labels))
        loss = strategy.reduce("sum", loss, axis=None)
        print("[INFO]: iteration {}, loss {}".format(i, loss))
        sok_results.append(embedding_vector)

    # save params to file.
    if 1 == args.save_params:
        filepath = r"./embedding_variables/"
        utils.try_make_dirs(filepath)

        plugin_saver.dump_to_file(
            plugin_demo.embedding_layer.embedding_variable, filepath)

    return sok_results, plugin_demo.embedding_layer.embedding_variable.values[
        0].m_var_name
Esempio n. 10
0
def get_sok_results(args, init_tensors, *random_samples):
    if args.distributed_tool == "onedevice":
        strategy = strategy_wrapper.OneDeviceStrategy()
    elif args.distributed_tool == "horovod":
        import horovod.tensorflow as hvd
        hvd.init()
        strategy = strategy_wrapper.HorovodStrategy()
    else:
        raise ValueError(f"{args.distributed_tool} is not supported.")

    with strategy.scope():
        sok_init_op = sok.Init(global_batch_size=args.global_batch_size)

        embedding_initializer = tf.keras.initializers.Ones(
        ) if args.use_tf_initializer else None

        sok_dense_demo = SOKDemo(
            max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size=args.embedding_vec_size,
            slot_num=args.slot_num,
            nnz_per_slot=args.nnz_per_slot,
            use_hashtable=args.use_hashtable,
            dynamic_input=args.dynamic_input,
            num_of_dense_layers=0,
            key_dtype=args.key_dtype,
            embedding_initializer=embedding_initializer)

        emb_opt = utils.get_embedding_optimizer(
            args.optimizer)(learning_rate=0.1)
        dense_opt = utils.get_dense_optimizer(
            args.optimizer)(learning_rate=0.1)
        if args.mixed_precision:
            emb_opt = sok.tf.keras.mixed_precision.LossScaleOptimizer(
                emb_opt, 1024)

    sok_saver = sok.Saver()
    restore_op = list()
    for i, embedding_layer in enumerate(sok_dense_demo.embedding_layers):
        control_inputs = [restore_op[-1]] if restore_op else None
        with tf.control_dependencies(control_inputs):
            if args.restore_params:
                filepath = r"./embedding_variables"
                op = sok_saver.restore_from_file(
                    embedding_layer.embedding_variable, filepath)
            else:
                if not args.use_tf_initializer:
                    op = sok_saver.load_embedding_values(
                        embedding_layer.embedding_variable, init_tensors[i])
                else:
                    op = tf.constant(1.0)
            restore_op.append(op)

    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True,
                                                 reduction='none')

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        _dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)
        return tf.cast(loss, _dtype)

    def _train_step(inputs, labels, training):
        def _step_fn(inputs, labels):
            logit, embedding_vector = sok_dense_demo(inputs, training=training)
            loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = emb_opt.get_scaled_loss(loss)
            else:
                _loss = loss
            emb_var, other_var = sok.split_embedding_variable_from_others(
                sok_dense_demo.trainable_variables)
            grads = tf.gradients(
                _loss,
                emb_var + other_var,
                colocate_gradients_with_ops=True,
                unconnected_gradients=tf.UnconnectedGradients.NONE)
            emb_grads, other_grads = grads[:len(emb_var)], grads[len(emb_var):]
            if args.mixed_precision:
                other_grads = emb_opt.get_unscaled_gradients(other_grads)
                emb_grads = emb_opt.get_unscaled_gradients(emb_grads)

            if "plugin" in args.optimizer:
                emb_train_op = emb_opt.apply_gradients(zip(emb_grads, emb_var))
            else:
                with sok.OptimizerScope(emb_var):
                    emb_train_op = emb_opt.apply_gradients(
                        zip(emb_grads, emb_var))
            with tf.control_dependencies([*emb_grads]):
                # in case NCCL runs concurrently via SOK and horovod
                other_grads = strategy.reduce("sum", other_grads)
            other_train_op = dense_opt.apply_gradients(
                zip(other_grads, other_var))

            with tf.control_dependencies([emb_train_op, other_train_op]):
                total_loss = strategy.reduce("sum", loss)
                total_loss = tf.identity(total_loss)
                return total_loss, embedding_vector

        return strategy.run(_step_fn, inputs, labels)

    replica_batch_size = args.global_batch_size // args.gpu_num
    dataset = utils.tf_dataset(*random_samples,
                               batchsize=replica_batch_size,
                               to_sparse_tensor=False,
                               repeat=1,
                               args=args)
    train_iterator = dataset.make_initializable_iterator()
    iterator_init = train_iterator.initializer

    inputs, labels = train_iterator.get_next()
    graph_results = _train_step(inputs, labels, training=True)

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    if "plugin" in args.optimizer:
        init_op = tf.group(init_op, emb_opt.initializer)

    save_op = list()
    for i, embedding_layer in enumerate(sok_dense_demo.embedding_layers):
        control_inputs = [save_op[-1]] if save_op else None
        with tf.control_dependencies(control_inputs):
            if args.save_params:
                filepath = r"./embedding_variables/"
                utils.try_make_dirs(filepath)
                op = sok_saver.dump_to_file(embedding_layer.embedding_variable,
                                            filepath)
            else:
                op = tf.constant(1.0)
        save_op.append(op)

    sok_results = list()

    config = tf.ConfigProto()
    config.log_device_placement = False
    with tf.Session(config=config) as sess:
        sess.run(sok_init_op)
        sess.run([init_op, iterator_init])
        sess.run(restore_op)
        sess.graph.finalize()

        for step in range(args.iter_num):
            loss_v, emb_vector_v = sess.run([*graph_results])
            print("*" * 80)
            print(f"Step: {step}, loss: {loss_v}"
                  )  #", embedding_vector:\n{emb_vector_v}")
            sok_results.append(emb_vector_v)

        sess.run(save_op)

    name = list()
    for embedding_layer in sok_dense_demo.embedding_layers:
        name.append(embedding_layer.embedding_variable.m_var_name)

    return sok_results, name
Esempio n. 11
0
 def __init__(self, **kwargs):
     print("[INFO]: single worker testing.")
     self.strategy = tf.distribute.MirroredStrategy()
     with self.strategy.scope():
         init_re = sok.Init(**kwargs)
Esempio n. 12
0
def test_sok_multi_dense_emb(args):
    assert (args.global_batch_size % args.worker_num == 0)
    replica_batch_size = args.global_batch_size // (args.worker_num)

    dataset = utility.TFDataset(filename=args.file_prefix + str(args.task_id) +
                                ".file",
                                batchsize=replica_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    dynamic_input = True if args.dynamic_input == 1 else False

    # SOK initialize
    sok.Init(global_batch_size=args.global_batch_size)

    model = SOKDenseModel(
        max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
        embedding_vec_size_list=args.embedding_vec_size_list,
        slot_num_list=args.slot_num_list,
        nnz_per_slot_list=[
            args.nnz_per_slot for _ in range(len(args.slot_num_list))
        ],
        num_dense_layers=args.num_dense_layers,
        dynamic_input=dynamic_input,
        use_hashtable=args.use_hashtable)

    emb_opt = utils.get_embedding_optimizer(args.optimizer)(learning_rate=0.1)
    dense_opt = utils.get_dense_optimizer(args.optimizer)(learning_rate=0.1)
    if args.mixed_precision:
        emb_opt = tf.keras.mixed_precision.LossScaleOptimizer(
            emb_opt, initial_scale=1024)

    sok_saver = sok.Saver()
    for i, layer in enumerate(model.embedding_layers):
        init_tensors = utils.get_ones_tensor(
            max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size=args.embedding_vec_size_list[i],
            num=args.worker_num)

        sok_saver.load_embedding_values(layer.embedding_variable, init_tensors)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        _dtype = loss.dtype
        loss = tf.cast(loss, tf.float32)
        loss = tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)
        return tf.cast(loss, _dtype)

    @tf.function
    def _train_step(inputs, labels, first_batch):
        with tf.GradientTape() as tape:
            logit, all_vectors = model(inputs, training=True)
            replica_loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = emb_opt.get_scaled_loss(replica_loss)
            else:
                _loss = replica_loss

        emb_var, other_var = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        emb_grads, grads = tape.gradient(_loss, [emb_var, other_var])
        if args.mixed_precision:
            emb_grads = emb_opt.get_unscaled_gradients(emb_grads)
            grads = emb_opt.get_unscaled_gradients(grads)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_var):
                emb_opt.apply_gradients(zip(emb_grads, emb_var),
                                        experimental_aggregate_gradients=False)
        else:
            emb_opt.apply_gradients(zip(emb_grads, emb_var),
                                    experimental_aggregate_gradients=False)

        with tf.control_dependencies(emb_grads):

            grads = [hvd.allreduce(grad) for grad in grads]
            dense_opt.apply_gradients(zip(grads, other_var))

            if first_batch:
                hvd.broadcast_variables(other_var, root_rank=0)
                hvd.broadcast_variables(dense_opt.variables(), root_rank=0)

            total_loss = hvd.allreduce(replica_loss)
        return total_loss, all_vectors

    sok_results = list()
    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_iter >= 0 and i >= args.stop_iter:
            break

        total_loss, all_vectors = _train_step(inputs, labels, 0 == i)
        print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))

        sok_results.append(all_vectors)
    return sok_results