def generate_data(args):
    dense_variables = generate_dense_variables(
        args.slot_num * args.nnz_per_slot * args.embedding_vec_size,
        [args.num_dense_units for _ in range(args.num_dense_layers)])
    vocabulary_tensors = generate_vocabulary_table(
        args.max_vocabulary_size_per_gpu, args.embedding_vec_size, hvd.size())
    samples, labels = utils.generate_random_samples(
        num_of_samples=args.global_batch_size,
        vocabulary_size=args.max_vocabulary_size_per_gpu * hvd.size(),
        slot_num=args.slot_num,
        max_nnz=args.nnz_per_slot,
        use_sparse_mask=False)
    samples, labels = tf.convert_to_tensor(samples), tf.convert_to_tensor(
        labels)

    for i in range(args.num_dense_layers):
        # dense_variables[0] means weight, dense_variables[1] means bias
        dense_variables[0][i] = hvd.broadcast(dense_variables[0][i],
                                              root_rank=0)
        dense_variables[1][i] = hvd.broadcast(dense_variables[1][i],
                                              root_rank=0)
    for i in range(hvd.size()):
        vocabulary_tensors[i] = hvd.broadcast(vocabulary_tensors[i],
                                              root_rank=0)
    samples = hvd.broadcast(samples, root_rank=0)
    labels = hvd.broadcast(labels, root_rank=0)

    return dense_variables, vocabulary_tensors, samples, labels
def compare_dense_emb_sok_with_tf(args):
    if (args.global_batch_size % args.local_gpu_num != 0):
        raise ValueError("global_batch_size: %d is not divisible by local_gpu_num: %d"
                        %(args.global_batch_size, args.local_gpu_num))
    if (args.global_batch_size % args.worker_num != 0):
        raise ValueError("global_batch_size: %d is not divisible by worker_num: %d"
                        %(args.global_batch_size, args.worker_num))

    if args.mixed_precision:
        policy = tf.keras.mixed_precision.Policy("mixed_float16")
        tf.keras.mixed_precision.set_global_policy(policy)

    #each worker generate different dataset
    if args.generate_new_datas:
        if args.use_hashtable:
            vocabulary_size = args.local_gpu_num * args.max_vocabulary_size_per_gpu * args.worker_num
        else:
            vocabulary_size = args.max_vocabulary_size_per_gpu

        worker_batch_size = args.global_batch_size // args.worker_num
        random_samples_local = utils.generate_random_samples(num_of_samples=worker_batch_size * args.iter_num,
                                                             vocabulary_size=vocabulary_size,
                                                             slot_num=args.slot_num,
                                                             max_nnz=args.nnz_per_slot,
                                                             use_sparse_mask=False)
        utils.save_to_file(r"./random_samples_" + str(args.task_id) + r".file", *random_samples_local)
    else:
        random_samples_local = utils.restore_from_file(r"./random_samples_" + str(args.task_id) + r".file")

    if 0 == args.restore_params:
        # each worker generate same init tensors, because each worker will do the filtering by itself
        init_tensors = utils.get_ones_tensor(max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu,
                                            embedding_vec_size=args.embedding_vec_size,
                                            num=args.local_gpu_num * args.worker_num)
    else:
        filepath = r"./embedding_variables"
        tf_values_filename = os.path.join(filepath, r"tf_variable.file")
        init_tensors = utils.restore_from_file(tf_values_filename)

    sok_results_local, embedding_variable_name = test_sok_dense_demo(args, init_tensors, *random_samples_local)
    # save the forward embedding vector from different worker to file
    utils.save_to_file(r"./sok_embedding_vectors_" + str(args.task_id) + r".file", *sok_results_local)

    # only 1 process needs to do tf computation
    if args.task_id != 0:
        return

    # aggregate dataset from different worker
    dataset_filenames = [r"./random_samples_" + str(task_id) + r".file"
                         for task_id in range(args.worker_num)]
    random_samples_total = [list() for _ in range(args.iter_num)]
    random_labels_total = [list() for _ in range(args.iter_num)]
    local_batch_size = args.global_batch_size // args.worker_num
    for worker_id in range(args.worker_num):
        samples, labels = utils.restore_from_file(dataset_filenames[worker_id])
        for i in range(args.iter_num):
            random_samples_total[i].extend(samples[i * local_batch_size : (i + 1) * local_batch_size])
            random_labels_total[i].extend(labels[i * local_batch_size : (i + 1) * local_batch_size])
    random_samples_total = np.concatenate(random_samples_total, axis=0)
    random_labels_total = np.concatenate(random_labels_total, axis=0)

    tf_results = test_tf_dense_model(args, init_tensors, random_samples_total, random_labels_total)

    # aggregate forward embedding vector from different worker
    sok_results_filenames = [r"./sok_embedding_vectors_" + str(task_id) + r".file"
                             for task_id in range(args.worker_num)]
    sok_results_total = list()
    for file_name in sok_results_filenames:
        sok_results_local = utils.restore_from_file(file_name)
        sok_results_total.append(sok_results_local)
    
    if (len(sok_results_total[0]) != len(tf_results)):
        raise ValueError("The length of results obtained from sok: %d is not equal to that obtained from TF: %d"
                         %(len(sok_results_total[0]), len(tf_results)))
    if (len(tf_results) != args.iter_num):
        raise ValueError("The length of embedding vectors: %d is not equal to iteration number: %d."
                         %(len(tf_results), args.iter_num))
    
    if 1 == args.restore_params or args.mixed_precision:
        tolerance = 1e-2
    else:
        tolerance = 1e-4

    for i in range(args.iter_num):
        if args.local_gpu_num != 1:
            sok_vector = tf.concat([tf.concat(sok_results_total[task_id][i].values, axis=0)
                                    for task_id in range(args.worker_num)], axis=0)
        else:
            sok_vector = tf.concat([sok_results_total[task_id][i]
                                    for task_id in range(args.worker_num)],
                                    axis=0)
        tf.debugging.assert_near(tf.reshape(sok_vector,
                                            shape=[-1, tf.shape(sok_vector)[-1]]),
                                tf_results[i],
                                atol=tolerance,
                                rtol=tolerance)

    print("\n[INFO]: For Dense Embedding Layer, with MultiWorkerMirroredStrategy, the embedding vectors "+\
          "obtained from sparse operation kit and TensorFlow are consistent for %d iterations"
          ", with mixed_precision = %s"
          %(args.iter_num, args.mixed_precision))

    if 1 == args.save_params:
        check_saved_embedding_variables(args, embedding_variable_name, 
                                        use_hashtable=args.use_hashtable, 
                                        gpu_num=args.worker_num * args.local_gpu_num,
                                        atol=tolerance, rtol=tolerance)
Esempio n. 3
0
def compare_sok_with_tf(args):
    if (args.global_batch_size % args.gpu_num != 0):
        raise ValueError(
            "global_batch_size: %d is not divisible by gpu_num: %d" %
            (args.global_batch_size, args.gpu_num))

    if args.use_hashtable:
        vocabulary_size = args.max_vocabulary_size_per_gpu * args.gpu_num
    else:
        vocabulary_size = args.max_vocabulary_size_per_gpu

    if args.generate_new_datas:
        random_samples = utils.generate_random_samples(
            num_of_samples=args.global_batch_size * args.iter_num,
            vocabulary_size=vocabulary_size,
            slot_num=args.slot_num,
            max_nnz=args.max_nnz)
        utils.save_to_file(r"./random_samples.file", *random_samples)
    else:
        random_samples = utils.restore_from_file(r"./random_samples.file")

    if (1 == args.restore_params):  # initialize using trained params
        filepath = r"./embedding_variables"

        # because we already checked the Variable consistency when saving.
        # so that here we can directly use TensorFlow Variable file to initialize
        # tf's variable.
        # FIXME: what if not all TensorFlow embedding vectors are used??
        tf_values_filename = os.path.join(filepath, r"tf_variable.file")
        init_tensors = utils.restore_from_file(tf_values_filename)

    else:  # initialize using random initial value
        init_tensors = utils.get_ones_tensor(
            max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu,
            embedding_vec_size=args.embedding_vec_size,
            num=args.gpu_num)

    sok_results, embedding_variable_name = test_sok_demo(
        args, init_tensors, *random_samples)
    tf_results = test_tf_demo(args, init_tensors, *random_samples)

    if (len(sok_results) != len(tf_results)):
        raise ValueError(
            "The length of plugin results is not equal to that of tensorflow.")
    if (len(tf_results) != args.iter_num):
        raise ValueError(
            "The length of embedding vectors: %d is not equal to iteration number: %d."
            % (len(tf_results), args.iter_num))

    tolerance = 1e-4
    if args.mixed_precision:
        tolerance = 1e-3

    for i, sok_vector in enumerate(sok_results):
        if args.gpu_num != 1:
            sok_vector = tf.stack(sok_vector.values, axis=0)
        tf.debugging.assert_near(tf.reshape(
            sok_vector, shape=[-1, tf.shape(sok_vector)[-1]]),
                                 tf_results[i],
                                 atol=tolerance,
                                 rtol=tolerance)
    print("\n[INFO]: With MirroredStrategy, the embedding vector obtained from " +\
          "sparse operation kit and tensorflow are consistent for %d iterations."
          " With mixed_precision = %s, and key_dtype = %s, and use_tf_initializer = %s"
          %(args.iter_num, args.mixed_precision, args.key_dtype, args.use_tf_initializer))

    if (1 == args.save_params):
        check_saved_embedding_variables(args,
                                        embedding_variable_name,
                                        use_hashtable=args.use_hashtable,
                                        gpu_num=args.gpu_num,
                                        atol=tolerance,
                                        rtol=tolerance)
Esempio n. 4
0
def compare_dense_emb_sok_with_tf(args):
    if args.global_batch_size % args.gpu_num != 0:
        raise ValueError(
            f"global_batch_size: {args.global_batch_size} is not divisible by"
            f" gpu_num: {args.gpu_num}")

    if args.use_hashtable:
        vocabulary_size = args.max_vocabulary_size_per_gpu * args.gpu_num
    else:
        vocabulary_size = args.max_vocabulary_size_per_gpu

    if args.generate_new_datas:
        replica_batch_size = args.global_batch_size // args.gpu_num
        random_samples = utils.generate_random_samples(
            num_of_samples=replica_batch_size * args.iter_num,
            vocabulary_size=vocabulary_size,
            slot_num=sum(args.slot_num),
            max_nnz=args.nnz_per_slot,
            use_sparse_mask=False)
        utils.save_to_file(
            r"./random_samples_" + str(args.rank_idx) + r".file",
            *random_samples)
    else:
        random_samples = utils.restore_from_file(r"./random_samples_" +
                                                 str(args.rank_idx) + r".file")

    if args.restore_params:
        filepath = r"./embedding_variables"
        # because we already checked the Variable consistency when saving
        # so that we can directly use TensorFlow Variable file to initialize
        # TF's Variable
        init_tensors = list()
        for i in range(len(args.slot_num)):
            tf_values_filename = os.path.join(
                filepath, r"tf_variable_" + str(i) + r".file")
            init_tensors.append(utils.restore_from_file(tf_values_filename))
    else:
        init_tensors = list()
        for i in range(len(args.slot_num)):
            init_tensors.append(
                utils.get_ones_tensor(
                    max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu,
                    embedding_vec_size=args.embedding_vec_size[i],
                    num=args.gpu_num))

    sok_results, embedding_variable_name = get_sok_results(
        args, init_tensors, *random_samples)
    utils.save_to_file(
        r"./sok_embedding_vectors_" + str(args.rank_idx) + r".file",
        *sok_results)

    if args.rank_idx != 0:
        return

    # aggregate dataset from different worker
    dataset_filenames = [
        r"./random_samples_" + str(rank_idx) + r".file"
        for rank_idx in range(args.rank_size)
    ]
    random_samples_total = [list() for _ in range(args.iter_num)]
    random_labels_total = [list() for _ in range(args.iter_num)]
    local_batch_size = args.global_batch_size // args.gpu_num
    for rank_idx in range(args.rank_size):
        samples, labels = utils.restore_from_file(dataset_filenames[rank_idx])
        for i in range(args.iter_num):
            random_samples_total[i].extend(
                samples[i * local_batch_size:(i + 1) * local_batch_size])
            random_labels_total[i].extend(labels[i * local_batch_size:(i + 1) *
                                                 local_batch_size])
    random_samples_total = np.concatenate(random_samples_total, axis=0)
    random_labels_total = np.concatenate(random_labels_total, axis=0)

    tf_results, _ = get_tf_results(args, init_tensors, random_samples_total,
                                   random_labels_total)

    # aggregate sok forward results from different worker
    sok_results_filenames = [
        r"./sok_embedding_vectors_" + str(rank_idx) + r".file"
        for rank_idx in range(args.rank_size)
    ]
    sok_results_total = list()
    for filename in sok_results_filenames:
        sok_results = utils.restore_from_file(filename)
        sok_results_total.append(sok_results)

    if len(sok_results_total[0]) != len(tf_results):
        raise ValueError(
            "The length of sok results is not equal to that of tensorflow.")
    if len(sok_results) != args.iter_num:
        raise ValueError(
            "The length of embedding vectors: %d is not equal to iteration number: %d."
            % (len(sok_results), args.iter_num))

    rtol = 1e-4
    atol = 1e-4
    if args.restore_params:
        rtol, atol = 1e-3, 1e-3
    elif args.distributed_tool == "horovod":
        rtol, atol = rtol * 10, atol * 10
    elif args.mixed_precision:
        rtol, atol = 1e-2, 1e-2

    for i in range(args.iter_num):
        sok_vector = np.concatenate([
            sok_results_total[rank_idx][i]
            for rank_idx in range(args.rank_size)
        ],
                                    axis=0)
        allclose = np.allclose(sok_vector, tf_results[i], rtol=rtol, atol=atol)
        if not allclose:
            raise ValueError(
                f"\n{sok_vector} \nis not near to \n{tf_results[i]} \nat rtol={rtol}, atol={atol}"
            )

        # TODO: add an verbose option
        if False:
            print("--------------- step: {}---------------------".format(i))
            print("sok_embedding_vector:\n{}".format(sok_vector))
            print("tf_embedding_vector:\n{}".format(tf_results[i]))

    print(
        f"\n[INFO]: For {len(args.slot_num)} Dense Embedding layer, using {args.gpu_num} GPUs + {args.optimizer} optimizer, "
        f"using hashtable? {args.use_hashtable}, dynamic_input? {args.dynamic_input}, "
        "the embedding vectors"
        f" obtained from sok and tf are consistent for {args.iter_num} iterations,"
        f" with mixed_precision = {args.mixed_precision}, key_dtype = {args.key_dtype}",
        f" use_tf_initializer = {args.use_tf_initializer}")

    if args.save_params:
        check_saved_embedding_variables(args,
                                        embedding_variable_name,
                                        use_hashtable=args.use_hashtable,
                                        gpu_num=args.gpu_num,
                                        atol=atol,
                                        rtol=rtol)