def generate_data(args): dense_variables = generate_dense_variables( args.slot_num * args.nnz_per_slot * args.embedding_vec_size, [args.num_dense_units for _ in range(args.num_dense_layers)]) vocabulary_tensors = generate_vocabulary_table( args.max_vocabulary_size_per_gpu, args.embedding_vec_size, hvd.size()) samples, labels = utils.generate_random_samples( num_of_samples=args.global_batch_size, vocabulary_size=args.max_vocabulary_size_per_gpu * hvd.size(), slot_num=args.slot_num, max_nnz=args.nnz_per_slot, use_sparse_mask=False) samples, labels = tf.convert_to_tensor(samples), tf.convert_to_tensor( labels) for i in range(args.num_dense_layers): # dense_variables[0] means weight, dense_variables[1] means bias dense_variables[0][i] = hvd.broadcast(dense_variables[0][i], root_rank=0) dense_variables[1][i] = hvd.broadcast(dense_variables[1][i], root_rank=0) for i in range(hvd.size()): vocabulary_tensors[i] = hvd.broadcast(vocabulary_tensors[i], root_rank=0) samples = hvd.broadcast(samples, root_rank=0) labels = hvd.broadcast(labels, root_rank=0) return dense_variables, vocabulary_tensors, samples, labels
def compare_dense_emb_sok_with_tf(args): if (args.global_batch_size % args.local_gpu_num != 0): raise ValueError("global_batch_size: %d is not divisible by local_gpu_num: %d" %(args.global_batch_size, args.local_gpu_num)) if (args.global_batch_size % args.worker_num != 0): raise ValueError("global_batch_size: %d is not divisible by worker_num: %d" %(args.global_batch_size, args.worker_num)) if args.mixed_precision: policy = tf.keras.mixed_precision.Policy("mixed_float16") tf.keras.mixed_precision.set_global_policy(policy) #each worker generate different dataset if args.generate_new_datas: if args.use_hashtable: vocabulary_size = args.local_gpu_num * args.max_vocabulary_size_per_gpu * args.worker_num else: vocabulary_size = args.max_vocabulary_size_per_gpu worker_batch_size = args.global_batch_size // args.worker_num random_samples_local = utils.generate_random_samples(num_of_samples=worker_batch_size * args.iter_num, vocabulary_size=vocabulary_size, slot_num=args.slot_num, max_nnz=args.nnz_per_slot, use_sparse_mask=False) utils.save_to_file(r"./random_samples_" + str(args.task_id) + r".file", *random_samples_local) else: random_samples_local = utils.restore_from_file(r"./random_samples_" + str(args.task_id) + r".file") if 0 == args.restore_params: # each worker generate same init tensors, because each worker will do the filtering by itself init_tensors = utils.get_ones_tensor(max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, num=args.local_gpu_num * args.worker_num) else: filepath = r"./embedding_variables" tf_values_filename = os.path.join(filepath, r"tf_variable.file") init_tensors = utils.restore_from_file(tf_values_filename) sok_results_local, embedding_variable_name = test_sok_dense_demo(args, init_tensors, *random_samples_local) # save the forward embedding vector from different worker to file utils.save_to_file(r"./sok_embedding_vectors_" + str(args.task_id) + r".file", *sok_results_local) # only 1 process needs to do tf computation if args.task_id != 0: return # aggregate dataset from different worker dataset_filenames = [r"./random_samples_" + str(task_id) + r".file" for task_id in range(args.worker_num)] random_samples_total = [list() for _ in range(args.iter_num)] random_labels_total = [list() for _ in range(args.iter_num)] local_batch_size = args.global_batch_size // args.worker_num for worker_id in range(args.worker_num): samples, labels = utils.restore_from_file(dataset_filenames[worker_id]) for i in range(args.iter_num): random_samples_total[i].extend(samples[i * local_batch_size : (i + 1) * local_batch_size]) random_labels_total[i].extend(labels[i * local_batch_size : (i + 1) * local_batch_size]) random_samples_total = np.concatenate(random_samples_total, axis=0) random_labels_total = np.concatenate(random_labels_total, axis=0) tf_results = test_tf_dense_model(args, init_tensors, random_samples_total, random_labels_total) # aggregate forward embedding vector from different worker sok_results_filenames = [r"./sok_embedding_vectors_" + str(task_id) + r".file" for task_id in range(args.worker_num)] sok_results_total = list() for file_name in sok_results_filenames: sok_results_local = utils.restore_from_file(file_name) sok_results_total.append(sok_results_local) if (len(sok_results_total[0]) != len(tf_results)): raise ValueError("The length of results obtained from sok: %d is not equal to that obtained from TF: %d" %(len(sok_results_total[0]), len(tf_results))) if (len(tf_results) != args.iter_num): raise ValueError("The length of embedding vectors: %d is not equal to iteration number: %d." %(len(tf_results), args.iter_num)) if 1 == args.restore_params or args.mixed_precision: tolerance = 1e-2 else: tolerance = 1e-4 for i in range(args.iter_num): if args.local_gpu_num != 1: sok_vector = tf.concat([tf.concat(sok_results_total[task_id][i].values, axis=0) for task_id in range(args.worker_num)], axis=0) else: sok_vector = tf.concat([sok_results_total[task_id][i] for task_id in range(args.worker_num)], axis=0) tf.debugging.assert_near(tf.reshape(sok_vector, shape=[-1, tf.shape(sok_vector)[-1]]), tf_results[i], atol=tolerance, rtol=tolerance) print("\n[INFO]: For Dense Embedding Layer, with MultiWorkerMirroredStrategy, the embedding vectors "+\ "obtained from sparse operation kit and TensorFlow are consistent for %d iterations" ", with mixed_precision = %s" %(args.iter_num, args.mixed_precision)) if 1 == args.save_params: check_saved_embedding_variables(args, embedding_variable_name, use_hashtable=args.use_hashtable, gpu_num=args.worker_num * args.local_gpu_num, atol=tolerance, rtol=tolerance)
def compare_sok_with_tf(args): if (args.global_batch_size % args.gpu_num != 0): raise ValueError( "global_batch_size: %d is not divisible by gpu_num: %d" % (args.global_batch_size, args.gpu_num)) if args.use_hashtable: vocabulary_size = args.max_vocabulary_size_per_gpu * args.gpu_num else: vocabulary_size = args.max_vocabulary_size_per_gpu if args.generate_new_datas: random_samples = utils.generate_random_samples( num_of_samples=args.global_batch_size * args.iter_num, vocabulary_size=vocabulary_size, slot_num=args.slot_num, max_nnz=args.max_nnz) utils.save_to_file(r"./random_samples.file", *random_samples) else: random_samples = utils.restore_from_file(r"./random_samples.file") if (1 == args.restore_params): # initialize using trained params filepath = r"./embedding_variables" # because we already checked the Variable consistency when saving. # so that here we can directly use TensorFlow Variable file to initialize # tf's variable. # FIXME: what if not all TensorFlow embedding vectors are used?? tf_values_filename = os.path.join(filepath, r"tf_variable.file") init_tensors = utils.restore_from_file(tf_values_filename) else: # initialize using random initial value init_tensors = utils.get_ones_tensor( max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size, num=args.gpu_num) sok_results, embedding_variable_name = test_sok_demo( args, init_tensors, *random_samples) tf_results = test_tf_demo(args, init_tensors, *random_samples) if (len(sok_results) != len(tf_results)): raise ValueError( "The length of plugin results is not equal to that of tensorflow.") if (len(tf_results) != args.iter_num): raise ValueError( "The length of embedding vectors: %d is not equal to iteration number: %d." % (len(tf_results), args.iter_num)) tolerance = 1e-4 if args.mixed_precision: tolerance = 1e-3 for i, sok_vector in enumerate(sok_results): if args.gpu_num != 1: sok_vector = tf.stack(sok_vector.values, axis=0) tf.debugging.assert_near(tf.reshape( sok_vector, shape=[-1, tf.shape(sok_vector)[-1]]), tf_results[i], atol=tolerance, rtol=tolerance) print("\n[INFO]: With MirroredStrategy, the embedding vector obtained from " +\ "sparse operation kit and tensorflow are consistent for %d iterations." " With mixed_precision = %s, and key_dtype = %s, and use_tf_initializer = %s" %(args.iter_num, args.mixed_precision, args.key_dtype, args.use_tf_initializer)) if (1 == args.save_params): check_saved_embedding_variables(args, embedding_variable_name, use_hashtable=args.use_hashtable, gpu_num=args.gpu_num, atol=tolerance, rtol=tolerance)
def compare_dense_emb_sok_with_tf(args): if args.global_batch_size % args.gpu_num != 0: raise ValueError( f"global_batch_size: {args.global_batch_size} is not divisible by" f" gpu_num: {args.gpu_num}") if args.use_hashtable: vocabulary_size = args.max_vocabulary_size_per_gpu * args.gpu_num else: vocabulary_size = args.max_vocabulary_size_per_gpu if args.generate_new_datas: replica_batch_size = args.global_batch_size // args.gpu_num random_samples = utils.generate_random_samples( num_of_samples=replica_batch_size * args.iter_num, vocabulary_size=vocabulary_size, slot_num=sum(args.slot_num), max_nnz=args.nnz_per_slot, use_sparse_mask=False) utils.save_to_file( r"./random_samples_" + str(args.rank_idx) + r".file", *random_samples) else: random_samples = utils.restore_from_file(r"./random_samples_" + str(args.rank_idx) + r".file") if args.restore_params: filepath = r"./embedding_variables" # because we already checked the Variable consistency when saving # so that we can directly use TensorFlow Variable file to initialize # TF's Variable init_tensors = list() for i in range(len(args.slot_num)): tf_values_filename = os.path.join( filepath, r"tf_variable_" + str(i) + r".file") init_tensors.append(utils.restore_from_file(tf_values_filename)) else: init_tensors = list() for i in range(len(args.slot_num)): init_tensors.append( utils.get_ones_tensor( max_vocab_size_per_gpu=args.max_vocabulary_size_per_gpu, embedding_vec_size=args.embedding_vec_size[i], num=args.gpu_num)) sok_results, embedding_variable_name = get_sok_results( args, init_tensors, *random_samples) utils.save_to_file( r"./sok_embedding_vectors_" + str(args.rank_idx) + r".file", *sok_results) if args.rank_idx != 0: return # aggregate dataset from different worker dataset_filenames = [ r"./random_samples_" + str(rank_idx) + r".file" for rank_idx in range(args.rank_size) ] random_samples_total = [list() for _ in range(args.iter_num)] random_labels_total = [list() for _ in range(args.iter_num)] local_batch_size = args.global_batch_size // args.gpu_num for rank_idx in range(args.rank_size): samples, labels = utils.restore_from_file(dataset_filenames[rank_idx]) for i in range(args.iter_num): random_samples_total[i].extend( samples[i * local_batch_size:(i + 1) * local_batch_size]) random_labels_total[i].extend(labels[i * local_batch_size:(i + 1) * local_batch_size]) random_samples_total = np.concatenate(random_samples_total, axis=0) random_labels_total = np.concatenate(random_labels_total, axis=0) tf_results, _ = get_tf_results(args, init_tensors, random_samples_total, random_labels_total) # aggregate sok forward results from different worker sok_results_filenames = [ r"./sok_embedding_vectors_" + str(rank_idx) + r".file" for rank_idx in range(args.rank_size) ] sok_results_total = list() for filename in sok_results_filenames: sok_results = utils.restore_from_file(filename) sok_results_total.append(sok_results) if len(sok_results_total[0]) != len(tf_results): raise ValueError( "The length of sok results is not equal to that of tensorflow.") if len(sok_results) != args.iter_num: raise ValueError( "The length of embedding vectors: %d is not equal to iteration number: %d." % (len(sok_results), args.iter_num)) rtol = 1e-4 atol = 1e-4 if args.restore_params: rtol, atol = 1e-3, 1e-3 elif args.distributed_tool == "horovod": rtol, atol = rtol * 10, atol * 10 elif args.mixed_precision: rtol, atol = 1e-2, 1e-2 for i in range(args.iter_num): sok_vector = np.concatenate([ sok_results_total[rank_idx][i] for rank_idx in range(args.rank_size) ], axis=0) allclose = np.allclose(sok_vector, tf_results[i], rtol=rtol, atol=atol) if not allclose: raise ValueError( f"\n{sok_vector} \nis not near to \n{tf_results[i]} \nat rtol={rtol}, atol={atol}" ) # TODO: add an verbose option if False: print("--------------- step: {}---------------------".format(i)) print("sok_embedding_vector:\n{}".format(sok_vector)) print("tf_embedding_vector:\n{}".format(tf_results[i])) print( f"\n[INFO]: For {len(args.slot_num)} Dense Embedding layer, using {args.gpu_num} GPUs + {args.optimizer} optimizer, " f"using hashtable? {args.use_hashtable}, dynamic_input? {args.dynamic_input}, " "the embedding vectors" f" obtained from sok and tf are consistent for {args.iter_num} iterations," f" with mixed_precision = {args.mixed_precision}, key_dtype = {args.key_dtype}", f" use_tf_initializer = {args.use_tf_initializer}") if args.save_params: check_saved_embedding_variables(args, embedding_variable_name, use_hashtable=args.use_hashtable, gpu_num=args.gpu_num, atol=atol, rtol=rtol)