def check_func_performance(tag, thread, func_module, reference_func, output_type, input_types, ranges=None, heavy_performance_load=False): N = 1024 * (1024 if heavy_performance_load else 32) repetitions = 100000 test = get_func_kernel(thread, func_module, output_type, input_types) perf_test = get_func_kernel(thread, func_module, output_type, input_types, repetitions=repetitions) arrays = [ get_test_array(N, tp, val_range=ranges[i] if ranges is not None else None) for i, tp in enumerate(input_types) ] arrays_dev = [thread.to_device(arr) for arr in arrays] dest_dev = thread.array(N, tp_dtype(output_type)) # Sanity check test(dest_dev, *arrays_dev, global_size=N) assert (dest_dev.get() == reference_func(*arrays)).all() # Performance check times = [] for j in range(10): thread.synchronize() t1 = time.time() perf_test(dest_dev, *arrays_dev, global_size=N) thread.synchronize() t2 = time.time() times.append(t2 - t1) times = numpy.array(times) times /= repetitions times /= N times *= 1e12 print() print( "{backend}: {tag} --- min: {min:.4f}, mean: {mean:.4f}, std: {std:.4f}" .format(tag=tag, min=times.min(), mean=times.mean(), std=times.std(), backend='cuda' if thread.api.get_id() == cluda.cuda_id() else 'ocl '))
def test_make_lwe_keyswitch_key(thread): params = NuFHEParameters() input_size = params.tgsw_params.tlwe_params.extracted_lweparams.size output_size = params.in_out_params.size decomp_length = params.ks_decomp_length log2_base = params.ks_log2_base base = 2**log2_base noise = params.in_out_params.min_noise ks_a = numpy.empty((input_size, decomp_length, base, output_size), dtype=Torus32) ks_b = numpy.empty((input_size, decomp_length, base), dtype=Torus32) ks_cv = numpy.empty((input_size, decomp_length, base), dtype=Float) in_key = get_test_array(input_size, Int32, (0, 2)) out_key = get_test_array(output_size, Int32, (0, 2)) noises_a = get_test_array( (input_size, decomp_length, base - 1, output_size), Torus32) noises_b = get_test_array((input_size, decomp_length, base - 1), Float, (-noise, noise)) test = MakeLweKeyswitchKey(input_size, output_size, decomp_length, log2_base, noise).compile(thread) ref = MakeLweKeyswitchKeyReference(input_size, output_size, decomp_length, log2_base, noise) ks_a_dev = thread.empty_like(ks_a) ks_b_dev = thread.empty_like(ks_b) ks_cv_dev = thread.empty_like(ks_cv) in_key_dev = thread.to_device(in_key) out_key_dev = thread.to_device(out_key) noises_a_dev = thread.to_device(noises_a) noises_b_dev = thread.to_device(noises_b) test(ks_a_dev, ks_b_dev, ks_cv_dev, in_key_dev, out_key_dev, noises_a_dev, noises_b_dev) ref(ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b) ks_a_test = ks_a_dev.get() ks_b_test = ks_b_dev.get() ks_cv_test = ks_cv_dev.get() assert (ks_a_test == ks_a).all() assert (ks_b_test == ks_b).all() assert numpy.allclose(ks_cv_test, ks_cv)
def test_tlwe_transformed_add_mul_to_trf(thread): shape = (2, 3) params = NuFHEParameters(transform_type='NTT') perf_params = PerformanceParameters(params).for_device(thread.device_params) tgsw_params = params.tgsw_params decomp_length = tgsw_params.decomp_length mask_size = tgsw_params.tlwe_params.mask_size polynomial_degree = tgsw_params.tlwe_params.polynomial_degree transform_type = tgsw_params.tlwe_params.transform_type transform = get_transform(transform_type) tlength = transform.transformed_length(polynomial_degree) tdtype = transform.transformed_dtype() result_shape = shape + (mask_size + 1, tlength) sample_shape = shape + (mask_size + 1, decomp_length, tlength) bk_len = 10 bootstrap_key_shape = (bk_len, mask_size + 1, decomp_length, mask_size + 1, tlength) bk_row_idx = 2 result = numpy.empty(result_shape, tdtype) sample = get_test_array(sample_shape, 'ff_number') bootstrap_key = get_test_array(bootstrap_key_shape, 'ff_number') result_dev = thread.empty_like(result) sample_dev = thread.to_device(sample) bootstrap_key_dev = thread.to_device(bootstrap_key) trf = get_tlwe_transformed_add_mul_to_trf(tgsw_params, shape, bk_len, perf_params) test = PureParallel.from_trf(trf, guiding_array='result').compile(thread) ref = tlwe_transformed_add_mul_to_trf_reference(tgsw_params, shape, bk_len, perf_params) test(result_dev, sample_dev, bootstrap_key_dev, bk_row_idx) result_test = result_dev.get() ref(result, sample, bootstrap_key, bk_row_idx) if numpy.issubdtype(tdtype, numpy.integer): assert (result == result_test).all() else: assert numpy.allclose(result, result_test)
def test_tgsw_transformed_external_mul(thread): shape = (2, 3) params = NuFHEParameters() perf_params = performance_parameters() tgsw_params = params.tgsw_params decomp_length = tgsw_params.decomp_length mask_size = tgsw_params.tlwe_params.mask_size polynomial_degree = tgsw_params.tlwe_params.polynomial_degree transform_type = tgsw_params.tlwe_params.transform_type transform = get_transform(transform_type) tlength = transform.transformed_length(polynomial_degree) tdtype = transform.transformed_dtype() accum_shape = shape + (mask_size + 1, polynomial_degree) bk_len = 10 bootstrap_key_shape = (bk_len, mask_size + 1, decomp_length, mask_size + 1, tlength) bk_row_idx = 2 bootstrap_key = get_test_array( bootstrap_key_shape, 'ff_number' if transform_type == 'NTT' else tdtype) accum = get_test_array(accum_shape, Torus32, (-1000, 1000)) bootstrap_key_dev = thread.to_device(bootstrap_key) accum_dev = thread.to_device(accum) thread.synchronize() test = TGswTransformedExternalMul(tgsw_params, shape, bk_len, perf_params).compile(thread) ref = TGswTransformedExternalMulReference(tgsw_params, shape, bk_len, perf_params) test(accum_dev, bootstrap_key_dev, bk_row_idx) accum_test = accum_dev.get() ref(accum, bootstrap_key, bk_row_idx) assert numpy.allclose(accum, accum_test)
def test_lwe_encrypt(thread): params = NuFHEParameters() lwe_size = params.in_out_params.size noise = params.in_out_params.min_noise shape = (16, 20) result_a = numpy.empty(shape + (lwe_size,), Torus32) result_b = numpy.empty(shape, Torus32) result_cv = numpy.empty(shape, ErrorFloat) key = get_test_array(lwe_size, Int32, (0, 2)) messages = get_test_array(shape, Torus32) noises_a = get_test_array(shape + (lwe_size,), Torus32) noises_b = get_test_array(shape, Torus32) test = LweEncrypt(shape, lwe_size, noise).compile(thread) ref = LweEncryptReference(shape, lwe_size, noise) result_a_dev = thread.empty_like(result_a) result_b_dev = thread.empty_like(result_b) result_cv_dev = thread.empty_like(result_cv) key_dev = thread.to_device(key) messages_dev = thread.to_device(messages) noises_a_dev = thread.to_device(noises_a) noises_b_dev = thread.to_device(noises_b) test( result_a_dev, result_b_dev, result_cv_dev, messages_dev, key_dev, noises_a_dev, noises_b_dev) ref(result_a, result_b, result_cv, messages, key, noises_a, noises_b) result_a_test = result_a_dev.get() result_b_test = result_b_dev.get() result_cv_test = result_cv_dev.get() assert (result_a_test == result_a).all() assert (result_b_test == result_b).all() assert errors_allclose(result_cv_test, result_cv)
def test_tlwe_encrypt_zero(thread): nufhe_params = NuFHEParameters() perf_params = PerformanceParameters(nufhe_params).for_device( thread.device_params) params = nufhe_params.tgsw_params.tlwe_params mask_size = params.mask_size polynomial_degree = params.polynomial_degree noise = params.min_noise shape = (3, 4, 5) result_a = numpy.empty(shape + (mask_size + 1, polynomial_degree), Torus32) result_cv = numpy.empty(shape, ErrorFloat) noises1 = get_test_array(shape + (mask_size, polynomial_degree), Torus32) noises2 = get_test_array(shape + (polynomial_degree, ), Torus32) key = get_test_array((mask_size, polynomial_degree), Int32, (0, 2)) test = TLweEncryptZero(params, shape, noise, perf_params).compile(thread) ref = TLweEncryptZeroReference(params, shape, noise, perf_params) result_a_dev = thread.empty_like(result_a) result_cv_dev = thread.empty_like(result_cv) noises1_dev = thread.to_device(noises1) noises2_dev = thread.to_device(noises2) key_dev = thread.to_device(key) test(result_a_dev, result_cv_dev, key_dev, noises1_dev, noises2_dev) ref(result_a, result_cv, key, noises1, noises2) result_a_test = result_a_dev.get() result_cv_test = result_cv_dev.get() assert (result_a_test == result_a).all() assert errors_allclose(result_cv_test, result_cv)
def test_t32_to_phase(thread): mspace_size = 2048 shape = (10, 20, 30) phase = get_test_array(shape, Torus32) result = numpy.empty(shape, Int32) phase_dev = thread.to_device(phase) result_dev = thread.empty_like(result) comp = Torus32ToPhase(shape, mspace_size).compile(thread) ref = Torus32ToPhaseReference(shape, mspace_size) comp(result_dev, phase_dev) result_test = result_dev.get() ref(result, phase) assert numpy.allclose(result_test, result)
def test_fft_performance(thread, transforms_per_block, constant_memory, heavy_performance_load): if not transform_supported(thread.device_params, 'FFT'): pytest.skip() if transforms_per_block > max_supported_transforms_per_block(thread.device_params, 'FFT'): pytest.skip() is_cuda = thread.api.get_id() == cuda_id() batch_shape = (2**14,) a = get_test_array(batch_shape + (512,), numpy.complex128) kernel_repetitions = 100 if heavy_performance_load else 5 a_dev = thread.to_device(a) res_dev = thread.empty_like(a_dev) res_ref = tr_fft.fft_transform_ref(a) transform = fft512(use_constant_memory=constant_memory) fft_comp = Transform( transform, batch_shape, transforms_per_block=transforms_per_block, ).compile(thread) fft_comp_repeated = Transform( transform, batch_shape, transforms_per_block=transforms_per_block, kernel_repetitions=kernel_repetitions).compile(thread) # Quick check of correctness fft_comp(res_dev, a_dev) res_test = res_dev.get() assert numpy.allclose(res_test, res_ref) # Test performance times, times_str = get_times(thread, fft_comp_repeated, res_dev, a_dev) print("\n{backend}, {trnum} per block, test --- {times}".format( times=times_str, backend='cuda' if is_cuda else 'ocl ', trnum=transforms_per_block))
def check_func(thread, func_module, reference_func, output_type, input_types, ranges=None, test_values=None): N = 1024 test = get_func_kernel(thread, func_module, output_type, input_types) arrays = [ get_test_array(N, tp, val_range=ranges[i] if ranges is not None else None) for i, tp in enumerate(input_types) ] if test_values is not None: for i, tvs in enumerate(test_values): if tvs is not None: for j, tv in enumerate(tvs): arrays[j][i] = tv arrays_dev = [thread.to_device(arr) for arr in arrays] dest_dev = thread.array(N, tp_dtype(output_type)) test(dest_dev, *arrays_dev, global_size=N) """ print() for arr in arrays: print(arr) print(dest_dev.get()) print(reference_func(*arrays)) """ assert (dest_dev.get() == reference_func(*arrays)).all()
def test_lwe_linear_broadcast(thread): params = NuFHEParameters() lwe_size = params.in_out_params.size res_shape = (10, 20) src_shape = res_shape[1:] res_a = get_test_array(res_shape + (lwe_size, ), Torus32) res_b = get_test_array(res_shape, Torus32) res_cv = get_test_array(res_shape, ErrorFloat, (-1, 1)) src_a = get_test_array(src_shape + (lwe_size, ), Torus32) src_b = get_test_array(src_shape, Torus32) src_cv = get_test_array(src_shape, ErrorFloat, (-1, 1)) coeff = 1 add_result = True res_shape_info = LweSampleArrayShapeInfo(res_a, res_b, res_cv) src_shape_info = LweSampleArrayShapeInfo(src_a, src_b, src_cv) test = LweLinear(res_shape_info, src_shape_info, add_result=add_result).compile(thread) ref = LweLinearReference(res_shape_info, src_shape_info, add_result=add_result) res_a_dev = thread.to_device(res_a) res_b_dev = thread.to_device(res_b) res_cv_dev = thread.to_device(res_cv) src_a_dev = thread.to_device(src_a) src_b_dev = thread.to_device(src_b) src_cv_dev = thread.to_device(src_cv) thread.synchronize() test(res_a_dev, res_b_dev, res_cv_dev, src_a_dev, src_b_dev, src_cv_dev, coeff) ref(res_a, res_b, res_cv, src_a, src_b, src_cv, coeff) assert (res_a_dev.get() == res_a).all() assert (res_b_dev.get() == res_b).all() assert errors_allclose(res_cv_dev.get(), res_cv)
def test_lwe_linear(thread, positive_coeff, add_result): params = NuFHEParameters() lwe_size = params.in_out_params.size shape = (10, 20) res_a = get_test_array(shape + (lwe_size, ), Torus32) res_b = get_test_array(shape, Torus32) res_cv = get_test_array(shape, Float, (-1, 1)) src_a = get_test_array(shape + (lwe_size, ), Torus32) src_b = get_test_array(shape, Torus32) src_cv = get_test_array(shape, Float, (-1, 1)) coeff = 1 if positive_coeff else -1 shape_info = LweSampleArrayShapeInfo(src_a, src_b, src_cv) test = LweLinear(shape_info, shape_info, add_result=add_result).compile(thread) ref = LweLinearReference(shape_info, shape_info, add_result=add_result) res_a_dev = thread.to_device(res_a) res_b_dev = thread.to_device(res_b) res_cv_dev = thread.to_device(res_cv) src_a_dev = thread.to_device(src_a) src_b_dev = thread.to_device(src_b) src_cv_dev = thread.to_device(src_cv) thread.synchronize() test(res_a_dev, res_b_dev, res_cv_dev, src_a_dev, src_b_dev, src_cv_dev, coeff) ref(res_a, res_b, res_cv, src_a, src_b, src_cv, coeff) assert (res_a_dev.get() == res_a).all() assert (res_b_dev.get() == res_b).all() assert numpy.allclose(res_cv_dev.get(), res_cv)
def test_ntt_performance(thread, transforms_per_block, constant_memory, heavy_performance_load): if not transform_supported(thread.device_params, 'NTT'): pytest.skip() if transforms_per_block > max_supported_transforms_per_block(thread.device_params, 'NTT'): pytest.skip() is_cuda = thread.api.get_id() == cuda_id() methods = list(itertools.product( ['cuda_asm', 'c'], # base method ['cuda_asm', 'c_from_asm', 'c'], # mul method ['cuda_asm', 'c_from_asm', 'c'] # lsh method )) if not is_cuda: # filter out all usage of CUDA asm if we're on OpenCL methods = [ms for ms in methods if 'cuda_asm' not in ms] batch_shape = (2**14,) a = get_test_array(batch_shape + (1024,), "ff_number") kernel_repetitions = 100 if heavy_performance_load else 5 a_dev = thread.to_device(a) res_dev = thread.empty_like(a_dev) # TODO: compute a reference NTT when it's fast enough on CPU #res_ref = tr_ntt.ntt_transform_ref(a) print() min_times = [] for base_method, mul_method, lsh_method in methods: transform = ntt1024( base_method=base_method, mul_method=mul_method, lsh_method=lsh_method, use_constant_memory=constant_memory) ntt_comp = Transform( transform, batch_shape, transforms_per_block=transforms_per_block, ).compile(thread) ntt_comp_repeated = Transform( transform, batch_shape, transforms_per_block=transforms_per_block, kernel_repetitions=kernel_repetitions).compile(thread) # TODO: compute a reference NTT when it's fast enough on CPU # Quick check of correctness #ntt_comp(res_dev, a_dev) #res_test = res_dev.get() #assert (res_test == res_ref).all() # Test performance times, times_str = get_times(thread, ntt_comp_repeated, res_dev, a_dev) print(" base: {bm}, mul: {mm}, lsh: {lm}".format( bm=base_method, mm=mul_method, lm=lsh_method)) print(" {backend}, {trnum} per block, test --- {times}".format( times=times_str, backend='cuda' if is_cuda else 'ocl ', trnum=transforms_per_block)) min_times.append((times.min(), base_method, mul_method, lsh_method)) best = min(min_times, key=lambda t: t[0]) time_best, base_method, mul_method, lsh_method = best print("Best time: {tb:.4f} for [base: {bm}, mul: {mm}, lsh: {lm}]".format( tb=time_best, bm=base_method, mm=mul_method, lm=lsh_method ))
def test_prepare_for_mul_cpu(): array = get_test_array(1024, 'ff_number') res = ntt.prepare_for_mul_cpu(array) ref = ref_prepare_for_mul(array) assert (res == ref).all()