Ejemplo n.º 1
0
def check_func_performance(tag,
                           thread,
                           func_module,
                           reference_func,
                           output_type,
                           input_types,
                           ranges=None,
                           heavy_performance_load=False):

    N = 1024 * (1024 if heavy_performance_load else 32)
    repetitions = 100000

    test = get_func_kernel(thread, func_module, output_type, input_types)
    perf_test = get_func_kernel(thread,
                                func_module,
                                output_type,
                                input_types,
                                repetitions=repetitions)

    arrays = [
        get_test_array(N,
                       tp,
                       val_range=ranges[i] if ranges is not None else None)
        for i, tp in enumerate(input_types)
    ]

    arrays_dev = [thread.to_device(arr) for arr in arrays]
    dest_dev = thread.array(N, tp_dtype(output_type))

    # Sanity check
    test(dest_dev, *arrays_dev, global_size=N)
    assert (dest_dev.get() == reference_func(*arrays)).all()

    # Performance check
    times = []

    for j in range(10):
        thread.synchronize()
        t1 = time.time()
        perf_test(dest_dev, *arrays_dev, global_size=N)
        thread.synchronize()
        t2 = time.time()
        times.append(t2 - t1)

    times = numpy.array(times)

    times /= repetitions
    times /= N
    times *= 1e12

    print()
    print(
        "{backend}: {tag} --- min: {min:.4f}, mean: {mean:.4f}, std: {std:.4f}"
        .format(tag=tag,
                min=times.min(),
                mean=times.mean(),
                std=times.std(),
                backend='cuda'
                if thread.api.get_id() == cluda.cuda_id() else 'ocl '))
Ejemplo n.º 2
0
def test_make_lwe_keyswitch_key(thread):

    params = NuFHEParameters()
    input_size = params.tgsw_params.tlwe_params.extracted_lweparams.size
    output_size = params.in_out_params.size
    decomp_length = params.ks_decomp_length
    log2_base = params.ks_log2_base
    base = 2**log2_base
    noise = params.in_out_params.min_noise

    ks_a = numpy.empty((input_size, decomp_length, base, output_size),
                       dtype=Torus32)
    ks_b = numpy.empty((input_size, decomp_length, base), dtype=Torus32)
    ks_cv = numpy.empty((input_size, decomp_length, base), dtype=Float)

    in_key = get_test_array(input_size, Int32, (0, 2))
    out_key = get_test_array(output_size, Int32, (0, 2))
    noises_a = get_test_array(
        (input_size, decomp_length, base - 1, output_size), Torus32)
    noises_b = get_test_array((input_size, decomp_length, base - 1), Float,
                              (-noise, noise))

    test = MakeLweKeyswitchKey(input_size, output_size, decomp_length,
                               log2_base, noise).compile(thread)
    ref = MakeLweKeyswitchKeyReference(input_size, output_size, decomp_length,
                                       log2_base, noise)

    ks_a_dev = thread.empty_like(ks_a)
    ks_b_dev = thread.empty_like(ks_b)
    ks_cv_dev = thread.empty_like(ks_cv)
    in_key_dev = thread.to_device(in_key)
    out_key_dev = thread.to_device(out_key)
    noises_a_dev = thread.to_device(noises_a)
    noises_b_dev = thread.to_device(noises_b)

    test(ks_a_dev, ks_b_dev, ks_cv_dev, in_key_dev, out_key_dev, noises_a_dev,
         noises_b_dev)
    ref(ks_a, ks_b, ks_cv, in_key, out_key, noises_a, noises_b)

    ks_a_test = ks_a_dev.get()
    ks_b_test = ks_b_dev.get()
    ks_cv_test = ks_cv_dev.get()

    assert (ks_a_test == ks_a).all()
    assert (ks_b_test == ks_b).all()
    assert numpy.allclose(ks_cv_test, ks_cv)
Ejemplo n.º 3
0
def test_tlwe_transformed_add_mul_to_trf(thread):

    shape = (2, 3)
    params = NuFHEParameters(transform_type='NTT')
    perf_params = PerformanceParameters(params).for_device(thread.device_params)
    tgsw_params = params.tgsw_params

    decomp_length = tgsw_params.decomp_length
    mask_size = tgsw_params.tlwe_params.mask_size
    polynomial_degree = tgsw_params.tlwe_params.polynomial_degree

    transform_type = tgsw_params.tlwe_params.transform_type
    transform = get_transform(transform_type)
    tlength = transform.transformed_length(polynomial_degree)
    tdtype = transform.transformed_dtype()

    result_shape = shape + (mask_size + 1, tlength)
    sample_shape = shape + (mask_size + 1, decomp_length, tlength)
    bk_len = 10
    bootstrap_key_shape = (bk_len, mask_size + 1, decomp_length, mask_size + 1, tlength)
    bk_row_idx = 2

    result = numpy.empty(result_shape, tdtype)

    sample = get_test_array(sample_shape, 'ff_number')
    bootstrap_key = get_test_array(bootstrap_key_shape, 'ff_number')

    result_dev = thread.empty_like(result)
    sample_dev = thread.to_device(sample)
    bootstrap_key_dev = thread.to_device(bootstrap_key)

    trf = get_tlwe_transformed_add_mul_to_trf(tgsw_params, shape, bk_len, perf_params)
    test = PureParallel.from_trf(trf, guiding_array='result').compile(thread)
    ref = tlwe_transformed_add_mul_to_trf_reference(tgsw_params, shape, bk_len, perf_params)

    test(result_dev, sample_dev, bootstrap_key_dev, bk_row_idx)
    result_test = result_dev.get()

    ref(result, sample, bootstrap_key, bk_row_idx)

    if numpy.issubdtype(tdtype, numpy.integer):
        assert (result == result_test).all()
    else:
        assert numpy.allclose(result, result_test)
Ejemplo n.º 4
0
def test_tgsw_transformed_external_mul(thread):

    shape = (2, 3)
    params = NuFHEParameters()
    perf_params = performance_parameters()
    tgsw_params = params.tgsw_params

    decomp_length = tgsw_params.decomp_length
    mask_size = tgsw_params.tlwe_params.mask_size
    polynomial_degree = tgsw_params.tlwe_params.polynomial_degree

    transform_type = tgsw_params.tlwe_params.transform_type
    transform = get_transform(transform_type)
    tlength = transform.transformed_length(polynomial_degree)
    tdtype = transform.transformed_dtype()

    accum_shape = shape + (mask_size + 1, polynomial_degree)
    bk_len = 10
    bootstrap_key_shape = (bk_len, mask_size + 1, decomp_length, mask_size + 1,
                           tlength)
    bk_row_idx = 2

    bootstrap_key = get_test_array(
        bootstrap_key_shape,
        'ff_number' if transform_type == 'NTT' else tdtype)
    accum = get_test_array(accum_shape, Torus32, (-1000, 1000))

    bootstrap_key_dev = thread.to_device(bootstrap_key)
    accum_dev = thread.to_device(accum)
    thread.synchronize()

    test = TGswTransformedExternalMul(tgsw_params, shape, bk_len,
                                      perf_params).compile(thread)
    ref = TGswTransformedExternalMulReference(tgsw_params, shape, bk_len,
                                              perf_params)

    test(accum_dev, bootstrap_key_dev, bk_row_idx)
    accum_test = accum_dev.get()

    ref(accum, bootstrap_key, bk_row_idx)

    assert numpy.allclose(accum, accum_test)
Ejemplo n.º 5
0
def test_lwe_encrypt(thread):

    params = NuFHEParameters()
    lwe_size = params.in_out_params.size
    noise = params.in_out_params.min_noise

    shape = (16, 20)
    result_a = numpy.empty(shape + (lwe_size,), Torus32)
    result_b = numpy.empty(shape, Torus32)
    result_cv = numpy.empty(shape, ErrorFloat)
    key = get_test_array(lwe_size, Int32, (0, 2))
    messages = get_test_array(shape, Torus32)
    noises_a = get_test_array(shape + (lwe_size,), Torus32)
    noises_b = get_test_array(shape, Torus32)

    test = LweEncrypt(shape, lwe_size, noise).compile(thread)
    ref = LweEncryptReference(shape, lwe_size, noise)

    result_a_dev = thread.empty_like(result_a)
    result_b_dev = thread.empty_like(result_b)
    result_cv_dev = thread.empty_like(result_cv)
    key_dev = thread.to_device(key)
    messages_dev = thread.to_device(messages)
    noises_a_dev = thread.to_device(noises_a)
    noises_b_dev = thread.to_device(noises_b)

    test(
        result_a_dev, result_b_dev, result_cv_dev,
        messages_dev, key_dev, noises_a_dev, noises_b_dev)
    ref(result_a, result_b, result_cv, messages, key, noises_a, noises_b)

    result_a_test = result_a_dev.get()
    result_b_test = result_b_dev.get()
    result_cv_test = result_cv_dev.get()

    assert (result_a_test == result_a).all()
    assert (result_b_test == result_b).all()
    assert errors_allclose(result_cv_test, result_cv)
Ejemplo n.º 6
0
def test_tlwe_encrypt_zero(thread):

    nufhe_params = NuFHEParameters()
    perf_params = PerformanceParameters(nufhe_params).for_device(
        thread.device_params)
    params = nufhe_params.tgsw_params.tlwe_params

    mask_size = params.mask_size
    polynomial_degree = params.polynomial_degree
    noise = params.min_noise

    shape = (3, 4, 5)

    result_a = numpy.empty(shape + (mask_size + 1, polynomial_degree), Torus32)
    result_cv = numpy.empty(shape, ErrorFloat)
    noises1 = get_test_array(shape + (mask_size, polynomial_degree), Torus32)
    noises2 = get_test_array(shape + (polynomial_degree, ), Torus32)
    key = get_test_array((mask_size, polynomial_degree), Int32, (0, 2))

    test = TLweEncryptZero(params, shape, noise, perf_params).compile(thread)
    ref = TLweEncryptZeroReference(params, shape, noise, perf_params)

    result_a_dev = thread.empty_like(result_a)
    result_cv_dev = thread.empty_like(result_cv)
    noises1_dev = thread.to_device(noises1)
    noises2_dev = thread.to_device(noises2)
    key_dev = thread.to_device(key)

    test(result_a_dev, result_cv_dev, key_dev, noises1_dev, noises2_dev)
    ref(result_a, result_cv, key, noises1, noises2)

    result_a_test = result_a_dev.get()
    result_cv_test = result_cv_dev.get()

    assert (result_a_test == result_a).all()
    assert errors_allclose(result_cv_test, result_cv)
Ejemplo n.º 7
0
def test_t32_to_phase(thread):

    mspace_size = 2048
    shape = (10, 20, 30)
    phase = get_test_array(shape, Torus32)
    result = numpy.empty(shape, Int32)

    phase_dev = thread.to_device(phase)
    result_dev = thread.empty_like(result)

    comp = Torus32ToPhase(shape, mspace_size).compile(thread)
    ref = Torus32ToPhaseReference(shape, mspace_size)

    comp(result_dev, phase_dev)
    result_test = result_dev.get()

    ref(result, phase)

    assert numpy.allclose(result_test, result)
Ejemplo n.º 8
0
def test_fft_performance(thread, transforms_per_block, constant_memory, heavy_performance_load):

    if not transform_supported(thread.device_params, 'FFT'):
        pytest.skip()

    if transforms_per_block > max_supported_transforms_per_block(thread.device_params, 'FFT'):
        pytest.skip()

    is_cuda = thread.api.get_id() == cuda_id()

    batch_shape = (2**14,)
    a = get_test_array(batch_shape + (512,), numpy.complex128)

    kernel_repetitions = 100 if heavy_performance_load else 5

    a_dev = thread.to_device(a)
    res_dev = thread.empty_like(a_dev)

    res_ref = tr_fft.fft_transform_ref(a)

    transform = fft512(use_constant_memory=constant_memory)

    fft_comp = Transform(
        transform, batch_shape, transforms_per_block=transforms_per_block,
        ).compile(thread)
    fft_comp_repeated = Transform(
        transform, batch_shape, transforms_per_block=transforms_per_block,
        kernel_repetitions=kernel_repetitions).compile(thread)

    # Quick check of correctness
    fft_comp(res_dev, a_dev)
    res_test = res_dev.get()
    assert numpy.allclose(res_test, res_ref)

    # Test performance
    times, times_str = get_times(thread, fft_comp_repeated, res_dev, a_dev)
    print("\n{backend}, {trnum} per block, test --- {times}".format(
        times=times_str,
        backend='cuda' if is_cuda else 'ocl ',
        trnum=transforms_per_block))
Ejemplo n.º 9
0
def check_func(thread,
               func_module,
               reference_func,
               output_type,
               input_types,
               ranges=None,
               test_values=None):

    N = 1024

    test = get_func_kernel(thread, func_module, output_type, input_types)

    arrays = [
        get_test_array(N,
                       tp,
                       val_range=ranges[i] if ranges is not None else None)
        for i, tp in enumerate(input_types)
    ]

    if test_values is not None:
        for i, tvs in enumerate(test_values):
            if tvs is not None:
                for j, tv in enumerate(tvs):
                    arrays[j][i] = tv

    arrays_dev = [thread.to_device(arr) for arr in arrays]
    dest_dev = thread.array(N, tp_dtype(output_type))

    test(dest_dev, *arrays_dev, global_size=N)
    """
    print()
    for arr in arrays:
        print(arr)
    print(dest_dev.get())
    print(reference_func(*arrays))
    """

    assert (dest_dev.get() == reference_func(*arrays)).all()
Ejemplo n.º 10
0
def test_lwe_linear_broadcast(thread):

    params = NuFHEParameters()
    lwe_size = params.in_out_params.size

    res_shape = (10, 20)
    src_shape = res_shape[1:]

    res_a = get_test_array(res_shape + (lwe_size, ), Torus32)
    res_b = get_test_array(res_shape, Torus32)
    res_cv = get_test_array(res_shape, ErrorFloat, (-1, 1))

    src_a = get_test_array(src_shape + (lwe_size, ), Torus32)
    src_b = get_test_array(src_shape, Torus32)
    src_cv = get_test_array(src_shape, ErrorFloat, (-1, 1))

    coeff = 1
    add_result = True

    res_shape_info = LweSampleArrayShapeInfo(res_a, res_b, res_cv)
    src_shape_info = LweSampleArrayShapeInfo(src_a, src_b, src_cv)

    test = LweLinear(res_shape_info, src_shape_info,
                     add_result=add_result).compile(thread)
    ref = LweLinearReference(res_shape_info,
                             src_shape_info,
                             add_result=add_result)

    res_a_dev = thread.to_device(res_a)
    res_b_dev = thread.to_device(res_b)
    res_cv_dev = thread.to_device(res_cv)
    src_a_dev = thread.to_device(src_a)
    src_b_dev = thread.to_device(src_b)
    src_cv_dev = thread.to_device(src_cv)
    thread.synchronize()

    test(res_a_dev, res_b_dev, res_cv_dev, src_a_dev, src_b_dev, src_cv_dev,
         coeff)
    ref(res_a, res_b, res_cv, src_a, src_b, src_cv, coeff)

    assert (res_a_dev.get() == res_a).all()
    assert (res_b_dev.get() == res_b).all()
    assert errors_allclose(res_cv_dev.get(), res_cv)
Ejemplo n.º 11
0
def test_lwe_linear(thread, positive_coeff, add_result):

    params = NuFHEParameters()
    lwe_size = params.in_out_params.size

    shape = (10, 20)

    res_a = get_test_array(shape + (lwe_size, ), Torus32)
    res_b = get_test_array(shape, Torus32)
    res_cv = get_test_array(shape, Float, (-1, 1))

    src_a = get_test_array(shape + (lwe_size, ), Torus32)
    src_b = get_test_array(shape, Torus32)
    src_cv = get_test_array(shape, Float, (-1, 1))

    coeff = 1 if positive_coeff else -1

    shape_info = LweSampleArrayShapeInfo(src_a, src_b, src_cv)

    test = LweLinear(shape_info, shape_info,
                     add_result=add_result).compile(thread)
    ref = LweLinearReference(shape_info, shape_info, add_result=add_result)

    res_a_dev = thread.to_device(res_a)
    res_b_dev = thread.to_device(res_b)
    res_cv_dev = thread.to_device(res_cv)
    src_a_dev = thread.to_device(src_a)
    src_b_dev = thread.to_device(src_b)
    src_cv_dev = thread.to_device(src_cv)
    thread.synchronize()

    test(res_a_dev, res_b_dev, res_cv_dev, src_a_dev, src_b_dev, src_cv_dev,
         coeff)
    ref(res_a, res_b, res_cv, src_a, src_b, src_cv, coeff)

    assert (res_a_dev.get() == res_a).all()
    assert (res_b_dev.get() == res_b).all()
    assert numpy.allclose(res_cv_dev.get(), res_cv)
Ejemplo n.º 12
0
def test_ntt_performance(thread, transforms_per_block, constant_memory, heavy_performance_load):

    if not transform_supported(thread.device_params, 'NTT'):
        pytest.skip()

    if transforms_per_block > max_supported_transforms_per_block(thread.device_params, 'NTT'):
        pytest.skip()

    is_cuda = thread.api.get_id() == cuda_id()

    methods = list(itertools.product(
        ['cuda_asm', 'c'], # base method
        ['cuda_asm', 'c_from_asm', 'c'], # mul method
        ['cuda_asm', 'c_from_asm', 'c'] # lsh method
        ))

    if not is_cuda:
        # filter out all usage of CUDA asm if we're on OpenCL
        methods = [ms for ms in methods if 'cuda_asm' not in ms]

    batch_shape = (2**14,)
    a = get_test_array(batch_shape + (1024,), "ff_number")

    kernel_repetitions = 100 if heavy_performance_load else 5

    a_dev = thread.to_device(a)
    res_dev = thread.empty_like(a_dev)

    # TODO: compute a reference NTT when it's fast enough on CPU
    #res_ref = tr_ntt.ntt_transform_ref(a)

    print()
    min_times = []
    for base_method, mul_method, lsh_method in methods:

        transform = ntt1024(
            base_method=base_method, mul_method=mul_method, lsh_method=lsh_method,
            use_constant_memory=constant_memory)

        ntt_comp = Transform(
            transform, batch_shape, transforms_per_block=transforms_per_block,
            ).compile(thread)
        ntt_comp_repeated = Transform(
            transform, batch_shape, transforms_per_block=transforms_per_block,
            kernel_repetitions=kernel_repetitions).compile(thread)

        # TODO: compute a reference NTT when it's fast enough on CPU
        # Quick check of correctness
        #ntt_comp(res_dev, a_dev)
        #res_test = res_dev.get()
        #assert (res_test == res_ref).all()

        # Test performance
        times, times_str = get_times(thread, ntt_comp_repeated, res_dev, a_dev)
        print("  base: {bm}, mul: {mm}, lsh: {lm}".format(
            bm=base_method, mm=mul_method, lm=lsh_method))
        print("  {backend}, {trnum} per block, test --- {times}".format(
            times=times_str,
            backend='cuda' if is_cuda else 'ocl ',
            trnum=transforms_per_block))

        min_times.append((times.min(), base_method, mul_method, lsh_method))

    best = min(min_times, key=lambda t: t[0])
    time_best, base_method, mul_method, lsh_method = best
    print("Best time: {tb:.4f} for [base: {bm}, mul: {mm}, lsh: {lm}]".format(
        tb=time_best, bm=base_method, mm=mul_method, lm=lsh_method
        ))
Ejemplo n.º 13
0
def test_prepare_for_mul_cpu():
    array = get_test_array(1024, 'ff_number')
    res = ntt.prepare_for_mul_cpu(array)
    ref = ref_prepare_for_mul(array)
    assert (res == ref).all()