Example #1
0
def multi_gpu_benchmark(n: int,
                        batches: int,
                        compute_device_pool: ComputeDevicePool,
                        repetitions: int = 1,
                        verbose: bool = False) -> tp.Dict[int, float]:
    if verbose:
        print('multi gpu benchmark - begin')

    number_of_devices_to_runtime_map = {}

    for number_of_devices_to_use in range(1, compute_device_pool.number_of_devices + 1):
        with Timer() as timer:
            for _ in range(repetitions):
                pi = compute_device_pool.map_reduce(lambda: estimate_pi(n=math.ceil(n / number_of_devices_to_use), batches=batches, gpu=True),
                                                    reduction=lambda x, y: x + y / number_of_devices_to_use,
                                                    initial_value=0.0,
                                                    number_of_batches=number_of_devices_to_use)

                sync()
                if verbose:
                    print(pi)

        gpu_time = timer.elapsed / repetitions
        number_of_devices_to_runtime_map[number_of_devices_to_use] = gpu_time

    if verbose:
        print('multi gpu benchmark - end')

    return number_of_devices_to_runtime_map
def multi_gpu_benchmark(n: int,
                        batches: int,
                        gpu_pool: ComputeDevicePool,
                        repetitions: int = 1) -> tp.Dict[int, float]:
    number_of_devices_to_runtime_map = {}

    for number_of_devices_to_use in range(1, gpu_pool.number_of_devices + 1):
        with Timer() as timer:
            for _ in range(repetitions):
                pi = gpu_pool.map_reduce(
                    lambda: estimate_pi(n=math.ceil(n /
                                                    number_of_devices_to_use),
                                        batches=batches,
                                        gpu=True),
                    reduction=lambda x, y: x + y / number_of_devices_to_use,
                    initial_value=0.0,
                    number_of_batches=number_of_devices_to_use)

                sync()

        gpu_time = timer.elapsed / repetitions
        number_of_devices_to_runtime_map[number_of_devices_to_use] = gpu_time

    return number_of_devices_to_runtime_map
def run_benchmarks(
        numerical_package_bundles:
        tp.Optional[tp.Tuple[tp.Type[NumericalPackageBundle], ...]] = None) \
        -> tp.Dict[type(NumericalPackageBundle), HestonBenchmarkResults]:
    if numerical_package_bundles is None:
        numerical_package_bundles = \
            get_available_numerical_packages(list_installed_bundles=True)

    # model parameters
    x0 = 0.0  # initial log stock price
    v0 = 0.101**2  # initial volatility
    r = math.log(1.0319)  # risk-free rate
    rho = -0.7  # instantaneous correlation between Brownian motions
    sigma_v = 0.61  # variance of volatility
    kappa = 6.21  # mean reversion speed
    v_bar = 0.019  # mean variance

    # option parameters
    T = 1.0  # time to expiration
    K = 0.95  # strike price

    # simulation parameters
    nT = int(math.ceil(500 * T))  # number of time-steps to simulate
    R = 2000000  # actual number of paths to simulate for pricing

    kwargs = \
        dict(x0=x0,
             v0=v0,
             r=r,
             rho=rho,
             sigma_v=sigma_v,
             kappa=kappa,
             v_bar=v_bar,
             T=T,
             K=K,
             nT=nT,
             R=R)

    # single core benchmarks
    numerical_package_bundle_to_result_map = {}
    for numerical_package_bundle in numerical_package_bundles:
        kwargs['numerical_package_bundle'] = numerical_package_bundle
        # run benchmark on gpu
        heston_benchmark_results \
            = run_benchmark(**kwargs)

        numerical_package_bundle_to_result_map[numerical_package_bundle] \
            = heston_benchmark_results

        heston_benchmark_results.print_results()

    if loky_installed():
        # multi core benchmarks
        kwargs['numerical_package_bundle'] = NumpyMulticoreBundle
        # initialize Python processes
        with Timer() as timer:
            _ \
                = simulate_and_compute_option_price_multicore(**kwargs)
            print(f'time in first run={timer.elapsed}')

        with Timer() as timer:
            option_price \
                = simulate_and_compute_option_price_multicore(**kwargs)

        total_time = timer.elapsed

        numpy_multicore_results = \
            HestonBenchmarkResults(
                numerical_package_bundle=NumpyMulticoreBundle,
                total_time=total_time,
                option_price=option_price)

        numpy_multicore_results.print_results()

        numerical_package_bundle_to_result_map[NumpyMulticoreBundle] \
            = numpy_multicore_results

        # multi gpu benchmarks
        if 'numerical_package_bundle' in kwargs:
            kwargs.pop('numerical_package_bundle')

        compute_device_pool = ComputeDevicePool()
        if compute_device_pool.number_of_devices > 1:
            # warm up
            kwargs['R'] = 20000
            option_price = \
                simulate_and_compute_option_price_gpu(compute_device_pool=compute_device_pool,
                                                      **kwargs)

            # actual benchmark
            kwargs['R'] = R

            for number_of_gpus in range(
                    1, compute_device_pool.number_of_devices + 1):
                cocos_multi_gpu_bundle = CocosMultiGPUBundle(
                    number_of_gpus=number_of_gpus)

                with Timer() as timer:
                    option_price = \
                        simulate_and_compute_option_price_gpu(compute_device_pool=compute_device_pool,
                                                              number_of_batches=number_of_gpus,
                                                              **kwargs)
                    cocos.device.sync()

                total_time = timer.elapsed

                cocos_multi_gpu_results = \
                    HestonBenchmarkResults(
                        numerical_package_bundle=cocos_multi_gpu_bundle,
                        total_time=total_time,
                        option_price=option_price)

                cocos_multi_gpu_results.print_results()

                numerical_package_bundle_to_result_map[cocos_multi_gpu_bundle] \
                    = cocos_multi_gpu_results
    else:
        print(
            f'Please install loky to enable multi core and multi gpu benchmarks'
        )

    return numerical_package_bundle_to_result_map
def main():
    n = 1000000000
    repetitions = 1
    batches = 20
    means_of_computation_to_runtime_map = {}

    # single core benchmark
    single_core_runtime = single_core_benchmark(n, repetitions=repetitions)
    means_of_computation_to_runtime_map[
        SINGLE_CORE_NUMPY] = single_core_runtime
    print(
        f'Estimation of pi using single core NumPy performed in {single_core_runtime} seconds'
    )

    # single core benchmark
    single_core_runtime_numexpr = single_core_benchmark_numexpr(
        n, repetitions=repetitions)
    means_of_computation_to_runtime_map[
        'NumExpr Single Core'] = single_core_runtime_numexpr
    print(
        f'Estimation of pi using single core Numexpr performed in {single_core_runtime_numexpr} seconds'
    )

    # multi core benchmark
    multi_core_benchmark(n=100,
                         core_config=range(1,
                                           multiprocessing.cpu_count() + 1),
                         repetitions=repetitions)
    number_of_cores_to_runtime_map = multi_core_benchmark(
        n=n, core_config=range(1,
                               multiprocessing.cpu_count() + 1))

    for number_of_cores_to_use, cpu_time in number_of_cores_to_runtime_map.items(
    ):
        means_of_computation_to_runtime_map[
            f'NumPy with {number_of_cores_to_use} CPU core(s)'] = cpu_time
        print(
            f'Estimation of pi on {number_of_cores_to_use} core(s) using NumPy performed in {cpu_time} seconds'
        )

    # single gpu
    single_gpu_benchmark(n=100, batches=1)
    single_gpu_runtime = single_gpu_benchmark(n=n,
                                              batches=batches,
                                              repetitions=repetitions)
    means_of_computation_to_runtime_map[
        'Cocos Single GPU'] = single_gpu_runtime
    print(
        f'Estimation of pi using single GPU Cocos performed in {single_gpu_runtime} seconds'
    )

    # multi gpu benchmark
    gpu_pool = ComputeDevicePool()

    multi_gpu_benchmark(n=100,
                        batches=batches,
                        gpu_pool=gpu_pool,
                        repetitions=repetitions)
    number_of_devices_to_runtime_map = multi_gpu_benchmark(n=n,
                                                           batches=batches,
                                                           gpu_pool=gpu_pool)

    for number_of_devices_to_use, gpu_time in number_of_devices_to_runtime_map.items(
    ):
        means_of_computation_to_runtime_map[
            f'Cocos with {number_of_devices_to_use} GPU(s)'] = gpu_time
        print(
            f'Estimation of pi on {number_of_devices_to_use} GPUs in {gpu_time} seconds'
        )

    if gpu_pool.number_of_devices > 1:
        for number_of_devices_to_use in range(2,
                                              gpu_pool.number_of_devices + 1):
            print(
                f'Performance on {number_of_devices_to_use} GPUs increased by a factor of'
                f' {number_of_devices_to_runtime_map[1] / number_of_devices_to_runtime_map[number_of_devices_to_use]} '
                f'over a single GPU.')

    # cupy single gpu
    try:
        single_gpu_cupy_benchmark(n=100, batches=1)
        single_gpu_cupy_runtime = single_gpu_cupy_benchmark(
            n=n, batches=batches, repetitions=repetitions)
        means_of_computation_to_runtime_map[
            'CuPy Single GPU'] = single_gpu_cupy_runtime
        print(
            f'Estimation of pi using single GPU CuPy performed in {single_gpu_cupy_runtime} seconds'
        )
    except Exception as e:
        print(e)
        print('CuPy is not installed or not working correctly.')

    print(create_result_table(means_of_computation_to_runtime_map))
    create_bar_plot(means_of_computation_to_runtime_map)
Example #5
0
    plt.figure(1)
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Speedup Factor')
    plt.title('Performance Relative to a Single GPU \n'
              'in Monte Carlo Simulation of Heston Model \n')

    plt.savefig(f'heston_pricing_benchmark_results_multi_gpu')

    plt.show()


if __name__ == '__main__':
    info()

    gpu_pool = ComputeDevicePool()

    # model parameters
    x0 = 0.0  # initial log stock price
    v0 = 0.101**2  # initial volatility
    r = math.log(1.0319)  # risk-free rate
    rho = -0.7  # instantaneous correlation between Brownian motions
    sigma_v = 0.61  # variance of volatility
    kappa = 6.21  # mean reversion speed
    v_bar = 0.019  # mean variance

    # option parameters
    T = 1.0  # time to expiration
    K = 0.95  # strike price

    # simulation parameters
Example #6
0
    def evaluate_in_batches_on_multiple_devices(self,
                                                points: NumericArray,
                                                maximum_number_of_elements_per_batch: int,
                                                compute_device_pool: ComputeDevicePool) \
            -> np.ndarray:
        """
        Evaluates a Gaussian KDE in batches on multiple gpus and stores the results in main memory.

        Args:
            points:
                numeric array with shape (d, m) containing the points at which to evaluate the kernel
                density estimate
            maximum_number_of_elements_per_batch:
                maximum number of data points times evaluation points to process in a single batch

        Returns:
            a m-dimensional NumPy array of kernel density estimates
        """
        if self.gpu:
            raise ValueError(
                'Multi GPU evaluation requires gaussian_kde.gpu = False.')

        points = self._check_and_adjust_dimensions_of_points(points)

        number_of_points = points.shape[1]

        args_list = []
        for begin_index, end_index in generate_slices_with_number_of_batches(
                number_of_points, compute_device_pool.number_of_devices):
            args_list.append([points[:, begin_index:end_index]])

        # points_per_device = math.floor(number_of_points / gpu_pool.number_of_devices)
        # args_list = _split_points_into_batches(points, points_per_device)
        kwargs_list = compute_device_pool.number_of_devices * \
                      [
                          {'maximum_number_of_elements_per_batch': maximum_number_of_elements_per_batch,
                           'n': self.n,
                           'd': self.d}
                      ]

        def f(points_internal, maximum_number_of_elements_per_batch: int,
              n: int, d: int):
            points_per_batch = math.floor(
                maximum_number_of_elements_per_batch / (n * d))
            args_list_internal = _split_points_into_batches(
                points_internal, points_per_batch)

            def f_internal(points_internal_internal):
                return gaussian_kernel_estimate_vectorized_whitened(
                    whitening=self.whitening,
                    whitened_points=self.whitened_points,
                    values=self.weights[:, None],
                    xi=points_internal_internal.T,
                    norm=self.normalization_constant,
                    dtype=np.float32,
                    gpu=True)

            result = \
                map_combine_single_device(f=f_internal,
                                          combination=lambda x: np.hstack(x),
                                          args_list=args_list_internal)

            return result

        result = \
            compute_device_pool.map_combine(f=f,
                                            combination=lambda x: np.hstack(x),
                                            args_list=args_list,
                                            kwargs_list=kwargs_list)

        return result
    plt.figure(1)
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Speedup Factor')
    plt.title('Performance Relative to a Single GPU \n'
              'in Monte Carlo Simulation of Heston Model \n')

    plt.savefig(f'heston_pricing_benchmark_results_multi_gpu')

    plt.show()


if __name__ == '__main__':
    info()

    compute_device_pool = ComputeDevicePool()

    # model parameters
    x0 = 0.0  # initial log stock price
    v0 = 0.101**2  # initial volatility
    r = math.log(1.0319)  # risk-free rate
    rho = -0.7  # instantaneous correlation between Brownian motions
    sigma_v = 0.61  # variance of volatility
    kappa = 6.21  # mean reversion speed
    v_bar = 0.019  # mean variance

    # option parameters
    T = 1.0  # time to expiration
    K = 0.95  # strike price

    # simulation parameters