def single_gpu_benchmark(): scenarios = wlb.ScenarioManager() block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)] cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1), (32, 4, 1), (64, 4, 1), (128, 4, 1), (32, 8, 1), (64, 8, 1), (32, 16, 1)] for block_size in block_sizes: for cuda_block_size in cuda_blocks: cells = block_size[0] * block_size[1] * block_size[2] time_steps_for_128_cubed = 1000 time_steps = (128**3 / cells) * time_steps_for_128_cubed scenario = Scenario(cells_per_block=block_size, gpuBlockSize=cuda_block_size, timeStepStrategy='kernelOnly', timesteps=int(time_steps)) scenarios.add(scenario)
def overlap_benchmark(): scenarios = wlb.ScenarioManager() inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1), (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)] for comm_strategy in [ 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy' ]: # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams' # no overlap scenarios.add( Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1))) # overlap for overlap_strategy in ['simpleOverlap', 'complexOverlap']: for inner_outer_split in inner_outer_splits: scenario = Scenario(timeStepStrategy=overlap_strategy, communicationScheme=comm_strategy, innerOuterSplit=inner_outer_split) scenarios.add(scenario)
# encoding: utf-8 import itertools import waLBerla as wlb from base import get_block_decomposition, communication_schemes, overlap_communication, \ cuda_enabled_mpi, num_processes, calculate_time_steps, side_length_to_fill_memory from benchmark import BenchmarkScenario, CommunicationSchemeType # Stores the scenarios for the current simulation scenarios = wlb.ScenarioManager() # Generates all block decompositions of xyz, 2 directions at a time #block_decompositions = itertools.combinations_with_replacement('xyz', r=2) block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz'] # compute number of cells depending on GPU memory i.e. by specifying the percentage of GPU memory to fill gpu_memory_gb = 16 cells_per_block = [ side_length_to_fill_memory(pc, gpu_memory_gb) for pc in (0.8, 0.5, 0.05) ] expected_mlups = 200 # to compute how many time steps have to be done time_per_scenarios = 5 # benchmark time in seconds fully_periodic = [False, True] if num_processes == 1: scenario_generator = itertools.product(communication_schemes, [ False, ], [ False,