def timed_median_filter(self, runs, filter_size): # Synchronize and free memory before making an assessment about available space free_memory_pool() # Determine the number of partitions required (not taking the padding into account) n_partitions_needed = number_of_partitions_needed( self.cpu_arrays[:1], get_free_bytes()) transfer_time = 0 operation_time = 0 pad_height = filter_size[1] // 2 pad_width = filter_size[0] // 2 filter_height = filter_size[0] filter_width = filter_size[1] padded_cpu_array = np.pad( self.cpu_arrays[0], pad_width=((0, 0), (pad_width, pad_width), (pad_height, pad_height)), ) if n_partitions_needed == 1: # Time the transfer from CPU to GPU start = get_synchronized_time() gpu_data_array = self._send_arrays_to_gpu([self.cpu_arrays[0]]) padded_array = cp.pad( gpu_data_array, pad_width=((0, 0), (pad_width, pad_width), (pad_height, pad_height)), ) transfer_time = get_synchronized_time() - start # Repeat the operation for _ in range(runs): operation_time += time_function(lambda: cupy_median_filter( gpu_data_array, padded_array, filter_height, filter_width)) # Time the transfer from GPU to CPU transfer_time += time_function(gpu_data_array[0].get) # Free the GPU arrays free_memory_pool([gpu_data_array, padded_array]) else: # Determine the number of partitions required again (to be on the safe side) n_partitions_needed = number_of_partitions_needed( [self.cpu_arrays[0], padded_cpu_array], get_free_bytes()) indices = get_array_partition_indices(self.cpu_arrays[0].shape[0], n_partitions_needed) gpu_arrays = self._send_arrays_to_gpu([ np.empty_like( self.cpu_arrays[0][indices[0][0]:indices[0][1]:, :]), np.empty_like(padded_cpu_array)[ indices[0][0]:indices[0][1]:, :], ]) # Return 0 when GPU is out of space if not gpu_arrays: return 0 gpu_data_array, gpu_padded_array = gpu_arrays for i in range(n_partitions_needed): # Retrieve the segments used for this iteration of the operation split_cpu_array = self.cpu_arrays[0][ indices[i][0]:indices[i][1]:, :] # Time transferring the segments to the GPU start = get_synchronized_time() if split_cpu_array.shape == gpu_data_array.shape: gpu_data_array.set(split_cpu_array, cp.cuda.Stream(non_blocking=True)) gpu_padded_array.set( np.pad( split_cpu_array, pad_width=( (0, 0), (pad_width, pad_width), (pad_height, pad_height), ), ), cp.cuda.Stream(non_blocking=True), ) else: diff = gpu_data_array.shape[0] - split_cpu_array.shape[0] expanded_cpu_array = np.pad(split_cpu_array, pad_width=((0, diff), (0, 0), (0, 0))) gpu_data_array.set(expanded_cpu_array, cp.cuda.Stream(non_blocking=True)) padded_cpu_array = np.pad( expanded_cpu_array, pad_width=( (0, 0), (pad_width, pad_width), (pad_height, pad_height), ), ) gpu_padded_array.set(padded_cpu_array, cp.cuda.Stream(non_blocking=True)) transfer_time += get_synchronized_time() - start try: # Carry out the operation on the slices for _ in range(runs): operation_time += time_function( lambda: cupy_median_filter( gpu_data_array, gpu_padded_array, filter_height, filter_width, )) except cp.cuda.memory.OutOfMemoryError as e: print( "Unable to make extra arrays during operation despite successful transfer." ) print(e) free_memory_pool([gpu_data_array, gpu_padded_array]) return 0 # Store time taken to transfer result transfer_time += time_function(gpu_data_array[0].get) # Free GPU arrays free_memory_pool([gpu_padded_array, gpu_data_array]) self.print_operation_times( total_time=operation_time, operation_name="Median Filter", runs=runs, transfer_time=transfer_time, ) return transfer_time + operation_time / runs
def timed_imaging_operation(self, runs, alg, alg_name, n_arrs_needed): # Synchronize and free memory before making an assessment about available space free_memory_pool() # Determine the number of partitions required n_partitions_needed = number_of_partitions_needed( self.cpu_arrays, get_free_bytes()) transfer_time = 0 operation_time = 0 if n_partitions_needed == 1: # Time the transfer from CPU to GPU start = get_synchronized_time() gpu_arrays = self._send_arrays_to_gpu( self.cpu_arrays[:n_arrs_needed]) transfer_time = get_synchronized_time() - start # Repeat the operation for _ in range(runs): operation_time += time_function( lambda: alg(*gpu_arrays[:n_arrs_needed])) # Time the transfer from GPU to CPU transfer_time += time_function(gpu_arrays[0].get) # Free the GPU arrays print("Before:", mempool.used_bytes()) free_memory_pool(gpu_arrays) print("After:", mempool.used_bytes()) else: # Determine the number of partitions required again (to be on the safe side) n_partitions_needed = number_of_partitions_needed( self.cpu_arrays[:n_arrs_needed], get_free_bytes()) indices = get_array_partition_indices(self.cpu_arrays[0].shape[0], n_partitions_needed) gpu_arrays = self._send_arrays_to_gpu([ np.empty_like(arr[indices[0][0]:indices[0][1]:, :]) for arr in self.cpu_arrays[:n_arrs_needed] ]) # Return 0 when GPU is out of space if not gpu_arrays: return 0 for i in range(n_partitions_needed): # Retrieve the segments used for this iteration of the operation split_cpu_arrays = [ cpu_array[indices[i][0]:indices[i][1]:, :] for cpu_array in self.cpu_arrays ] shape_diff = gpu_arrays[0].shape[0] - split_cpu_arrays[ 0].shape[0] # Time transferring the segments to the GPU start = get_synchronized_time() if shape_diff == 0: for j in range(n_arrs_needed): replace_gpu_array_contents(gpu_arrays[j], split_cpu_arrays[j]) else: expanded_cpu_arrays = [ np.pad(arr, pad_width=((0, shape_diff), (0, 0), (0, 0))) for arr in split_cpu_arrays ] for j in range(n_arrs_needed): replace_gpu_array_contents(gpu_arrays[j], expanded_cpu_arrays[j]) transfer_time += get_synchronized_time() - start try: # Carry out the operation on the slices for _ in range(runs): operation_time += time_function( lambda: alg(*gpu_arrays[:n_arrs_needed])) except cp.cuda.memory.OutOfMemoryError as e: print( "Unable to make extra arrays during operation despite successful transfer." ) print(e) free_memory_pool(gpu_arrays) return 0 # Store time taken to transfer result transfer_time += time_function(gpu_arrays[0].get) # Free GPU arrays print("Before:", mempool.used_bytes()) free_memory_pool(gpu_arrays) print("After:", mempool.used_bytes()) self.print_operation_times( total_time=operation_time, operation_name=alg_name, runs=runs, transfer_time=transfer_time, ) return transfer_time + operation_time / runs
def timed_imaging_operation(self, runs, alg, alg_name, n_arrs_needed, n_gpu_arrs_needed): # Synchronize and free memory before making an assessment about available space self.clear_cuda_memory() n_partitions_needed = num_partitions_needed(self.cpu_arrays[0], n_gpu_arrs_needed, get_free_bytes()) transfer_time = 0 operation_time = 0 if n_partitions_needed == 1: cpu_result_array = np.empty_like(self.cpu_arrays[0]) # Time transfer from CPU to GPU start = self.get_time() gpu_input_arrays = self._send_arrays_to_gpu( self.cpu_arrays[:n_arrs_needed], n_gpu_arrs_needed) gpu_output_array = self._send_arrays_to_gpu([cpu_result_array], n_gpu_arrs_needed)[0] transfer_time += self.get_time() - start # Repeat the operation for _ in range(runs): operation_time += self.time_function(lambda: alg( *gpu_input_arrays[:n_arrs_needed], gpu_output_array)) stream = cuda.stream() self.streams.append(stream) # Time the transfer from GPU to CPU transfer_time += self.time_function( lambda: gpu_output_array.copy_to_host(cpu_result_array, stream )) # Free the GPU arrays self.clear_cuda_memory(gpu_input_arrays) else: # Determine the number of partitions required again (to be on the safe side) n_partitions_needed = num_partitions_needed( self.cpu_arrays[0], n_gpu_arrs_needed, get_free_bytes()) indices = get_array_partition_indices(self.cpu_arrays[0].shape[0], n_partitions_needed) for i in range(n_partitions_needed): # Retrieve the segments used for this iteration of the operation split_cpu_arrays = [ cpu_array[indices[i][0]:indices[i][1]:, :] for cpu_array in self.cpu_arrays ] cpu_result_array = np.empty_like(split_cpu_arrays[i]) # Time transferring the segments to the GPU start = self.get_time() gpu_input_arrays = self._send_arrays_to_gpu( split_cpu_arrays, n_gpu_arrs_needed) gpu_output_array_list = self._send_arrays_to_gpu( [cpu_result_array], n_gpu_arrs_needed) transfer_time += self.get_time() - start if not gpu_input_arrays: return 0 if not gpu_output_array_list: return 0 gpu_output_array = gpu_output_array_list[0] # Carry out the operation on the slices for _ in range(runs): operation_time += self.time_function(lambda: alg( *gpu_input_arrays[:n_arrs_needed], gpu_output_array)) stream = cuda.stream() self.streams.append(stream) transfer_time += self.time_function( lambda: gpu_output_array.copy_to_host( cpu_result_array, stream)) # Free GPU arrays and partition arrays self.clear_cuda_memory(gpu_input_arrays + [gpu_output_array]) if transfer_time > 0 and operation_time > 0: self.print_operation_times(operation_time, alg_name, runs, transfer_time) self.synchronise() return transfer_time + operation_time / runs
def timed_imaging_operation(self, runs, alg, alg_name, n_arrs_needed): n_partitions_needed = num_partitions_needed(self.cpu_arrays, get_free_bytes()) transfer_time = 0 operation_time = 0 if n_partitions_needed == 1: # Time transfer from CPU to GPU start = get_time() gpu_input_arrays = self._send_arrays_to_gpu( self.cpu_arrays[:n_arrs_needed]) transfer_time += get_time() - start # Repeat the operation for _ in range(runs): operation_time += time_function( lambda: alg(*gpu_input_arrays[:n_arrs_needed])) # Time the transfer from GPU to CPU transfer_time += time_function( lambda: gpu_input_arrays[0].get_async) # Free the GPU arrays free_memory_pool(gpu_input_arrays) else: # Determine the number of partitions required again (to be on the safe side) n_partitions_needed = num_partitions_needed( self.cpu_arrays, get_free_bytes()) indices = get_array_partition_indices(self.cpu_arrays[0].shape[0], n_partitions_needed) for i in range(n_partitions_needed): # Retrieve the segments used for this iteration of the operation split_cpu_arrays = [ cpu_array[indices[i][0]:indices[i][1]:, :] for cpu_array in self.cpu_arrays ] # Time transferring the segments to the GPU start = get_time() gpu_input_arrays = self._send_arrays_to_gpu(split_cpu_arrays) transfer_time += get_time() - start if not gpu_input_arrays: return 0 # Carry out the operation on the slices for _ in range(runs): operation_time += time_function( lambda: alg(*gpu_input_arrays[:n_arrs_needed])) transfer_time += time_function( lambda: gpu_input_arrays[0].get_async) # Free the GPU arrays free_memory_pool(gpu_input_arrays) if transfer_time > 0 and operation_time > 0: self.print_operation_times(operation_time, alg_name, runs, transfer_time) self.synchronise() return transfer_time + operation_time / runs
def timed_median_filter(self, runs, filter_size): n_partitions_needed = num_partitions_needed(self.cpu_arrays[:1], get_free_bytes()) transfer_time = 0 operation_time = 0 pad_height = filter_size[1] // 2 pad_width = filter_size[0] // 2 filter_height = filter_size[0] filter_width = filter_size[1] if n_partitions_needed == 1: # Time transfer from CPU to GPU (and padding creation) start = get_time() cpu_padded_array = create_padded_array(self.cpu_arrays[0], pad_height, pad_width) gpu_data_array, gpu_padded_array = self._send_arrays_to_gpu( [self.cpu_arrays[0], cpu_padded_array]) transfer_time += get_time() - start scipy_median_filter(self.cpu_arrays[0], size=filter_size) pycuda_median_filter(gpu_data_array, gpu_padded_array, filter_height, filter_width) print(np.isclose(self.cpu_arrays[0], gpu_data_array.get())) print("Assertion passed.") # Repeat the operation for _ in range(runs): operation_time += time_function(lambda: pycuda_median_filter( gpu_data_array, gpu_padded_array, filter_height, filter_width)) # Time the transfer from GPU to CPU transfer_time += time_function(lambda: gpu_data_array.get_async) # Free the GPU arrays free_memory_pool([gpu_data_array, gpu_padded_array]) else: n_partitions_needed = num_partitions_needed( self.cpu_arrays[:1], get_free_bytes()) indices = get_array_partition_indices(self.cpu_arrays[0].shape[0], n_partitions_needed) for i in range(n_partitions_needed): # Retrieve the segments used for this iteration of the operation split_cpu_array = self.cpu_arrays[0][ indices[i][0]:indices[i][1]:, :] # Time transferring the segments to the GPU cpu_padded_array = create_padded_array(split_cpu_array, pad_height, pad_width) start = get_time() gpu_data_array, gpu_padded_array = self._send_arrays_to_gpu( [split_cpu_array, cpu_padded_array]) transfer_time += get_time() - start # Carry out the operation on the slices for _ in range(runs): operation_time += time_function( lambda: pycuda_median_filter( gpu_data_array, gpu_padded_array, filter_height, filter_width, )) transfer_time += time_function( lambda: gpu_data_array.get_async) # scipy_median_filter(split_cpu_array, size=filter_size) # assert np.allclose(split_cpu_array, gpu_data_array.get_async()) # print("Assertion passed.") # Free the GPU arrays free_memory_pool([gpu_data_array, gpu_padded_array]) if transfer_time > 0 and operation_time > 0: self.print_operation_times(operation_time, "median filter", runs, transfer_time) self.synchronise() return transfer_time + operation_time / N_RUNS