def timed_median_filter(self, runs, filter_size):

        # Synchronize and free memory before making an assessment about available space
        free_memory_pool()

        # Determine the number of partitions required (not taking the padding into account)
        n_partitions_needed = number_of_partitions_needed(
            self.cpu_arrays[:1], get_free_bytes())

        transfer_time = 0
        operation_time = 0

        pad_height = filter_size[1] // 2
        pad_width = filter_size[0] // 2

        filter_height = filter_size[0]
        filter_width = filter_size[1]

        padded_cpu_array = np.pad(
            self.cpu_arrays[0],
            pad_width=((0, 0), (pad_width, pad_width), (pad_height,
                                                        pad_height)),
        )

        if n_partitions_needed == 1:

            # Time the transfer from CPU to GPU
            start = get_synchronized_time()
            gpu_data_array = self._send_arrays_to_gpu([self.cpu_arrays[0]])
            padded_array = cp.pad(
                gpu_data_array,
                pad_width=((0, 0), (pad_width, pad_width), (pad_height,
                                                            pad_height)),
            )
            transfer_time = get_synchronized_time() - start

            # Repeat the operation
            for _ in range(runs):
                operation_time += time_function(lambda: cupy_median_filter(
                    gpu_data_array, padded_array, filter_height, filter_width))

            # Time the transfer from GPU to CPU
            transfer_time += time_function(gpu_data_array[0].get)

            # Free the GPU arrays
            free_memory_pool([gpu_data_array, padded_array])

        else:

            # Determine the number of partitions required again (to be on the safe side)
            n_partitions_needed = number_of_partitions_needed(
                [self.cpu_arrays[0], padded_cpu_array], get_free_bytes())

            indices = get_array_partition_indices(self.cpu_arrays[0].shape[0],
                                                  n_partitions_needed)

            gpu_arrays = self._send_arrays_to_gpu([
                np.empty_like(
                    self.cpu_arrays[0][indices[0][0]:indices[0][1]:, :]),
                np.empty_like(padded_cpu_array)[
                    indices[0][0]:indices[0][1]:, :],
            ])

            # Return 0 when GPU is out of space
            if not gpu_arrays:
                return 0

            gpu_data_array, gpu_padded_array = gpu_arrays

            for i in range(n_partitions_needed):

                # Retrieve the segments used for this iteration of the operation
                split_cpu_array = self.cpu_arrays[0][
                    indices[i][0]:indices[i][1]:, :]

                # Time transferring the segments to the GPU
                start = get_synchronized_time()

                if split_cpu_array.shape == gpu_data_array.shape:
                    gpu_data_array.set(split_cpu_array,
                                       cp.cuda.Stream(non_blocking=True))
                    gpu_padded_array.set(
                        np.pad(
                            split_cpu_array,
                            pad_width=(
                                (0, 0),
                                (pad_width, pad_width),
                                (pad_height, pad_height),
                            ),
                        ),
                        cp.cuda.Stream(non_blocking=True),
                    )
                else:

                    diff = gpu_data_array.shape[0] - split_cpu_array.shape[0]

                    expanded_cpu_array = np.pad(split_cpu_array,
                                                pad_width=((0, diff), (0, 0),
                                                           (0, 0)))
                    gpu_data_array.set(expanded_cpu_array,
                                       cp.cuda.Stream(non_blocking=True))

                    padded_cpu_array = np.pad(
                        expanded_cpu_array,
                        pad_width=(
                            (0, 0),
                            (pad_width, pad_width),
                            (pad_height, pad_height),
                        ),
                    )
                    gpu_padded_array.set(padded_cpu_array,
                                         cp.cuda.Stream(non_blocking=True))

                transfer_time += get_synchronized_time() - start

                try:
                    # Carry out the operation on the slices
                    for _ in range(runs):
                        operation_time += time_function(
                            lambda: cupy_median_filter(
                                gpu_data_array,
                                gpu_padded_array,
                                filter_height,
                                filter_width,
                            ))
                except cp.cuda.memory.OutOfMemoryError as e:
                    print(
                        "Unable to make extra arrays during operation despite successful transfer."
                    )
                    print(e)
                    free_memory_pool([gpu_data_array, gpu_padded_array])
                    return 0

                # Store time taken to transfer result
                transfer_time += time_function(gpu_data_array[0].get)

                # Free GPU arrays
                free_memory_pool([gpu_padded_array, gpu_data_array])

        self.print_operation_times(
            total_time=operation_time,
            operation_name="Median Filter",
            runs=runs,
            transfer_time=transfer_time,
        )

        return transfer_time + operation_time / runs
    def timed_imaging_operation(self, runs, alg, alg_name, n_arrs_needed):

        # Synchronize and free memory before making an assessment about available space
        free_memory_pool()

        # Determine the number of partitions required
        n_partitions_needed = number_of_partitions_needed(
            self.cpu_arrays, get_free_bytes())

        transfer_time = 0
        operation_time = 0

        if n_partitions_needed == 1:

            # Time the transfer from CPU to GPU
            start = get_synchronized_time()
            gpu_arrays = self._send_arrays_to_gpu(
                self.cpu_arrays[:n_arrs_needed])
            transfer_time = get_synchronized_time() - start

            # Repeat the operation
            for _ in range(runs):
                operation_time += time_function(
                    lambda: alg(*gpu_arrays[:n_arrs_needed]))

            # Time the transfer from GPU to CPU
            transfer_time += time_function(gpu_arrays[0].get)

            # Free the GPU arrays
            print("Before:", mempool.used_bytes())
            free_memory_pool(gpu_arrays)
            print("After:", mempool.used_bytes())

        else:

            # Determine the number of partitions required again (to be on the safe side)
            n_partitions_needed = number_of_partitions_needed(
                self.cpu_arrays[:n_arrs_needed], get_free_bytes())

            indices = get_array_partition_indices(self.cpu_arrays[0].shape[0],
                                                  n_partitions_needed)

            gpu_arrays = self._send_arrays_to_gpu([
                np.empty_like(arr[indices[0][0]:indices[0][1]:, :])
                for arr in self.cpu_arrays[:n_arrs_needed]
            ])

            # Return 0 when GPU is out of space
            if not gpu_arrays:
                return 0

            for i in range(n_partitions_needed):

                # Retrieve the segments used for this iteration of the operation
                split_cpu_arrays = [
                    cpu_array[indices[i][0]:indices[i][1]:, :]
                    for cpu_array in self.cpu_arrays
                ]

                shape_diff = gpu_arrays[0].shape[0] - split_cpu_arrays[
                    0].shape[0]

                # Time transferring the segments to the GPU
                start = get_synchronized_time()
                if shape_diff == 0:
                    for j in range(n_arrs_needed):
                        replace_gpu_array_contents(gpu_arrays[j],
                                                   split_cpu_arrays[j])
                else:

                    expanded_cpu_arrays = [
                        np.pad(arr,
                               pad_width=((0, shape_diff), (0, 0), (0, 0)))
                        for arr in split_cpu_arrays
                    ]
                    for j in range(n_arrs_needed):
                        replace_gpu_array_contents(gpu_arrays[j],
                                                   expanded_cpu_arrays[j])

                transfer_time += get_synchronized_time() - start

                try:
                    # Carry out the operation on the slices
                    for _ in range(runs):
                        operation_time += time_function(
                            lambda: alg(*gpu_arrays[:n_arrs_needed]))
                except cp.cuda.memory.OutOfMemoryError as e:
                    print(
                        "Unable to make extra arrays during operation despite successful transfer."
                    )
                    print(e)
                    free_memory_pool(gpu_arrays)
                    return 0

                # Store time taken to transfer result
                transfer_time += time_function(gpu_arrays[0].get)

            # Free GPU arrays
            print("Before:", mempool.used_bytes())
            free_memory_pool(gpu_arrays)
            print("After:", mempool.used_bytes())

        self.print_operation_times(
            total_time=operation_time,
            operation_name=alg_name,
            runs=runs,
            transfer_time=transfer_time,
        )

        return transfer_time + operation_time / runs
Exemple #3
0
    def timed_imaging_operation(self, runs, alg, alg_name, n_arrs_needed,
                                n_gpu_arrs_needed):

        # Synchronize and free memory before making an assessment about available space
        self.clear_cuda_memory()

        n_partitions_needed = num_partitions_needed(self.cpu_arrays[0],
                                                    n_gpu_arrs_needed,
                                                    get_free_bytes())

        transfer_time = 0
        operation_time = 0

        if n_partitions_needed == 1:

            cpu_result_array = np.empty_like(self.cpu_arrays[0])

            # Time transfer from CPU to GPU
            start = self.get_time()
            gpu_input_arrays = self._send_arrays_to_gpu(
                self.cpu_arrays[:n_arrs_needed], n_gpu_arrs_needed)
            gpu_output_array = self._send_arrays_to_gpu([cpu_result_array],
                                                        n_gpu_arrs_needed)[0]
            transfer_time += self.get_time() - start

            # Repeat the operation
            for _ in range(runs):
                operation_time += self.time_function(lambda: alg(
                    *gpu_input_arrays[:n_arrs_needed], gpu_output_array))

            stream = cuda.stream()
            self.streams.append(stream)

            # Time the transfer from GPU to CPU
            transfer_time += self.time_function(
                lambda: gpu_output_array.copy_to_host(cpu_result_array, stream
                                                      ))

            # Free the GPU arrays
            self.clear_cuda_memory(gpu_input_arrays)

        else:

            # Determine the number of partitions required again (to be on the safe side)
            n_partitions_needed = num_partitions_needed(
                self.cpu_arrays[0], n_gpu_arrs_needed, get_free_bytes())

            indices = get_array_partition_indices(self.cpu_arrays[0].shape[0],
                                                  n_partitions_needed)

            for i in range(n_partitions_needed):

                # Retrieve the segments used for this iteration of the operation
                split_cpu_arrays = [
                    cpu_array[indices[i][0]:indices[i][1]:, :]
                    for cpu_array in self.cpu_arrays
                ]

                cpu_result_array = np.empty_like(split_cpu_arrays[i])

                # Time transferring the segments to the GPU
                start = self.get_time()
                gpu_input_arrays = self._send_arrays_to_gpu(
                    split_cpu_arrays, n_gpu_arrs_needed)
                gpu_output_array_list = self._send_arrays_to_gpu(
                    [cpu_result_array], n_gpu_arrs_needed)
                transfer_time += self.get_time() - start

                if not gpu_input_arrays:
                    return 0

                if not gpu_output_array_list:
                    return 0

                gpu_output_array = gpu_output_array_list[0]

                # Carry out the operation on the slices
                for _ in range(runs):
                    operation_time += self.time_function(lambda: alg(
                        *gpu_input_arrays[:n_arrs_needed], gpu_output_array))

                stream = cuda.stream()
                self.streams.append(stream)

                transfer_time += self.time_function(
                    lambda: gpu_output_array.copy_to_host(
                        cpu_result_array, stream))

                # Free GPU arrays and partition arrays
                self.clear_cuda_memory(gpu_input_arrays + [gpu_output_array])

        if transfer_time > 0 and operation_time > 0:
            self.print_operation_times(operation_time, alg_name, runs,
                                       transfer_time)

        self.synchronise()

        return transfer_time + operation_time / runs
Exemple #4
0
    def timed_imaging_operation(self, runs, alg, alg_name, n_arrs_needed):

        n_partitions_needed = num_partitions_needed(self.cpu_arrays,
                                                    get_free_bytes())

        transfer_time = 0
        operation_time = 0

        if n_partitions_needed == 1:

            # Time transfer from CPU to GPU
            start = get_time()
            gpu_input_arrays = self._send_arrays_to_gpu(
                self.cpu_arrays[:n_arrs_needed])
            transfer_time += get_time() - start

            # Repeat the operation
            for _ in range(runs):
                operation_time += time_function(
                    lambda: alg(*gpu_input_arrays[:n_arrs_needed]))

            # Time the transfer from GPU to CPU
            transfer_time += time_function(
                lambda: gpu_input_arrays[0].get_async)

            # Free the GPU arrays
            free_memory_pool(gpu_input_arrays)

        else:

            # Determine the number of partitions required again (to be on the safe side)
            n_partitions_needed = num_partitions_needed(
                self.cpu_arrays, get_free_bytes())

            indices = get_array_partition_indices(self.cpu_arrays[0].shape[0],
                                                  n_partitions_needed)

            for i in range(n_partitions_needed):

                # Retrieve the segments used for this iteration of the operation
                split_cpu_arrays = [
                    cpu_array[indices[i][0]:indices[i][1]:, :]
                    for cpu_array in self.cpu_arrays
                ]

                # Time transferring the segments to the GPU
                start = get_time()
                gpu_input_arrays = self._send_arrays_to_gpu(split_cpu_arrays)
                transfer_time += get_time() - start

                if not gpu_input_arrays:
                    return 0

                # Carry out the operation on the slices
                for _ in range(runs):
                    operation_time += time_function(
                        lambda: alg(*gpu_input_arrays[:n_arrs_needed]))

                transfer_time += time_function(
                    lambda: gpu_input_arrays[0].get_async)

                # Free the GPU arrays
                free_memory_pool(gpu_input_arrays)

        if transfer_time > 0 and operation_time > 0:
            self.print_operation_times(operation_time, alg_name, runs,
                                       transfer_time)

        self.synchronise()

        return transfer_time + operation_time / runs
    def timed_median_filter(self, runs, filter_size):

        n_partitions_needed = num_partitions_needed(self.cpu_arrays[:1],
                                                    get_free_bytes())

        transfer_time = 0
        operation_time = 0

        pad_height = filter_size[1] // 2
        pad_width = filter_size[0] // 2

        filter_height = filter_size[0]
        filter_width = filter_size[1]

        if n_partitions_needed == 1:

            # Time transfer from CPU to GPU (and padding creation)
            start = get_time()
            cpu_padded_array = create_padded_array(self.cpu_arrays[0],
                                                   pad_height, pad_width)
            gpu_data_array, gpu_padded_array = self._send_arrays_to_gpu(
                [self.cpu_arrays[0], cpu_padded_array])
            transfer_time += get_time() - start

            scipy_median_filter(self.cpu_arrays[0], size=filter_size)
            pycuda_median_filter(gpu_data_array, gpu_padded_array,
                                 filter_height, filter_width)
            print(np.isclose(self.cpu_arrays[0], gpu_data_array.get()))
            print("Assertion passed.")

            # Repeat the operation
            for _ in range(runs):
                operation_time += time_function(lambda: pycuda_median_filter(
                    gpu_data_array, gpu_padded_array, filter_height,
                    filter_width))

            # Time the transfer from GPU to CPU
            transfer_time += time_function(lambda: gpu_data_array.get_async)

            # Free the GPU arrays
            free_memory_pool([gpu_data_array, gpu_padded_array])

        else:

            n_partitions_needed = num_partitions_needed(
                self.cpu_arrays[:1], get_free_bytes())

            indices = get_array_partition_indices(self.cpu_arrays[0].shape[0],
                                                  n_partitions_needed)

            for i in range(n_partitions_needed):

                # Retrieve the segments used for this iteration of the operation
                split_cpu_array = self.cpu_arrays[0][
                    indices[i][0]:indices[i][1]:, :]

                # Time transferring the segments to the GPU
                cpu_padded_array = create_padded_array(split_cpu_array,
                                                       pad_height, pad_width)
                start = get_time()
                gpu_data_array, gpu_padded_array = self._send_arrays_to_gpu(
                    [split_cpu_array, cpu_padded_array])
                transfer_time += get_time() - start

                # Carry out the operation on the slices
                for _ in range(runs):
                    operation_time += time_function(
                        lambda: pycuda_median_filter(
                            gpu_data_array,
                            gpu_padded_array,
                            filter_height,
                            filter_width,
                        ))

                transfer_time += time_function(
                    lambda: gpu_data_array.get_async)

                # scipy_median_filter(split_cpu_array, size=filter_size)
                # assert np.allclose(split_cpu_array, gpu_data_array.get_async())
                # print("Assertion passed.")

                # Free the GPU arrays
                free_memory_pool([gpu_data_array, gpu_padded_array])

        if transfer_time > 0 and operation_time > 0:
            self.print_operation_times(operation_time, "median filter", runs,
                                       transfer_time)

        self.synchronise()

        return transfer_time + operation_time / N_RUNS