Python defer_cleanupの例、numba.cuda.defer_cleanup Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_deallocations.py プロジェクト: esc/numba

    def test_mapped_contextmanager(self):
        # Check that temporarily mapped memory is unregistered immediately,
        # such that it can be re-mapped at any time
        class MappedException(Exception):
            pass

        arr = np.zeros(1)
        ctx = cuda.current_context()
        ctx.deallocations.clear()
        with self.check_ignored_exception(ctx):
            with cuda.mapped(arr) as marr:
                pass
            with cuda.mapped(arr) as marr:
                pass
            # Should also work inside a `defer_cleanup` block
            with cuda.defer_cleanup():
                with cuda.mapped(arr) as marr:
                    pass
                with cuda.mapped(arr) as marr:
                    pass
            # Should also work when breaking out of the block due to an exception
            try:
                with cuda.mapped(arr) as marr:
                    raise MappedException
            except MappedException:
                with cuda.mapped(arr) as marr:
                    pass

コード例 #2

0

ファイルを表示

    def test_nested(self):
        harr = np.arange(5)
        darr1 = cuda.to_device(harr)
        deallocs = cuda.current_context().memory_manager.deallocations
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)
        with cuda.defer_cleanup():
            with cuda.defer_cleanup():
                darr2 = cuda.to_device(harr)
                del darr1
                self.assertEqual(len(deallocs), 1)
                del darr2
                self.assertEqual(len(deallocs), 2)
                deallocs.clear()
                self.assertEqual(len(deallocs), 2)
            deallocs.clear()
            self.assertEqual(len(deallocs), 2)

        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

コード例 #3

0

ファイルを表示

ファイル: test_deallocations.py プロジェクト: cpcloud/numba

    def test_nested(self):
        harr = np.arange(5)
        darr1 = cuda.to_device(harr)
        deallocs = cuda.current_context().deallocations
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)
        with cuda.defer_cleanup():
            with cuda.defer_cleanup():
                darr2 = cuda.to_device(harr)
                del darr1
                self.assertEqual(len(deallocs), 1)
                del darr2
                self.assertEqual(len(deallocs), 2)
                deallocs.clear()
                self.assertEqual(len(deallocs), 2)
            deallocs.clear()
            self.assertEqual(len(deallocs), 2)

        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

コード例 #4

0

ファイルを表示

ファイル: test_deallocations.py プロジェクト: sisi2/Masterthesis

    def test_exception(self):
        harr = np.arange(5)
        darr1 = cuda.to_device(harr)
        deallocs = cuda.current_context().deallocations
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

        class CustomError(Exception):
            pass

        with self.assertRaises(CustomError):
            with cuda.defer_cleanup():
                darr2 = cuda.to_device(harr)
                del darr2
                self.assertEqual(len(deallocs), 1)
                deallocs.clear()
                self.assertEqual(len(deallocs), 1)
                raise CustomError
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)
        del darr1
        self.assertEqual(len(deallocs), 1)
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

コード例 #5

0

ファイルを表示

ファイル: test_deallocations.py プロジェクト: cpcloud/numba

    def test_exception(self):
        harr = np.arange(5)
        darr1 = cuda.to_device(harr)
        deallocs = cuda.current_context().deallocations
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

        class CustomError(Exception):
            pass

        with self.assertRaises(CustomError):
            with cuda.defer_cleanup():
                darr2 = cuda.to_device(harr)
                del darr2
                self.assertEqual(len(deallocs), 1)
                deallocs.clear()
                self.assertEqual(len(deallocs), 1)
                raise CustomError
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)
        del darr1
        self.assertEqual(len(deallocs), 1)
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

コード例 #6

0

ファイルを表示

    def __update_costs(self):
        logger.info("Preparing for cost calculation")
        costs = np.zeros_like(self.edges, dtype=float)

        threadsperblock = (self.max_neighbors, self.max_neighbors)
        blockspergrid_x = ceil(costs.shape[0] / threadsperblock[0])
        blockspergrid_y = ceil(costs.shape[1] / threadsperblock[1])
        blockspergrid = (blockspergrid_x, blockspergrid_y)

        c_vertices = cuda.to_device(self.vertices)
        c_edges = cuda.to_device(self.edges)
        c_costs = cuda.to_device(costs)
        c_referance = cuda.to_device(np.ascontiguousarray(self.map_ref))
        c_wheels = cuda.to_device(self.wheels)
        c_params = cuda.to_device(self.coeffs)

        logger.info("Starting calculation on the GPU")
        with cuda.defer_cleanup():
            cost[blockspergrid, threadsperblock](
                c_vertices, c_edges, c_costs, c_referance, c_wheels, c_params
            )

        self.costs = c_costs.copy_to_host()
        logger.success("Cost calculation completed")

コード例 #7

0

ファイルを表示

ファイル: test_deallocations.py プロジェクト: sisi2/Masterthesis

 def test_context_manager(self):
     # just make sure the API is available
     with cuda.defer_cleanup():
         pass

コード例 #8

0

ファイルを表示

ファイル: neighbour_search_cuda.py プロジェクト: sharwinbobde/MLHD-insights

    for curriculum_level in range(CURRICULUM_LEVELS):
        print(f"CURRICULUM_LEVEL = {curriculum_level + 1}/{CURRICULUM_LEVELS}")
        rows = int(CURRICULUM_SIZE * (curriculum_level + 1))
        if rows >= ROWS_MAX:
            rows = ROWS_MAX
            FLAG_tempering = True

        print(f"number of rows (recordings) = {rows}")
        curriculum_level_gpu = cuda.to_device(
            np.array([curriculum_level], dtype=np.int32))
        rows_gpu = cuda.to_device(np.array([rows], dtype=np.int32))

        blocks_per_grid = min([int(rows / 10), 50000])
        threads_per_block = int(rows / blocks_per_grid) + 1

        with cuda.defer_cleanup():
            # iterations of the Search and Update steps
            iterations_this_round = ITERATIONS_PER_CURRICULUM_LEVEL
            if FLAG_tempering:
                iterations_this_round = int(ITERATIONS_PER_CURRICULUM_LEVEL *
                                            5)

            for iteration in range(iterations_this_round):
                iteration_gpu = cuda.to_device(
                    np.array([iteration], dtype=np.int32))
                # neighbour search
                neighbour_search_step[blocks_per_grid, threads_per_block](
                    A_gpu, search_matrix_index, archive_index, archive_dist,
                    rows_gpu, curriculum_level_gpu, iteration_gpu)

                # Wait for GPU to complete

コード例 #9

0

ファイルを表示

ファイル: test_deallocations.py プロジェクト: cpcloud/numba

 def test_context_manager(self):
     # just make sure the API is available
     with cuda.defer_cleanup():
         pass

コード例 #10

0

ファイルを表示

ファイル: ApproxNearestNeighboursCUDA.py プロジェクト: sharwinbobde/MLHD-insights

    def run(self,
            df: pd.DataFrame,
            features_column: str,
            verbose: bool = True):
        stats = []
        ROWS_MAX = df.shape[0]
        CURRICULUM_LEVELS = int(ROWS_MAX / CURRICULUM_SIZE)
        if ROWS_MAX % CURRICULUM_SIZE != 0:
            CURRICULUM_LEVELS += 1
        A = np.array(df[features_column].values.tolist(), dtype=np.int32)

        A_gpu = cuda.to_device(A)
        if verbose: print(f"A_gpu.alloc_size = {A_gpu.alloc_size}")

        archive_index = cuda.to_device(
            np.ones(shape=(ROWS_MAX, NEIGHBOURS_ARCHIVE_SIZE), dtype=np.uint32)
            * -1)
        if verbose:
            print(
                f"archive_index.alloc_size = {filesize.size(archive_index.alloc_size)}"
            )

        archive_dist = cuda.to_device(
            np.ones(shape=(ROWS_MAX, NEIGHBOURS_ARCHIVE_SIZE), dtype=np.uint16)
            * np.iinfo(np.uint16).max)
        if verbose:
            print(
                f"archive_dist.alloc_size = {filesize.size(archive_dist.alloc_size)}"
            )

        search_matrix_index = cuda.to_device(
            np.random.randint(low=0,
                              high=np.iinfo(np.int32).max,
                              size=(ROWS_MAX, SEARCH_SIZE),
                              dtype=np.uint32))
        if verbose:
            print(
                f"search_matrix_index.alloc_size = {filesize.size(search_matrix_index.alloc_size)}"
            )

        observed_means_arr = cuda.to_device(
            np.empty(shape=(ROWS_MAX), dtype=np.float32))
        if verbose:
            print(
                f"observed_means_arr.alloc_size = {filesize.size(observed_means_arr.alloc_size)}"
            )

        FLAG_tempering = False
        for curriculum_level in range(CURRICULUM_LEVELS):
            if verbose:
                print(
                    f"CURRICULUM_LEVEL = {curriculum_level + 1}/{CURRICULUM_LEVELS}"
                )
            rows = int(CURRICULUM_SIZE * (curriculum_level + 1))
            if rows >= ROWS_MAX:
                rows = ROWS_MAX
                FLAG_tempering = True

            if verbose: print(f"number of rows (recordings) = {rows}")

            curriculum_level_gpu = cuda.to_device(
                np.array([curriculum_level], dtype=np.int32))
            rows_gpu = cuda.to_device(np.array([rows], dtype=np.int32))

            blocks_per_grid = min([int(rows / 10), 50000])
            threads_per_block = int(rows / blocks_per_grid) + 1

            with cuda.defer_cleanup():
                # iterations of the Search and Update steps
                iterations_this_round = ITERATIONS_PER_CURRICULUM_LEVEL
                if FLAG_tempering:
                    iterations_this_round = int(
                        ITERATIONS_PER_CURRICULUM_LEVEL * 5)

                for iteration in range(iterations_this_round):
                    iteration_gpu = cuda.to_device(
                        np.array([iteration], dtype=np.int32))
                    # neighbour search
                    neighbour_search_step[blocks_per_grid, threads_per_block](
                        A_gpu, search_matrix_index, archive_index,
                        archive_dist, observed_means_arr, rows_gpu,
                        curriculum_level_gpu, iteration_gpu)

                    # Wait for GPU to complete
                    cuda.synchronize()

                    # update search direction based on archive: sees archive_index and updates search_matrix_index
                    update_step[blocks_per_grid,
                                threads_per_block](search_matrix_index,
                                                   archive_index, rows_gpu)

                    # print(search_matrix_index.copy_to_host()[200000, :])

                    # Evaluate Solution Quality: sees archive_dist and updates search_i
                    quality_mean, quality_std, obs_mean, obs_std = self.calc_quality_metrics(
                        archive_dist, observed_means_arr, rows, rows_gpu)
                    if verbose:
                        print(
                            f"quality indicator: mean={quality_mean}\tstd={quality_std}\t|\t "
                            f"observed: mean={obs_mean}\tstd={obs_std}")

                    stats.append([
                        datetime.now(), curriculum_level, iteration, rows,
                        quality_mean, quality_std, obs_mean, obs_std
                    ])

                    # wait for all computations to complete
                    cuda.synchronize()

        df['row_id'] = np.arange(df.shape[0]).astype(np.int32)
        df['neighbours'] = [
            x for x in archive_index.copy_to_host().astype(np.int32)
            [:, 0:int(NEIGHBOURS_ARCHIVE_SIZE / 2)]
        ]
        df['neighbour_dist'] = [
            x for x in archive_dist.copy_to_host().astype(np.int32)
            [:, 0:int(NEIGHBOURS_ARCHIVE_SIZE / 2)]
        ]
        stats_df = pd.DataFrame(data=stats,
                                columns=[
                                    'timestamp', 'curriculum_level',
                                    'iteration', 'rows',
                                    'quality_indicator_mean',
                                    'quality_indicator_std', 'observed_mean',
                                    'observed_std'
                                ])
        return df, stats_df