def test_mapped_contextmanager(self): # Check that temporarily mapped memory is unregistered immediately, # such that it can be re-mapped at any time class MappedException(Exception): pass arr = np.zeros(1) ctx = cuda.current_context() ctx.deallocations.clear() with self.check_ignored_exception(ctx): with cuda.mapped(arr) as marr: pass with cuda.mapped(arr) as marr: pass # Should also work inside a `defer_cleanup` block with cuda.defer_cleanup(): with cuda.mapped(arr) as marr: pass with cuda.mapped(arr) as marr: pass # Should also work when breaking out of the block due to an exception try: with cuda.mapped(arr) as marr: raise MappedException except MappedException: with cuda.mapped(arr) as marr: pass
def test_nested(self): harr = np.arange(5) darr1 = cuda.to_device(harr) deallocs = cuda.current_context().memory_manager.deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) with cuda.defer_cleanup(): with cuda.defer_cleanup(): darr2 = cuda.to_device(harr) del darr1 self.assertEqual(len(deallocs), 1) del darr2 self.assertEqual(len(deallocs), 2) deallocs.clear() self.assertEqual(len(deallocs), 2) deallocs.clear() self.assertEqual(len(deallocs), 2) deallocs.clear() self.assertEqual(len(deallocs), 0)
def test_nested(self): harr = np.arange(5) darr1 = cuda.to_device(harr) deallocs = cuda.current_context().deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) with cuda.defer_cleanup(): with cuda.defer_cleanup(): darr2 = cuda.to_device(harr) del darr1 self.assertEqual(len(deallocs), 1) del darr2 self.assertEqual(len(deallocs), 2) deallocs.clear() self.assertEqual(len(deallocs), 2) deallocs.clear() self.assertEqual(len(deallocs), 2) deallocs.clear() self.assertEqual(len(deallocs), 0)
def test_exception(self): harr = np.arange(5) darr1 = cuda.to_device(harr) deallocs = cuda.current_context().deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) class CustomError(Exception): pass with self.assertRaises(CustomError): with cuda.defer_cleanup(): darr2 = cuda.to_device(harr) del darr2 self.assertEqual(len(deallocs), 1) deallocs.clear() self.assertEqual(len(deallocs), 1) raise CustomError deallocs.clear() self.assertEqual(len(deallocs), 0) del darr1 self.assertEqual(len(deallocs), 1) deallocs.clear() self.assertEqual(len(deallocs), 0)
def __update_costs(self): logger.info("Preparing for cost calculation") costs = np.zeros_like(self.edges, dtype=float) threadsperblock = (self.max_neighbors, self.max_neighbors) blockspergrid_x = ceil(costs.shape[0] / threadsperblock[0]) blockspergrid_y = ceil(costs.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) c_vertices = cuda.to_device(self.vertices) c_edges = cuda.to_device(self.edges) c_costs = cuda.to_device(costs) c_referance = cuda.to_device(np.ascontiguousarray(self.map_ref)) c_wheels = cuda.to_device(self.wheels) c_params = cuda.to_device(self.coeffs) logger.info("Starting calculation on the GPU") with cuda.defer_cleanup(): cost[blockspergrid, threadsperblock]( c_vertices, c_edges, c_costs, c_referance, c_wheels, c_params ) self.costs = c_costs.copy_to_host() logger.success("Cost calculation completed")
def test_context_manager(self): # just make sure the API is available with cuda.defer_cleanup(): pass
for curriculum_level in range(CURRICULUM_LEVELS): print(f"CURRICULUM_LEVEL = {curriculum_level + 1}/{CURRICULUM_LEVELS}") rows = int(CURRICULUM_SIZE * (curriculum_level + 1)) if rows >= ROWS_MAX: rows = ROWS_MAX FLAG_tempering = True print(f"number of rows (recordings) = {rows}") curriculum_level_gpu = cuda.to_device( np.array([curriculum_level], dtype=np.int32)) rows_gpu = cuda.to_device(np.array([rows], dtype=np.int32)) blocks_per_grid = min([int(rows / 10), 50000]) threads_per_block = int(rows / blocks_per_grid) + 1 with cuda.defer_cleanup(): # iterations of the Search and Update steps iterations_this_round = ITERATIONS_PER_CURRICULUM_LEVEL if FLAG_tempering: iterations_this_round = int(ITERATIONS_PER_CURRICULUM_LEVEL * 5) for iteration in range(iterations_this_round): iteration_gpu = cuda.to_device( np.array([iteration], dtype=np.int32)) # neighbour search neighbour_search_step[blocks_per_grid, threads_per_block]( A_gpu, search_matrix_index, archive_index, archive_dist, rows_gpu, curriculum_level_gpu, iteration_gpu) # Wait for GPU to complete
def run(self, df: pd.DataFrame, features_column: str, verbose: bool = True): stats = [] ROWS_MAX = df.shape[0] CURRICULUM_LEVELS = int(ROWS_MAX / CURRICULUM_SIZE) if ROWS_MAX % CURRICULUM_SIZE != 0: CURRICULUM_LEVELS += 1 A = np.array(df[features_column].values.tolist(), dtype=np.int32) A_gpu = cuda.to_device(A) if verbose: print(f"A_gpu.alloc_size = {A_gpu.alloc_size}") archive_index = cuda.to_device( np.ones(shape=(ROWS_MAX, NEIGHBOURS_ARCHIVE_SIZE), dtype=np.uint32) * -1) if verbose: print( f"archive_index.alloc_size = {filesize.size(archive_index.alloc_size)}" ) archive_dist = cuda.to_device( np.ones(shape=(ROWS_MAX, NEIGHBOURS_ARCHIVE_SIZE), dtype=np.uint16) * np.iinfo(np.uint16).max) if verbose: print( f"archive_dist.alloc_size = {filesize.size(archive_dist.alloc_size)}" ) search_matrix_index = cuda.to_device( np.random.randint(low=0, high=np.iinfo(np.int32).max, size=(ROWS_MAX, SEARCH_SIZE), dtype=np.uint32)) if verbose: print( f"search_matrix_index.alloc_size = {filesize.size(search_matrix_index.alloc_size)}" ) observed_means_arr = cuda.to_device( np.empty(shape=(ROWS_MAX), dtype=np.float32)) if verbose: print( f"observed_means_arr.alloc_size = {filesize.size(observed_means_arr.alloc_size)}" ) FLAG_tempering = False for curriculum_level in range(CURRICULUM_LEVELS): if verbose: print( f"CURRICULUM_LEVEL = {curriculum_level + 1}/{CURRICULUM_LEVELS}" ) rows = int(CURRICULUM_SIZE * (curriculum_level + 1)) if rows >= ROWS_MAX: rows = ROWS_MAX FLAG_tempering = True if verbose: print(f"number of rows (recordings) = {rows}") curriculum_level_gpu = cuda.to_device( np.array([curriculum_level], dtype=np.int32)) rows_gpu = cuda.to_device(np.array([rows], dtype=np.int32)) blocks_per_grid = min([int(rows / 10), 50000]) threads_per_block = int(rows / blocks_per_grid) + 1 with cuda.defer_cleanup(): # iterations of the Search and Update steps iterations_this_round = ITERATIONS_PER_CURRICULUM_LEVEL if FLAG_tempering: iterations_this_round = int( ITERATIONS_PER_CURRICULUM_LEVEL * 5) for iteration in range(iterations_this_round): iteration_gpu = cuda.to_device( np.array([iteration], dtype=np.int32)) # neighbour search neighbour_search_step[blocks_per_grid, threads_per_block]( A_gpu, search_matrix_index, archive_index, archive_dist, observed_means_arr, rows_gpu, curriculum_level_gpu, iteration_gpu) # Wait for GPU to complete cuda.synchronize() # update search direction based on archive: sees archive_index and updates search_matrix_index update_step[blocks_per_grid, threads_per_block](search_matrix_index, archive_index, rows_gpu) # print(search_matrix_index.copy_to_host()[200000, :]) # Evaluate Solution Quality: sees archive_dist and updates search_i quality_mean, quality_std, obs_mean, obs_std = self.calc_quality_metrics( archive_dist, observed_means_arr, rows, rows_gpu) if verbose: print( f"quality indicator: mean={quality_mean}\tstd={quality_std}\t|\t " f"observed: mean={obs_mean}\tstd={obs_std}") stats.append([ datetime.now(), curriculum_level, iteration, rows, quality_mean, quality_std, obs_mean, obs_std ]) # wait for all computations to complete cuda.synchronize() df['row_id'] = np.arange(df.shape[0]).astype(np.int32) df['neighbours'] = [ x for x in archive_index.copy_to_host().astype(np.int32) [:, 0:int(NEIGHBOURS_ARCHIVE_SIZE / 2)] ] df['neighbour_dist'] = [ x for x in archive_dist.copy_to_host().astype(np.int32) [:, 0:int(NEIGHBOURS_ARCHIVE_SIZE / 2)] ] stats_df = pd.DataFrame(data=stats, columns=[ 'timestamp', 'curriculum_level', 'iteration', 'rows', 'quality_indicator_mean', 'quality_indicator_std', 'observed_mean', 'observed_std' ]) return df, stats_df