def test_fork(self): """ Test fork detection. """ cuda.current_context() # force cuda initialize # fork in process that also uses CUDA ctx = mp.get_context('fork') q = ctx.Queue() proc = ctx.Process(target=fork_test, args=[q]) proc.start() exc = q.get() proc.join() # there should be an exception raised in the child process self.assertIsNotNone(exc) self.assertIn('CUDA initialized before forking', str(exc))
def test_attached_non_primary(self): # Emulate non-primary context creation by 3rd party the_driver = driver.driver hctx = driver.drvapi.cu_context() the_driver.cuCtxCreate(byref(hctx), 0, 0) try: cuda.current_context() except RuntimeError as e: # Expecting an error about non-primary CUDA context self.assertIn("Numba cannot operate on non-primary CUDA context ", str(e)) else: self.fail("No RuntimeError raised") finally: the_driver.cuCtxDestroy(hctx)
def test_max_pending_bytes(self): # get deallocation manager and flush it ctx = cuda.current_context() deallocs = ctx.deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) mi = ctx.get_memory_info() max_pending = 10**6 # 1MB old_ratio = config.CUDA_DEALLOCS_RATIO try: # change to a smaller ratio config.CUDA_DEALLOCS_RATIO = max_pending / mi.total # due to round off error (floor is used in calculating _max_pending_bytes) # it can be off by 1. self.assertAlmostEqual(deallocs._max_pending_bytes, max_pending, delta=1) # allocate half the max size # this will not trigger deallocation cuda.to_device(np.ones(max_pending // 2, dtype=np.int8)) self.assertEqual(len(deallocs), 1) # allocate another remaining # this will not trigger deallocation cuda.to_device(np.ones(deallocs._max_pending_bytes - deallocs._size, dtype=np.int8)) self.assertEqual(len(deallocs), 2) # another byte to trigger .clear() cuda.to_device(np.ones(1, dtype=np.int8)) self.assertEqual(len(deallocs), 0) finally: # restore old ratio config.CUDA_DEALLOCS_RATIO = old_ratio
def test_ipc_handle(self): # prepare data for IPC arr = np.arange(10, dtype=np.intp) devarr = cuda.to_device(arr) # create IPC handle ctx = cuda.current_context() ipch = ctx.get_ipc_handle(devarr.gpu_data) # manually prepare for serialization as bytes handle_bytes = bytes(ipch.handle) size = ipch.size # spawn new process for testing ctx = mp.get_context('spawn') result_queue = ctx.Queue() args = (handle_bytes, size, result_queue) proc = ctx.Process(target=base_ipc_handle_test, args=args) proc.start() succ, out = result_queue.get() if not succ: self.fail(out) else: np.testing.assert_equal(arr, out) proc.join(3)
def test_max_pending_bytes(self): # get deallocation manager and flush it ctx = cuda.current_context() deallocs = ctx.deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) mi = ctx.get_memory_info() max_pending = 10**6 # 1MB old_ratio = config.CUDA_DEALLOCS_RATIO try: # change to a smaller ratio config.CUDA_DEALLOCS_RATIO = max_pending / mi.total self.assertEqual(deallocs._max_pending_bytes, max_pending) # deallocate half the max size cuda.to_device(np.ones(max_pending // 2, dtype=np.int8)) self.assertEqual(len(deallocs), 1) # deallocate another remaining cuda.to_device(np.ones(max_pending - deallocs._size, dtype=np.int8)) self.assertEqual(len(deallocs), 2) # another byte to trigger .clear() cuda.to_device(np.ones(1, dtype=np.int8)) self.assertEqual(len(deallocs), 0) finally: # restore old ratio config.CUDA_DEALLOCS_RATIO = old_ratio
def test_ipc_handle_serialization(self): # prepare data for IPC arr = np.arange(10, dtype=np.intp) devarr = cuda.to_device(arr) # create IPC handle ctx = cuda.current_context() ipch = ctx.get_ipc_handle(devarr.gpu_data) # pickle buf = pickle.dumps(ipch) ipch_recon = pickle.loads(buf) self.assertIs(ipch_recon.base, None) self.assertEqual(tuple(ipch_recon.handle), tuple(ipch.handle)) self.assertEqual(ipch_recon.size, ipch.size) # spawn new process for testing ctx = mp.get_context('spawn') result_queue = ctx.Queue() args = (ipch, result_queue) proc = ctx.Process(target=serialize_ipc_handle_test, args=args) proc.start() succ, out = result_queue.get() if not succ: self.fail(out) else: np.testing.assert_equal(arr, out) proc.join(3)
def test_mapped_contextmanager(self): # Check that temporarily mapped memory is unregistered immediately, # such that it can be re-mapped at any time class MappedException(Exception): pass arr = np.zeros(1) ctx = cuda.current_context() ctx.deallocations.clear() with self.check_ignored_exception(ctx): with cuda.mapped(arr) as marr: pass with cuda.mapped(arr) as marr: pass # Should also work inside a `defer_cleanup` block with cuda.defer_cleanup(): with cuda.mapped(arr) as marr: pass with cuda.mapped(arr) as marr: pass # Should also work when breaking out of the block due to an exception try: with cuda.mapped(arr) as marr: raise MappedException except MappedException: with cuda.mapped(arr) as marr: pass
def _array_helper(self, addr, datasize, shape, strides, dtype, finalizer=None): ctx = cuda.current_context() ptr = ctypes.c_uint64(int(addr)) mem = cuda.driver.MemoryPointer(ctx, ptr, datasize, finalizer=finalizer) return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype, gpu_data=mem)
def the_work(): dtype = np.dtype(np.intp) darr = handle.open_array(cuda.current_context(), shape=handle.size // dtype.itemsize, dtype=dtype) # copy the data to host arr = darr.copy_to_host() handle.close() return arr
def _as_numba_devarray(intaddr, nelem, dtype): dtype = np.dtype(dtype) addr = ctypes.c_uint64(intaddr) elemsize = dtype.itemsize datasize = elemsize * nelem memptr = cuda.driver.MemoryPointer(context=cuda.current_context(), pointer=addr, size=datasize) return cuda.devicearray.DeviceNDArray(shape=(nelem,), strides=(elemsize,), dtype=dtype, gpu_data=memptr)
def test_context_memory(self): mem = cuda.current_context().get_memory_info() self.assertIsInstance(mem.free, numbers.Number) self.assertEquals(mem.free, mem[0]) self.assertIsInstance(mem.total, numbers.Number) self.assertEquals(mem.total, mem[1]) self.assertLessEqual(mem.free, mem.total)
def get_gpus_mem(): gpus = cuda.gpus.lst mem_list = [] for gpu in gpus: with gpu: meminfo = cuda.current_context().get_memory_info() mem_list.append(int(meminfo[0])) sort_gpus = [x for (y, x) in sorted(zip(mem_list, gpus), reverse=True)] sort_gmem = [y for (y, x) in sorted(zip(mem_list, gpus), reverse=True)] # return [sort_gpus[0]], [sort_gmem[0]] return sort_gpus, sort_gmem
def test_max_pending_count(self): # get deallocation manager and flush it deallocs = cuda.current_context().deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) # deallocate to maximum count for i in range(config.CUDA_DEALLOCS_COUNT): cuda.to_device(np.arange(1)) self.assertEqual(len(deallocs), i + 1) # one more to trigger .clear() cuda.to_device(np.arange(1)) self.assertEqual(len(deallocs), 0)
def wrap_fq(atoms, qbin=.1, sum_type='fq'): # get information for FQ transformation q = atoms.get_positions() q = q.astype(np.float32) n = len(q) if sum_type == 'fq': scatter_array = atoms.get_array('F(Q) scatter') else: scatter_array = atoms.get_array('PDF scatter') qmax_bin = scatter_array.shape[1] # get number of allocated nodes n_nodes = count_nodes() print('nodes', n_nodes) # get info on our gpu setup and available memory mem_list = gpu_avail(n_nodes) mem_list.append(cuda.current_context().get_memory_info()[0]) # starting buffers n_cov = 0 # create list of tasks m_list = [] while n_cov < n: for mem in mem_list: m = gpu_fq_atoms_allocation(n, qmax_bin, mem) if m > n - n_cov: m = n - n_cov m_list.append(m) if n_cov >= n: break n_cov += m if n_cov >= n: break # Make certain that we have covered all the atoms assert sum(m_list) == n reports = mpi_fq(n_nodes, m_list, q, scatter_array, qbin) fq = np.zeros(qmax_bin) for ele in reports: fq[:] += ele na = np.average(scatter_array, axis=0) ** 2 * n old_settings = np.seterr(all='ignore') fq = np.nan_to_num(1 / na * fq) np.seterr(**old_settings) return fq
def test_attached_primary(self): # Emulate primary context creation by 3rd party the_driver = driver.driver hctx = driver.drvapi.cu_context() the_driver.cuDevicePrimaryCtxRetain(byref(hctx), 0) try: ctx = driver.Context(weakref.proxy(self), hctx) ctx.push() # Check that the context from numba matches the created primary # context. my_ctx = cuda.current_context() self.assertEqual(my_ctx.handle.value, ctx.handle.value) finally: ctx.pop() the_driver.cuDevicePrimaryCtxRelease(0)
def test_basic(self): harr = np.arange(5) darr1 = cuda.to_device(harr) deallocs = cuda.current_context().deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) with cuda.defer_cleanup(): darr2 = cuda.to_device(harr) del darr1 self.assertEqual(len(deallocs), 1) del darr2 self.assertEqual(len(deallocs), 2) deallocs.clear() self.assertEqual(len(deallocs), 2) deallocs.clear() self.assertEqual(len(deallocs), 0)
def test_host_alloc_driver(self): n = 32 mem = cuda.current_context().memhostalloc(n, mapped=True) dtype = np.dtype(np.uint8) ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, buffer=mem) magic = 0xAB driver.device_memset(mem, magic, n) self.assertTrue(np.all(ary == magic)) ary.fill(n) recv = np.empty_like(ary) driver.device_to_host(recv, mem, ary.size) self.assertTrue(np.all(ary == recv)) self.assertTrue(np.all(recv == n))
def test_ownership(self): # Get the deallocation queue ctx = cuda.current_context() deallocs = ctx.deallocations # Flush all deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) # Make new device array d_arr = cuda.to_device(np.arange(100)) # Convert it cvted = cuda.as_cuda_array(d_arr) # Drop reference to the original object such that # only `cvted` has a reference to it. del d_arr # There shouldn't be any new deallocations self.assertEqual(len(deallocs), 0) # Try to access the memory and verify its content np.testing.assert_equal(cvted.copy_to_host(), np.arange(100)) # Drop last reference to the memory del cvted self.assertEqual(len(deallocs), 1) # Flush deallocs.clear()
def test_exception(self): harr = np.arange(5) darr1 = cuda.to_device(harr) deallocs = cuda.current_context().deallocations deallocs.clear() self.assertEqual(len(deallocs), 0) class CustomError(Exception): pass with self.assertRaises(CustomError): with cuda.defer_cleanup(): darr2 = cuda.to_device(harr) del darr2 self.assertEqual(len(deallocs), 1) deallocs.clear() self.assertEqual(len(deallocs), 1) raise CustomError deallocs.clear() self.assertEqual(len(deallocs), 0) del darr1 self.assertEqual(len(deallocs), 1) deallocs.clear() self.assertEqual(len(deallocs), 0)
def test_stream(self): ctx = cuda.current_context() stream = ctx.create_stream() with self.check_ignored_exception(ctx): del stream
def get_device_total_memory(index=0): """ Return total memory of CUDA device with index """ with cuda.gpus[index]: return cuda.current_context().get_memory_info()[1]
def test_device_memory(self): ctx = cuda.current_context() mem = ctx.memalloc(32) with self.check_ignored_exception(ctx): del mem
logger.info(fn.__doc__) logger.info("Cardinality: {}".format(n)) logger.info("Dimensionality: {}".format(d)) return fn(n, d) # computes required device memory: data + labels + dists + centroids # n = cardinality, d = dimensionality, c = number of clusters req_mem = lambda n, d, c: (n * d * 4 + n * 2 * 4 + c * d * 4) # HOST memory max MAX_ALLOWED_HOST_MEM = hostmemory * 2 ** 30 MAX_ALLOWED_HOST_MEM = int(MAX_ALLOWED_HOST_MEM) # compute device memory c = cuda.current_context() free_mem, total_mem = c.get_memory_info() MAX_ALLOWED_DEVICE_MEM = thresholdgpu * total_mem # threshold default is 0.97 MAX_ALLOWED_DEVICE_MEM = int(MAX_ALLOWED_DEVICE_MEM) logger.info("Will occupy maximum of {} MB in" " device memory.".format(MAX_ALLOWED_DEVICE_MEM / (1024.0 ** 2))) # cardinality = [100, 250, 500, 750, # 1e3, 2.5e3, 5e3, 7.5e3, # 1e4, 2.5e4, 5e4, 7.5e4, # 1e5, 2.5e5, 5e5, 7.5e5, # 1e6, 2.5e6, 5e6, 7.5e6, # 1e7] cardinality = [100, 1e3, 5e3, 1e4, 5e4, 1e5, 5e5, 1e6, 5e6, 1e7] cardinality = map(int, cardinality)
def cuda_current_context(): ctx = cuda.current_context() return ctx
def test_mapped_memory(self): ctx = cuda.current_context() mem = ctx.memhostalloc(32, mapped=True) with self.check_ignored_exception(ctx): del mem
def cc_X_or_above(major, minor): if not config.ENABLE_CUDASIM: return cuda.current_context().device.compute_capability >= (major, minor) else: return True
def switch_gpu(): with cuda.gpus[1]: return cuda.current_context().device.id
def test_event(self): ctx = cuda.current_context() event = ctx.create_event() with self.check_ignored_exception(ctx): del event
def gpu_stump( T_A, m, T_B=None, ignore_trivial=True, device_id=0, normalize=True, p=2.0 ): """ Compute the z-normalized matrix profile with one or more GPU devices This is a convenience wrapper around the Numba `cuda.jit` `_gpu_stump` function which computes the matrix profile according to GPU-STOMP. The default number of threads-per-block is set to `512` and may be changed by setting the global parameter `config.STUMPY_THREADS_PER_BLOCK` to an appropriate number based on your GPU hardware. Parameters ---------- T_A : numpy.ndarray The time series or sequence for which to compute the matrix profile m : int Window size T_B : numpy.ndarray, default None The time series or sequence that will be used to annotate T_A. For every subsequence in T_A, its nearest neighbor in T_B will be recorded. Default is `None` which corresponds to a self-join. ignore_trivial : bool, default True Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. device_id : int or list, default 0 The (GPU) device number to use. The default value is `0`. A list of valid device ids (int) may also be provided for parallel GPU-STUMP computation. A list of all valid device ids can be obtained by executing `[device.id for device in numba.cuda.list_devices()]`. normalize : bool, default True When set to `True`, this z-normalizes subsequences prior to computing distances. Otherwise, this function gets re-routed to its complementary non-normalized equivalent set in the `@core.non_normalized` function decorator. p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. Returns ------- out : numpy.ndarray The first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column consists of the left matrix profile indices, and the fourth column consists of the right matrix profile indices. See Also -------- stumpy.stump : Compute the z-normalized matrix profile stumpy.stumped : Compute the z-normalized matrix profile with a distributed dask cluster stumpy.scrump : Compute an approximate z-normalized matrix profile Notes ----- `DOI: 10.1109/ICDM.2016.0085 \ <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__ See Table II, Figure 5, and Figure 6 Timeseries, T_A, will be annotated with the distance location (or index) of all its subsequences in another times series, T_B. Return: For every subsequence, Q, in T_A, you will get a distance and index for the closest subsequence in T_B. Thus, the array returned will have length T_A.shape[0]-m+1. Additionally, the left and right matrix profiles are also returned. Note: Unlike in the Table II where T_A.shape is expected to be equal to T_B.shape, this implementation is generalized so that the shapes of T_A and T_B can be different. In the case where T_A.shape == T_B.shape, then our algorithm reduces down to the same algorithm found in Table II. Additionally, unlike STAMP where the exclusion zone is m/2, the default exclusion zone for STOMP is m/4 (See Definition 3 and Figure 3). For self-joins, set `ignore_trivial = True` in order to avoid the trivial match. Note that left and right matrix profiles are only available for self-joins. Examples -------- >>> from numba import cuda >>> if __name__ == "__main__": ... all_gpu_devices = [device.id for device in cuda.list_devices()] ... stumpy.gpu_stump( ... np.array([584., -11., 23., 79., 1001., 0., -19.]), ... m=3, ... device_id=all_gpu_devices) array([[0.11633857113691416, 4, -1, 4], [2.694073918063438, 3, -1, 3], [3.0000926340485923, 0, 0, 4], [2.694073918063438, 1, 1, -1], [0.11633857113691416, 0, 0, -1]], dtype=object) """ if T_B is None: # Self join! T_B = T_A ignore_trivial = True T_A, M_T, Σ_T = core.preprocess(T_A, m) T_B, μ_Q, σ_Q = core.preprocess(T_B, m) if T_A.ndim != 1: # pragma: no cover raise ValueError( f"T_A is {T_A.ndim}-dimensional and must be 1-dimensional. " "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`" ) if T_B.ndim != 1: # pragma: no cover raise ValueError( f"T_B is {T_B.ndim}-dimensional and must be 1-dimensional. " "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`" ) core.check_window_size(m, max_size=min(T_A.shape[0], T_B.shape[0])) if ignore_trivial is False and core.are_arrays_equal(T_A, T_B): # pragma: no cover logger.warning("Arrays T_A, T_B are equal, which implies a self-join.") logger.warning("Try setting `ignore_trivial = True`.") if ignore_trivial and core.are_arrays_equal(T_A, T_B) is False: # pragma: no cover logger.warning("Arrays T_A, T_B are not equal, which implies an AB-join.") logger.warning("Try setting `ignore_trivial = False`.") n = T_B.shape[0] k = T_A.shape[0] - m + 1 l = n - m + 1 excl_zone = int( np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM) ) # See Definition 3 and Figure 3 T_A_fname = core.array_to_temp_file(T_A) T_B_fname = core.array_to_temp_file(T_B) M_T_fname = core.array_to_temp_file(M_T) Σ_T_fname = core.array_to_temp_file(Σ_T) μ_Q_fname = core.array_to_temp_file(μ_Q) σ_Q_fname = core.array_to_temp_file(σ_Q) out = np.empty((k, 4), dtype=object) if isinstance(device_id, int): device_ids = [device_id] else: device_ids = device_id profile = [None] * len(device_ids) indices = [None] * len(device_ids) for _id in device_ids: with cuda.gpus[_id]: if ( cuda.current_context().__class__.__name__ != "FakeCUDAContext" ): # pragma: no cover cuda.current_context().deallocations.clear() step = 1 + l // len(device_ids) # Start process pool for multi-GPU request if len(device_ids) > 1: # pragma: no cover mp.set_start_method("spawn", force=True) pool = mp.Pool(processes=len(device_ids)) results = [None] * len(device_ids) QT_fnames = [] QT_first_fnames = [] for idx, start in enumerate(range(0, l, step)): stop = min(l, start + step) QT, QT_first = core._get_QT(start, T_A, T_B, m) QT_fname = core.array_to_temp_file(QT) QT_first_fname = core.array_to_temp_file(QT_first) QT_fnames.append(QT_fname) QT_first_fnames.append(QT_first_fname) if len(device_ids) > 1 and idx < len(device_ids) - 1: # pragma: no cover # Spawn and execute in child process for multi-GPU request results[idx] = pool.apply_async( _gpu_stump, ( T_A_fname, T_B_fname, m, stop, excl_zone, M_T_fname, Σ_T_fname, QT_fname, QT_first_fname, μ_Q_fname, σ_Q_fname, k, ignore_trivial, start + 1, device_ids[idx], ), ) else: # Execute last chunk in parent process # Only parent process is executed when a single GPU is requested profile[idx], indices[idx] = _gpu_stump( T_A_fname, T_B_fname, m, stop, excl_zone, M_T_fname, Σ_T_fname, QT_fname, QT_first_fname, μ_Q_fname, σ_Q_fname, k, ignore_trivial, start + 1, device_ids[idx], ) # Clean up process pool for multi-GPU request if len(device_ids) > 1: # pragma: no cover pool.close() pool.join() # Collect results from spawned child processes if they exist for idx, result in enumerate(results): if result is not None: profile[idx], indices[idx] = result.get() os.remove(T_A_fname) os.remove(T_B_fname) os.remove(M_T_fname) os.remove(Σ_T_fname) os.remove(μ_Q_fname) os.remove(σ_Q_fname) for QT_fname in QT_fnames: os.remove(QT_fname) for QT_first_fname in QT_first_fnames: os.remove(QT_first_fname) for idx in range(len(device_ids)): profile_fname = profile[idx] indices_fname = indices[idx] profile[idx] = np.load(profile_fname, allow_pickle=False) indices[idx] = np.load(indices_fname, allow_pickle=False) os.remove(profile_fname) os.remove(indices_fname) for i in range(1, len(device_ids)): # Update all matrix profiles and matrix profile indices # (global, left, right) and store in profile[0] and indices[0] for col in range(profile[0].shape[1]): # pragma: no cover cond = profile[0][:, col] < profile[i][:, col] profile[0][:, col] = np.where(cond, profile[0][:, col], profile[i][:, col]) indices[0][:, col] = np.where(cond, indices[0][:, col], indices[i][:, col]) out[:, 0] = profile[0][:, 0] out[:, 1:4] = indices[0][:, :] threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover logger.warning(f"A large number of values are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial = True`.") return out
def _get_context(): pid = multiprocessing.current_process().pid ctxid = cuda.current_context().handle.value return pid, ctxid
def cc(self): return cuda.current_context().device.compute_capability
def get_device_memory_info(): """ Returns the total amount of global memory on the device in bytes """ meminfo = cuda.current_context().get_memory_info() return meminfo[1]
def destroy(self): context = cuda.current_context(self.gpu) context.reset() # delete variables in self. gc.collect(1)
def iam_lots_gpu_compute(csv_filename="", patch_size=[1,2,4,8], blending_weights=[0.65,0.2,0.1,0.05], num_sample=[512], alpha=0.5, thrsh_patches = True, bin_tresh=0.5, save_jpeg=True, delete_intermediary=False, nawm_preprocessing=False): ''' FUNCTION'S SUMMARY: Main function of the LOTS-IAM-GPU algorithm. This function produces (i.e. saving) age maps that indicate level of irregularity of voxels in brain FLAIR MRI. This function reads a list of FLAIR MR image (NifTI), ICV mask (NifTI), CSF mask (NifTI), NAWM mask (NifTI), and Cortical mask (NifTI) to produce the corresponding age maps from a CSV file. Please note that this version only accept NifTI (.nii/.nii.gz) files. NOTE: NAWM and Cortical masks are optional. They will be used if they are included in the CSV file. Format of the CSV input file (NOTE: spaces are used to make the format clearer): path_to_mri_codebase_folder, mri_code_name, path_FLAIR, path_ICV, path_CSF, path_NAWM (optional), path_Cortical (optional) Example (NOTE: spaces are used to make the format clearer): /dir/MRI_DB/, MRI001, /dir/MRI_DB/MRI001/FLAIR.nii.gz, /dir/MRI_DB/MRI001/ICV.nii.gz, /dir/MRI_DB/MRI001/CSF.nii.gz, /dir/MRI_DB/MRI001/NAWM.nii.gz (optional), /dir/MRI_DB/MRI001/Cortex.nii.gz (optional) By default, the age maps are calculated by using four different sizes of source/target patches (i.e. 1x1, 2x2, 4x4, and 8x8) and 64 target samples. Furthermore, all intermediary files are saved in .mat (Matlab) and JPEG files. INPUT PARAMETERS: This function's behavior can be set by using input parameters below. 1. output_filedir : Path of directory for saving all results. Format of the path: "output_path/name_of_experiment" 2. csv_filename : Name of a CSV input file which contains list all files to be processed by the LOTS-IAM-GPU. Example: "input.csv" 3. patch_size : Size of source/target patches for IAM's computation. Default: [1,2,4,8] to calculate age maps from four different sizes of source/target patches i.e. 1x1, 2x2, 4x4, and 8x8. The sizes of source/target patches must be in the form of python's list. 4. blending_weights : Weights used for blending age maps produced by different size of source/target patches. The weights must be in the form of python's list, summed to 1, and its length must be the same as `patch_size`. 5. num_sample : A list of numbers used for randomly sampling target patches to be used in the LOTS-IAM-GPU calculation. Default: [512]. Available values: [64, 128, 256, 512, 1024, 2048]. Some important notes: a. Smaller number will make computation faster. b. Input the numbers as a list to automatically produce age maps by using all different numbers of target patches. The software will automatically create different output folders for different number of target samples. c. For this version, only 64, 128, 256, 512, 1024, and 2048 can be used as input numbers (error will be raised if other numbers are used). 6. alpha : Weight of distance function to blend maximum difference and average difference between source and target patches. Default: 0.5. Input value should be between 0 and 1 (i.e. floating points). The current distance function being used is: d = (alpha . |max(s - t)|) + ((1 - alpha) . |mean(s - t)|) where d is distance value, s is source patch, and t is target patch. 7. bin_tresh : Threshold value for cutting of probability values of brain masks, if probability masks are given instead of binary masks. 8. save_jpeg : True --> Save all JPEG files for visualisation. False --> Do not save the JPEG files. 9. delete_intermediary : False --> Save all intermediary files (i.e. JPEG/.mat files). True --> Delete all intermediary files, saving some spaces in the hard disk drive. OUTPUT: The software will automatically create a new folder provided in "output_filedir" variable. Please make sure that the directory is accessible and writable. Inside the experiment’s folder, each patient/MRI mri_code will have its own folder. In default, there are 6 sub-folders which are: 1. 1: Contains age maps of each slice generated by using 1x1 patch. 2. 2: Contains age maps of each slice generated by using 2x2 patch. 3. 4: Contains age maps of each slice generated by using 4x4 patch. 4. 8: Contains age maps of each slice generated by using 8x8 patch. 5. IAM_combined_python: Contains two sub-folders: a. Patch: contains visualisation of age maps of each slices in JPEG files, and b. Combined: contains visualisation of the final output of LOTS-IAM-GPU’s computation. 6. IAM_GPU_nifti_python: Contains one Matlab (.mat) file and three NIfTI files (.nii.gz): a. all_slice_dat.mat: processed mri_code of all slices in Matlab file, b. IAM_GPU_COMBINED.nii.gz: the original age map values, c. IAM_GPU_GN.nii.gz: the final age map values (i.e. GN and penalty), and d. IAM_GPU_GN_postprocessed.nii.gz: the final age map values plus post-processing (only if NAWM mask is provided). Note: If parameter value of `delete_intermediary` is `True`, then all folders listed above will be deleted, except for folder `IAM_GPU_nifti_python` and its contents. MORE HELP: Please read README.md file provided in: https://github.com/febrianrachmadi/lots-iam-gpu VERSION (dd/mm/yyyy): - 31/05/2018b: NAWM and Cortical brain masks are now optional input (will be used if available). - 31/05/2018a: Fix header information of the LOTS-IAM-GPU's result. - 08/05/2018 : Add lines to cutting off probability mask and deleting intermediary folders. - 07/05/2018 : Initial release code. ''' ## Check availability of input files and output path if csv_filename == "": raise ValueError("Please set output folder's name and CSV mri_code filename. See: help(iam_lots_gpu)") return 0 ## Check compatibility between 'patch_size' and 'blending_weights' if len(patch_size) != len(blending_weights): raise ValueError("Lengths of 'patch_size' and 'blending_weights' variables are not the same. Length of 'patch_size' is " + str(len(patch_size)) + ", while 'blending_weights' is " + str(len(blending_weights)) + ".") return 0 ## If intermediary files to be deleted, don't even try to save JPEGs if delete_intermediary: save_jpeg = False ''' Set number of mean samples automatically ''' ''' num_samples_all = [64, 128, 256, 512, 1024, 2048] ''' ''' num_mean_samples_all = [16, 32, 32, 64, 128, 128] ''' num_samples_all = num_sample num_mean_samples_all = [] for sample in num_samples_all: if sample == 64: num_mean_samples_all.append(16) elif sample == 128: num_mean_samples_all.append(32) elif sample == 256: num_mean_samples_all.append(32) elif sample == 512: num_mean_samples_all.append(64) elif sample == 1024: num_mean_samples_all.append(128) elif sample == 2048: num_mean_samples_all.append(128) else: raise ValueError("Number of samples must be either 64, 128, 256, 512, 1024 or 2048!") return 0 print("--- PARAMETERS - CHECKED ---") print('CSV mri_code filename: ' + csv_filename) print('Patch size(s): ' + str(patch_size)) print('Number of samples (all): ' + str(num_samples_all)) print('Number of mean samples (all): ' + str(num_mean_samples_all)) print('Save JPEGs? ' + str(save_jpeg)) print("--- PARAMETERS - CHECKED ---\n") for ii_s in range(0, len(num_samples_all)): num_samples = num_samples_all[ii_s] num_mean_samples = num_mean_samples_all[ii_s] print('Number of samples for IAM: ' + str(num_samples)) print('Number of mean samples for IAM: ' + str(num_mean_samples)) with open(csv_filename, newline='') as csv_file: num_subjects = len(csv_file.readlines()) print('Number of subject(s): ' + str(num_subjects)) with open(csv_filename, newline='', encoding="utf-8-sig") as csv_file: reader = csv.reader(csv_file) timer_idx = 0 elapsed_times_all = np.zeros((num_subjects)) elapsed_times_patch_all = np.zeros((num_subjects, len(patch_size))) for row in reader: mri_code = row[2] dirOutput = row[1] print('Output dir: ' + dirOutput + '\n--') try: os.makedirs(dirOutput) except OSError as e: if e.errno != errno.EEXIST: raise print('--\nNow processing mri_code: ' + mri_code) inputSubjectDir = row[0] print('Input filename (full path): ' + inputSubjectDir) ''' Create output folder(s) ''' dirOutData = dirOutput + '/' + mri_code dirOutDataCom = dirOutput + '/' + mri_code + '/IAM_combined_python/' dirOutDataPatch = dirOutput + '/' + mri_code + '/IAM_combined_python/Patch/' dirOutDataCombined = dirOutput + '/' + mri_code + '/IAM_combined_python/Combined/' try: print(dirOutDataCom) os.makedirs(dirOutDataCom) os.makedirs(dirOutDataPatch) os.makedirs(dirOutDataCombined) except OSError as e: if e.errno != errno.EEXIST: raise mri_data = sio.loadmat(row[0]) # Loading FLAIR mri_data = mri_data["flair"] [x_len, y_len, z_len] = mri_data.shape one_mri_data = timer() for xy in range(0, len(patch_size)): print('>>> Processing patch-size: ' + str(patch_size[xy]) + ' <<<\n') try: os.makedirs(dirOutData + '/' + str(patch_size[xy])) except OSError as e: if e.errno != errno.EEXIST: raise one_patch = timer() for zz in range(0, mri_data.shape[2]): print('---> Slice number: ' + str(zz) + ' <---') ''' KEY POINT: PRE-PROCESSING P.2 - START ------------------------------------- This version still does per slice operation for extracting brain tissues. Two important variables used in the next part of the code are: 1. mask_slice ---> Combination of ICV & CSF masks. It is used to find valid source patches for LOTS-IAM-GPU computation (i.e. brain tissues' source patches). 2. brain_slice --> Brain tissues' information from FLAIR slice. ''' mask_slice = np.nan_to_num(mri_data[:, :, zz]) mask_slice[mask_slice > 0] = 1 brain_slice = np.nan_to_num(mri_data[:, :, zz]) ''' ----------------------------------- KEY POINT: PRE-PROCESSING P.2 - END ''' ## Show brain slice to be used for computation #fig, ax = plt.subplots() #cax = ax.imshow(icv_slice, cmap="jet") #cbar = fig.colorbar(cax) #fig.show() #plt.savefig("plot.jpg") # Vol distance threshold vol_slice = np.count_nonzero(brain_slice) / (x_len * y_len) ## Proportion of brain slice compared to full image print('DEBUG-Patch: brain_slice - ' + str(np.count_nonzero(brain_slice)) + ', x_len * y_len - ' + str(x_len * y_len) + ', vol: ' + str(vol_slice)) ## x_len/y_len = 512 here # Patch's sampling number treshold TRSH = 0.50 if patch_size[xy] == 1: if vol_slice < 0.010: TRSH = 0 elif vol_slice < 0.035: TRSH = 0.15 elif vol_slice < 0.070 and vol_slice >= 0.035: TRSH = 0.60 elif vol_slice >= 0.070: TRSH = 0.80 elif patch_size[xy] == 2: if vol_slice < 0.010: TRSH = 0 elif vol_slice < 0.035: TRSH = 0.15 elif vol_slice < 0.070 and vol_slice >= 0.035: TRSH = 0.60 elif vol_slice >= 0.070: TRSH = 0.80 elif patch_size[xy] == 4 or patch_size[xy] == 8: if vol_slice < 0.035: TRSH = 0 print('DEBUG-Patch: Size - ' + str(patch_size[xy]) + ', slice - ' + str(zz) + ', vol: ' + str(vol_slice) + ', TRSH: ' + str(TRSH)) counter_y = int(y_len / patch_size[xy]) ## counter_y = 512 if patch of size 1 and image of size 512x512 counter_x = int(x_len / patch_size[xy]) source_patch_len = counter_x * counter_y ## How many source patches are neede (e.g. for 1, we need one for each pixel) age_values_all = np.zeros(source_patch_len) ## Age Map that will be filled with the actual values valid = 0 if ((vol_slice >= 0.008 and vol_slice < 0.035) and (patch_size[xy] == 1 or patch_size[xy] == 2)) or \ ((vol_slice >= 0.035 and vol_slice < 0.065) and (patch_size[xy] == 1 or patch_size[xy] == 2 or \ patch_size[xy] == 4)) or (vol_slice > 0.065): valid = 1 ## Creating grid-patch 'xy-by-xy' # -- Column y_c = np.ceil(patch_size[xy] / 2) y_c_sources = np.zeros(int(y_len / patch_size[xy])) for iy in range(0, int(y_len / patch_size[xy])): y_c_sources[iy] = (iy * patch_size[xy]) + y_c - 1 # -- Row x_c = np.ceil(patch_size[xy] / 2) x_c_sources = np.zeros(int(x_len / patch_size[xy])) for ix in range(0, int(x_len / patch_size[xy])): x_c_sources[ix] = (ix * patch_size[xy]) + x_c - 1 ''' Extracting Source Patches ''' area_source_patch = np.zeros([1,patch_size[xy],patch_size[xy]]) center_source_patch = np.zeros([1,2]) icv_source_flag = np.zeros([source_patch_len]) icv_source_flag_valid = np.ones([source_patch_len]) index_mapping = np.ones([source_patch_len]) * -1 flag = 1 index = 0 index_source= 0 if patch_size[xy] == 1: area_source_patch = brain_slice[mask_slice == 1] area_source_patch = area_source_patch.reshape([area_source_patch.shape[0], 1, 1]) index = source_patch_len index_source = area_source_patch.shape[0] icv_source_flag = mask_slice.flatten() positive_indices = (np.where(brain_slice.flatten() > 0))[0] index = 0 for i in positive_indices: index_mapping[i] = index index += 1 else: area_source_patch = [] for isc in range(0, counter_x): for jsc in range(0, counter_y): icv_source_flag[index] = mask_slice[int(x_c_sources[isc]), int(y_c_sources[jsc])] if icv_source_flag[index] == 1: temp = get_area(x_c_sources[isc], y_c_sources[jsc], patch_size[xy], patch_size[xy], brain_slice) area_source_patch.append(temp.tolist()) index_mapping[index] = index_source index_source += 1 index += 1 area_source_patch = np.asarray(area_source_patch) icv_source_flag_valid = icv_source_flag_valid[0:index_source] age_values_valid = np.zeros(index_source) """ TO DELETE, IT'S JUST FOR DISSERTATION for i in range(area_source_patch.shape[2]): plt.imshow(area_source_patch[i]) #Needs to be in row,col order plt.savefig("test.jpg") """ ''' Extracting Target Patches ''' target_patches = [] index_debug = 0 random_array = np.random.randint(10, size=(x_len, y_len)) index_possible = np.zeros(brain_slice.shape) index_possible[(mask_slice != 0) & (random_array > TRSH*10)] = 1 index_possible = np.argwhere(index_possible) for index_chosen in index_possible: x, y = index_chosen area = get_area(x, y, patch_size[xy], patch_size[xy], brain_slice) if area.size == patch_size[xy] * patch_size[xy]: if np.random.randint(low=1, high=10)/10 < (100/(x*y)) * num_samples: pass target_patches.append(area) index_debug += 1 target_patches_np = get_shuffled_patches(target_patches, num_samples) target_patches_np = target_patches_np[0:num_samples,:,:] print('Sampling finished: ' + ' with: ' + str(index_debug) + ' samples from: ' + str(x_len * y_len)) area = [] '''''' ''' Reshaping array mri_code ''' area_source_patch_cuda_all = np.reshape(area_source_patch,(area_source_patch.shape[0], area_source_patch.shape[1] * area_source_patch.shape[2])) target_patches_np_cuda_all = np.reshape(target_patches_np, (target_patches_np.shape[0], target_patches_np.shape[1] * target_patches_np.shape[2])) #if patch_size[xy] == 2: # code.interact(local=dict(globals(), **locals())) melvin = timer() source_len = icv_source_flag_valid.shape[0] loop_len = 512 # def: 512 loop_num = int(np.ceil(source_len / loop_len)) print('\nLoop Information:') print('Total number of source patches: ' + str(source_len)) print('Number of voxels processed in one loop: ' + str(loop_len)) print('Number of loop needed: ' + str(loop_num)) print('Check GPU memory: ' + str(cuda.current_context().get_memory_info())) for il in range(0, loop_num): ''' Debug purposed printing ''' print('.', end='') if np.remainder(il+1, 32) == 0: print(' ' + str(il+1) + '/' + str(loop_num)) # Print newline ''' Only process sub-array ''' source_patches_loop = area_source_patch_cuda_all[il*loop_len:(il*loop_len)+loop_len,:] ''' SUBTRACTION ''' sub_result_gm = cuda.device_array((source_patches_loop.shape[0], target_patches_np_cuda_all.shape[0], target_patches_np_cuda_all.shape[1])) TPB = (4,256) BPGx = int(math.ceil(source_patches_loop.shape[0] / TPB[0])) BPGy = int(math.ceil(target_patches_np_cuda_all.shape[0] / TPB[1])) BPGxy = (BPGx,BPGy) cu_sub_st[BPGxy,TPB](source_patches_loop, target_patches_np_cuda_all, sub_result_gm) ''' MAX-MEAN-ABS ''' sub_max_mean_result = cuda.device_array((source_patches_loop.shape[0], target_patches_np_cuda_all.shape[0],2)) cu_max_mean_abs[BPGxy,TPB](sub_result_gm, sub_max_mean_result) sub_result_gm = 0 # Free memory ''' DISTANCE ''' distances_result = cuda.device_array((source_patches_loop.shape[0], target_patches_np_cuda_all.shape[0])) cu_distances[BPGxy,TPB](sub_max_mean_result, icv_source_flag_valid[il*loop_len:(il*loop_len)+loop_len], distances_result, alpha) sub_max_mean_result = 0 # Free memory ''' SORT ''' TPB = 256 BPG = int(math.ceil(distances_result.shape[0] / TPB)) cu_sort_distance[BPG,TPB](distances_result) ''' MEAN (AGE-VALUE) ''' idx_start = 8 # Starting index of mean calculation (to avoid bad example) distances_result_for_age = distances_result[:,idx_start:idx_start+num_mean_samples] distances_result = 0 # Free memory cu_age_value[BPG,TPB](distances_result_for_age, age_values_valid[il*loop_len:(il*loop_len)+loop_len]) distances_result_for_age = 0 # Free memory del source_patches_loop # Free memory #code.interact(local=dict(globals(), **locals())) print(' - Finished!\n') print(timer() - melvin) raise Exception() ''' Mapping from age_value_valid to age value_all ''' if valid == 1: index = 0 for idx_val in index_mapping: if idx_val != -1: age_values_all[index] = age_values_valid[int(idx_val)] index += 1 ''' Normalisation to probabilistic map (0...1) ''' if (np.max(age_values_all) - np.min(age_values_all)) == 0: all_mean_distance_normed = age_values_all else: all_mean_distance_normed = np.divide((age_values_all - np.min(age_values_all)), (np.max(age_values_all) - np.min(age_values_all))) ''' SAVE Result (JPG) ''' slice_age_map = np.zeros([counter_x,counter_y]) index = 0 for ix in range(0, counter_x): for iy in range(0, counter_y): slice_age_map[ix,iy] = all_mean_distance_normed[index] index += 1 ## Save mri_data sio.savemat(dirOutData + '/' + str(patch_size[xy]) + '/' + str(zz) + '_dat.mat', {'slice_age_map':slice_age_map}) print('Check GPU memory: ' + str(cuda.current_context().get_memory_info())) print('GPU flushing..\n--\n') numba.cuda.profile_stop() elapsed_times_patch_all[timer_idx,xy] = timer() - one_patch print('IAM for MRI ID: ' + mri_code + ' with patch size: ' + str(patch_size[xy]) + ' elapsed for: ' + str(elapsed_times_patch_all[timer_idx,xy])) elapsed_times_all[timer_idx] = timer() - one_mri_data print('IAM for MRI ID: ' + mri_code + ' elapsed for: ' + str(elapsed_times_all[timer_idx])) timer_idx += 1 ''' Save all elapsed times ''' sio.savemat(dirOutput + '/elapsed_times_all_' + str(num_samples) + 's' + str(num_mean_samples) + 'm.mat', {'elapsed_times_all':elapsed_times_all}) sio.savemat(dirOutput + '/elapsed_times_patch_all_' + str(num_samples) + 's' + str(num_mean_samples) + 'm.mat', {'elapsed_times_patch_all':elapsed_times_patch_all}) ''' IAM's (GPU Part) Computation ENDS here ''' ''' KEY POINT: IAM's Combination, Penalisation, and Post-processing - START ----------------------------------------------------------------------- Part 0 - Saving output results in .mat and JPEG files. Part 1 - Combination of multiple age maps. Part 2 - Global normalisation and penalisation of age maps based on brain tissues. Part 3 - Post-processing. Hint: You can search the keys of Part 0/1/2/3. ''' combined_age_map_mri = np.zeros((x_len, y_len, z_len)) combined_age_map_mri_mult = np.zeros((x_len, y_len, z_len)) combined_age_map_mri_mult_normed = np.zeros((x_len, y_len, z_len)) for zz in range(0, mri_data.shape[2]): mri_slice = mri_data[:,:,zz] mask_slice = np.nan_to_num(mri_slice) mask_slice[mask_slice > 0] = 1 penalty_slice = np.nan_to_num(mri_slice) ### PENALTY slice_age_map_all = np.zeros((len(patch_size), x_len, y_len)) dirOutData = dirOutput + '/' + mri_code for xy in range(0, len(patch_size)): mat_contents = sio.loadmat(dirOutData + '/' + str(patch_size[xy]) + '/' + str(zz) + '_dat.mat') slice_age_map = mat_contents['slice_age_map'] slice_age_map_res = cv2.resize(slice_age_map, None, fx=patch_size[xy], fy=patch_size[xy], interpolation=cv2.INTER_CUBIC) slice_age_map_res = skifilters.gaussian(slice_age_map_res,sigma=0.5,truncate=2.0) #if zz== 20: # code.interact(local=dict(globals(), **locals())) slice_age_map_res = np.multiply(mask_slice, slice_age_map_res) slice_age_map_all[xy,:,:] = slice_age_map_res slice_age_map_all = np.nan_to_num(slice_age_map_all) if save_jpeg: ''' >>> Part 0 <<<''' ''' Show all age maps based on patch's size and saving the mri_data ''' fig, axes = plt.subplots(2, 2, sharex=True, sharey=True) fig.set_size_inches(10, 10) fig.suptitle('All Patches Gaussian Filtered', fontsize=16) axes[0,0].set_title('Patch 1 x 1') im1 = axes[0,0].imshow(np.rot90(slice_age_map_all[0,:,:]), cmap="jet", vmin=0, vmax=1) divider1 = make_axes_locatable(axes[0,0]) cax1 = divider1.append_axes("right", size="7%", pad=0.05) cbar1 = plt.colorbar(im1, ticks=[0, 0.5, 1], cax=cax1) if len(patch_size) > 1: axes[0,1].set_title('Patch 2 x 2') im2 = axes[0,1].imshow(np.rot90(slice_age_map_all[1,:,:]), cmap="jet", vmin=0, vmax=1) divider2 = make_axes_locatable(axes[0,1]) cax2 = divider2.append_axes("right", size="7%", pad=0.05) cbar2 = plt.colorbar(im2, ticks=[0, 0.5, 1], cax=cax2) if len(patch_size) > 2: axes[1,0].set_title('Patch 4 x 4') im3 = axes[1,0].imshow(np.rot90(slice_age_map_all[2,:,:]), cmap="jet", vmin=0, vmax=1) divider3 = make_axes_locatable(axes[1,0]) cax3 = divider3.append_axes("right", size="7%", pad=0.05) cbar3 = plt.colorbar(im3, ticks=[0, 0.5, 1], cax=cax3) if len(patch_size) > 3: axes[1,1].set_title('Patch 8 x 8') im4 = axes[1,1].imshow(np.rot90(slice_age_map_all[3,:,:]), cmap="jet", vmin=0, vmax=1) divider4 = make_axes_locatable(axes[1,1]) cax4 = divider4.append_axes("right", size="7%", pad=0.05) cbar4 = plt.colorbar(im4, ticks=[0, 0.5, 1], cax=cax4) plt.tight_layout() plt.subplots_adjust(top=0.95) ''' >>> Part 0 <<<''' ''' Save mri_data in *_all.jpg ''' dirOutData = dirOutput + '/' + mri_code + '/IAM_combined_python/Patch/' fig.savefig(dirOutData + str(zz) + '_all.jpg', dpi=100) print('Saving files: ' + dirOutData + str(zz) + '_all.jpg') plt.close() ''' >>> Part 1 <<< ''' ''' Combined all patches age map information ''' combined_age_map = 0 for bi in range(len(patch_size)): combined_age_map += np.multiply(blending_weights[bi],slice_age_map_all[bi,:,:]) combined_age_map_mri[:,:,zz] = combined_age_map ''' Global Normalisation - saving needed mri_data ''' combined_age_map_mri_mult[:,:,zz] = np.multiply(np.multiply(combined_age_map, penalty_slice), mask_slice) ### PENALTY normed_only = np.divide((combined_age_map_mri[:,:,zz] - np.min(combined_age_map_mri[:,:,zz])),\ (np.max(combined_age_map_mri[:,:,zz]) - np.min(combined_age_map_mri[:,:,zz]))) normed_mult = np.multiply(np.multiply(normed_only, penalty_slice), mask_slice) ### PENALTY normed_mult_normed = np.divide((normed_mult - np.min(normed_mult)), \ (np.max(normed_mult) - np.min(normed_mult))) combined_age_map_mri_mult_normed[:,:,zz] = normed_mult_normed ''' Save mri_data in *.mat ''' dirOutData = dirOutput + '/' + mri_code + '/IAM_combined_python/Patch/' print('Saving files: ' + dirOutData + 'c' + str(zz) + '_combined.mat\n') sio.savemat(dirOutData + 'c' + str(zz) + '_combined.mat', {'slice_age_map_all':slice_age_map_all, 'combined_age_map':normed_only, 'mri_slice_mul_normed':normed_mult_normed, 'combined_mult':combined_age_map_mri_mult[:,:,zz]}) ''' >>> Part 2 <<< ''' ''' Penalty + Global Normalisation (GN) ''' combined_age_map_mri_normed = np.divide((combined_age_map_mri - np.min(combined_age_map_mri)),\ (np.max(combined_age_map_mri) - np.min(combined_age_map_mri))) combined_age_map_mri_mult_normed = np.divide((combined_age_map_mri_mult - np.min(combined_age_map_mri_mult)),\ (np.max(combined_age_map_mri_mult) - np.min(combined_age_map_mri_mult))) if save_jpeg: for zz in range(0, mri_data.shape[2]): fig2, axes2 = plt.subplots(1, 3) fig2.set_size_inches(16,5) axes2[0].set_title('Combined and normalised') im1 = axes2[0].imshow(np.rot90(np.nan_to_num(combined_age_map_mri_normed[:,:,zz])), cmap="jet", vmin=0, vmax=1) divider1 = make_axes_locatable(axes2[0]) cax1 = divider1.append_axes("right", size="7%", pad=0.05) cbar1 = plt.colorbar(im1, ticks=[0, 0.5, 1], cax=cax1) axes2[1].set_title('Combined, penalised and normalised') im2 = axes2[1].imshow(np.rot90(np.nan_to_num(combined_age_map_mri_mult_normed[:,:,zz])), cmap="jet", vmin=0, vmax=1) divider2 = make_axes_locatable(axes2[1]) cax2 = divider2.append_axes("right", size="7%", pad=0.05) cbar2 = plt.colorbar(im2, ticks=[0, 0.5, 1], cax=cax2) axes2[2].set_title('Original MRI slice') im3 = axes2[2].imshow(np.rot90(np.nan_to_num(mri_data[:,:,zz])), cmap="gray") divider3 = make_axes_locatable(axes2[2]) cax3 = divider3.append_axes("right", size="7%", pad=0.05) cbar3 = plt.colorbar(im3, cax=cax3) plt.tight_layout() # Make space for title plt.subplots_adjust(top=0.95) ''' Save mri_data in *_combined.jpg ''' dirOutData = dirOutput + '/' + mri_code + '/IAM_combined_python/Combined/' fig2.savefig(dirOutData + str(zz) + '_combined.jpg', dpi=100) print('Saving files: ' + dirOutData + str(zz) + '_combined.jpg') plt.close() ''' Save mri_data in *.mat ''' sio.savemat(dirOutDataCom + '/all_slice_dat.mat', {'combined_age_map_all_slice':combined_age_map_mri, 'mri_slice_mul_all_slice':combined_age_map_mri_mult, 'combined_age_map_mri_normed':combined_age_map_mri_normed, 'combined_age_map_mri_mult_normed':combined_age_map_mri_mult_normed}) ''' combined_age_map_mri_img = nib.Nifti1Image(combined_age_map_mri_normed, mri_nii.affine) nib.save(combined_age_map_mri_img, str(dirOutDataFin + '/IAM_GPU_COMBINED.nii.gz')) combined_age_map_mri_GN_img = nib.Nifti1Image(combined_age_map_mri_mult_normed, mri_nii.affine) nib.save(combined_age_map_mri_GN_img, str(dirOutDataFin + '/IAM_GPU_GN.nii.gz')) ''' ''' >>> Part 3 <<< ''' ''' Post-processing ''' ''' COMMENTED OUT BECAUSE NOT AVAILABLE if nawm_available and ~nawm_preprocessing: combined_age_map_mri_mult_normed = np.multiply(combined_age_map_mri_mult_normed,nawm_mri_code) combined_age_map_mri_GN_img = nib.Nifti1Image(combined_age_map_mri_mult_normed, mri_nii.affine) nib.save(combined_age_map_mri_GN_img, str(dirOutDataFin + '/IAM_GPU_GN_postprocessed.nii.gz')) ''' ''' --------------------------------------------------------------------- KEY POINT: IAM's Combination, Penalisation, and Post-processing - END ''' if delete_intermediary: shutil.rmtree(dirOutDataCom, ignore_errors=True) for xy in range(0, len(patch_size)): shutil.rmtree(dirOutput + '/' + mri_code + '/' + str(patch_size[xy]), ignore_errors=True) del temp del center_source_patch, icv_source_flag del icv_source_flag_valid, index_mapping del area_source_patch, target_patches_np # Free memory del area_source_patch_cuda_all, target_patches_np_cuda_all # Free memory gc.collect() ## Print the elapsed time information print('\n--\nSpeed statistics of this run..') print('mean elapsed time : ' + str(np.mean(elapsed_times_all)) + ' seconds') print('std elapsed time : ' + str(np.std(elapsed_times_all)) + ' seconds') print('median elapsed time : ' + str(np.median(elapsed_times_all)) + ' seconds') print('min elapsed time : ' + str(np.min(elapsed_times_all)) + ' seconds') print('max elapsed time : ' + str(np.max(elapsed_times_all)) + ' seconds')
def gpu_stump(T_A, m, T_B=None, ignore_trivial=True, device_id=0): """ Compute the matrix profile with GPU-STOMP This is a convenience wrapper around the Numba `cuda.jit` `_gpu_stump` function which computes the matrix profile according to GPU-STOMP. Parameters ---------- T_A : ndarray The time series or sequence for which to compute the matrix profile m : int Window size T_B : (optional) ndarray The time series or sequence that contain your query subsequences of interest. Default is `None` which corresponds to a self-join. ignore_trivial : bool Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. device_id : int or list The (GPU) device number to use. The default value is `0`. A list of valid device ids (int) may also be provided for parallel GPU-STUMP computation. A list of all valid device ids can be obtained by executing `[device.id for device in cuda.list_devices()]`. Returns ------- out : ndarray The first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column consists of the left matrix profile indices, and the fourth column consists of the right matrix profile indices. Notes ----- `DOI: 10.1109/ICDM.2016.0085 \ <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__ See Table II, Figure 5, and Figure 6 Timeseries, T_B, will be annotated with the distance location (or index) of all its subsequences in another times series, T_A. Return: For every subsequence, Q, in T_B, you will get a distance and index for the closest subsequence in T_A. Thus, the array returned will have length T_B.shape[0]-m+1. Additionally, the left and right matrix profiles are also returned. Note: Unlike in the Table II where T_A.shape is expected to be equal to T_B.shape, this implementation is generalized so that the shapes of T_A and T_B can be different. In the case where T_A.shape == T_B.shape, then our algorithm reduces down to the same algorithm found in Table II. Additionally, unlike STAMP where the exclusion zone is m/2, the default exclusion zone for STOMP is m/4 (See Definition 3 and Figure 3). For self-joins, set `ignore_trivial = True` in order to avoid the trivial match. Note that left and right matrix profiles are only available for self-joins. """ if T_B is None: # Self join! T_B = T_A ignore_trivial = True # Swap T_A and T_B for GPU implementation # This keeps the API identical to and compatible with `stumpy.stump` tmp_T = T_A T_A = T_B T_B = tmp_T T_A, M_T, Σ_T = core.preprocess(T_A, m) T_B, μ_Q, σ_Q = core.preprocess(T_B, m) if T_A.ndim != 1: # pragma: no cover raise ValueError( f"T_A is {T_A.ndim}-dimensional and must be 1-dimensional. " "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`" ) if T_B.ndim != 1: # pragma: no cover raise ValueError( f"T_B is {T_B.ndim}-dimensional and must be 1-dimensional. " "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`" ) core.check_dtype(T_A) core.check_dtype(T_B) core.check_window_size(m) if ignore_trivial is False and core.are_arrays_equal( T_A, T_B): # pragma: no cover logger.warning("Arrays T_A, T_B are equal, which implies a self-join.") logger.warning("Try setting `ignore_trivial = True`.") if ignore_trivial and core.are_arrays_equal( T_A, T_B) is False: # pragma: no cover logger.warning( "Arrays T_A, T_B are not equal, which implies an AB-join.") logger.warning("Try setting `ignore_trivial = False`.") n = T_B.shape[0] k = T_A.shape[0] - m + 1 l = n - m + 1 excl_zone = int(np.ceil(m / 4)) # See Definition 3 and Figure 3 T_A_fname = core.array_to_temp_file(T_A) T_B_fname = core.array_to_temp_file(T_B) M_T_fname = core.array_to_temp_file(M_T) Σ_T_fname = core.array_to_temp_file(Σ_T) μ_Q_fname = core.array_to_temp_file(μ_Q) σ_Q_fname = core.array_to_temp_file(σ_Q) out = np.empty((k, 4), dtype=object) if isinstance(device_id, int): device_ids = [device_id] else: device_ids = device_id profile = [None] * len(device_ids) indices = [None] * len(device_ids) for _id in device_ids: with cuda.gpus[_id]: if (cuda.current_context().__class__.__name__ != "FakeCUDAContext"): # pragma: no cover cuda.current_context().deallocations.clear() step = 1 + l // len(device_ids) # Start process pool for multi-GPU request if len(device_ids) > 1: # pragma: no cover mp.set_start_method("spawn", force=True) p = mp.Pool(processes=len(device_ids)) results = [None] * len(device_ids) QT_fnames = [] QT_first_fnames = [] for idx, start in enumerate(range(0, l, step)): stop = min(l, start + step) QT, QT_first = _get_QT(start, T_A, T_B, m) QT_fname = core.array_to_temp_file(QT) QT_first_fname = core.array_to_temp_file(QT_first) QT_fnames.append(QT_fname) QT_first_fnames.append(QT_first_fname) if len(device_ids ) > 1 and idx < len(device_ids) - 1: # pragma: no cover # Spawn and execute in child process for multi-GPU request results[idx] = p.apply_async( _gpu_stump, ( T_A_fname, T_B_fname, m, stop, excl_zone, M_T_fname, Σ_T_fname, QT_fname, QT_first_fname, μ_Q_fname, σ_Q_fname, k, ignore_trivial, start + 1, device_ids[idx], ), ) else: # Execute last chunk in parent process # Only parent process is executed when a single GPU is requested profile[idx], indices[idx] = _gpu_stump( T_A_fname, T_B_fname, m, stop, excl_zone, M_T_fname, Σ_T_fname, QT_fname, QT_first_fname, μ_Q_fname, σ_Q_fname, k, ignore_trivial, start + 1, device_ids[idx], ) # Clean up process pool for multi-GPU request if len(device_ids) > 1: # pragma: no cover p.close() p.join() # Collect results from spawned child processes if they exist for idx, result in enumerate(results): if result is not None: profile[idx], indices[idx] = result.get() os.remove(T_A_fname) os.remove(T_B_fname) os.remove(M_T_fname) os.remove(Σ_T_fname) os.remove(μ_Q_fname) os.remove(σ_Q_fname) for QT_fname in QT_fnames: os.remove(QT_fname) for QT_first_fname in QT_first_fnames: os.remove(QT_first_fname) for idx in range(len(device_ids)): profile_fname = profile[idx] indices_fname = indices[idx] profile[idx] = np.load(profile_fname, allow_pickle=False) indices[idx] = np.load(indices_fname, allow_pickle=False) os.remove(profile_fname) os.remove(indices_fname) for i in range(1, len(device_ids)): # Update all matrix profiles and matrix profile indices # (global, left, right) and store in profile[0] and indices[0] for col in range(profile[0].shape[1]): # pragma: no cover cond = profile[0][:, col] < profile[i][:, col] profile[0][:, col] = np.where(cond, profile[0][:, col], profile[i][:, col]) indices[0][:, col] = np.where(cond, indices[0][:, col], indices[i][:, col]) out[:, 0] = profile[0][:, 0] out[:, 1:4] = indices[0][:, :] threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover logger.warning( f"A large number of values are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial = True`.") return out
def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0): """ Compute the non-normalized (i.e., without z-normalization) matrix profile with one or more GPU devices This is a convenience wrapper around the Numba `cuda.jit` `_gpu_aamp` function which computes the non-normalized matrix profile according to modified version GPU-STOMP. Parameters ---------- T_A : ndarray The time series or sequence for which to compute the matrix profile m : int Window size T_B : ndarray, default None The time series or sequence that contain your query subsequences of interest. Default is `None` which corresponds to a self-join. ignore_trivial : bool, default True Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. device_id : int or list, default 0 The (GPU) device number to use. The default value is `0`. A list of valid device ids (int) may also be provided for parallel GPU-STUMP computation. A list of all valid device ids can be obtained by executing `[device.id for device in numba.cuda.list_devices()]`. Returns ------- out : ndarray The first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column consists of the left matrix profile indices, and the fourth column consists of the right matrix profile indices. Notes ----- `arXiv:1901.05708 \ <https://arxiv.org/pdf/1901.05708.pdf>`__ See Algorithm 1 Note that we have extended this algorithm for AB-joins as well. `DOI: 10.1109/ICDM.2016.0085 \ <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__ See Table II, Figure 5, and Figure 6 """ if T_B is None: # Self join! T_B = T_A ignore_trivial = True T_A, T_A_subseq_isfinite = core.preprocess_non_normalized(T_A, m) T_B, T_B_subseq_isfinite = core.preprocess_non_normalized(T_B, m) T_A_subseq_squared = np.sum(core.rolling_window(T_A * T_A, m), axis=1) T_B_subseq_squared = np.sum(core.rolling_window(T_B * T_B, m), axis=1) if T_A.ndim != 1: # pragma: no cover raise ValueError( f"T_A is {T_A.ndim}-dimensional and must be 1-dimensional. " "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`" ) if T_B.ndim != 1: # pragma: no cover raise ValueError( f"T_B is {T_B.ndim}-dimensional and must be 1-dimensional. " "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`" ) core.check_window_size(m, max_size=min(T_A.shape[0], T_B.shape[0])) if ignore_trivial is False and core.are_arrays_equal( T_A, T_B): # pragma: no cover logger.warning("Arrays T_A, T_B are equal, which implies a self-join.") logger.warning("Try setting `ignore_trivial = True`.") if ignore_trivial and core.are_arrays_equal( T_A, T_B) is False: # pragma: no cover logger.warning( "Arrays T_A, T_B are not equal, which implies an AB-join.") logger.warning("Try setting `ignore_trivial = False`.") n = T_B.shape[0] k = T_A.shape[0] - m + 1 l = n - m + 1 excl_zone = int(np.ceil(m / 4)) # See Definition 3 and Figure 3 T_A_fname = core.array_to_temp_file(T_A) T_B_fname = core.array_to_temp_file(T_B) T_A_subseq_isfinite_fname = core.array_to_temp_file(T_A_subseq_isfinite) T_B_subseq_isfinite_fname = core.array_to_temp_file(T_B_subseq_isfinite) T_A_subseq_squared_fname = core.array_to_temp_file(T_A_subseq_squared) T_B_subseq_squared_fname = core.array_to_temp_file(T_B_subseq_squared) out = np.empty((k, 4), dtype=object) if isinstance(device_id, int): device_ids = [device_id] else: device_ids = device_id profile = [None] * len(device_ids) indices = [None] * len(device_ids) for _id in device_ids: with cuda.gpus[_id]: if (cuda.current_context().__class__.__name__ != "FakeCUDAContext"): # pragma: no cover cuda.current_context().deallocations.clear() step = 1 + l // len(device_ids) # Start process pool for multi-GPU request if len(device_ids) > 1: # pragma: no cover mp.set_start_method("spawn", force=True) p = mp.Pool(processes=len(device_ids)) results = [None] * len(device_ids) QT_fnames = [] QT_first_fnames = [] for idx, start in enumerate(range(0, l, step)): stop = min(l, start + step) QT, QT_first = core._get_QT(start, T_A, T_B, m) QT_fname = core.array_to_temp_file(QT) QT_first_fname = core.array_to_temp_file(QT_first) QT_fnames.append(QT_fname) QT_first_fnames.append(QT_first_fname) if len(device_ids ) > 1 and idx < len(device_ids) - 1: # pragma: no cover # Spawn and execute in child process for multi-GPU request results[idx] = p.apply_async( _gpu_aamp, ( T_A_fname, T_B_fname, m, stop, excl_zone, T_A_subseq_isfinite_fname, T_B_subseq_isfinite_fname, T_A_subseq_squared_fname, T_B_subseq_squared_fname, QT_fname, QT_first_fname, k, ignore_trivial, start + 1, device_ids[idx], ), ) else: # Execute last chunk in parent process # Only parent process is executed when a single GPU is requested profile[idx], indices[idx] = _gpu_aamp( T_A_fname, T_B_fname, m, stop, excl_zone, T_A_subseq_isfinite_fname, T_B_subseq_isfinite_fname, T_A_subseq_squared_fname, T_B_subseq_squared_fname, QT_fname, QT_first_fname, k, ignore_trivial, start + 1, device_ids[idx], ) # Clean up process pool for multi-GPU request if len(device_ids) > 1: # pragma: no cover p.close() p.join() # Collect results from spawned child processes if they exist for idx, result in enumerate(results): if result is not None: profile[idx], indices[idx] = result.get() os.remove(T_A_fname) os.remove(T_B_fname) os.remove(T_A_subseq_isfinite_fname) os.remove(T_B_subseq_isfinite_fname) os.remove(T_A_subseq_squared_fname) os.remove(T_B_subseq_squared_fname) for QT_fname in QT_fnames: os.remove(QT_fname) for QT_first_fname in QT_first_fnames: os.remove(QT_first_fname) for idx in range(len(device_ids)): profile_fname = profile[idx] indices_fname = indices[idx] profile[idx] = np.load(profile_fname, allow_pickle=False) indices[idx] = np.load(indices_fname, allow_pickle=False) os.remove(profile_fname) os.remove(indices_fname) for i in range(1, len(device_ids)): # Update all matrix profiles and matrix profile indices # (global, left, right) and store in profile[0] and indices[0] for col in range(profile[0].shape[1]): # pragma: no cover cond = profile[0][:, col] < profile[i][:, col] profile[0][:, col] = np.where(cond, profile[0][:, col], profile[i][:, col]) indices[0][:, col] = np.where(cond, indices[0][:, col], indices[i][:, col]) out[:, 0] = profile[0][:, 0] out[:, 1:4] = indices[0][:, :] threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover logger.warning( f"A large number of values are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial = True`.") return out
# Helper libraries import numpy as np import matplotlib.pyplot as plt from time import time print(tf.__version__) #### USE CPU if you wish etc. #python -m pip install numba from numba import cuda device = cuda.get_current_device() device.reset() cuda.current_context().trashing.clear() cpu = True #False cpu = False if cpu: #my_devices = #tf.config.experimental.list_physical_devices(device_type='CPU') #tf.config.experimental.set_visible_devices(devices= my_devices, device_type='CPU') #for anyone who is using tf 2.1, the above comment does not seems to work. print("Using CPU!") tf.config.set_visible_devices([], 'GPU') tf.debugging.set_log_device_placement(True) fashion_mnist = keras.datasets.fashion_mnist
def test_initialized_in_context(self): # If we have a CUDA context, it should already have initialized its # memory manager. self.assertTrue(cuda.current_context().memory_manager.initialized)
def gpu_stump( T_A, m, T_B=None, ignore_trivial=True, threads_per_block=THREADS_PER_BLOCK, device_id=0, ): """ Compute the matrix profile with GPU-STOMP This is a convenience wrapper around the Numba `cuda.jit` `_gpu_stump` function which computes the matrix profile according to GPU-STOMP. Parameters ---------- T_A : ndarray The time series or sequence for which to compute the matrix profile m : int Window size T_B : ndarray The time series or sequence that contain your query subsequences of interest. Default is `None` which corresponds to a self-join. ignore_trivial : bool Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. threads_per_block : int The number of GPU threads to use for all kernels. The default value is set in `THREADS_PER_BLOCK=512`. device_id : int or list The (GPU) device number to use. The defailt value is `0`. A list of valid device ids (int) may also be provided for parallel GPU-STUMP computation. A list of all valid device ids can be obtained by executing `[device.id for device in cuda.list_devices()]`. Returns ------- out : ndarray The first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column consists of the left matrix profile indices, and the fourth column consists of the right matrix profile indices. Notes ----- `DOI: 10.1109/ICDM.2016.0085 \ <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__ See Table II, Figure 5, and Figure 6 Timeseries, T_B, will be annotated with the distance location (or index) of all its subsequences in another times series, T_A. Return: For every subsequence, Q, in T_B, you will get a distance and index for the closest subsequence in T_A. Thus, the array returned will have length T_B.shape[0]-m+1. Additionally, the left and right matrix profiles are also returned. Note: Unlike in the Table II where T_A.shape is expected to be equal to T_B.shape, this implementation is generalized so that the shapes of T_A and T_B can be different. In the case where T_A.shape == T_B.shape, then our algorithm reduces down to the same algorithm found in Table II. Additionally, unlike STAMP where the exclusion zone is m/2, the default exclusion zone for STOMP is m/4 (See Definition 3 and Figure 3). For self-joins, set `ignore_trivial = True` in order to avoid the trivial match. Note that left and right matrix profiles are only available for self-joins. """ T_A = np.asarray(T_A) core.check_dtype(T_A) core.check_nan(T_A) if T_B is None: # Self join! T_B = T_A ignore_trivial = True T_B = np.asarray(T_B) core.check_dtype(T_B) core.check_nan(T_B) core.check_window_size(m) if ignore_trivial is False and core.are_arrays_equal( T_A, T_B): # pragma: no cover logger.warning("Arrays T_A, T_B are equal, which implies a self-join.") logger.warning("Try setting `ignore_trivial = True`.") if ignore_trivial and core.are_arrays_equal( T_A, T_B) is False: # pragma: no cover logger.warning( "Arrays T_A, T_B are not equal, which implies an AB-join.") logger.warning("Try setting `ignore_trivial = False`.") # Swap T_A and T_B for GPU implementation # This keeps the API identical to and compatible with `stumpy.stump` tmp_T = T_A T_A = T_B T_B = tmp_T n = T_B.shape[0] k = T_A.shape[0] - m + 1 l = n - m + 1 excl_zone = int(np.ceil(m / 4)) # See Definition 3 and Figure 3 M_T, Σ_T = core.compute_mean_std(T_A, m) μ_Q, σ_Q = core.compute_mean_std(T_B, m) out = np.empty((k, 4), dtype=object) if isinstance(device_id, int): device_ids = [device_id] else: device_ids = device_id profile = [None] * len(device_ids) indices = [None] * len(device_ids) for _id in device_ids: cuda.select_device(_id) if (cuda.current_context().__class__.__name__ != "FakeCUDAContext"): # pragma: no cover cuda.current_context().deallocations.clear() step = 1 + l // len(device_ids) for idx, start in enumerate(range(0, l, step)): stop = min(l, start + step) QT, QT_first = _get_QT(start, T_A, T_B, m) profile[idx], indices[idx] = _gpu_stump( T_A, T_B, m, stop, excl_zone, M_T, Σ_T, QT, QT_first, μ_Q, σ_Q, k, ignore_trivial, start + 1, threads_per_block, device_ids[idx], ) for i in range(1, len(device_ids)): # Update all matrix profiles and matrix profile indices # (global, left, right) and store in profile[0] and indices[0] for col in range(profile[0].shape[1]): # pragma: no cover cond = profile[0][:, col] < profile[i][:, col] profile[0][:, col] = np.where(cond, profile[0][:, col], profile[i][:, col]) indices[0][:, col] = np.where(cond, indices[0][:, col], indices[i][:, col]) out[:, 0] = profile[0][:, 0] out[:, 1:4] = indices[0][:, :] threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover logger.warning( f"A large number of values are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial = True`.") return out
__author__ = 'christopher' if __name__ == '__main__': from mpi4py import MPI from numba import cuda comm = MPI.Comm.Get_parent() rank = comm.Get_rank() meminfo = int(cuda.current_context().get_memory_info()[0]) cuda.close() comm.gather(sendobj=meminfo, root=0) comm.Disconnect()
g_reg = cuda.to_device(regions,stream=stream) g_ste = cuda.to_device(ste,stream=stream) rng_states = create_xoroshiro128p_states(13456, seed=int(tim.time())) g_steps = cuda.to_device(steps,stream=stream) g_del = cuda.to_device(tt,stream=stream) ranwalk = np.empty((10000,116,116), dtype=g_steps.dtype) delay = np.empty((10000,116,116), dtype=g_steps.dtype) output = np.empty(shape=g_steps.shape, dtype=g_steps.dtype) delay_o = np.empty(shape=g_steps.shape, dtype=g_steps.dtype) for i in range(10000): random_walk[116, 116](g_prob, g_time, g_reg, g_steps, g_del, g_ste, rng_states) print(i) #print("g_steps size:", g_steps.size, " output size: ", output.size) cuda.cudadrv.driver.Context.synchronize(cuda.current_context()) #time.sleep(0.7) #print("synchronized") output = g_steps.copy_to_host(stream=stream) delay_o = g_del.copy_to_host(stream=stream) #print("ovde zaglaviv") sd = output sc = delay_o ranwalk[i,:,:] = sd.reshape((116,116)) delay[i,:,:] = sc.reshape((116,116)) del rng_states #del g_steps rng_states = create_xoroshiro128p_states(116*116, seed=np.uint64(tim.time())) #g_steps = cuda.to_device(steps) sci.savemat(name +'_randomwalk_steps.mat', {'ranwalk':ranwalk})