def __init__(self, slvr_cfg): """ RimeSolver Constructor Parameters: slvr_cfg : SolverConfiguration Solver Configuration variables """ super(CompositeRimeSolver, self).__init__(slvr_cfg=slvr_cfg) # Create thread local storage self.thread_local = threading.local() self.register_default_dimensions() # Configure the dimensions of the beam cube self.register_dimension('beam_lw', slvr_cfg[Options.E_BEAM_WIDTH], description='E cube l width') self.register_dimension('beam_mh', slvr_cfg[Options.E_BEAM_HEIGHT], description='E cube m height') self.register_dimension('beam_nud', slvr_cfg[Options.E_BEAM_DEPTH], description='E cube nu depth') # Monkey patch v4 antenna pair functions into the object from montblanc.impl.rime.v4.ant_pairs import monkey_patch_antenna_pairs monkey_patch_antenna_pairs(self) # Copy the v4 arrays and properties and # modify them for use on this Composite Solver A_main, P_main = self._cfg_comp_slvr_arys_and_props(v4Arrays, v4Props) self.register_properties(P_main) self.register_arrays(A_main) # Look for ignored and supplied arrays in the solver configuration array_cfg = slvr_cfg.get('array_cfg', {}) ignore = array_cfg.get('ignore', None) supplied = array_cfg.get('supplied', None) # Create arrays on the solver, ignoring # and using supplied arrays as necessary self.create_arrays(ignore, supplied) # PyCUDA contexts for each GPU device self.dev_ctxs = slvr_cfg.get(Options.CONTEXT) # Number of GPU Solvers created for each device nsolvers = slvr_cfg.get(Options.NSOLVERS) # Maximum number of enqueued visibility chunks # before throttling is applied self.throttle_factor = slvr_cfg.get(Options.VISIBILITY_THROTTLE_FACTOR) # Massage the contexts for each device into a list if not isinstance(self.dev_ctxs, list): self.dev_ctxs = [self.dev_ctxs] montblanc.log.info( 'Using {d} solver(s) per device.'.format(d=nsolvers)) # Shorten the type name C = CompositeRimeSolver # Create a one thread executor for each device context, # i.e. a thread per device enqueue_executors = [cf.ThreadPoolExecutor(1) for ctx in self.dev_ctxs] sync_executors = [cf.ThreadPoolExecutor(1) for ctx in self.dev_ctxs] self.enqueue_executors = enqueue_executors self.sync_executors = sync_executors self.initialised = False self._vis_write_mode = slvr_cfg.get(Options.VISIBILITY_WRITE_MODE) montblanc.log.info( 'Created {d} executor(s).'.format(d=len(enqueue_executors))) # Initialise executor threads for ex, ctx in zip(enqueue_executors, self.dev_ctxs): ex.submit(C._thread_init, self, ctx).result() for ex, ctx in zip(sync_executors, self.dev_ctxs): ex.submit(C._thread_init, self, ctx).result() montblanc.log.info( 'Initialised {d} thread(s).'.format(d=len(enqueue_executors))) # Get a template dictionary T = self.template_dict() A_sub, P_sub = self._cfg_sub_slvr_arys_and_props(v4Arrays, v4Props) self._validate_arrays(A_sub) # Find the budget with the lowest memory usage # Work with the device with the lowest memory budgets = sorted([ ex.submit(C._thread_budget, self, slvr_cfg, A_sub, T).result() for ex in enqueue_executors ], key=lambda T: T[1]) P, M, mem = budgets[0] # Log some information about the memory budget # and dimension reduction montblanc.log.info(('Selected a solver memory budget of {b} ' 'for {d} solvers.').format(b=mbu.fmt_bytes(mem), d=nsolvers)) montblanc.log.info(('The following dimension reductions ' 'have been applied:')) for k, v in M.iteritems(): montblanc.log.info('{p}{d}: {id} => {rd}'.format(p=' ' * 4, d=k, id=T[k], rd=v)) # Create the sub solver configuration subslvr_cfg = slvr_cfg.copy() subslvr_cfg[Options.DATA_SOURCE] = Options.DATA_SOURCE_EMPTY subslvr_cfg[Options.CONTEXT] = ctx subslvr_cfg = self._cfg_subslvr_dims(subslvr_cfg, P) # Extract the dimension differences self.src_diff = P[Options.NSRC] self.time_diff = P[Options.NTIME] self.ant_diff = P[Options.NA] self.bl_diff = P[Options.NBL] self.chan_diff = P[Options.NCHAN] montblanc.log.info('Creating {s} solver(s) on {d} device(s).'.format( s=nsolvers, d=len(enqueue_executors))) # Now create the solvers on each thread for ex in enqueue_executors: ex.submit(C._thread_create_solvers, self, subslvr_cfg, P, nsolvers).result() montblanc.log.info('Solvers Created') # Register arrays and properties on each thread's solvers for ex in enqueue_executors: ex.submit(C._thread_reg_sub_arys_and_props, self, A_sub, P_sub).result() montblanc.log.info('Priming Memory Pools') # Prime the memory pools on each sub-solver for ex in enqueue_executors: ex.submit(C._thread_prime_memory_pools, self).result()
def _budget(cube, slvr_cfg): # Figure out a viable dimension configuration # given the total problem size mem_budget = slvr_cfg.get('mem_budget', 2*ONE_GB) bytes_required = cube.bytes_required() src_dims = mbu.source_nr_vars() + ['nsrc'] dim_names = ['na', 'nbl', 'ntime'] + src_dims global_sizes = cube.dim_global_size(*dim_names) na, nbl, ntime = global_sizes[:3] # Keep track of original dimension sizes and any reductions that are applied original_sizes = { r: s for r, s in zip(dim_names, global_sizes) } applied_reductions = {} def _reduction(): # Reduce over time first trange = _uniq_log2_range(1, ntime, 5) for t in trange[0:1]: yield [('ntime', t)] # Attempt reduction over source sbs = slvr_cfg['source_batch_size'] srange = _uniq_log2_range(10, sbs, 5) if sbs > 10 else 10 src_dim_gs = global_sizes[3:] for bs in srange: yield [(d, bs if bs < gs else gs) for d, gs in zip(src_dims, src_dim_gs)] # Try the rest of the timesteps for t in trange[1:]: yield [('ntime', t)] # Reduce by baseline for bl in _uniq_log2_range(na, nbl, 5): yield [('nbl', bl)] for reduction in _reduction(): if bytes_required > mem_budget: for dim, size in reduction: applied_reductions[dim] = size cube.update_dimension(dim, lower_extent=0, upper_extent=size) else: break bytes_required = cube.bytes_required() # Log some information about the memory_budget # and dimension reduction montblanc.log.info(("Selected a solver memory budget of {rb} " "given a hard limit of {mb}.").format( rb=mbu.fmt_bytes(bytes_required), mb=mbu.fmt_bytes(mem_budget))) if len(applied_reductions) > 0: montblanc.log.info("The following dimension reductions " "were applied:") for k, v in applied_reductions.iteritems(): montblanc.log.info('{p}{d}: {id} => {rd}'.format (p=' '*4, d=k, id=original_sizes[k], rd=v)) else: montblanc.log.info("No dimension reductions were applied.") return applied_reductions, bytes_required
def _thread_prime_memory_pools(self): """ We use memory pools to avoid allocating both CUDA pinned host and device memory. This function fakes allocations prior to running the solver so that the memory pools are 'primed' with memory allocations that can be re-used during actual execution of the solver """ montblanc.log.debug('Priming memory pools in thread %s', threading.current_thread()) nsrc = self.dim_local_size('nsrc') # Retain references to pool allocations pinned_pool_refs = defaultdict(list) device_pool_refs = defaultdict(list) pinned_allocated = 0 # Class of arrays that are to be transferred classifiers = [ Classifier.E_BEAM_INPUT, Classifier.B_SQRT_INPUT, Classifier.EKB_SQRT_INPUT, Classifier.COHERENCIES_INPUT ] # Estimate number of kernels for constant data nkernels = len(classifiers) # Detect already transferred array chunks dirty = {} # Get the first chunk of the visibility space cpu_slice_map, gpu_slice_map = self._gen_vis_slices().next() for i, subslvr in enumerate(self.thread_local.solvers): # For the maximum number of visibility chunks that can be enqueued for T in range(self.throttle_factor): # For each source batch within the visibility chunk for cpu_src_slice_map, gpu_src_slice_map in self._gen_source_slices( ): cpu_slice_map.update(cpu_src_slice_map) gpu_slice_map.update(gpu_src_slice_map) # Allocate pinned memory for transfer arrays # retaining references to them refs = self._enqueue_array(subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, classifiers=classifiers) pinned_allocated += sum( [r.nbytes for l in refs.values() for r in l]) _update_refs(pinned_pool_refs, refs) # Allocate pinned memory for constant memory transfers cdata = subslvr.const_data().ndary() for k in range(nkernels): cdata_ref = subslvr.pinned_mem_pool.allocate( shape=cdata.shape, dtype=cdata.dtype) pinned_allocated += cdata_ref.nbytes pinned_pool_refs['cdata'].append(cdata_ref) # Allocate device memory for arrays that need to be # allocated from a pool by PyCUDA's reduction kernels dev_ary = subslvr.dev_mem_pool.allocate(self.X2.nbytes) device_pool_refs['X2_gpu'].append(dev_ary) device = self.thread_local.context.get_device() montblanc.log.info('Primed pinned memory pool ' 'of size {n} for device {d}.'.format( d=device.name(), n=mbu.fmt_bytes(pinned_allocated))) # Now force return of memory to the pools for key, array_list in pinned_pool_refs.iteritems(): [a.base.free() for a in array_list] for array_list in device_pool_refs.itervalues(): [a.free() for a in array_list]
def _thread_budget(self, slvr_cfg, A_sub, props): """ Get memory budget and dimension reduction information from the CUDA device associated with the current thread and context """ montblanc.log.debug('Budgeting in thread %s', threading.current_thread()) # Query free memory on this context (free_mem, total_mem) = cuda.mem_get_info() device = self.thread_local.context.get_device() montblanc.log.info('{d}: {t} total {f} free.'.format( d=device.name(), f=mbu.fmt_bytes(free_mem), t=mbu.fmt_bytes(total_mem))) # Work with a supplied memory budget, otherwise use # free memory less an amount equal to the upper size # of an NVIDIA context mem_budget = slvr_cfg.get('mem_budget', free_mem - 200 * ONE_MB) nsolvers = slvr_cfg.get(Options.NSOLVERS) na = slvr_cfg.get(Options.NA) nsrc = slvr_cfg.get(Options.SOURCE_BATCH_SIZE) src_str_list = [Options.NSRC] + mbu.source_nr_vars() src_reduction_str = '&'.join( ['%s=%s' % (nr_var, nsrc) for nr_var in src_str_list]) ntime_split = np.int32(np.ceil(100.0 / nsolvers)) ntime_split_str = 'ntime={n}'.format(n=ntime_split) # Figure out a viable dimension configuration # given the total problem size viable, modded_dims = mbu.viable_dim_config(mem_budget, A_sub, props, [ ntime_split_str, src_reduction_str, 'ntime', 'nbl={na}&na={na}'.format(na=na), 'nchan=50%' ], nsolvers) # Create property dictionary with updated # dimensions. P = props.copy() P.update(modded_dims) required_mem = mbu.dict_array_bytes_required(A_sub, P) if not viable: dim_set_str = ', '.join( ['%s=%s' % (k, v) for k, v in modded_dims.iteritems()]) ary_list_str = '\n'.join([ '%-*s %-*s %s' % (15, a['name'], 10, mbu.fmt_bytes(mbu.dict_array_bytes( a, P)), mbu.shape_from_str_tuple(a['shape'], P)) for a in sorted(A_sub, reverse=True, key=lambda a: mbu.dict_array_bytes(a, P)) ]) raise MemoryError( "Tried reducing the problem size " "by setting '%s' on all arrays, " "but the resultant required memory of %s " "for each of %d solvers is too big " "to fit within the memory budget of %s. " "List of biggests offenders:\n%s " "\nSplitting the problem along the " "channel dimension needs to be " "implemented." % (dim_set_str, mbu.fmt_bytes(required_mem), nsolvers, mbu.fmt_bytes(mem_budget), ary_list_str)) return P, modded_dims, required_mem
def _thread_prime_memory_pools(self): """ We use memory pools to avoid allocating both CUDA pinned host and device memory. This function fakes allocations prior to running the solver so that the memory pools are 'primed' with memory allocations that can be re-used during actual execution of the solver """ montblanc.log.debug('Priming memory pools in thread %s', threading.current_thread()) nsrc = self.dim_local_size('nsrc') # Retain references to pool allocations pinned_pool_refs = defaultdict(list) device_pool_refs = defaultdict(list) pinned_allocated = 0 # Class of arrays that are to be transferred classifiers = [Classifier.E_BEAM_INPUT, Classifier.B_SQRT_INPUT, Classifier.EKB_SQRT_INPUT, Classifier.COHERENCIES_INPUT] # Estimate number of kernels for constant data nkernels = len(classifiers) # Detect already transferred array chunks dirty = {} # Get the first chunk of the visibility space cpu_slice_map, gpu_slice_map = self._gen_vis_slices().next() for i, subslvr in enumerate(self.thread_local.solvers): # For the maximum number of visibility chunks that can be enqueued for T in range(self.throttle_factor): # For each source batch within the visibility chunk for cpu_src_slice_map, gpu_src_slice_map in self._gen_source_slices(): cpu_slice_map.update(cpu_src_slice_map) gpu_slice_map.update(gpu_src_slice_map) # Allocate pinned memory for transfer arrays # retaining references to them refs = self._enqueue_array(subslvr, cpu_slice_map, gpu_slice_map, direction=ASYNC_HTOD, dirty=dirty, classifiers=classifiers) pinned_allocated += sum([r.nbytes for l in refs.values() for r in l]) _update_refs(pinned_pool_refs, refs) # Allocate pinned memory for constant memory transfers cdata = subslvr.const_data().ndary() for k in range(nkernels): cdata_ref = subslvr.pinned_mem_pool.allocate( shape=cdata.shape, dtype=cdata.dtype) pinned_allocated += cdata_ref.nbytes pinned_pool_refs['cdata'].append(cdata_ref) # Allocate device memory for arrays that need to be # allocated from a pool by PyCUDA's reduction kernels dev_ary = subslvr.dev_mem_pool.allocate(self.X2.nbytes) device_pool_refs['X2_gpu'].append(dev_ary) device = self.thread_local.context.get_device() montblanc.log.info('Primed pinned memory pool ' 'of size {n} for device {d}.'.format( d=device.name(), n=mbu.fmt_bytes(pinned_allocated))) # Now force return of memory to the pools for key, array_list in pinned_pool_refs.iteritems(): [a.base.free() for a in array_list] for array_list in device_pool_refs.itervalues(): [a.free() for a in array_list]
def __init__(self, slvr_cfg): """ RimeSolver Constructor Parameters: slvr_cfg : SolverConfiguration Solver Configuration variables """ super(CompositeRimeSolver, self).__init__(slvr_cfg=slvr_cfg) # Create thread local storage self.thread_local = threading.local() self.register_default_dimensions() # Configure the dimensions of the beam cube self.register_dimension('beam_lw', slvr_cfg[Options.E_BEAM_WIDTH], description='E cube l width') self.register_dimension('beam_mh', slvr_cfg[Options.E_BEAM_HEIGHT], description='E cube m height') self.register_dimension('beam_nud', slvr_cfg[Options.E_BEAM_DEPTH], description='E cube nu depth') # Monkey patch v4 antenna pair functions into the object from montblanc.impl.rime.v4.ant_pairs import monkey_patch_antenna_pairs monkey_patch_antenna_pairs(self) # Copy the v4 arrays and properties and # modify them for use on this Composite Solver A_main, P_main = self._cfg_comp_slvr_arys_and_props(v4Arrays, v4Props) self.register_properties(P_main) self.register_arrays(A_main) # Look for ignored and supplied arrays in the solver configuration array_cfg = slvr_cfg.get('array_cfg', {}) ignore = array_cfg.get('ignore', None) supplied = array_cfg.get('supplied', None) # Create arrays on the solver, ignoring # and using supplied arrays as necessary self.create_arrays(ignore, supplied) # PyCUDA contexts for each GPU device self.dev_ctxs = slvr_cfg.get(Options.CONTEXT) # Number of GPU Solvers created for each device nsolvers = slvr_cfg.get(Options.NSOLVERS) # Maximum number of enqueued visibility chunks # before throttling is applied self.throttle_factor = slvr_cfg.get( Options.VISIBILITY_THROTTLE_FACTOR) # Massage the contexts for each device into a list if not isinstance(self.dev_ctxs, list): self.dev_ctxs = [self.dev_ctxs] montblanc.log.info('Using {d} solver(s) per device.'.format( d=nsolvers)) # Shorten the type name C = CompositeRimeSolver # Create a one thread executor for each device context, # i.e. a thread per device enqueue_executors = [cf.ThreadPoolExecutor(1) for ctx in self.dev_ctxs] sync_executors = [cf.ThreadPoolExecutor(1) for ctx in self.dev_ctxs] self.enqueue_executors = enqueue_executors self.sync_executors = sync_executors self.initialised = False self._vis_write_mode = slvr_cfg.get(Options.VISIBILITY_WRITE_MODE) montblanc.log.info('Created {d} executor(s).'.format(d=len(enqueue_executors))) # Initialise executor threads for ex, ctx in zip(enqueue_executors, self.dev_ctxs): ex.submit(C._thread_init, self, ctx).result() for ex, ctx in zip(sync_executors, self.dev_ctxs): ex.submit(C._thread_init, self, ctx).result() montblanc.log.info('Initialised {d} thread(s).'.format(d=len(enqueue_executors))) # Get a template dictionary T = self.template_dict() A_sub, P_sub = self._cfg_sub_slvr_arys_and_props(v4Arrays, v4Props) self._validate_arrays(A_sub) # Find the budget with the lowest memory usage # Work with the device with the lowest memory budgets = sorted([ex.submit(C._thread_budget, self, slvr_cfg, A_sub, T).result() for ex in enqueue_executors], key=lambda T: T[1]) P, M, mem = budgets[0] # Log some information about the memory budget # and dimension reduction montblanc.log.info(('Selected a solver memory budget of {b} ' 'for {d} solvers.').format(b=mbu.fmt_bytes(mem), d=nsolvers)) montblanc.log.info(('The following dimension reductions ' 'have been applied:')) for k, v in M.iteritems(): montblanc.log.info('{p}{d}: {id} => {rd}'.format (p=' '*4, d=k, id=T[k], rd=v)) # Create the sub solver configuration subslvr_cfg = slvr_cfg.copy() subslvr_cfg[Options.DATA_SOURCE] = Options.DATA_SOURCE_EMPTY subslvr_cfg[Options.CONTEXT] = ctx subslvr_cfg = self._cfg_subslvr_dims(subslvr_cfg, P) # Extract the dimension differences self.src_diff = P[Options.NSRC] self.time_diff = P[Options.NTIME] self.ant_diff = P[Options.NA] self.bl_diff = P[Options.NBL] self.chan_diff = P[Options.NCHAN] montblanc.log.info('Creating {s} solver(s) on {d} device(s).' .format(s=nsolvers, d=len(enqueue_executors))) # Now create the solvers on each thread for ex in enqueue_executors: ex.submit(C._thread_create_solvers, self, subslvr_cfg, P, nsolvers).result() montblanc.log.info('Solvers Created') # Register arrays and properties on each thread's solvers for ex in enqueue_executors: ex.submit(C._thread_reg_sub_arys_and_props, self, A_sub, P_sub).result() montblanc.log.info('Priming Memory Pools') # Prime the memory pools on each sub-solver for ex in enqueue_executors: ex.submit(C._thread_prime_memory_pools, self).result()
def _thread_budget(self, slvr_cfg, A_sub, props): """ Get memory budget and dimension reduction information from the CUDA device associated with the current thread and context """ montblanc.log.debug('Budgeting in thread %s', threading.current_thread()) # Query free memory on this context (free_mem,total_mem) = cuda.mem_get_info() device = self.thread_local.context.get_device() montblanc.log.info('{d}: {t} total {f} free.'.format( d=device.name(), f=mbu.fmt_bytes(free_mem), t=mbu.fmt_bytes(total_mem))) # Work with a supplied memory budget, otherwise use # free memory less an amount equal to the upper size # of an NVIDIA context mem_budget = slvr_cfg.get('mem_budget', free_mem - 200*ONE_MB) nsolvers = slvr_cfg.get(Options.NSOLVERS) na = slvr_cfg.get(Options.NA) nsrc = slvr_cfg.get(Options.SOURCE_BATCH_SIZE) src_str_list = [Options.NSRC] + mbu.source_nr_vars() src_reduction_str = '&'.join(['%s=%s' % (nr_var, nsrc) for nr_var in src_str_list]) ntime_split = np.int32(np.ceil(100.0 / nsolvers)) ntime_split_str = 'ntime={n}'.format(n=ntime_split) # Figure out a viable dimension configuration # given the total problem size viable, modded_dims = mbu.viable_dim_config( mem_budget, A_sub, props, [ntime_split_str, src_reduction_str, 'ntime', 'nbl={na}&na={na}'.format(na=na), 'nchan=50%'], nsolvers) # Create property dictionary with updated # dimensions. P = props.copy() P.update(modded_dims) required_mem = mbu.dict_array_bytes_required(A_sub, P) if not viable: dim_set_str = ', '.join(['%s=%s' % (k,v) for k,v in modded_dims.iteritems()]) ary_list_str = '\n'.join(['%-*s %-*s %s' % ( 15, a['name'], 10, mbu.fmt_bytes(mbu.dict_array_bytes(a, P)), mbu.shape_from_str_tuple(a['shape'],P)) for a in sorted(A_sub, reverse=True, key=lambda a: mbu.dict_array_bytes(a, P))]) raise MemoryError("Tried reducing the problem size " "by setting '%s' on all arrays, " "but the resultant required memory of %s " "for each of %d solvers is too big " "to fit within the memory budget of %s. " "List of biggests offenders:\n%s " "\nSplitting the problem along the " "channel dimension needs to be " "implemented." % (dim_set_str, mbu.fmt_bytes(required_mem), nsolvers, mbu.fmt_bytes(mem_budget), ary_list_str)) return P, modded_dims, required_mem