Esempio n. 1
0
    def __init__(self, slvr_cfg):
        """
        RimeSolver Constructor

        Parameters:
            slvr_cfg : SolverConfiguration
                Solver Configuration variables
        """
        super(CompositeRimeSolver, self).__init__(slvr_cfg=slvr_cfg)

        # Create thread local storage
        self.thread_local = threading.local()

        self.register_default_dimensions()

        # Configure the dimensions of the beam cube
        self.register_dimension('beam_lw',
                                slvr_cfg[Options.E_BEAM_WIDTH],
                                description='E cube l width')

        self.register_dimension('beam_mh',
                                slvr_cfg[Options.E_BEAM_HEIGHT],
                                description='E cube m height')

        self.register_dimension('beam_nud',
                                slvr_cfg[Options.E_BEAM_DEPTH],
                                description='E cube nu depth')

        # Monkey patch v4 antenna pair functions into the object
        from montblanc.impl.rime.v4.ant_pairs import monkey_patch_antenna_pairs
        monkey_patch_antenna_pairs(self)

        # Copy the v4 arrays and properties and
        # modify them for use on this Composite Solver
        A_main, P_main = self._cfg_comp_slvr_arys_and_props(v4Arrays, v4Props)

        self.register_properties(P_main)
        self.register_arrays(A_main)

        # Look for ignored and supplied arrays in the solver configuration
        array_cfg = slvr_cfg.get('array_cfg', {})
        ignore = array_cfg.get('ignore', None)
        supplied = array_cfg.get('supplied', None)

        # Create arrays on the solver, ignoring
        # and using supplied arrays as necessary
        self.create_arrays(ignore, supplied)

        # PyCUDA contexts for each GPU device
        self.dev_ctxs = slvr_cfg.get(Options.CONTEXT)
        # Number of GPU Solvers created for each device
        nsolvers = slvr_cfg.get(Options.NSOLVERS)
        # Maximum number of enqueued visibility chunks
        # before throttling is applied
        self.throttle_factor = slvr_cfg.get(Options.VISIBILITY_THROTTLE_FACTOR)

        # Massage the contexts for each device into a list
        if not isinstance(self.dev_ctxs, list):
            self.dev_ctxs = [self.dev_ctxs]

        montblanc.log.info(
            'Using {d} solver(s) per device.'.format(d=nsolvers))

        # Shorten the type name
        C = CompositeRimeSolver

        # Create a one thread executor for each device context,
        # i.e. a thread per device
        enqueue_executors = [cf.ThreadPoolExecutor(1) for ctx in self.dev_ctxs]
        sync_executors = [cf.ThreadPoolExecutor(1) for ctx in self.dev_ctxs]

        self.enqueue_executors = enqueue_executors
        self.sync_executors = sync_executors
        self.initialised = False
        self._vis_write_mode = slvr_cfg.get(Options.VISIBILITY_WRITE_MODE)

        montblanc.log.info(
            'Created {d} executor(s).'.format(d=len(enqueue_executors)))

        # Initialise executor threads
        for ex, ctx in zip(enqueue_executors, self.dev_ctxs):
            ex.submit(C._thread_init, self, ctx).result()

        for ex, ctx in zip(sync_executors, self.dev_ctxs):
            ex.submit(C._thread_init, self, ctx).result()

        montblanc.log.info(
            'Initialised {d} thread(s).'.format(d=len(enqueue_executors)))

        # Get a template dictionary
        T = self.template_dict()

        A_sub, P_sub = self._cfg_sub_slvr_arys_and_props(v4Arrays, v4Props)
        self._validate_arrays(A_sub)

        # Find the budget with the lowest memory usage
        # Work with the device with the lowest memory
        budgets = sorted([
            ex.submit(C._thread_budget, self, slvr_cfg, A_sub, T).result()
            for ex in enqueue_executors
        ],
                         key=lambda T: T[1])

        P, M, mem = budgets[0]

        # Log some information about the memory budget
        # and dimension reduction
        montblanc.log.info(('Selected a solver memory budget of {b} '
                            'for {d} solvers.').format(b=mbu.fmt_bytes(mem),
                                                       d=nsolvers))

        montblanc.log.info(('The following dimension reductions '
                            'have been applied:'))

        for k, v in M.iteritems():
            montblanc.log.info('{p}{d}: {id} => {rd}'.format(p=' ' * 4,
                                                             d=k,
                                                             id=T[k],
                                                             rd=v))

        # Create the sub solver configuration
        subslvr_cfg = slvr_cfg.copy()
        subslvr_cfg[Options.DATA_SOURCE] = Options.DATA_SOURCE_EMPTY
        subslvr_cfg[Options.CONTEXT] = ctx

        subslvr_cfg = self._cfg_subslvr_dims(subslvr_cfg, P)

        # Extract the dimension differences
        self.src_diff = P[Options.NSRC]
        self.time_diff = P[Options.NTIME]
        self.ant_diff = P[Options.NA]
        self.bl_diff = P[Options.NBL]
        self.chan_diff = P[Options.NCHAN]

        montblanc.log.info('Creating {s} solver(s) on {d} device(s).'.format(
            s=nsolvers, d=len(enqueue_executors)))

        # Now create the solvers on each thread
        for ex in enqueue_executors:
            ex.submit(C._thread_create_solvers, self, subslvr_cfg, P,
                      nsolvers).result()

        montblanc.log.info('Solvers Created')

        # Register arrays and properties on each thread's solvers
        for ex in enqueue_executors:
            ex.submit(C._thread_reg_sub_arys_and_props, self, A_sub,
                      P_sub).result()

        montblanc.log.info('Priming Memory Pools')

        # Prime the memory pools on each sub-solver
        for ex in enqueue_executors:
            ex.submit(C._thread_prime_memory_pools, self).result()
Esempio n. 2
0
def _budget(cube, slvr_cfg):
    # Figure out a viable dimension configuration
    # given the total problem size
    mem_budget = slvr_cfg.get('mem_budget', 2*ONE_GB)
    bytes_required = cube.bytes_required()

    src_dims = mbu.source_nr_vars() + ['nsrc']
    dim_names = ['na', 'nbl', 'ntime'] + src_dims
    global_sizes = cube.dim_global_size(*dim_names)
    na, nbl, ntime = global_sizes[:3]

    # Keep track of original dimension sizes and any reductions that are applied
    original_sizes = { r: s for r, s in zip(dim_names, global_sizes) }
    applied_reductions = {}

    def _reduction():
        # Reduce over time first
        trange = _uniq_log2_range(1, ntime, 5)
        for t in trange[0:1]:
            yield [('ntime', t)]

        # Attempt reduction over source
        sbs = slvr_cfg['source_batch_size']
        srange = _uniq_log2_range(10, sbs, 5) if sbs > 10 else 10
        src_dim_gs = global_sizes[3:]

        for bs in srange:
            yield [(d, bs if bs < gs else gs) for d, gs
                in zip(src_dims, src_dim_gs)]

        # Try the rest of the timesteps
        for t in trange[1:]:
            yield [('ntime', t)]

        # Reduce by baseline
        for bl in _uniq_log2_range(na, nbl, 5):
            yield [('nbl', bl)]

    for reduction in _reduction():
        if bytes_required > mem_budget:
            for dim, size in reduction:
                applied_reductions[dim] = size
                cube.update_dimension(dim, lower_extent=0, upper_extent=size)
        else:
            break

        bytes_required = cube.bytes_required()

    # Log some information about the memory_budget
    # and dimension reduction
    montblanc.log.info(("Selected a solver memory budget of {rb} "
        "given a hard limit of {mb}.").format(
        rb=mbu.fmt_bytes(bytes_required),
        mb=mbu.fmt_bytes(mem_budget)))

    if len(applied_reductions) > 0:
        montblanc.log.info("The following dimension reductions "
            "were applied:")

        for k, v in applied_reductions.iteritems():
            montblanc.log.info('{p}{d}: {id} => {rd}'.format
                (p=' '*4, d=k, id=original_sizes[k], rd=v))
    else:
        montblanc.log.info("No dimension reductions were applied.")

    return applied_reductions, bytes_required
Esempio n. 3
0
    def _thread_prime_memory_pools(self):
        """
        We use memory pools to avoid allocating both CUDA
        pinned host and device memory. This function fakes
        allocations prior to running the solver so that
        the memory pools are 'primed' with memory allocations
        that can be re-used during actual execution of the solver
        """

        montblanc.log.debug('Priming memory pools in thread %s',
                            threading.current_thread())

        nsrc = self.dim_local_size('nsrc')

        # Retain references to pool allocations
        pinned_pool_refs = defaultdict(list)
        device_pool_refs = defaultdict(list)
        pinned_allocated = 0

        # Class of arrays that are to be transferred
        classifiers = [
            Classifier.E_BEAM_INPUT, Classifier.B_SQRT_INPUT,
            Classifier.EKB_SQRT_INPUT, Classifier.COHERENCIES_INPUT
        ]

        # Estimate number of kernels for constant data
        nkernels = len(classifiers)

        # Detect already transferred array chunks
        dirty = {}

        # Get the first chunk of the visibility space
        cpu_slice_map, gpu_slice_map = self._gen_vis_slices().next()

        for i, subslvr in enumerate(self.thread_local.solvers):
            # For the maximum number of visibility chunks that can be enqueued
            for T in range(self.throttle_factor):
                # For each source batch within the visibility chunk
                for cpu_src_slice_map, gpu_src_slice_map in self._gen_source_slices(
                ):
                    cpu_slice_map.update(cpu_src_slice_map)
                    gpu_slice_map.update(gpu_src_slice_map)

                    # Allocate pinned memory for transfer arrays
                    # retaining references to them
                    refs = self._enqueue_array(subslvr,
                                               cpu_slice_map,
                                               gpu_slice_map,
                                               direction=ASYNC_HTOD,
                                               dirty=dirty,
                                               classifiers=classifiers)
                    pinned_allocated += sum(
                        [r.nbytes for l in refs.values() for r in l])
                    _update_refs(pinned_pool_refs, refs)

                    # Allocate pinned memory for constant memory transfers
                    cdata = subslvr.const_data().ndary()

                    for k in range(nkernels):
                        cdata_ref = subslvr.pinned_mem_pool.allocate(
                            shape=cdata.shape, dtype=cdata.dtype)
                        pinned_allocated += cdata_ref.nbytes
                        pinned_pool_refs['cdata'].append(cdata_ref)

                    # Allocate device memory for arrays that need to be
                    # allocated from a pool by PyCUDA's reduction kernels
                    dev_ary = subslvr.dev_mem_pool.allocate(self.X2.nbytes)
                    device_pool_refs['X2_gpu'].append(dev_ary)

        device = self.thread_local.context.get_device()

        montblanc.log.info('Primed pinned memory pool '
                           'of size {n} for device {d}.'.format(
                               d=device.name(),
                               n=mbu.fmt_bytes(pinned_allocated)))

        # Now force return of memory to the pools
        for key, array_list in pinned_pool_refs.iteritems():
            [a.base.free() for a in array_list]

        for array_list in device_pool_refs.itervalues():
            [a.free() for a in array_list]
Esempio n. 4
0
    def _thread_budget(self, slvr_cfg, A_sub, props):
        """
        Get memory budget and dimension reduction
        information from the CUDA device associated
        with the current thread and context
        """
        montblanc.log.debug('Budgeting in thread %s',
                            threading.current_thread())

        # Query free memory on this context
        (free_mem, total_mem) = cuda.mem_get_info()

        device = self.thread_local.context.get_device()

        montblanc.log.info('{d}: {t} total {f} free.'.format(
            d=device.name(),
            f=mbu.fmt_bytes(free_mem),
            t=mbu.fmt_bytes(total_mem)))

        # Work with a supplied memory budget, otherwise use
        # free memory less an amount equal to the upper size
        # of an NVIDIA context
        mem_budget = slvr_cfg.get('mem_budget', free_mem - 200 * ONE_MB)

        nsolvers = slvr_cfg.get(Options.NSOLVERS)
        na = slvr_cfg.get(Options.NA)
        nsrc = slvr_cfg.get(Options.SOURCE_BATCH_SIZE)
        src_str_list = [Options.NSRC] + mbu.source_nr_vars()
        src_reduction_str = '&'.join(
            ['%s=%s' % (nr_var, nsrc) for nr_var in src_str_list])

        ntime_split = np.int32(np.ceil(100.0 / nsolvers))
        ntime_split_str = 'ntime={n}'.format(n=ntime_split)

        # Figure out a viable dimension configuration
        # given the total problem size
        viable, modded_dims = mbu.viable_dim_config(mem_budget, A_sub, props, [
            ntime_split_str, src_reduction_str, 'ntime',
            'nbl={na}&na={na}'.format(na=na), 'nchan=50%'
        ], nsolvers)

        # Create property dictionary with updated
        # dimensions.
        P = props.copy()
        P.update(modded_dims)

        required_mem = mbu.dict_array_bytes_required(A_sub, P)

        if not viable:
            dim_set_str = ', '.join(
                ['%s=%s' % (k, v) for k, v in modded_dims.iteritems()])

            ary_list_str = '\n'.join([
                '%-*s %-*s %s' %
                (15, a['name'], 10, mbu.fmt_bytes(mbu.dict_array_bytes(
                    a, P)), mbu.shape_from_str_tuple(a['shape'], P))
                for a in sorted(A_sub,
                                reverse=True,
                                key=lambda a: mbu.dict_array_bytes(a, P))
            ])

            raise MemoryError(
                "Tried reducing the problem size "
                "by setting '%s' on all arrays, "
                "but the resultant required memory of %s "
                "for each of %d solvers is too big "
                "to fit within the memory budget of %s. "
                "List of biggests offenders:\n%s "
                "\nSplitting the problem along the "
                "channel dimension needs to be "
                "implemented." %
                (dim_set_str, mbu.fmt_bytes(required_mem), nsolvers,
                 mbu.fmt_bytes(mem_budget), ary_list_str))

        return P, modded_dims, required_mem
Esempio n. 5
0
    def _thread_prime_memory_pools(self):
        """
        We use memory pools to avoid allocating both CUDA
        pinned host and device memory. This function fakes
        allocations prior to running the solver so that
        the memory pools are 'primed' with memory allocations
        that can be re-used during actual execution of the solver
        """

        montblanc.log.debug('Priming memory pools in thread %s',
            threading.current_thread())

        nsrc = self.dim_local_size('nsrc')

        # Retain references to pool allocations
        pinned_pool_refs = defaultdict(list)
        device_pool_refs = defaultdict(list)
        pinned_allocated = 0

        # Class of arrays that are to be transferred
        classifiers = [Classifier.E_BEAM_INPUT,
            Classifier.B_SQRT_INPUT,
            Classifier.EKB_SQRT_INPUT,
            Classifier.COHERENCIES_INPUT]

        # Estimate number of kernels for constant data
        nkernels = len(classifiers)

        # Detect already transferred array chunks
        dirty = {}

        # Get the first chunk of the visibility space
        cpu_slice_map, gpu_slice_map = self._gen_vis_slices().next()

        for i, subslvr in enumerate(self.thread_local.solvers):
            # For the maximum number of visibility chunks that can be enqueued
            for T in range(self.throttle_factor):
                # For each source batch within the visibility chunk
                for cpu_src_slice_map, gpu_src_slice_map in self._gen_source_slices():
                    cpu_slice_map.update(cpu_src_slice_map)
                    gpu_slice_map.update(gpu_src_slice_map)

                    # Allocate pinned memory for transfer arrays
                    # retaining references to them
                    refs = self._enqueue_array(subslvr,
                        cpu_slice_map, gpu_slice_map, 
                        direction=ASYNC_HTOD, dirty=dirty,
                        classifiers=classifiers)
                    pinned_allocated += sum([r.nbytes
                        for l in refs.values() for r in l])
                    _update_refs(pinned_pool_refs, refs)

                    # Allocate pinned memory for constant memory transfers
                    cdata = subslvr.const_data().ndary()

                    for k in range(nkernels):
                        cdata_ref = subslvr.pinned_mem_pool.allocate(
                            shape=cdata.shape, dtype=cdata.dtype)
                        pinned_allocated += cdata_ref.nbytes
                        pinned_pool_refs['cdata'].append(cdata_ref)

                    # Allocate device memory for arrays that need to be
                    # allocated from a pool by PyCUDA's reduction kernels
                    dev_ary = subslvr.dev_mem_pool.allocate(self.X2.nbytes)
                    device_pool_refs['X2_gpu'].append(dev_ary)

        device = self.thread_local.context.get_device()

        montblanc.log.info('Primed pinned memory pool '
            'of size {n} for device {d}.'.format(
                d=device.name(), n=mbu.fmt_bytes(pinned_allocated)))

        # Now force return of memory to the pools
        for key, array_list in pinned_pool_refs.iteritems():
            [a.base.free() for a in array_list]

        for array_list in device_pool_refs.itervalues():
            [a.free() for a in array_list]
Esempio n. 6
0
    def __init__(self, slvr_cfg):
        """
        RimeSolver Constructor

        Parameters:
            slvr_cfg : SolverConfiguration
                Solver Configuration variables
        """
        super(CompositeRimeSolver, self).__init__(slvr_cfg=slvr_cfg)

        # Create thread local storage
        self.thread_local = threading.local()

        self.register_default_dimensions()

        # Configure the dimensions of the beam cube
        self.register_dimension('beam_lw',
            slvr_cfg[Options.E_BEAM_WIDTH],
            description='E cube l width')

        self.register_dimension('beam_mh',
            slvr_cfg[Options.E_BEAM_HEIGHT],
            description='E cube m height')

        self.register_dimension('beam_nud',
            slvr_cfg[Options.E_BEAM_DEPTH],
            description='E cube nu depth')

        # Monkey patch v4 antenna pair functions into the object
        from montblanc.impl.rime.v4.ant_pairs import monkey_patch_antenna_pairs
        monkey_patch_antenna_pairs(self)

        # Copy the v4 arrays and properties and
        # modify them for use on this Composite Solver
        A_main, P_main = self._cfg_comp_slvr_arys_and_props(v4Arrays, v4Props)

        self.register_properties(P_main)
        self.register_arrays(A_main)

        # Look for ignored and supplied arrays in the solver configuration
        array_cfg = slvr_cfg.get('array_cfg', {})
        ignore = array_cfg.get('ignore', None)
        supplied = array_cfg.get('supplied', None)

        # Create arrays on the solver, ignoring
        # and using supplied arrays as necessary
        self.create_arrays(ignore, supplied)

        # PyCUDA contexts for each GPU device   
        self.dev_ctxs = slvr_cfg.get(Options.CONTEXT)
        # Number of GPU Solvers created for each device
        nsolvers = slvr_cfg.get(Options.NSOLVERS)
        # Maximum number of enqueued visibility chunks
        # before throttling is applied
        self.throttle_factor = slvr_cfg.get(
            Options.VISIBILITY_THROTTLE_FACTOR)

        # Massage the contexts for each device into a list
        if not isinstance(self.dev_ctxs, list):
            self.dev_ctxs = [self.dev_ctxs]

        montblanc.log.info('Using {d} solver(s) per device.'.format(
            d=nsolvers))

        # Shorten the type name
        C = CompositeRimeSolver

        # Create a one thread executor for each device context,
        # i.e. a thread per device
        enqueue_executors = [cf.ThreadPoolExecutor(1) for ctx in self.dev_ctxs]
        sync_executors = [cf.ThreadPoolExecutor(1) for ctx in self.dev_ctxs]

        self.enqueue_executors = enqueue_executors
        self.sync_executors = sync_executors
        self.initialised = False
        self._vis_write_mode = slvr_cfg.get(Options.VISIBILITY_WRITE_MODE)

        montblanc.log.info('Created {d} executor(s).'.format(d=len(enqueue_executors)))

        # Initialise executor threads
        for ex, ctx in zip(enqueue_executors, self.dev_ctxs):
            ex.submit(C._thread_init, self, ctx).result()

        for ex, ctx in zip(sync_executors, self.dev_ctxs):
            ex.submit(C._thread_init, self, ctx).result()

        montblanc.log.info('Initialised {d} thread(s).'.format(d=len(enqueue_executors)))

        # Get a template dictionary
        T = self.template_dict()

        A_sub, P_sub = self._cfg_sub_slvr_arys_and_props(v4Arrays, v4Props)
        self._validate_arrays(A_sub)

        # Find the budget with the lowest memory usage
        # Work with the device with the lowest memory
        budgets = sorted([ex.submit(C._thread_budget, self,
                            slvr_cfg, A_sub, T).result()
                        for ex in enqueue_executors],
                    key=lambda T: T[1])

        P, M, mem = budgets[0]

        # Log some information about the memory budget
        # and dimension reduction
        montblanc.log.info(('Selected a solver memory budget of {b} '
            'for {d} solvers.').format(b=mbu.fmt_bytes(mem), d=nsolvers))

        montblanc.log.info(('The following dimension reductions '
            'have been applied:'))

        for k, v in M.iteritems():
            montblanc.log.info('{p}{d}: {id} => {rd}'.format
                (p=' '*4, d=k, id=T[k], rd=v))

        # Create the sub solver configuration
        subslvr_cfg = slvr_cfg.copy()
        subslvr_cfg[Options.DATA_SOURCE] = Options.DATA_SOURCE_EMPTY
        subslvr_cfg[Options.CONTEXT] = ctx

        subslvr_cfg = self._cfg_subslvr_dims(subslvr_cfg, P)

        # Extract the dimension differences
        self.src_diff = P[Options.NSRC]
        self.time_diff = P[Options.NTIME]
        self.ant_diff = P[Options.NA]
        self.bl_diff = P[Options.NBL]
        self.chan_diff = P[Options.NCHAN]

        montblanc.log.info('Creating {s} solver(s) on {d} device(s).'
            .format(s=nsolvers, d=len(enqueue_executors)))

        # Now create the solvers on each thread
        for ex in enqueue_executors:
            ex.submit(C._thread_create_solvers,
                self, subslvr_cfg, P, nsolvers).result()

        montblanc.log.info('Solvers Created')

        # Register arrays and properties on each thread's solvers
        for ex in enqueue_executors:
            ex.submit(C._thread_reg_sub_arys_and_props,
                self, A_sub, P_sub).result()

        montblanc.log.info('Priming Memory Pools')

        # Prime the memory pools on each sub-solver
        for ex in enqueue_executors:
            ex.submit(C._thread_prime_memory_pools, self).result()
Esempio n. 7
0
    def _thread_budget(self, slvr_cfg, A_sub, props):
        """
        Get memory budget and dimension reduction
        information from the CUDA device associated
        with the current thread and context
        """
        montblanc.log.debug('Budgeting in thread %s', threading.current_thread())

        # Query free memory on this context
        (free_mem,total_mem) = cuda.mem_get_info()

        device = self.thread_local.context.get_device()

        montblanc.log.info('{d}: {t} total {f} free.'.format(
           d=device.name(), f=mbu.fmt_bytes(free_mem), t=mbu.fmt_bytes(total_mem)))

        # Work with a supplied memory budget, otherwise use
        # free memory less an amount equal to the upper size
        # of an NVIDIA context
        mem_budget = slvr_cfg.get('mem_budget', free_mem - 200*ONE_MB)

        nsolvers = slvr_cfg.get(Options.NSOLVERS)
        na = slvr_cfg.get(Options.NA)
        nsrc = slvr_cfg.get(Options.SOURCE_BATCH_SIZE)
        src_str_list = [Options.NSRC] + mbu.source_nr_vars()
        src_reduction_str = '&'.join(['%s=%s' % (nr_var, nsrc)
            for nr_var in src_str_list])

        ntime_split = np.int32(np.ceil(100.0 / nsolvers))
        ntime_split_str = 'ntime={n}'.format(n=ntime_split)

        # Figure out a viable dimension configuration
        # given the total problem size 
        viable, modded_dims = mbu.viable_dim_config(
            mem_budget, A_sub, props,
                [ntime_split_str, src_reduction_str, 'ntime',
                'nbl={na}&na={na}'.format(na=na),
                'nchan=50%'],
            nsolvers)                

        # Create property dictionary with updated
        # dimensions.
        P = props.copy()
        P.update(modded_dims)

        required_mem = mbu.dict_array_bytes_required(A_sub, P)

        if not viable:
            dim_set_str = ', '.join(['%s=%s' % (k,v)
                for k,v in modded_dims.iteritems()])

            ary_list_str = '\n'.join(['%-*s %-*s %s' % (
                15, a['name'],
                10, mbu.fmt_bytes(mbu.dict_array_bytes(a, P)),
                mbu.shape_from_str_tuple(a['shape'],P))
                for a in sorted(A_sub,
                    reverse=True,
                    key=lambda a: mbu.dict_array_bytes(a, P))])

            raise MemoryError("Tried reducing the problem size "
                "by setting '%s' on all arrays, "
                "but the resultant required memory of %s "
                "for each of %d solvers is too big "
                "to fit within the memory budget of %s. "
                "List of biggests offenders:\n%s "
                "\nSplitting the problem along the "
                "channel dimension needs to be "
                "implemented." %
                    (dim_set_str,
                    mbu.fmt_bytes(required_mem),
                    nsolvers,
                    mbu.fmt_bytes(mem_budget),
                    ary_list_str))

        return P, modded_dims, required_mem