def make_vertices(self, cycles): """Partition the neurons onto multiple cores.""" # Make reduced constraints to partition against, we don't partition # against SDRAM as we're already sure that there is sufficient SDRAM # (and if there isn't we can't possibly fit all the vertices on a # single chip). dtcm_constraint = partition.Constraint(64 * 2**10, 0.9) # 90% of DTCM cpu_constraint = partition.Constraint(cycles, 0.8) # 80% of compute # Get the number of neurons in this cluster n_neurons = self.neuron_slice.stop - self.neuron_slice.start # Form the constraints dictionary def _make_constraint(f, size_in, **kwargs): """Wrap a usage computation method to work with the partitioner.""" def f_(neuron_slice, output_slice): # Calculate the number of neurons n_neurons = neuron_slice.stop - neuron_slice.start # Calculate the number of outgoing dimensions size_out = output_slice.stop - output_slice.start # Call the original method return f(size_in, size_out, n_neurons, **kwargs) return f_ constraints = { dtcm_constraint: _make_constraint(_lif_dtcm_usage, self.size_in, n_neurons_in_cluster=n_neurons), cpu_constraint: _make_constraint(_lif_cpu_usage, self.size_in, n_neurons_in_cluster=n_neurons), } # Partition the slice of neurons that we have self.neuron_slices = list() output_slices = list() for neurons, outputs in partition.partition_multiple( (self.neuron_slice, slice(self.size_out)), constraints): self.neuron_slices.append(neurons) output_slices.append(outputs) n_slices = len(self.neuron_slices) assert n_slices <= 16 # Too many cores in the cluster # Also partition the input space input_slices = partition.divide_slice(slice(0, self.size_in), n_slices) # Zip these together to create the vertices all_slices = zip(input_slices, output_slices) for i, (in_slice, out_slice) in enumerate(all_slices): # Create the vertex vertex = EnsembleSlice(i, self.neuron_slices, in_slice, out_slice, self.regions) # Add to the list of vertices self.vertices.append(vertex) # Return all the vertices return self.vertices
def make_vertices(self, model, n_steps): # TODO remove n_steps """Construct the data which can be loaded into the memory of a SpiNNaker machine. """ # Extract all the filters from the incoming connections to build the # filter regions. signals_conns = model.get_signals_to_object(self)[InputPort.standard] filter_region, filter_routing_region = make_filter_regions( signals_conns, model.dt, True, model.keyspaces.filter_routing_tag) self._routing_region = filter_routing_region # Make sufficient vertices to ensure that each has a size_in of less # than max_width. n_vertices = ( (self.size_in // self.max_width) + (1 if self.size_in % self.max_width else 0) ) self.vertices = tuple( ValueSinkVertex(model.machine_timestep, n_steps, sl, filter_region, filter_routing_region) for sl in divide_slice(slice(0, self.size_in), n_vertices) ) # Return the spec return netlistspec(self.vertices, self.load_to_machine, after_simulation_function=self.after_simulation)
def make_vertices(self, output_signals, machine_timestep, filter_region, filter_routing_region): """Partition the transform matrix into groups of rows and assign each group of rows to a core for computation. If the group needs to be split over multiple chips (i.e., the group is larger than 17 cores) then partition the matrix such that any used chips are used in their entirety. """ if OutputPort.standard not in output_signals: self.cores = list() else: # Get the output transform, keys and slices for this slice of the # filter. transform, keys, output_slices = \ get_transforms_and_keys(output_signals[OutputPort.standard], self.column_slice) size_out = transform.shape[0] # Build as many vertices as required to keep the number of rows # handled by each core below max_rows. n_cores = ( (size_out // self.max_rows) + (1 if size_out % self.max_rows else 0) ) # Build the transform region for these cores transform_region = regions.MatrixRegion( np_to_fix(transform), sliced_dimension=regions.MatrixPartitioning.rows ) # Build all the vertices self.cores = [ FilterCore(self.column_slice, out_slice, transform_region, keys, output_slices, machine_timestep, filter_region, filter_routing_region) for out_slice in divide_slice(slice(0, size_out), n_cores) ] return self.cores
def make_vertices(self, output_signals, machine_timestep, filter_region, filter_routing_region): """Partition the transform matrix into groups of rows and assign each group of rows to a core for computation. If the group needs to be split over multiple chips (i.e., the group is larger than 17 cores) then partition the matrix such that any used chips are used in their entirety. """ if OutputPort.standard not in output_signals: self.cores = list() else: # Get the output transform, keys and slices for this slice of the # filter. transform, keys, output_slices = \ get_transforms_and_keys(output_signals[OutputPort.standard], self.column_slice) size_out = transform.shape[0] # Build as many vertices as required to keep the number of rows # handled by each core below max_rows. n_cores = ((size_out // self.max_rows) + (1 if size_out % self.max_rows else 0)) # Build the transform region for these cores transform_region = regions.MatrixRegion( np_to_fix(transform), sliced_dimension=regions.MatrixPartitioning.rows) # Build all the vertices self.cores = [ FilterCore(self._label, self.column_slice, out_slice, transform_region, keys, output_slices, machine_timestep, filter_region, filter_routing_region) for out_slice in divide_slice(slice(0, size_out), n_cores) ] return self.cores
def __init__(self, size_in, max_cols=128, max_rows=64): """Create a new parallel Filter. Parameters ---------- size_in : int Width of the filter (length of any incoming signals). max_cols : int max_rows : int Maximum number of columns and rows which may be handled by a single processing core. The defaults (128 and 64 respectively) result in the overall connection matrix being decomposed such that (a) blocks are sufficiently small to be stored in DTCM, (b) network traffic is reduced. """ # NB: max_rows and max_cols determined by experimentation by AM and # some modelling by SBF. # Create as many groups as necessary to keep the size in of any group # less than max_cols. self.size_in = size_in n_groups = (size_in // max_cols) + (1 if size_in % max_cols else 0) self.groups = tuple(FilterGroup(sl, max_rows) for sl in divide_slice(slice(0, size_in), n_groups))
def make_vertices(self, model, n_steps): # TODO remove n_steps """Construct the data which can be loaded into the memory of a SpiNNaker machine. """ # Extract all the filters from the incoming connections to build the # filter regions. signals_conns = model.get_signals_to_object(self)[InputPort.standard] filter_region, filter_routing_region = make_filter_regions( signals_conns, model.dt, True, model.keyspaces.filter_routing_tag) self._routing_region = filter_routing_region # Make sufficient vertices to ensure that each has a size_in of less # than max_width. n_vertices = ((self.size_in // self.max_width) + (1 if self.size_in % self.max_width else 0)) self.vertices = tuple( ValueSinkVertex(model.machine_timestep, n_steps, sl, filter_region, filter_routing_region) for sl in divide_slice(slice(0, self.size_in), n_vertices)) # Return the spec return netlistspec(self.vertices, self.load_to_machine, after_simulation_function=self.after_simulation)
def __init__(self, size_in, label, max_cols=128, max_rows=64): """Create a new parallel Filter. Parameters ---------- size_in : int Width of the filter (length of any incoming signals). max_cols : int max_rows : int Maximum number of columns and rows which may be handled by a single processing core. The defaults (128 and 64 respectively) result in the overall connection matrix being decomposed such that (a) blocks are sufficiently small to be stored in DTCM, (b) network traffic is reduced. """ # NB: max_rows and max_cols determined by experimentation by AM and # some modelling by SBF. # Create as many groups as necessary to keep the size in of any group # less than max_cols. self.size_in = size_in n_groups = (size_in // max_cols) + (1 if size_in % max_cols else 0) self.groups = tuple( FilterGroup(sl, max_rows, "interposer") for sl in divide_slice(slice(0, size_in), n_groups))
def test_divide_slice(start, stop, n_items): slices = list(pac.divide_slice(slice(start, stop), n_items)) assert slices[0].start == start assert slices[-1].stop == stop assert len(slices) == n_items
def make_vertices(self, model, n_steps): """Make vertices for the filter.""" # Get the outgoing transforms and keys sigs = model.get_signals_from_object(self) if OutputPort.standard in sigs: outgoing = sigs[OutputPort.standard] transform, output_keys, sigs_pars_slices = \ get_transforms_and_keys(outgoing) else: transform = np.array([[]]) output_keys = list() sigs_pars_slices = list() size_out = len(output_keys) # Calculate how many cores and chips to use. if self.n_cores_per_chip is None or self.n_chips is None: # The number of cores is largely a function of the input size, we # try to ensure that each core is receiving a max of 32 packets per # timestep. n_cores_per_chip = int(min(16, np.ceil(self.size_in / 32.0))) # The number of chips is now determined by the size in (columns in # the transform matrix), the size out (rows in the transform # matrix) and the number of cores per chip. n_chips = self.n_chips or 1 n_cores = n_chips * n_cores_per_chip while True: rows_per_core = int( np.ceil(float(size_out) / (n_cores * n_chips))) load_per_core = rows_per_core * self.size_in # The 8,000 limits the number of columns in each row that we # need to process. This is a heuristic. if load_per_core <= 8000 or n_chips > 9: # The load per core is acceptable or we're using way too # many chips break if n_cores < 16: # Increase the number of cores per chip if we can n_cores += 1 else: # Otherwise increase the number of chips n_chips += 1 # Store the result self.n_cores_per_chip = n_cores self.n_chips = n_chips # Slice the input space into the given number of subspaces, this is # repeated on each chip. input_slices = list( divide_slice(slice(0, self.size_in), self.n_cores_per_chip)) # Slice the output space into the given number of subspaces, this is # sliced across all of the chips. output_slices = divide_slice(slice(0, size_out), self.n_cores_per_chip * self.n_chips) # Construct the output keys and transform regions; the output keys and # sliced, and the transform is sliced by rows. self.output_keys_region = regions.KeyspacesRegion( output_keys, fields=[regions.KeyField({'cluster': 'cluster'})], partitioned_by_atom=True) self.transform_region = regions.MatrixRegion( np_to_fix(transform), sliced_dimension=regions.MatrixPartitioning.rows) # Construct the system region self.system_region = SystemRegion(self.size_in, model.machine_timestep) # Get the incoming filters incoming = model.get_signals_to_object(self) self.filters_region, self.routing_region = make_filter_regions( incoming[InputPort.standard], model.dt, True, model.keyspaces.filter_routing_tag, width=self.size_in) # Make the vertices and constraints iter_output_slices = iter(output_slices) cons = list() # List of constraints # For each chip that we'll be using for _ in range(self.n_chips): chip_vertices = list() # Each core is given an input slice and an output slice. The same # set of input slices is used per chip, but we iterate through the # whole list of output slices. for in_slice, out_slice in zip(input_slices, iter_output_slices): # Determine the amount of SDRAM required (the 24 additional # bytes are for the application pointer table). We also # include this cores contribution to a shared SDRAM vector. sdram = (24 + 4 * (in_slice.stop - in_slice.start) + self.system_region.sizeof() + self.filters_region.sizeof_padded() + self.routing_region.sizeof_padded() + self.output_keys_region.sizeof_padded(out_slice) + self.transform_region.sizeof_padded(out_slice)) # Create the vertex and include in the list of vertices v = ParallelFilterSlice(in_slice, out_slice, { Cores: 1, SDRAM: sdram }, sigs_pars_slices) chip_vertices.append(v) self.vertices.append(v) # Create a constraint which will force all of the vertices to exist # of the same chip. cons.append(SameChipConstraint(chip_vertices)) # Return the spec return netlistspec(self.vertices, self.load_to_machine, constraints=cons)
def test_divide_slice(start, stop, n_items): slices = list(pac.divide_slice(slice(start, stop), n_items)) assert slices[0].start == start assert slices[-1].stop == stop assert len(slices) == n_items
def make_vertices(self, cycles): """Partition the neurons onto multiple cores.""" # Make reduced constraints to partition against, we don't partition # against SDRAM as we're already sure that there is sufficient SDRAM # (and if there isn't we can't possibly fit all the vertices on a # single chip). dtcm_constraint = partition.Constraint(64 * 2**10, 0.9) # 90% of DTCM cpu_constraint = partition.Constraint(cycles, 0.8) # 80% of compute # Get the number of neurons in this cluster n_neurons = self.neuron_slice.stop - self.neuron_slice.start # Form the constraints dictionary def _make_constraint(f, size_in, **kwargs): """Wrap a usage computation method to work with the partitioner.""" def f_(neuron_slice, output_slice): # Calculate the number of neurons n_neurons = neuron_slice.stop - neuron_slice.start # Calculate the number of outgoing dimensions size_out = output_slice.stop - output_slice.start # Call the original method return f(size_in, size_out, n_neurons, **kwargs) return f_ constraints = { dtcm_constraint: _make_constraint(_lif_dtcm_usage, self.size_in, n_neurons_in_cluster=n_neurons), cpu_constraint: _make_constraint(_lif_cpu_usage, self.size_in, n_neurons_in_cluster=n_neurons), } # Partition the slice of neurons that we have self.neuron_slices = list() output_slices = list() for neurons, outputs in partition.partition_multiple( (self.neuron_slice, slice(self.size_out)), constraints): self.neuron_slices.append(neurons) output_slices.append(outputs) n_slices = len(self.neuron_slices) assert n_slices <= 16 # Too many cores in the cluster # Also partition the input space input_slices = partition.divide_slice(slice(0, self.size_in), n_slices) # Zip these together to create the vertices all_slices = zip(input_slices, output_slices) for i, (in_slice, out_slice) in enumerate(all_slices): # Create the vertex vertex = EnsembleSlice(i, self.neuron_slices, in_slice, out_slice, self.regions) # Add to the list of vertices self.vertices.append(vertex) # Return all the vertices return self.vertices
def make_vertices(self, model, n_steps): """Make vertices for the filter.""" # Get the outgoing transforms and keys sigs = model.get_signals_from_object(self) if OutputPort.standard in sigs: outgoing = sigs[OutputPort.standard] transform, output_keys, sigs_pars_slices = \ get_transforms_and_keys(outgoing) else: transform = np.array([[]]) output_keys = list() sigs_pars_slices = list() size_out = len(output_keys) # Calculate how many cores and chips to use. if self.n_cores_per_chip is None or self.n_chips is None: # The number of cores is largely a function of the input size, we # try to ensure that each core is receiving a max of 32 packets per # timestep. n_cores_per_chip = int(min(16, np.ceil(self.size_in / 32.0))) # The number of chips is now determined by the size in (columns in # the transform matrix), the size out (rows in the transform # matrix) and the number of cores per chip. n_chips = self.n_chips or 1 n_cores = n_chips * n_cores_per_chip while True: rows_per_core = int(np.ceil(float(size_out) / (n_cores * n_chips))) load_per_core = rows_per_core * self.size_in # The 8,000 limits the number of columns in each row that we # need to process. This is a heuristic. if load_per_core <= 8000 or n_chips > 9: # The load per core is acceptable or we're using way too # many chips break if n_cores < 16: # Increase the number of cores per chip if we can n_cores += 1 else: # Otherwise increase the number of chips n_chips += 1 # Store the result self.n_cores_per_chip = n_cores self.n_chips = n_chips # Slice the input space into the given number of subspaces, this is # repeated on each chip. input_slices = list(divide_slice(slice(0, self.size_in), self.n_cores_per_chip)) # Slice the output space into the given number of subspaces, this is # sliced across all of the chips. output_slices = divide_slice(slice(0, size_out), self.n_cores_per_chip * self.n_chips) # Construct the output keys and transform regions; the output keys and # sliced, and the transform is sliced by rows. self.output_keys_region = regions.KeyspacesRegion( output_keys, fields=[regions.KeyField({'cluster': 'cluster'})], partitioned_by_atom=True ) self.transform_region = regions.MatrixRegion( np_to_fix(transform), sliced_dimension=regions.MatrixPartitioning.rows ) # Construct the system region self.system_region = SystemRegion(self.size_in, model.machine_timestep) # Get the incoming filters incoming = model.get_signals_to_object(self) self.filters_region, self.routing_region = make_filter_regions( incoming[InputPort.standard], model.dt, True, model.keyspaces.filter_routing_tag, width=self.size_in ) # Make the vertices and constraints iter_output_slices = iter(output_slices) cons = list() # List of constraints # For each chip that we'll be using for _ in range(self.n_chips): chip_vertices = list() # Each core is given an input slice and an output slice. The same # set of input slices is used per chip, but we iterate through the # whole list of output slices. for in_slice, out_slice in zip(input_slices, iter_output_slices): # Determine the amount of SDRAM required (the 24 additional # bytes are for the application pointer table). We also # include this cores contribution to a shared SDRAM vector. sdram = (24 + 4*(in_slice.stop - in_slice.start) + self.system_region.sizeof() + self.filters_region.sizeof_padded() + self.routing_region.sizeof_padded() + self.output_keys_region.sizeof_padded(out_slice) + self.transform_region.sizeof_padded(out_slice)) # Create the vertex and include in the list of vertices v = ParallelFilterSlice(in_slice, out_slice, {Cores: 1, SDRAM: sdram}, sigs_pars_slices) chip_vertices.append(v) self.vertices.append(v) # Create a constraint which will force all of the vertices to exist # of the same chip. cons.append(SameChipConstraint(chip_vertices)) # Return the spec return netlistspec(self.vertices, self.load_to_machine, constraints=cons)