def compute_cov_cpu(lambda0, coords, n_procs): """ Massively parallelized version (on CPU) """ inv_lambda_2 = -np.sqrt(3) / lambda0 n_cells = coords.shape[0] n_dim_coords = coords.shape[1] cov_shape = (n_cells, n_cells) cov_shared_buffer = RawArray('d', n_cells * n_cells) # Wrap as a numpy array so we can easily manipulates its data. cov_np = np.frombuffer(cov_shared_buffer).reshape(cov_shape) # Copy data to our shared array. np.copyto(cov_np, np.zeros(cov_shape)) coords_shared_buffer = RawArray('d', n_cells * n_dim_coords) # Wrap as a numpy array so we can easily manipulates its data. coords_np = np.frombuffer(coords_shared_buffer).reshape(coords_shape) # Copy data to our shared array. np.copyto(coords_np, coords) # Start the process pool and do the computation. # Here we pass X and X_shape to the initializer of # each worker. # (Because X_shape is not a shared variable, # it will be copied to each # child process.) with Pool(processes=n_procs, initializer=init_worker, initargs=(cov_shared_buffer, cov_shape, coords_shared_buffer, coords_shape, inv_lambda_2)) as pool: result = pool.map(_worker_func, range(coords_shape[0])) return cov_np
def _set_gym_matrices(self): # set the action space num_actions = self.topo.get_num_hosts() min_bw = 10000.0 / float(self.topo.conf["max_capacity"]) self.action_min = np.empty(num_actions) self.action_min.fill(min_bw) self.action_max = np.empty(num_actions) self.action_max.fill(1.0) # self.action_space = spaces.Box( # low=action_min, high=action_max, dtype=np.float32) # Initialize the action arrays shared with the control manager # Qdisc do not go beyond uint32 rate limit which is about 4Gbps tx_rate = RawArray(ctypes.c_uint32, num_actions) self.tx_rate = dc_utils.shmem_to_nparray(tx_rate, np.float32) active_rate = RawArray(ctypes.c_uint32, num_actions) self.active_rate = dc_utils.shmem_to_nparray(active_rate, np.float32) log.info("%s Setting action space", (self.short_id)) log.info("from %s", self.action_min) log.info("to %s", self.action_max) # set the observation space num_ports = self.topo.get_num_sw_ports() num_features = len(self.conf["state_model"]) if self.conf["collect_flows"]: num_features += num_actions * 2 obs_min = np.empty(num_ports * num_features + num_actions) obs_min.fill(-np.inf) obs_max = np.empty(num_ports * num_features + num_actions) obs_max.fill(np.inf)
def __init__(self, init_dict=None): """Create a shared memory version of each element of the initial dictionary. Creates an empty array otherwise, which will extend automatically when keys are added. Each different type (all supported types listed in the `types` array above) has its own array. For each key we store an index into the appropriate array as well as the type of value stored for that key. """ # idx is dict of {key: (array_idx, value_type)} self.idx = {} # arrays is dict of {value_type: array_of_ctype} self.arrays = {} if init_dict: sizes = {typ: 0 for typ in self.types.keys()} for v in init_dict.values(): if type(v) not in sizes: raise TypeError('SharedTable does not support values of ' + 'type ' + str(type(v))) sizes[type(v)] += 1 for typ, sz in sizes.items(): self.arrays[typ] = RawArray(self.types[typ], sz) idxs = {typ: 0 for typ in self.types.keys()} for k, v in init_dict.items(): val_type = type(v) self.idx[k] = (idxs[val_type], val_type) if val_type == str: v = sys.intern(v) self.arrays[val_type][idxs[val_type]] = v idxs[val_type] += 1 # initialize any needed empty arrays for typ, ctyp in self.types.items(): if typ not in self.arrays: self.arrays[typ] = RawArray(ctyp, 0) self.lock = Lock()
def start_process(self, max_pulses=None): if self.__process.is_alive(): print('Process is already running') return if max_pulses is None: max_pulses = self.__config['DAQ']['BufferLength'] print('Starting process...') if not (self.is_valid): print( 'You have to start the process manually by calling <object>.start_process()!' ) return self._synchpulsetimes = RawArray('d', [-1] * max_pulses) self._buttonstates = RawArray('b', [0] * self.number_of_buttons) self._buttonpresstimes = [ RawArray('d', [-1] * max_pulses) for n in range(0, self.number_of_buttons) ] self._select_buttons = RawArray( 'b', [1] * self.number_of_buttons) # record only selected buttons self._button_record_period = RawArray( 'd', [0, inf]) # record buttons only in this period self.__readout_time = [ self.__readout_time[0] ] + [self.__readout_time[1]] * self.number_of_buttons self.__process = Process(target=self._run) self.__process.start() while not (self.is_alive): pass print('[{:.3f}s] - Process is running'.format(self.clock))
def create_multiproc_matrix(state_basis, state_occup, vals, vmat, N): global state_basis_sh global state_occup_sh global vals_sh global vmat_sh global matrix_sh with warnings.catch_warnings(): warnings.simplefilter("ignore") state_basis_ct = np.ctypeslib.as_ctypes(state_basis) state_occup_ct = np.ctypeslib.as_ctypes(state_occup) vals_ct = np.ctypeslib.as_ctypes(vals) vmat_ct = np.ctypeslib.as_ctypes(vmat.view(dtype='float64')) state_basis_sh = RawArray(state_basis_ct._type_, state_basis_ct) state_occup_sh = RawArray(state_occup_ct._type_, state_occup_ct) vals_sh = RawArray(vals_ct._type_, vals_ct) vmat_sh = RawArray(vmat_ct._type_, vmat_ct) dim = len(state_basis) block_size = dim / 20 matrix = np.zeros((dim, dim), dtype=complex) matrix_ct = np.ctypeslib.as_ctypes(matrix.view(dtype='float64')) matrix_sh = RawArray(matrix_ct._type_, matrix_ct) func = partial(fill_per_window_sh, N=N, block_size=block_size) idxs = [(i, min(i + block_size, dim)) for i in xrange(0, dim, block_size)] p = Pool(processes=10) res = p.map(func, idxs) np.copyto(matrix, np.ctypeslib.as_array(matrix_sh).view(dtype='complex128')) return matrix
def begin_compute(cls): """Starts the Mandelbrot set computation.""" no_val = "No {} value set!" essential_values = [ "ITERATIONS", "PIXWIDTH", "PIXHEIGHT", "STEP", "RMIN", "RMAX", "IMIN", "IMAX", "PALLETE", ] for v in essential_values: if getattr(cls, v, None) is None: raise MandelProcException(no_val.format(v)) x_coords = RawArray('I', np.arange(cls.PIXWIDTH, dtype='I')) pixel_data = RawArray('B', 3*cls.PIXWIDTH*cls.PIXHEIGHT) row_flags = RawArray('B', cls.PIXHEIGHT) np_array = np.frombuffer(pixel_data, dtype='B') np_array = np_array.reshape(cls.PIXHEIGHT, 3*cls.PIXWIDTH) procs = [cls(i, np_array, row_flags, x_coords) for i in range(cls.STEP)] for p in procs: p.start() return procs, np_array, row_flags
def init_rate_control(ctrl_iface, rate): # Initialize the action array shared with the control manager tx_rate = RawArray(ctypes.c_uint32, 1) tx_rate = dc_utils.shmem_to_nparray(tx_rate, np.float32) tx_rate.fill(rate) bw_proc = BandwidthController({"test": ctrl_iface}, tx_rate, tx_rate, rate) bw_proc.start() return tx_rate, bw_proc
def init_shared(self, obs_shape): shape = (self.batch_size, ) + obs_shape state = np.zeros(shape, dtype=np.float32) state = RawArray(c_float, state.reshape(-1)) state = np.frombuffer(state, c_float).reshape(shape) return state
def getRawArrays(self): returnMatrix = RawArray( 'd', self.returnMatrix.reshape(np.prod(self.shapes['return']))) excessReturnMatrix = RawArray( 'd', self.excessReturn.reshape(np.prod(self.shapes['return']))) excessMarketReturn = RawArray( 'd', self.excessMarketReturn.reshape(np.prod(self.shapes['market']))) return returnMatrix, excessReturnMatrix, excessMarketReturn
def final_init(self): self.buffer_size = self.descriptor.num_points()*self.descriptor.buffer_mult_factor # logger.info(f"{self.start_connector.parent}:{self.start_connector} to {self.end_connector.parent}:{self.end_connector} buffer of size {self.buffer_size}") if self.buffer_size > 50e6: logger.debug(f"Limiting buffer size of {self} to 50 Million Points") self.buffer_size = 50e6 self.buff_shared_re = RawArray(ctypes.c_double, int(self.buffer_size)) self.buff_shared_im = RawArray(ctypes.c_double, int(self.buffer_size)) self.re_np = np.frombuffer(self.buff_shared_re, dtype=np.float64) self.im_np = np.frombuffer(self.buff_shared_im, dtype=np.float64)
def raw_array_from_ndarray(arr): if arr.dtype == np.float64: raw = RawArray('d', arr.size) _tempnparray = np.frombuffer(raw, np.float64).reshape(arr.shape) elif arr.dtype == np.int64 or arr.dtype == np.int32: raw = RawArray('i', arr.size) _tempnparray = np.frombuffer(raw, np.int32).reshape(arr.shape) else: raise TypeError(f"Unknown numpy dtype: {arr.dtype}") np.copyto(_tempnparray, arr) return raw
def main(file): predicted_map = np.load( os.path.join(raw_predictions, "predicted_map_{}.npy".format(file))) points = np.loadtxt(os.path.join(in_path, "{}.xyz".format(file))) corrected_maps = np.zeros_like(predicted_map) n_points = len(predicted_map) predicted_neighborhood_indices = np.load( os.path.join(raw_predictions, "predicted_neighborhood_indices_{}.npy".format(file))) full_errors = [] BATCH_SIZE = 64 #512#16 shared_predicted_map = RawArray('d', 10000 * (n_nearest_neighbors + 1) * 3) shared_predicted_map = np.frombuffer(shared_predicted_map, dtype=np.float64).reshape( 10000, (n_nearest_neighbors + 1), 3) np.copyto(shared_predicted_map, predicted_map) shared_predicted_neighborhood_indices = RawArray( 'i', 10000 * (n_nearest_neighbors + 1)) shared_predicted_neighborhood_indices = np.frombuffer( shared_predicted_neighborhood_indices, dtype=np.int32).reshape(10000, (n_nearest_neighbors + 1)) np.copyto(shared_predicted_neighborhood_indices, predicted_neighborhood_indices) manager = multiprocessing.Manager() return_dict = manager.dict() global align_patch_func def align_patch_func(shared_predicted_map, shared_predicted_neighborhood_indices, i): return align_patch(shared_predicted_map, shared_predicted_neighborhood_indices, i) jobs = [] start = time.time() print('start:', file) with multiprocessing.Pool(64) as pool: corrected_maps = pool.map( functools.partial(align_patch_func, shared_predicted_map, shared_predicted_neighborhood_indices), range(n_points)) corrected_maps = np.array(corrected_maps) np.save(os.path.join(res_path, 'corrected_maps_{}.npy'.format(file)), corrected_maps) end = time.time()
def __init__(self, limit, item_shape, n_cpu=1): """ The replay buffer object. Stores everything in float32. :param limit: (int) the max number of transitions to store :param item_shape: a list of tuples of (str) item name and (tuple) the shape for item Ex: [("observations", env.observation_space.shape),\ ("actions",env.action_space.shape),\ ("rewards", (1,)),\ ("dones", (1,))] """ self.limit = limit global BUFF BUFF = AttrDict() self.BUFF = BUFF # a global object that has shared RawArray-based RingBuffers. BUFF.items = [] # item buffers for name, shape in item_shape: BUFF.items.append('buffer_' + name) BUFF['raw_' + name] = RawArray('f', int(np.prod((limit, ) + shape))) BUFF['np_' + name] =\ np.frombuffer(BUFF['raw_' + name], dtype=np.float32).reshape((limit, ) + shape) BUFF['buffer_' + name] = RingBuffer(limit, shape=shape, data=BUFF['np_' + name]) # special buffers BUFF.raw_tidx = RawArray('d', limit) BUFF.np_tidx = np.frombuffer(BUFF.raw_tidx, dtype=np.int64) BUFF.buffer_tidx = RingBuffer(limit, shape=(), dtype=np.int64, data=BUFF.np_tidx) BUFF.raw_tleft = RawArray('d', limit) BUFF.np_tleft = np.frombuffer(BUFF.raw_tleft, dtype=np.int64) BUFF.buffer_tleft = RingBuffer(limit, shape=(), dtype=np.int64, data=BUFF.np_tleft) if 'buffer_bg' in BUFF: # is this a successful trajectory? BUFF.raw_success = RawArray('f', limit) BUFF.np_success = np.frombuffer(BUFF.raw_success, dtype=np.float32) BUFF.buffer_success = RingBuffer(limit, shape=(), dtype=np.float32, data=BUFF.np_success) self.trajectories = OrderedDict() # a centralized dict of trajectory_id --> trajectory_idxs self.total_trajectory_len = 0 self.current_trajectory = 0 self.pool = None self.n_cpu = n_cpu if n_cpu > 1: self.pool = mp.Pool(n_cpu, initializer=worker_init, initargs=(BUFF,))
def _init_stats_matrices(self, num_ports, num_hosts): # Set up the shared stats matrix stats_arr_len = num_ports * len(self.stats_dict) mp_stats = RawArray(c_ulong, stats_arr_len) np_stats = dc_utils.shmem_to_nparray(mp_stats, np.float64) self.stats = np_stats.reshape((len(self.stats_dict), num_ports)) # Set up the shared flow matrix if (self.collect_flows): flow_arr_len = num_ports * num_hosts * 2 mp_flows = RawArray(c_ubyte, flow_arr_len) np_flows = dc_utils.shmem_to_nparray(mp_flows, np.uint8) self.flow_stats = np_flows.reshape((num_ports, 2, num_hosts)) # Save the initialized stats matrix to compute deltas self.prev_stats = self.stats.copy() self.deltas = np.zeros(shape=(len(self.stats_dict), num_ports))
def __init__(self, clients, server_host, config): # setup connection info self.hostname = socket.gethostname() self.clients = clients self.num_clients = len(clients) self.server_settings = (server_host, int(config['port'])) self.backlog_size = int(config['backlog']) self.max_retries = int(config['max_retries']) # setup logging info self.verbose = config['verbose'].upper()[0] == 'T' outfilename = os.path.join( os.getenv("OUTPUT_DIR"), self.hostname + "_server.out") self.outfile = open(outfilename, 'w') # setup table num_keys = int(config['table_size']) # syncrhonized by means of an indicator array self.table = RawArray(c_int, num_keys) # setup worker synchronization self.num_workers = int(config['server_threads']) self.request_queue = Queue() # multi-producer/multi-consumer queues self.sock_locks = [Lock() for i in range(self.num_clients)] # a dumb globally-locked array keeps track of pending PUTs self.pending = Array(c_int, num_keys) # setup multiprocessing info super(Server, self).__init__( group=None, target=None, name="{} (server)".format(self.hostname))
def __init__(self, max_size): from multiprocessing import Lock, RawArray, RawValue self._max_size = max_size self._array = RawArray('c', max_size) self._pos = RawValue('L') self._size = RawValue('L') self._locks = Lock(), Lock(), Lock()
def __init__(self, host='localhost', port=8001, model='atari', observation_shape=None, n_stack_frames=4, wait_interval_msec=30, **kwargs): super(AsyncAgent, self).__init__(**kwargs) self._wait_interval_sec = wait_interval_msec / 1000.0 self._observation_shape = observation_shape n_bytes = np.prod(observation_shape) self._n_stack_frames = n_stack_frames self._state_buffers = [RawArray(c_ubyte, range(n_bytes)) \ for _ in range(self._n_stack_frames)] self._n_frames = Value(c_long, 0) self._action_buffer = Value(c_long, 1) self._stop_signal = Value(c_bool, 0) self._state_lock = mpLock() self._trtis_client = TrtisClient(host=host, port=port, model_name=model)
def __setitem__(self, key, value): """If key is in table, update it. Otherwise, extend the array to make room. This uses additive resizing not multiplicative, since the number of keys is not likely to change frequently during a run, so do not abuse it. Raises an error if you try to change the type of the value stored for that key--if you need to do this, you must delete the key first.s """ val_type = type(value) if val_type not in self.types: raise TypeError('SharedTable does not support type ' + type(value)) if val_type == str: value = sys.intern(value) if key in self.idx: idx, typ = self.idx[key] if typ != val_type: raise TypeError( ('Cannot change stored type for {key} from ' + '{v1} to {v2}. You need to del the key first' + ' if you need to change value types.').format( key=key, v1=typ, v2=val_type)) self.arrays[typ][idx] = value else: old_array = self.arrays[val_type] ctyp = self.types[val_type] new_array = RawArray(ctyp, len(old_array) + 1) for i in range(len(old_array)): new_array[i] = old_array[i] new_array[-1] = value self.arrays[val_type] = new_array self.idx[key] = (len(new_array) - 1, val_type)
def __init__(self): print("SimulationCommunicator object created") #wartosci-rezultaty dzialania symulacji self.ball_x = RawValue('f', 0.0) self.ball_y = RawValue('f', 0.0) self.servo_x = RawValue('i', 0) self.servo_y = RawValue('i', 0) self.corner_tl_x = RawValue('f', 0.0) self.corner_tl_y = RawValue('f', 0.0) self.corner_tr_x = RawValue('f', 0.0) self.corner_tr_y = RawValue('f', 0.0) self.corner_br_x = RawValue('f', 0.0) self.corner_br_y = RawValue('f', 0.0) self.corner_bl_x = RawValue('f', 0.0) self.corner_bl_y = RawValue('f', 0.0) self.cameraFrame = RawArray('i', 3 * 256**2) #zmienne wartosci self.servo_actual_pos = [0, 0] #aktualna pozycja serwa self.servo_target_pos = [0, 0] #docelowa pozycja serwa self.refreshDeltaTime = 1 / 60 self.frameReadTargetDelta = 1 / 40 self.frameReadLastTime = 0.0 self.capturedFrame = np.zeros((256, 256, 3))
def make_raw(array): """Create a multiprocessing-ready RawArray; creates a copy of ``array``. Parameters ---------- array : np.array Array to use. Is copied into a RawArray. Returns ------- (np.array, RawArray) [0] Numpy array created from raw_array. Do not share with other processes. [1] Raw Array; safe for sharing. References ---------- https://research.wmz.ninja/articles/2018/03/ on-sharing-large-arrays-when-using-pythons-multiprocessing.html """ raw_array = RawArray('f', int(np.prod(array.shape))) array_np = np.frombuffer(raw_array, dtype=np.float32).reshape(array.shape) np.copyto(array_np, array) return raw_array
def __init__(self, entry): self.entry = entry self.queue = Queue() self.raw = RawArray('B', self.MAX_SHM_SIZE) self.proc = Process(target=self.p_main, args=[], daemon=True) self.proc.start() self.joint = Thread(target=self.proc.join, args=[], daemon=True) self.joint.start()
def _setup(self): self._shared_mem = RawArray('c', self._cap) self._base = ctypes.addressof(self._shared_mem) self._locker.acquire() try: self._allocator = PageAllocator(self._shared_mem, self._page_num, self._page_size) finally: self._locker.release()
def create_shared_array(ctype, array_shape): ''' Returns a multiprocessing.RawArray and its Numpy wrapper. ''' numel = int(reduce(lambda x, y: x*y, array_shape)) shared_array = RawArray(ctype, numel) shared_array_wrapper = np.frombuffer(shared_array, dtype=ctype).reshape(array_shape) return shared_array, shared_array_wrapper
def genFakeData(): # Share shape = (16, 1000) raw = RawArray('d', shape[0] * shape[1]) # Don't share dat = np.zeros(shape, dtype=np.float64) datBuf = np.frombuffer(raw, dtype=np.float64).reshape(shape) np.copyto(datBuf, dat) return (datBuf, raw, shape)
def ndarray_to_shmem(array): """ Converts a numpy.ndarray to a multiprocessing.Array object. The memory is copied, and the array is flattened. """ arr = array.reshape((-1, )) data = RawArray(_numpy_to_ctypes[array.dtype.type], arr.size) ctypes.memmove(data, array.data[:], len(array.data)) return data
def run(self): if self.extractor_name == 'Neighborhood': self.extractor = Neighborhood(**self.extractor_kwargs) else: self.extractor = Region(**self.extractor_kwargs) self.extractor.preprocess(self.image) # Build a shared version of the extractor's stack extractor_def = {'name': self.extractor_name, 'shape': self.extractor.stack.shape, 'kwargs': self.extractor_kwargs} mem_size = 1 for s in self.extractor.stack.shape: mem_size *= s self.shared_stack = RawArray(ctypes.c_float, mem_size) stack_shape = self.extractor.stack.shape tmp = np.frombuffer(self.shared_stack, dtype=np.float32).reshape(stack_shape) tmp[:,:,:] = self.extractor.stack[:,:,:] tmp = None # Fetch dimensions of vector for a simple location vector = self.extractor.extract_at(0, 0) # Calculate number of colums takinginto consideration stride cols = self.image.shape[1] // self.stride shape = ((cols, ) + vector.shape) mem_size = 1 # Calculate the size of the memory needed for one row for s in shape: mem_size *= s # Determine the max number of processes we can spawn if tf.test.is_gpu_available(cuda_only=False): mem_available = psutil.virtual_memory().available / 1024 / 1024 / 2 row_size = (mem_size * 4) / 1024 / 1024 max_children = int(mem_available / row_size) else: max_children = cpu_count() - 2 if max_children <= 0: max_children = 1 elif max_children > 250: max_children = 250 print(max_children) for i in range(max_children): state = Value('i', -1) mem = Array(ctypes.c_float, mem_size) array = np.frombuffer(mem.get_obj(), dtype=np.float32).reshape(shape) rows = [x for x in range(i * self.stride, self.image.shape[0], max_children * self.stride)] if len(rows) > 0: p = Process(target=extract, args=(extractor_def, self.shared_stack, rows, mem, state, shape)) self.processes.append(p) self.mem.append(mem) self.states.append(state) self.arrays.append(array) self.extractor = None self.ready = True for p in self.processes: p.start() for p in self.processes: p.join()
def _setup(self): self._shared_mem = RawArray('c', self._cap) self._base = np.frombuffer( self._shared_mem, dtype='uint8', count=self._cap) self._locker.acquire() try: self._allocator = PageAllocator(self._base, self._total_pages, self._page_size) finally: self._locker.release()
def __init__(self, comm_info): """Initilize shared memory.""" super(ShareByRawArray, self).__init__() self.size_shared_mem = comm_info.get("size", 100000000) self.agent_num = comm_info.get("agent_num", 4) self.control_q = Queue() self.mem = RawArray(c_ubyte, self.size_shared_mem) self.size_mem_agent = int(self.size_shared_mem / self.agent_num)
def optimize(self, maxiter=1000, perdiff=0.1): """ Optimizes the posterior distribution given the data. The algorithm terminates when either the maximum number of iterations is reached or the percent difference in the posterior is less than perdiff. """ #if self.gpu: # self.gdata = to_gpu(np.asarray(self.data, dtype=np.float32)) # self.g_ones = to_gpu(np.ones((self.ncomp,1), dtype=np.float32)) # self.g_ones_long = to_gpu(np.ones((self.nobs, 1), dtype=np.float32)) if self.parallel: from multiprocessing import RawArray self.shared_dens_mem = RawArray('d', self.nobs * self.ncomp) self.shared_dens = np.frombuffer(self.shared_dens_mem).reshape( self.nobs, self.ncomp) for w in self.workers: w.set_dens(self.shared_dens_mem) w.start() # start threads if self.gpu: self.gpu_workers = init_GPUWorkers(self.data, self.dev_list) self.expected_labels() ll_2 = self.log_posterior() ll_1 = 1 it = 0 if self.verbose: if self.gpu: print "starting GPU enabled BEM" else: print "starting BEM" while np.abs(ll_1 - ll_2) > 0.01 * perdiff and it < maxiter: if isinstance(self.verbose, int) and self.verbose and not isinstance( self.verbose, bool): if it % self.verbose == 0: print "%d:, %f" % (it, ll_2) it += 1 self.maximize_mu() self.maximize_Sigma() self.maximize_weights() self.expected_alpha() self.expected_labels() ll_1 = ll_2 ll_2 = self.log_posterior() if self.gpu: kill_GPUWorkers(self.gpu_workers) if self.parallel: for i in xrange(self.num_cores): self.work_queue[i].put(None)
def gen_time_results(mat_size, core_list, no_runs): if __name__ == '__main__': for _ in range(no_runs): mat_shape = (mat_size, mat_size) data_A = np.random.rand(*mat_shape).astype(np.float32) data_B = np.random.rand(*mat_shape).astype(np.float32) A = RawArray('f', mat_shape[0] * mat_shape[1]) B = RawArray('f', mat_shape[0] * mat_shape[1]) A_np = np.frombuffer(A, dtype=np.float32).reshape(mat_shape) B_np = np.frombuffer(B, dtype=np.float32).reshape(mat_shape) np.copyto(A_np, data_A) np.copyto(B_np, data_B) for no_cores in core_list: print(no_cores) #Assuming the matrix is of size 2^n for int N, we take log2 to find the value of n power = np.log2(no_cores) / 2 #Represents the number of partitons that must be calculated in the result matrix C pars_i = int(2**(np.ceil(power))) pars_j = int(2**(np.floor(power))) #Represents the size of each partiton in the i and j axis i_size = int(mat_size / pars_i) j_size = int(mat_size / pars_j) start = time.perf_counter() send_list = [] for i in range(pars_i): for j in range(pars_j): send_list.append([ i * i_size, (i + 1) * i_size, j * j_size, (j + 1) * j_size, mat_size ]) p = Pool(processes=no_cores, initializer=init_worker, initargs=(A, B)) res_list = p.starmap(matrix_mult, send_list) p.close() result = np.vstack( np.split(np.concatenate(res_list, axis=1), pars_i, axis=1)) finish = time.perf_counter() time_taken = round(finish - start, 10) print(time_taken) print("") return None