def _mapfill_hprobs_atom(self, array_to_fill, dest_indices, dest_param_indices1, dest_param_indices2, layout_atom, param_indices1, param_indices2, resource_alloc, eps): """ Helper function for populating hessian values by block. """ shared_mem_leader = resource_alloc.is_host_leader if ( resource_alloc is not None) else True if param_indices1 is None: param_indices1 = list(range(self.model.num_params)) if param_indices2 is None: param_indices2 = list(range(self.model.num_params)) if dest_param_indices1 is None: dest_param_indices1 = list(range(_slct.length(param_indices1))) if dest_param_indices2 is None: dest_param_indices2 = list(range(_slct.length(param_indices2))) param_indices1 = _slct.to_array(param_indices1) dest_param_indices1 = _slct.to_array(dest_param_indices1) #dest_param_indices2 = _slct.to_array(dest_param_indices2) # OK if a slice #Get a map from global parameter indices to the desired # final index within mx_to_fill (fpoffset = final parameter offset) iParamToFinal = { i: dest_index for i, dest_index in zip(param_indices1, dest_param_indices1) } nEls = layout_atom.num_elements nP2 = _slct.length(param_indices2) if isinstance( param_indices2, slice) else len(param_indices2) dprobs, shm = _smt.create_shared_ndarray(resource_alloc, (nEls, nP2), 'd') dprobs2, shm2 = _smt.create_shared_ndarray(resource_alloc, (nEls, nP2), 'd') self.calclib.mapfill_dprobs_atom(self, dprobs, slice(0, nEls), None, layout_atom, param_indices2, resource_alloc, eps) orig_vec = self.model.to_vector().copy() for i in range(self.model.num_params): if i in iParamToFinal: iFinal = iParamToFinal[i] vec = orig_vec.copy() vec[i] += eps self.model.from_vector(vec, close=True) self.calclib.mapfill_dprobs_atom(self, dprobs2, slice(0, nEls), None, layout_atom, param_indices2, resource_alloc, eps) if shared_mem_leader: _fas(array_to_fill, [dest_indices, iFinal, dest_param_indices2], (dprobs2 - dprobs) / eps) self.model.from_vector(orig_vec) _smt.cleanup_shared_ndarray(shm) _smt.cleanup_shared_ndarray(shm2)
def _bulk_fill_hprobs_atom(self, array_to_fill, dest_param_slice1, dest_param_slice2, layout_atom, param_slice1, param_slice2, resource_alloc): # Note: *don't* set dest_indices arg = layout.element_slice, as this is already done by caller resource_alloc.check_can_allocate_memory( layout_atom.cache_size * self.model.dim * _slct.length(param_slice1) * _slct.length(param_slice2)) self._mapfill_hprobs_atom(array_to_fill, slice(0, array_to_fill.shape[0]), dest_param_slice1, dest_param_slice2, layout_atom, param_slice1, param_slice2, resource_alloc, self.hessian_eps)
def _success_dprob(self, circuit, param_slice, cache): """ todo """ assert(param_slice is None or _slct.length(param_slice) == len(self._paramvec)), \ "No support for derivatives with respect to a subset of model parameters yet!" pvec = self._paramvec**2 dpvec_dparams = 2 * self._paramvec if cache is None: cache = self._circuit_cache(circuit) one_over_2_width, all_inds_to_mult, all_inds_to_mult_cnt = cache sp = 1.0 - pvec successprob_all_ops = prod(sp[all_inds_to_mult]) deriv = _np.zeros(len(pvec), 'd') for i, n in enumerate(all_inds_to_mult_cnt): deriv[i] = n * successprob_all_ops / sp[i] * -1.0 # The circuit succeeds if all ops succeed, and has a random outcome otherwise. # successprob_circuit = successprob_all_ops + (1 - successprob_all_ops) / 2**width # = const + (1-1/2**width)*successprobs_all_ops deriv *= (1.0 - one_over_2_width) return deriv * dpvec_dparams
def _iter_hprobs_by_rectangle(self, layout, wrt_slices_list, return_dprobs_12): # Just needed for compatibility - so base `iter_hprobs_by_rectangle` knows to loop over atoms # Similar to _iter_atom_hprobs_by_rectangle but runs over all atoms before yielding and # yielded array has leading dim == # of local elements instead of just 1 atom's # elements. nElements = layout.num_elements resource_alloc = layout.resource_alloc() for wrtSlice1, wrtSlice2 in wrt_slices_list: if return_dprobs_12: dprobs1, dprobs1_shm = _smt.create_shared_ndarray(resource_alloc, (nElements, _slct.length(wrtSlice1)), 'd', zero_out=True) dprobs2, dprobs2_shm = _smt.create_shared_ndarray(resource_alloc, (nElements, _slct.length(wrtSlice2)), 'd', zero_out=True) else: dprobs1 = dprobs2 = dprobs1_shm = dprobs2_shm = None hprobs, hprobs_shm = _smt.create_shared_ndarray( resource_alloc, (nElements, _slct.length(wrtSlice1), _slct.length(wrtSlice2)), 'd', zero_out=True) for atom in layout.atoms: self._bulk_fill_hprobs_dprobs_atom(hprobs[atom.element_slice, :, :], dprobs1[atom.element_slice, :] if (dprobs1 is not None) else None, dprobs2[atom.element_slice, :] if (dprobs2 is not None) else None, atom, wrtSlice1, wrtSlice2, resource_alloc) #Note: we give resource_alloc as our local `resource_alloc` above because all the arrays # have been allocated based on just this subset of processors, unlike a call to bulk_fill_hprobs(...) # where the probs & dprobs are memory allocated and filled by a larger group of processors. (the main # function of these args is to know which procs work together to fill the *same* values and which of # these are on the *same* host so that only one per host actually writes to the assumed-shared memory. if return_dprobs_12: dprobs12 = dprobs1[:, :, None] * dprobs2[:, None, :] # (KM,N,1) * (KM,1,N') = (KM,N,N') yield wrtSlice1, wrtSlice2, hprobs, dprobs12 else: yield wrtSlice1, wrtSlice2, hprobs _smt.cleanup_shared_ndarray(dprobs1_shm) _smt.cleanup_shared_ndarray(dprobs2_shm) _smt.cleanup_shared_ndarray(hprobs_shm)
def mapfill_dprobs_atom(fwdsim, mx_to_fill, dest_indices, dest_param_indices, layout_atom, param_indices, resource_alloc, eps): #eps = 1e-7 #shared_mem_leader = resource_alloc.is_host_leader if (resource_alloc is not None) else True if param_indices is None: param_indices = list(range(fwdsim.model.num_params)) if dest_param_indices is None: dest_param_indices = list(range(_slct.length(param_indices))) param_indices = _slct.to_array(param_indices) dest_param_indices = _slct.to_array(dest_param_indices) #Get a map from global parameter indices to the desired # final index within mx_to_fill (fpoffset = final parameter offset) iParamToFinal = { i: dest_index for i, dest_index in zip(param_indices, dest_param_indices) } orig_vec = fwdsim.model.to_vector().copy() fwdsim.model.from_vector( orig_vec, close=False) # ensure we call with close=False first #Note: no real need for using shared memory here except so that we can pass # `resource_alloc` to mapfill_probs_block and have it potentially use multiple procs. nEls = layout_atom.num_elements probs, shm = _smt.create_shared_ndarray(resource_alloc, (nEls, ), 'd', memory_tracker=None) probs2, shm2 = _smt.create_shared_ndarray(resource_alloc, (nEls, ), 'd', memory_tracker=None) mapfill_probs_atom(fwdsim, probs, slice(0, nEls), layout_atom, resource_alloc) # probs != shared for i in range(fwdsim.model.num_params): #print("dprobs cache %d of %d" % (i,self.Np)) if i in iParamToFinal: iFinal = iParamToFinal[i] vec = orig_vec.copy() vec[i] += eps fwdsim.model.from_vector(vec, close=True) mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, resource_alloc) _fas(mx_to_fill, [dest_indices, iFinal], (probs2 - probs) / eps) fwdsim.model.from_vector(orig_vec, close=True) _smt.cleanup_shared_ndarray(shm) _smt.cleanup_shared_ndarray(shm2)
def _iter_atom_hprobs_by_rectangle(self, atom, wrt_slices_list, return_dprobs_12, resource_alloc): #FUTURE could make a resource_alloc.check_can_allocate_memory call here for ('epp', 'epp')? nElements = atom.num_elements for wrtSlice1, wrtSlice2 in wrt_slices_list: if return_dprobs_12: dprobs1, dprobs1_shm = _smt.create_shared_ndarray(resource_alloc, (nElements, _slct.length(wrtSlice1)), 'd', zero_out=True) dprobs2, dprobs2_shm = _smt.create_shared_ndarray(resource_alloc, (nElements, _slct.length(wrtSlice2)), 'd', zero_out=True) else: dprobs1 = dprobs2 = dprobs1_shm = dprobs2_shm = None hprobs, hprobs_shm = _smt.create_shared_ndarray( resource_alloc, (nElements, _slct.length(wrtSlice1), _slct.length(wrtSlice2)), 'd', zero_out=True) # Note: no need to index w/ [atom.element_slice,...] (compare with _iter_hprobs_by_rectangles) # since these arrays are already sized to this particular atom (not to all the host's atoms) self._bulk_fill_hprobs_dprobs_atom(hprobs, dprobs1, dprobs2, atom, wrtSlice1, wrtSlice2, resource_alloc) #Note: we give resource_alloc as our local `resource_alloc` above because all the arrays # have been allocated based on just this subset of processors, unlike a call to bulk_fill_hprobs(...) # where the probs & dprobs are memory allocated and filled by a larger group of processors. (the main # function of these args is to know which procs work together to fill the *same* values and which of # these are on the *same* host so that only one per host actually writes to the assumed-shared memory. if return_dprobs_12: dprobs12 = dprobs1[:, :, None] * dprobs2[:, None, :] # (KM,N,1) * (KM,1,N') = (KM,N,N') yield wrtSlice1, wrtSlice2, hprobs, dprobs12 else: yield wrtSlice1, wrtSlice2, hprobs _smt.cleanup_shared_ndarray(dprobs1_shm) _smt.cleanup_shared_ndarray(dprobs2_shm) _smt.cleanup_shared_ndarray(hprobs_shm)
def _success_dprob(self, circuit, param_slice, cache): assert(param_slice is None or _slct.length(param_slice) == len(self._paramvec)), \ "No support for derivatives with respect to a subset of model parameters yet!" pvec = self._paramvec**2 dpvec_dparams = 2 * self._paramvec if cache is None: cache = self._circuit_cache(circuit) all_inds_to_mult, all_inds_to_mult_cnt = cache sp = 1.0 - pvec successprob_circuit = prod(sp[all_inds_to_mult]) deriv = _np.zeros(len(pvec), 'd') for i, n in enumerate(all_inds_to_mult_cnt): deriv[i] = n * successprob_circuit / sp[i] * -1.0 return deriv * dpvec_dparams
def _success_dprob(self, circuit, param_slice, cache): assert(param_slice is None or _slct.length(param_slice) == len(self._paramvec)), \ "No support for derivatives with respect to a subset of model parameters yet!" pvec = self._paramvec**2 dpvec_dparams = 2 * self._paramvec if cache is None: cache = self._circuit_cache(circuit) # p = product_layers(1 - alpha * (1 - prod_[inds4layer](1 - param))) * \ # (prod_[inds4LASTlayer](1 - param) - 1 / 2**width) # Note: indices cannot be repeated in a layer, i.e. either a given index appears one or zero times in inds4layer width, depth, alpha, one_over_2_width, inds_to_mult_by_layer = cache sp = 1.0 - pvec deriv = _np.zeros(len(pvec), 'd') nLayers = len(inds_to_mult_by_layer) lambda_per_layer = _np.empty(nLayers, 'd') for i, inds_to_mult in enumerate(inds_to_mult_by_layer[:-1]): lambda_per_layer[i] = 1 - alpha * (1 - prod(sp[inds_to_mult])) successprob_readout = prod(sp[inds_to_mult_by_layer[-1]]) lambda_per_layer[nLayers - 1] = successprob_readout - one_over_2_width lambda_all_layers = prod( lambda_per_layer) # includes readout factor as last layer #All layers except last for i, inds_to_mult in enumerate(inds_to_mult_by_layer[:-1]): lambda_all_but_current_layer = lambda_all_layers / lambda_per_layer[ i] # for each such ind, when we take deriv wrt this index, we need to differentiate this layer, etc. for ind in inds_to_mult: deriv[ind] += lambda_all_but_current_layer * alpha * \ (prod(sp[inds_to_mult]) / sp[ind]) * -1.0 # what if sp[ind] == 0? #Last layer lambda_all_but_current_layer = lambda_all_layers / lambda_per_layer[-1] for ind in inds_to_mult_by_layer[-1]: deriv[ind] += lambda_all_but_current_layer * ( successprob_readout / sp[ind]) * -1.0 # what if sp[ind] == 0? return deriv * dpvec_dparams
def _bulk_fill_dprobs(self, array_to_fill, layout, pr_array_to_fill): """Note: we expect that array_to_fill points to the memory specifically for this processor (a subset of the memory for the host when memory is shared) """ blkSize = layout.param_dimension_blk_sizes[0] atom_resource_alloc = layout.resource_alloc('atom-processing') param_resource_alloc = layout.resource_alloc('param-processing') atom_resource_alloc.host_comm_barrier() # ensure all procs have finished w/shared memory before we reinit # Note: use *largest* host comm that we fill - so 'atom' comm, not 'param' comm host_param_slice = None # layout.host_param_slice # array_to_fill is already just this slice of the host mem global_param_slice = layout.global_param_slice for atom in layout.atoms: #assert(_slct.length(atom.element_slice) == atom.num_elements) # for debugging #print("DEBUG: Atom %d of %d slice=%s" % (iDB, len(layout.atoms), str(atom.element_slice))) if pr_array_to_fill is not None: self._bulk_fill_probs_atom(pr_array_to_fill[atom.element_slice], atom, atom_resource_alloc) if blkSize is None: # avoid unnecessary slice_up_range and block loop logic in 'else' block #Compute all of our derivative columns at once self._bulk_fill_dprobs_atom(array_to_fill[atom.element_slice, :], host_param_slice, atom, global_param_slice, param_resource_alloc) else: # Divide columns into blocks of at most blkSize Np = _slct.length(global_param_slice) # total number of parameters we're computing nBlks = int(_np.ceil(Np / blkSize)) # num blocks required to achieve desired average size == blkSize blocks = _mpit.slice_up_range(Np, nBlks) # blocks contain indices into final_array[host_param_slice] for block in blocks: host_param_slice_part = block # _slct.shift(block, host_param_slice.start) # into host's memory global_param_slice_part = _slct.shift(block, global_param_slice.start) # actual parameter indices self._bulk_fill_dprobs_atom(array_to_fill[atom.element_slice, :], host_param_slice_part, atom, global_param_slice_part, param_resource_alloc) atom_resource_alloc.host_comm_barrier() # don't exit until all procs' array_to_fill is ready
def _success_dprob(self, circuit, param_slice, cache): """ todo """ assert(param_slice is None or _slct.length(param_slice) == len(self._paramvec)), \ "No support for derivatives with respect to a subset of model parameters yet!" pvec = self._paramvec**2 dpvec_dparams = 2 * self._paramvec if cache is None: cache = self._circuit_cache(circuit) width, depth, alpha, one_over_2_width, all_inds_to_mult, readout_inds_to_mult, all_inds_to_mult_cnt = cache sp = 1.0 - pvec lambda_ops = 1.0 - alpha * pvec deriv = _np.zeros(len(pvec), 'd') # The depolarizing constant for the full sequence of twirled gates. lambda_all_layers = prod(lambda_ops[all_inds_to_mult]) for i, n in enumerate(all_inds_to_mult_cnt): deriv[i] = n * lambda_all_layers / lambda_ops[ i] * -alpha # -alpha = d(lambda_ops/dparam) # The readout success probability. readout_deriv = _np.zeros(len(pvec), 'd') successprob_readout = prod(sp[readout_inds_to_mult]) for ind in readout_inds_to_mult: readout_deriv[ind] = (successprob_readout / sp[ind]) * -1.0 # what if sp[ind] == 0? # The success probability of the circuit. #successprob_circuit = lambda_all_layers * (successprob_readout - one_over_2_width) + one_over_2_width # product rule return (deriv * (successprob_readout - one_over_2_width) + lambda_all_layers * readout_deriv) * dpvec_dparams
def _jacobian_fn(gauge_group_el): #Penalty terms below always act on the transformed non-target model. original_gauge_group_el = gauge_group_el if frobenius_transform_target: gauge_group_el = gauge_group_el.inverse() mdl_pre = full_target_model.copy() mdl_post = mdl_pre.copy() else: mdl_pre = model.copy() mdl_post = mdl_pre.copy() mdl_post.transform_inplace(gauge_group_el) # Indices: Jacobian output matrix has shape (L, N) start = 0 d = mdl_pre.dim N = gauge_group_el.num_params L = mdl_pre.num_elements #Compute "extra" (i.e. beyond the model-element) rows of jacobian if cptp_penalty_factor != 0: L += _cptp_penalty_size(mdl_pre) if spam_penalty_factor != 0: L += _spam_penalty_size(mdl_pre) #Set basis for pentaly term calculation if cptp_penalty_factor != 0 or spam_penalty_factor != 0: mdl_pre.basis = mxBasis mdl_post.basis = mxBasis jacMx = _np.zeros((L, N)) #Overview of terms: # objective: op_term = (S_inv * gate * S - target_op) # jac: d(op_term) = (d (S_inv) * gate * S + S_inv * gate * dS ) # d(op_term) = (-(S_inv * dS * S_inv) * gate * S + S_inv * gate * dS ) # objective: rho_term = (S_inv * rho - target_rho) # jac: d(rho_term) = d (S_inv) * rho # d(rho_term) = -(S_inv * dS * S_inv) * rho # objective: ET_term = (E.T * S - target_E.T) # jac: d(ET_term) = E.T * dS #Overview of terms when frobenius_transform_target == True). Note that the objective #expressions are identical to the above except for an additional overall minus sign and S <=> S_inv. # objective: op_term = (gate - S * target_op * S_inv) # jac: d(op_term) = -(dS * target_op * S_inv + S * target_op * -(S_inv * dS * S_inv) ) # d(op_term) = (-dS * target_op * S_inv + S * target_op * (S_inv * dS * S_inv) ) # objective: rho_term = (rho - S * target_rho) # jac: d(rho_term) = - dS * target_rho # objective: ET_term = (E.T - target_E.T * S_inv) # jac: d(ET_term) = - target_E.T * -(S_inv * dS * S_inv) # d(ET_term) = target_E.T * (S_inv * dS * S_inv) #Distribute computation across processors allDerivColSlice = slice(0, N) derivSlices, myDerivColSlice, derivOwners, mySubComm = \ _mpit.distribute_slice(allDerivColSlice, comm) if mySubComm is not None: _warnings.warn("Note: more CPUs(%d)" % comm.Get_size() + " than gauge-opt derivative columns(%d)!" % N) # pragma: no cover n = _slct.length(myDerivColSlice) wrtIndices = _slct.indices(myDerivColSlice) if (n < N) else None my_jacMx = jacMx[:, myDerivColSlice] # just the columns I'm responsible for # S, and S_inv are shape (d,d) #S = gauge_group_el.transform_matrix S_inv = gauge_group_el.transform_matrix_inverse dS = gauge_group_el.deriv_wrt_params(wrtIndices) # shape (d*d),n dS.shape = (d, d, n) # call it (d1,d2,n) dS = _np.rollaxis(dS, 2) # shape (n, d1, d2) assert(dS.shape == (n, d, d)) # --- NOTE: ordering here, with running `start` index MUST # correspond to those in Model.residuals, which in turn # must correspond to those in ForwardSimulator.residuals - which # currently orders as: gates, simplified_ops, preps, effects. # -- LinearOperator terms # ------------------------- for lbl, G in mdl_pre.operations.items(): # d(op_term) = S_inv * (-dS * S_inv * G * S + G * dS) = S_inv * (-dS * G' + G * dS) # Note: (S_inv * G * S) is G' (transformed G) wt = item_weights.get(lbl, opWeight) left = -1 * _np.dot(dS, mdl_post.operations[lbl].to_dense(on_space='minimal')) # shape (n,d1,d2) right = _np.swapaxes(_np.dot(G.to_dense(on_space='minimal'), dS), 0, 1) # shape (d1,n,d2) -> (n,d1,d2) result = _np.swapaxes(_np.dot(S_inv, left + right), 1, 2) # shape (d1, d2, n) result = result.reshape((d**2, n)) # must copy b/c non-contiguous my_jacMx[start:start + d**2] = wt * result start += d**2 # -- Instrument terms # ------------------------- for ilbl, Inst in mdl_pre.instruments.items(): wt = item_weights.get(ilbl, opWeight) for lbl, G in Inst.items(): # same calculation as for operation terms left = -1 * _np.dot(dS, mdl_post.instruments[ilbl][lbl].to_dense(on_space='minimal')) # (n,d1,d2) right = _np.swapaxes(_np.dot(G.to_dense(on_space='minimal'), dS), 0, 1) # (d1,n,d2) -> (n,d1,d2) result = _np.swapaxes(_np.dot(S_inv, left + right), 1, 2) # shape (d1, d2, n) result = result.reshape((d**2, n)) # must copy b/c non-contiguous my_jacMx[start:start + d**2] = wt * result start += d**2 # -- prep terms # ------------------------- for lbl, rho in mdl_post.preps.items(): # d(rho_term) = -(S_inv * dS * S_inv) * rho # Note: (S_inv * rho) is transformed rho wt = item_weights.get(lbl, spamWeight) Sinv_dS = _np.dot(S_inv, dS) # shape (d1,n,d2) result = -1 * _np.dot(Sinv_dS, rho.to_dense(on_space='minimal')) # shape (d,n) my_jacMx[start:start + d] = wt * result start += d # -- effect terms # ------------------------- for povmlbl, povm in mdl_pre.povms.items(): for lbl, E in povm.items(): # d(ET_term) = E.T * dS wt = item_weights.get(povmlbl + "_" + lbl, spamWeight) result = _np.dot(E.to_dense(on_space='minimal')[None, :], dS).T # shape (1,n,d2).T => (d2,n,1) my_jacMx[start:start + d] = wt * result.squeeze(2) # (d2,n) start += d # -- penalty terms -- Note: still use original gauge transform applied to `model` # ------------------------- if cptp_penalty_factor > 0 or spam_penalty_factor > 0: if frobenius_transform_target: # reset back to non-target-tranform "mode" gauge_group_el = original_gauge_group_el mdl_pre = model.copy() mdl_post = mdl_pre.copy() mdl_post.transform_inplace(gauge_group_el) if cptp_penalty_factor > 0: start += _cptp_penalty_jac_fill(my_jacMx[start:], mdl_pre, mdl_post, gauge_group_el, cptp_penalty_factor, mdl_pre.basis, wrtIndices) if spam_penalty_factor > 0: start += _spam_penalty_jac_fill(my_jacMx[start:], mdl_pre, mdl_post, gauge_group_el, spam_penalty_factor, mdl_pre.basis, wrtIndices) #At this point, each proc has filled the portions (columns) of jacMx that # it's responsible for, and so now we gather them together. _mpit.gather_slices(derivSlices, derivOwners, jacMx, [], 1, comm) #Note jacMx is completely filled (on all procs) if check_jac and (comm is None or comm.Get_rank() == 0): def _mock_objective_fn(v): return _objective_fn(gauge_group_el, False) vec = gauge_group_el.to_vector() _opt.check_jac(_mock_objective_fn, vec, jacMx, tol=1e-5, eps=1e-9, err_type='abs', verbosity=1) return jacMx
def mpidot(a, b, loc_row_slice, loc_col_slice, slice_tuples_by_rank, comm, out=None, out_shm=None): """ Performs a distributed dot product, dot(a,b). Parameters ---------- a : numpy.ndarray First array to dot together. b : numpy.ndarray Second array to dot together. loc_row_slice, loc_col_slice : slice Specify the row or column indices, respectively, of the resulting dot product that are computed by this processor (the rows of `a` and columns of `b` that are used). Obtained from :func:`distribute_for_dot`. slice_tuples_by_rank : list A list of (row_slice, col_slice) tuples, one per processor within this processors broadcast group, ordered by rank. Provided by :func:`distribute_for_dot`. comm : mpi4py.MPI.Comm or ResourceAllocation or None The communicator used to parallelize the dot product. If a :class:`ResourceAllocation` object is given, then a shared memory result will be returned when appropriate. out : numpy.ndarray, optional If not None, the array to use for the result. This should be the same type of array (size, and whether it's shared or not) as this function would have created if `out` were `None`. out_shm : multiprocessing.shared_memory.SharedMemory, optinal The shared memory object corresponding to `out` when it uses shared memory. Returns ------- result : numpy.ndarray The resulting array shm : multiprocessing.shared_memory.SharedMemory A shared memory object needed to cleanup the shared memory. If a normal array is created, this is `None`. Provide this to :function:`cleanup_shared_ndarray` to ensure `ar` is deallocated properly. """ # R_ij = sum_k A_ik * B_kj from ..baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation if isinstance(comm, _ResourceAllocation): ralloc = comm comm = ralloc.comm else: ralloc = None if comm is None or comm.Get_size() == 1: return _np.dot(a, b), None if out is None: if ralloc is None: result, result_shm = _np.zeros((a.shape[0], b.shape[1]), a.dtype), None else: result, result_shm = _smt.create_shared_ndarray(ralloc, (a.shape[0], b.shape[1]), a.dtype, zero_out=True) else: result = out result_shm = out_shm rshape = (_slct.length(loc_row_slice), _slct.length(loc_col_slice)) loc_result_flat = _np.empty(rshape[0] * rshape[1], a.dtype) loc_result = loc_result_flat.view(); loc_result.shape = rshape loc_result[:, :] = _np.dot(a[loc_row_slice, :], b[:, loc_col_slice]) # broadcast_com defines the group of processors this processor communicates with. # Without shared memory, this is *all* the other processors. With shared memory, this # is one processor on each host. This code is identical to that in distribute_for_dot. if ralloc is None: broadcast_comm = comm else: broadcast_comm = comm if (ralloc.interhost_comm is None) else ralloc.interhost_comm comm.barrier() # wait for all ranks to do their work (get their loc_result) for r, (cur_row_slice, cur_col_slice) in enumerate(slice_tuples_by_rank): # for each member of the group that will communicate results cur_shape = (_slct.length(cur_row_slice), _slct.length(cur_col_slice)) buf = loc_result_flat if (broadcast_comm.rank == r) else _np.empty(cur_shape[0] * cur_shape[1], a.dtype) broadcast_comm.Bcast(buf, root=r) if broadcast_comm.rank != r: buf.shape = cur_shape else: buf = loc_result # already of correct shape result[cur_row_slice, cur_col_slice] = buf comm.barrier() # wait for all ranks to finish writing to result #assert(_np.linalg.norm(_np.dot(a,b) - result)/(_np.linalg.norm(result) + result.size) < 1e-6),\ # "DEBUG: %g, %g, %d" % (_np.linalg.norm(_np.dot(a,b) - result), _np.linalg.norm(result), result.size) return result, result_shm
def gather_indices(indices, index_owners, ar_to_fill, ar_to_fill_inds, axes, comm, max_buffer_size=None): """ Gathers data within a numpy array, `ar_to_fill`, according to given indices. Upon entry it is assumed that the different processors within `comm` have computed different parts of `ar_to_fill`, namely different slices or index-arrays of the `axis`-th axis. At exit, data has been gathered such that all processors have the results for the entire `ar_to_fill` (or at least for all the indices given). Parameters ---------- indices : list A list of all the integer-arrays or slices (computed by *any* of the processors, not just the current one). Each element of `indices` may be either a single slice/index-array or a tuple of such elements (when gathering across multiple dimensions). index_owners : dict A dictionary mapping the index of an element within `slices` to an integer rank of the processor responsible for communicating that slice/index-array's data to the rest of the processors. ar_to_fill : numpy.ndarray The array which contains partial data upon entry and the gathered data upon exit. ar_to_fill_inds : list A list of slice or index-arrays specifying the (fixed) sub-array of `ar_to_fill` that should be gathered into. The elements of `ar_to_fill_inds` are taken to be indices for the leading dimension first, and any unspecified dimensions or `None` elements are assumed to be unrestricted (as if `slice(None,None)`). Note that the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will work with index arrays as well as slices. axes : int or tuple of ints The axis or axes of `ar_to_fill` on which the slices apply (which axis do the elements of `indices` refer to?). Note that `len(axes)` must be equal to the number of sub-indices (i.e. the tuple length) of each element of `indices`. comm : mpi4py.MPI.Comm or None The communicator specifying the processors involved and used to perform the gather operation. max_buffer_size : int or None The maximum buffer size in bytes that is allowed to be used for gathering data. If None, there is no limit. Returns ------- None """ if comm is None: return # no gathering needed! #Perform broadcasts for each slice in order my_rank = comm.Get_rank() arIndx = [slice(None, None)] * ar_to_fill.ndim arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds axes = (axes,) if _compat.isint(axes) else axes max_indices = [None] * len(axes) if max_buffer_size is not None: # no maximum of buffer size chunkBytes = ar_to_fill.nbytes # start with the entire array as the "chunk" for iaxis, axis in enumerate(axes): # Consider restricting the chunk size along the iaxis-th axis. # If we can achieve the desired max_buffer_size by restricting # just along this axis, great. Otherwise, restrict to at most # 1 index along this axis and keep going. bytes_per_index = chunkBytes / ar_to_fill.shape[axis] max_inds = int(max_buffer_size / bytes_per_index) if max_inds == 0: max_indices[iaxis] = 1 chunkBytes /= ar_to_fill.shape[axis] else: max_indices[iaxis] = max_inds break else: _warnings.warn("gather_indices: Could not achieve max_buffer_size") for iIndex, indOrIndTup in enumerate(indices): owner = index_owners[iIndex] # owner's rank indTup = (indOrIndTup,) if not isinstance(indOrIndTup, tuple) else indOrIndTup assert(len(indTup) == len(axes)) def to_slice_list(index_array_or_slice): """Breaks a slice or index array into a list of slices""" if isinstance(index_array_or_slice, slice): return [index_array_or_slice] # easy! lst = index_array_or_slice if len(lst) == 0: return [slice(0, 0)] slc_lst = [] i = 0; N = len(lst) while i < N: start = lst[i] step = lst[i + 1] - lst[i] if i + 1 < N else None while i + 1 < N and lst[i + 1] - lst[i] == step: i += 1 stop = lst[i] + 1 slc_lst.append(slice(start, stop, None if step == 1 else step)) i += 1 return slc_lst #Get the a list of the (sub-)indices along each axis, whose product # (along the specified axes) gives the entire block given by slcTup axisSlices = [] for iaxis, axis in enumerate(axes): ind = indTup[iaxis] sub_slices = [] #break `ind`, which may be either a single slice or an index array, # into a list of slices that are broadcast one at a time (sometimes # these `ind_slice` slices themselves need to be broken up further # to obey max_buffer_size). for islice in to_slice_list(ind): if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(islice): sub_slices.append(islice) # arIndx[axis] = slc else: sub_slices.extend(_slct.divide(islice, max_indices[iaxis])) axisSlices.append(sub_slices) for axSlcs in _itertools.product(*axisSlices): #create arIndx from per-axis (sub-)slices and broadcast for iaxis, axis in enumerate(axes): arIndx[axis] = axSlcs[iaxis] #broadcast arIndx slice buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \ else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype) comm.Bcast(buf, root=owner) if my_rank != owner: _fas(ar_to_fill, arIndx, buf) buf = None # free buffer mem asap
def gather_slices_by_owner(current_slices, ar_to_fill, ar_to_fill_inds, axes, comm, max_buffer_size=None): """ Gathers data within a numpy array, `ar_to_fill`, according to given slices. Upon entry it is assumed that the different processors within `comm` have computed different parts of `ar_to_fill`, namely different slices of the axes indexed by `axes`. At exit, data has been gathered such that all processors have the results for the entire `ar_to_fill` (or at least for all the slices given). Parameters ---------- current_slices : list A list of all the slices computed by the *current* processor. Each element of `slices` may be either a single slice or a tuple of slices (when gathering across multiple dimensions). ar_to_fill : numpy.ndarray The array which contains partial data upon entry and the gathered data upon exit. ar_to_fill_inds : list A list of slice or index-arrays specifying the (fixed) sub-array of `ar_to_fill` that should be gathered into. The elements of `ar_to_fill_inds` are taken to be indices for the leading dimension first, and any unspecified dimensions or `None` elements are assumed to be unrestricted (as if `slice(None,None)`). Note that the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will work with index arrays as well as slices. axes : int or tuple of ints The axis or axes of `ar_to_fill` on which the slices apply (which axis do the slices in `slices` refer to?). Note that `len(axes)` must be equal to the number of slices (i.e. the tuple length) of each element of `slices`. comm : mpi4py.MPI.Comm or None The communicator specifying the processors involved and used to perform the gather operation. max_buffer_size : int or None The maximum buffer size in bytes that is allowed to be used for gathering data. If None, there is no limit. Returns ------- None """ #Note: same beginning as gather_slices (TODO: consolidate?) if comm is None: return # no gathering needed! #Perform broadcasts for each slice in order my_rank = comm.Get_rank() arIndx = [slice(None, None)] * ar_to_fill.ndim arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds axes = (axes,) if _compat.isint(axes) else axes max_indices = [None] * len(axes) if max_buffer_size is not None: # no maximum of buffer size chunkBytes = ar_to_fill.nbytes # start with the entire array as the "chunk" for iaxis, axis in enumerate(axes): # Consider restricting the chunk size along the iaxis-th axis. # If we can achieve the desired max_buffer_size by restricting # just along this axis, great. Otherwise, restrict to at most # 1 index along this axis and keep going. bytes_per_index = chunkBytes / ar_to_fill.shape[axis] max_inds = int(max_buffer_size / bytes_per_index) if max_inds == 0: max_indices[iaxis] = 1 chunkBytes /= ar_to_fill.shape[axis] else: max_indices[iaxis] = max_inds break else: _warnings.warn("gather_slices_by_owner: Could not achieve max_buffer_size") # -- end part that is the same as gather_slices #Get a list of the slices to broadcast, indexed by the rank of the owner proc slices_by_owner = comm.allgather(current_slices) for owner, slices in enumerate(slices_by_owner): for slcOrSlcTup in slices: slcTup = (slcOrSlcTup,) if isinstance(slcOrSlcTup, slice) else slcOrSlcTup assert(len(slcTup) == len(axes)) #Get the a list of the (sub-)slices along each axis, whose product # (along the specified axes) gives the entire block given by slcTup axisSlices = [] for iaxis, axis in enumerate(axes): slc = slcTup[iaxis] if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(slc): axisSlices.append([slc]) # arIndx[axis] = slc else: axisSlices.append(_slct.divide(slc, max_indices[iaxis])) for axSlcs in _itertools.product(*axisSlices): #create arIndx from per-axis (sub-)slices and broadcast for iaxis, axis in enumerate(axes): arIndx[axis] = axSlcs[iaxis] #broadcast arIndx slice buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \ else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype) comm.Bcast(buf, root=owner) if my_rank != owner: _fas(ar_to_fill, arIndx, buf) buf = None # free buffer mem asap
def gather_slices(slices, slice_owners, ar_to_fill, ar_to_fill_inds, axes, comm, max_buffer_size=None): """ Gathers data within a numpy array, `ar_to_fill`, according to given slices. Upon entry it is assumed that the different processors within `comm` have computed different parts of `ar_to_fill`, namely different slices of the `axis`-th axis. At exit, data has been gathered such that all processors have the results for the entire `ar_to_fill` (or at least for all the slices given). Parameters ---------- slices : list A list of all the slices (computed by *any* of the processors, not just the current one). Each element of `slices` may be either a single slice or a tuple of slices (when gathering across multiple dimensions). slice_owners : dict A dictionary mapping the index of a slice (or tuple of slices) within `slices` to an integer rank of the processor responsible for communicating that slice's data to the rest of the processors. ar_to_fill : numpy.ndarray The array which contains partial data upon entry and the gathered data upon exit. ar_to_fill_inds : list A list of slice or index-arrays specifying the (fixed) sub-array of `ar_to_fill` that should be gathered into. The elements of `ar_to_fill_inds` are taken to be indices for the leading dimension first, and any unspecified dimensions or `None` elements are assumed to be unrestricted (as if `slice(None,None)`). Note that the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will work with index arrays as well as slices. axes : int or tuple of ints The axis or axes of `ar_to_fill` on which the slices apply (which axis do the slices in `slices` refer to?). Note that `len(axes)` must be equal to the number of slices (i.e. the tuple length) of each element of `slices`. comm : mpi4py.MPI.Comm or ResourceAllocation or None The communicator specifying the processors involved and used to perform the gather operation. If a :class:`ResourceAllocation` is provided, then inter-host communication is used when available to facilitate use of shared intra-host memory. max_buffer_size : int or None The maximum buffer size in bytes that is allowed to be used for gathering data. If None, there is no limit. Returns ------- None """ from ..baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation if isinstance(comm, _ResourceAllocation): ralloc = comm comm = ralloc.comm #For use with shared intra-host (intra-node) memory: # my_interhost_ranks = ranks of comm, 1 per host, that this processor uses to send/receive data between hosts # broadcast_comm = the comm of my_interhost_ranks used to send/receive data. if ralloc.interhost_ranks is not None: my_interhost_ranks = set(ralloc.interhost_ranks) broadcast_rank_map = {comm_rank: broadcast_comm_rank for broadcast_comm_rank, comm_rank in enumerate(ralloc.interhost_ranks)} broadcast_comm = ralloc.interhost_comm else: my_interhost_ranks = None broadcast_rank_map = {i: i for i in range(comm.Get_size())} if (comm is not None) else {0: 0} # trivial map broadcast_comm = comm else: ralloc = None my_interhost_ranks = None broadcast_rank_map = {i: i for i in range(comm.Get_size())} if (comm is not None) else {0: 0} # trivial map broadcast_comm = comm if comm is None: return # no gathering needed! # To be safe, since use of broadcast_comm below means we don't always need to wait for all procs # to finish what they were doing last, which could involve updating a shared ar_to_fill so that # values accessed by the already-finished front-running processors are affected! comm.barrier() #Perform broadcasts for each slice in order my_rank = comm.Get_rank() axes = (axes,) if _compat.isint(axes) else axes #print("DB: Rank %d (%d): BEGIN GATHER SLICES: interhost=%s, group=%s" % # (my_rank, broadcast_comm.rank, str(my_interhost_ranks), str(broadcast_comm.Get_group()))) # # if ar_to_fill_inds only contains slices (or is empty), then we can slice ar_to_fill once up front # # and not use generic arIndx in loop below (slower, especially with lots of procs) # if all([isinstance(indx, slice) for indx in ar_to_fill_inds]): # ar_to_fill = ar_to_fill[tuple(ar_to_fill_inds)] # Note: this *doesn't* reduce its .ndim # ar_to_fill_inds = () # now ar_to_fill requires no further indexing arIndx = [slice(None, None)] * ar_to_fill.ndim arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds max_indices = [None] * len(axes) if max_buffer_size is not None: # no maximum of buffer size chunkBytes = ar_to_fill.nbytes # start with the entire array as the "chunk" for iaxis, axis in enumerate(axes): # Consider restricting the chunk size along the iaxis-th axis. # If we can achieve the desired max_buffer_size by restricting # just along this axis, great. Otherwise, restrict to at most # 1 index along this axis and keep going. bytes_per_index = chunkBytes / ar_to_fill.shape[axis] max_inds = int(max_buffer_size / bytes_per_index) if max_inds == 0: max_indices[iaxis] = 1 chunkBytes /= ar_to_fill.shape[axis] else: max_indices[iaxis] = max_inds break else: _warnings.warn("gather_slices: Could not achieve max_buffer_size") # NOTE: Tried doing something faster (Allgatherv) when slices elements are simple slices (not tuples of slices). # This ultimately showed that our repeated use of Bcast isn't any slower than fewer calls to Allgatherv, # and since the Allgatherv case complicates the code and ignores the memory limit, it's best to just drop it. # Broadcast slices one-by-one (slower, but more general): for iSlice, slcOrSlcTup in enumerate(slices): owner = slice_owners[iSlice] # owner's rank if my_interhost_ranks is not None and owner not in my_interhost_ranks: # if the "source" (owner) of the data isn't a part of my "circle" of ranks, then we # don't need to send or receive this data - other ranks on the same hosts will do it. continue slcTup = (slcOrSlcTup,) if isinstance(slcOrSlcTup, slice) else slcOrSlcTup assert(len(slcTup) == len(axes)) #Get the a list of the (sub-)slices along each axis, whose product # (along the specified axes) gives the entire block given by slcTup axisSlices = [] for iaxis, axis in enumerate(axes): slc = slcTup[iaxis] if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(slc): axisSlices.append([slc]) # arIndx[axis] = slc else: axisSlices.append(_slct.divide(slc, max_indices[iaxis])) for axSlcs in _itertools.product(*axisSlices): #create arIndx from per-axis (sub-)slices and broadcast for iaxis, axis in enumerate(axes): arIndx[axis] = axSlcs[iaxis] #broadcast arIndx slice buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \ else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype) if my_interhost_ranks is None or len(my_interhost_ranks) > 1: #print("DB: Rank %d (%d) Broadcast: arIndx = %s, owner=%d root=%d" % # (my_rank, broadcast_comm.rank, str(arIndx), owner, broadcast_rank_map[owner])) broadcast_comm.Bcast(buf, root=broadcast_rank_map[owner]) if my_rank != owner: _fas(ar_to_fill, arIndx, buf) buf = None # free buffer mem asap #print("DB: Rank %d: END GATHER SLICES" % my_rank) # Important: wait for everything to finish before proceeding # (when broadcast_comm != comm some procs may run ahead - see comment above) comm.barrier()
def distribute_slice(s, comm, allow_split_comm=True): """ Partition a continuous slice evenly among `comm`'s processors. This function is similar to :func:`distribute_indices`, but is specific to the case when the indices being distributed are a consecutive set of integers (specified by a slice). Parameters ---------- s : slice The slice to be partitioned. comm : mpi4py.MPI.Comm or ResourceAllocation The communicator which specifies the number of processors and which may be split into returned sub-communicators. If a :class:`ResourceAllocation` object, node information is also taken into account when available (for shared memory compatibility). allow_split_comm : bool If True, when there are more processors than slice indices, multiple processors will be given the *same* local slice and `comm` will be split into sub-communicators, one for each group of processors that are given the same local slice. If False, then "extra" processors are simply given nothing to do, i.e. an empty local slice. Returns ------- slices : list of slices The list of *unique* slices assigned to different processors. It's possible that a single slice (i.e. element of `slices`) is assigned to multiple processors (when there are more processors than indices in `s`. loc_slice : slice A slice specifying the indices belonging to the current processor. owners : dict A dictionary giving the owning rank of each slice. Values are integer ranks and keys are integers into `slices`, specifying which slice. loc_comm : mpi4py.MPI.Comm or ResourceAllocation or None The local communicator/ResourceAllocation for the group of processors which have been given the same `loc_slice` to compute, obtained by splitting `comm`. If `loc_slice` is unique to the current processor, or if `allow_split_comm` is False, None is returned. """ from ..baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation if isinstance(comm, _ResourceAllocation): ralloc = comm comm = ralloc.comm else: ralloc = None if comm is None: nprocs, rank = 1, 0 else: nprocs = comm.Get_size() rank = comm.Get_rank() slices = slice_up_slice(s, min(nprocs, _slct.length(s))) assert(len(slices) <= nprocs) loc_iSlices, slcOwners, _ = \ distribute_indices_base(list(range(len(slices))), nprocs, rank, allow_split_comm) assert(len(loc_iSlices) <= 1) # should not assign more than one slice to # each proc by design (there are only nprocs slices) if len(loc_iSlices) == 1: loc_slice = slices[loc_iSlices[0]] #Split comm into sub-comms when there are more procs than # indices, resulting in all procs getting only a # single index and multiple procs getting the *same* # (single) index. if nprocs > _slct.length(s) and (comm is not None) and allow_split_comm: loc_comm = comm.Split(color=loc_iSlices[0], key=rank) else: loc_comm = None else: # len(loc_iSlices) == 0 (nothing for this proc to do) loc_slice = slice(0, 0) loc_comm = None if ralloc is not None: # then return a ResourceAllocation instead of a comm loc_comm = _ResourceAllocation(loc_comm, ralloc.mem_limit, ralloc.profiler, ralloc.distribute_method, ralloc.allocated_memory) if ralloc.host_comm is not None: loc_comm.build_hostcomms() # signals that we want to use shared intra-host memory return slices, loc_slice, slcOwners, loc_comm
def _bulk_fill_timedep_deriv(self, layout, dataset, ds_circuits, num_total_outcomes, deriv_array_to_fill, deriv_fill_fn, array_to_fill=None, fill_fn=None): """ A helper method for computing (filling) the derivative of a time-dependent quantity. A generic method providing the scaffolding used when computing (filling) the derivative of a time-dependent quantity. In particular, it distributes the computation among the subtrees of `eval_tree` and relies on the caller to supply "compute_cache" and "compute_dcache" functions which just need to compute the quantitiy being filled and its derivative given a sub-tree and a parameter-slice. Parameters ---------- layout : TermCOPALayout The layout specifiying the quantities (circuit outcome probabilities) to be computed, and related information. dataset : DataSet the data set passed on to the computation functions. ds_circuits : list of Circuits the circuits to use as they should be queried from `dataset` (see below). This is typically the same list of circuits used to construct `layout` potentially with some aliases applied. num_total_outcomes : list or array a list of the total number of *possible* outcomes for each circuit (so `len(num_total_outcomes) == len(ds_circuits)`). This is needed for handling sparse data, where `dataset` may not contain counts for all the possible outcomes of each circuit. deriv_array_to_fill : numpy ndarray an already-allocated ExM numpy array where E is the total number of computed elements (i.e. layout.num_elements) and M is the number of model parameters. deriv_fill_fn : function a function used to compute the objective funtion jacobian. array_to_fill : numpy array, optional when not None, an already-allocated length-E numpy array that is filled with the per-circuit contributions computed using `fn` below. fill_fn : function, optional a function used to compute the objective function. Returns ------- None """ #Note: this function is similar to _bulk_fill_dprobs, and may be able to consolidated in the FUTURE. blkSize = layout.param_dimension_blk_sizes[0] atom_resource_alloc = layout.resource_alloc('atom-processing') param_resource_alloc = layout.resource_alloc('param-processing') assert(atom_resource_alloc.host_comm is None), \ "Shared memory is not supported in time-dependent calculations (yet)" host_param_slice = layout.host_param_slice global_param_slice = layout.global_param_slice for atom in layout.atoms: elInds = atom.element_slice #NOTE: this block uses atom.orig_indices_by_expcircuit, which is specific to _MapCOPALayoutAtom - TODO dataset_rows = {i_expanded: dataset[ds_circuits[i]] for i_expanded, i in atom.orig_indices_by_expcircuit.items()} num_outcomes = {i_expanded: num_total_outcomes[i] for i_expanded, i in atom.orig_indices_by_expcircuit.items()} if array_to_fill is not None: fill_fn(array_to_fill, elInds, num_outcomes, atom, dataset_rows, atom_resource_alloc) if blkSize is None: # wrt_filter gives entire computed parameter block #Fill derivative cache info deriv_fill_fn(deriv_array_to_fill, elInds, host_param_slice, num_outcomes, atom, dataset_rows, global_param_slice, param_resource_alloc) #profiler.mem_check("bulk_fill_dprobs: post fill") else: # Divide columns into blocks of at most blkSize Np = _slct.length(host_param_slice) # total number of parameters we're computing nBlks = int(_np.ceil(Np / blkSize)) # num blocks required to achieve desired average size == blkSize blocks = _mpit.slice_up_range(Np, nBlks) for block in blocks: host_param_slice_part = _slct.shift(block, host_param_slice.start) # into host's memory global_param_slice_part = _slct.shift(block, global_param_slice.start) # actual parameter indices deriv_fill_fn(deriv_array_to_fill, elInds, host_param_slice_part, num_outcomes, atom, dataset_rows, global_param_slice_part, param_resource_alloc)
def _bulk_fill_hprobs(self, array_to_fill, layout, pr_array_to_fill, deriv1_array_to_fill, deriv2_array_to_fill): """Note: we expect that array_to_fill points to the memory specifically for this processor (a subset of the memory for the host when memory is shared) """ blkSize1 = layout.param_dimension_blk_sizes[0] blkSize2 = layout.param_dimension_blk_sizes[1] #Assume we're being called with a resource_alloc that's been setup by a distributed layout: atom_resource_alloc = layout.resource_alloc('atom-processing') param_resource_alloc = layout.resource_alloc('param-processing') param2_resource_alloc = layout.resource_alloc('param2-processing') atom_resource_alloc.host_comm_barrier() # ensure all procs have finished w/shared memory before we reinit # Note: use *largest* host comm that we fill - so 'atom' comm, not 'param' comm host_param_slice = None # layout.host_param_slice # array_to_fill is already just this slice of the host mem host_param2_slice = None # layout.host_param2_slice # array_to_fill is already just this slice of the host mem global_param_slice = layout.global_param_slice global_param2_slice = layout.global_param2_slice for atom in layout.atoms: if pr_array_to_fill is not None: self._bulk_fill_probs_atom(pr_array_to_fill[atom.element_slice], atom, atom_resource_alloc) if blkSize1 is None and blkSize2 is None: # run 'else' block without unnecessary logic #Compute all our derivative columns at once if deriv1_array_to_fill is not None: self._bulk_fill_dprobs_atom(deriv1_array_to_fill[atom.element_slice, :], host_param_slice, atom, global_param_slice, param_resource_alloc) if deriv2_array_to_fill is not None: if deriv1_array_to_fill is not None and global_param_slice == global_param2_slice: deriv2_array_to_fill[atom.element_slice, :] = deriv1_array_to_fill[atom.element_slice, :] else: self._bulk_fill_dprobs_atom(deriv2_array_to_fill[atom.element_slice, :], host_param2_slice, atom, global_param2_slice, param2_resource_alloc) self._bulk_fill_hprobs_atom(array_to_fill[atom.element_slice, :, :], host_param_slice, host_param2_slice, atom, global_param_slice, global_param2_slice, param2_resource_alloc) else: # Divide columns into blocks of at most shape (blkSize1, blkSize2) assert(blkSize1 is not None and blkSize2 is not None), \ "Both (or neither) of the Hessian block sizes must be specified!" Np1 = _slct.length(global_param_slice) Np2 = _slct.length(global_param2_slice) nBlks1 = int(_np.ceil(Np1 / blkSize1)) nBlks2 = int(_np.ceil(Np2 / blkSize2)) # num blocks required to achieve desired average size == blkSize1 or blkSize2 blocks1 = _mpit.slice_up_range(Np1, nBlks1) blocks2 = _mpit.slice_up_range(Np2, nBlks2) for block1 in blocks1: host_param_slice_part = block1 # _slct.shift(block1, host_param_slice.start) # into host's memory global_param_slice_part = _slct.shift(block1, global_param_slice.start) # actual parameter indices if deriv1_array_to_fill is not None: self._bulk_fill_dprobs_atom(deriv1_array_to_fill[atom.element_slice, :], host_param_slice_part, atom, global_param_slice_part, param_resource_alloc) for block2 in blocks2: host_param2_slice_part = block2 # into host's memory global_param2_slice_part = _slct.shift(block2, global_param2_slice.start) # parameter indices self._bulk_fill_hprobs_atom(array_to_fill[atom.element_slice, :], host_param_slice_part, host_param2_slice_part, atom, global_param_slice_part, global_param2_slice_part, param2_resource_alloc) #Fill deriv2_array_to_fill if we need to. if deriv2_array_to_fill is not None: if deriv1_array_to_fill is not None and global_param_slice == global_param2_slice: deriv2_array_to_fill[atom.element_slice, :] = deriv1_array_to_fill[atom.element_slice, :] else: for block2 in blocks2: host_param2_slice_part = block2 # into host's memory global_param2_slice_part = _slct.shift(block2, global_param2_slice.start) # param indices self._bulk_fill_dprobs_atom(deriv2_array_to_fill[atom.element_slice, :], host_param2_slice_part, atom, global_param2_slice_part, param_resource_alloc) atom_resource_alloc.host_comm_barrier() # don't exit until all procs' array_to_fill is ready