def __init__(self, exps=None, cs=None, nomials=None, simplify=True): if nomials and (exps or cs): raise ValueError("The NomialData initializor accepts either" " exps and cs, or nomials, but not both.") elif nomials: self.nomials = nomials exps = functools_reduce(add, (tuple(s.exps) for s in nomials)) cs = np.hstack((mag(s.cs) for s in nomials)) simplify = False # nomials have already been simplified elif exps is None or cs is None: raise ValueError("creation of a NomialData requires exps and cs.") if simplify: exps, cs = simplify_exps_and_cs(exps, cs) self.exps, self.cs = exps, cs self.any_nonpositive_cs = any(mag(c) <= 0 for c in self.cs) self.varlocs, self.varstrs = locate_vars(self.exps) self.values = {vk: vk.descr["value"] for vk in self.varlocs if "value" in vk.descr} if nomials: self.units = tuple(s.units for s in nomials) elif isinstance(self.cs, Quantity): self.units = Quantity(1, self.cs.units) else: self.units = None self._hashvalue = None
def init_from_nomials(self, nomials): """Way to initialize from nomials. Calls __init__. Used by subclass __init__ methods. """ exps = functools_reduce(add, (tuple(s.exps) for s in nomials)) cs = np.hstack((mag(s.cs) for s in nomials)) # nomials are already simplified, so simplify=False NomialData.__init__(self, exps, cs, simplify=False) self.units = tuple(s.units for s in nomials)
def _collect_explicit_graded(cls, block_structure): """ Collect the 'explicit_graded' field for every block. """ def _set_field(block_key, field_value): """ Sets the explicit graded field to the given value for the given block. """ block_structure.set_transformer_block_field(block_key, cls, cls.EXPLICIT_GRADED_FIELD_NAME, field_value) def _get_field(block_key): """ Gets the explicit graded field to the given value for the given block. """ return block_structure.get_transformer_block_field(block_key, cls, cls.EXPLICIT_GRADED_FIELD_NAME) block_types_to_ignore = {"course", "chapter", "sequential"} for block_key in block_structure.topological_traversal(): if block_key.block_type in block_types_to_ignore: _set_field(block_key, None) else: explicit_field_on_block = get_field_on_block(block_structure.get_xblock(block_key), "graded") if explicit_field_on_block is not None: _set_field(block_key, explicit_field_on_block) else: values_from_parents = [ _get_field(parent) for parent in block_structure.get_parents(block_key) if parent.block_type not in block_types_to_ignore ] non_null_values_from_parents = [value for value in values_from_parents if not None] explicit_from_parents = functools_reduce(lambda x, y: x or y, non_null_values_from_parents, None) _set_field(block_key, explicit_from_parents)
def parse_result(result, constants, beforesubs, sweep={}, linkedsweep={}, freevar_sensitivity_tolerance=1e-4, localmodel_sensitivity_requirement=0.1): "Parses a GP-like result dict into a SolutionArray-like dict." cost = result["cost"] freevariables = dict(result["variables"]) sweepvariables = {var: val for var, val in constants.items() if var in sweep or var in linkedsweep} constants = {var: val for var, val in constants.items() if var not in sweepvariables} variables = dict(freevariables) variables.update(constants) variables.update(sweepvariables) sensitivities = dict(result["sensitivities"]) # Remap monomials after substitution and simplification. # The monomial sensitivities from the GP/SP are in terms of this # smaller post-substitution list of monomials, so we need to map that # back to the pre-substitution list. # # Each "smap" is a list of HashVectors (mmaps), # whose keys are monomial indexes pre-substitution, # and whose values are the percentage of the simplified monomial's # coefficient that came from that particular parent nu = result["sensitivities"]["monomials"] # HACK: simplified solves need a mutated beforesubs, as created in Model if hasattr(beforesubs, "smaps"): nu_ = np.zeros(len(beforesubs.cs)) little_counter, big_counter = 0, 0 for j, smap in enumerate(beforesubs.smaps): for i, mmap in enumerate(smap): for idx, percentage in mmap.items(): nu_[idx + big_counter] += percentage*nu[i + little_counter] little_counter += len(smap) big_counter += len(beforesubs.signomials[j].cs) sensitivities["monomials"] = nu_ sens_vars = {var: sum([beforesubs.exps[i][var]*nu_[i] for i in locs]) for (var, locs) in beforesubs.varlocs.items()} sensitivities["variables"] = sens_vars # free-variable sensitivities must be <= some epsilon for var, S in sensitivities["variables"].items(): if var in freevariables and abs(S) > freevar_sensitivity_tolerance: raise ValueError("free variable too sensitive: S_{%s} = " "%0.2e" % (var, S)) localexp = {var: S for (var, S) in sens_vars.items() if abs(S) >= localmodel_sensitivity_requirement} localcs = (variables[var]**-S for (var, S) in localexp.items()) localc = functools_reduce(mul, localcs, cost) localmodel = Monomial(localexp, localc) # vectorvar substitution veckeys = set() for var in beforesubs.varlocs: if "idx" in var.descr and "shape" in var.descr: descr = dict(var.descr) idx = descr.pop("idx") if "value" in descr: descr.pop("value") if "units" in descr: units = descr.pop("units") veckey = VarKey(**descr) veckey.descr["units"] = units else: veckey = VarKey(**descr) veckeys.add(veckey) for vardict in [variables, sensitivities["variables"], constants, sweepvariables, freevariables]: if var in vardict: if veckey in vardict: vardict[veckey][idx] = vardict[var] else: vardict[veckey] = np.full(var.descr["shape"], np.nan) vardict[veckey][idx] = vardict[var] del vardict[var] if hasattr(beforesubs, "varkeysubs"): for origvk, subvk in beforesubs.varkeysubs.items(): for data in [constants, sweepvariables, freevariables, variables, sensitivities["variables"]]: if subvk in data: qty = isinstance(origvk.units, Quantity) if data is sensitivities["variables"] or not qty: data[origvk] = data[subvk] else: scale = (subvk.units/origvk.units).to("dimensionless") data[origvk] = data[subvk] * scale return dict(cost=cost, constants=constants, sweepvariables=sweepvariables, freevariables=freevariables, variables=variables, sensitivities=sensitivities, localmodel=localmodel)
def get_query(self): return functools_reduce(lambda q, item_id: q | Q(id=item_id), self.ids, Q())
def quantized_inner_product_check_rule(x, w, b, scale_q, offset_q, scale_deq_req, offset_req, y, quant_algo, scale_sqrt, num_output, transpose, bias_term, axis, kernel_name="quantized_inner_product"): """ Check the legality of each entry """ # x info shape_x = x.get('shape') dtype_x = x.get('dtype') format_x = x.get('format') m_shape = shape_x[0] km_shape = shape_x[1] * shape_x[2] * shape_x[3] * shape_x[4] if functools_reduce(lambda x, y: x * y, shape_x) >= SHAPE_SIZE_LIMIT: raise RuntimeError("The shape_x exceed 32 bit limitations! ") if shape_x[-1] != 32: raise RuntimeError("For non_quant 'NC1HWC0' x, the C0 must be 32!") util.check_dtype_rule(dtype_x, ['uint8']) if format_x != 'NC1HWC0': raise RuntimeError("For IP situation, x format must be NC1HWC0!") # gevm is_gevm = m_shape == 1 if is_gevm: if km_shape % 512 != 0: raise RuntimeError("for quant_gevm, KM/KN must be multi of 512!") # w info shape_w = w.get('shape') dtype_w = w.get('dtype') format_w = w.get('format') if functools_reduce(lambda x, y: x * y, shape_w) >= SHAPE_SIZE_LIMIT: raise RuntimeError("The shape_w exceed 32 bit limitations! ") util.check_dtype_rule(dtype_w, ['int8']) if format_w != 'FRACTAL_Z': raise RuntimeError( "For quant IP situation, w format must be FRACTAL_Z!") if shape_w[2] != 16 or shape_w[3] != 32: raise RuntimeError( "For quant IP situation, last two dim must be 16 and 32!") kn_shape = shape_w[0] * shape_w[3] n_shape = shape_w[1] * shape_w[2] # Check shape if km_shape != kn_shape: raise RuntimeError("KM of input_x must be equal to KN of input_w!") # y info shape_y = y.get('shape') dtype_y = y.get('dtype') format_y = y.get('format') if shape_y[-1] != 16: raise RuntimeError("For Quant 'NC1HWC0' y, the C0 must be 32!") util.check_dtype_rule(dtype_y, ['float16']) if format_y != 'NC1HWC0': raise RuntimeError("For IP situation, y format must be NC1HWC0!") # b info if bias_term: shape_b = b.get('shape') dtype_b = b.get('dtype') format_b = b.get('format') b_size = shape_b[1] * shape_b[4] # Check info util.check_dtype_rule(dtype_b, ['int32']) if format_b != 'NC1HWC0': raise RuntimeError("For IP situation, b format must be NC1HWC0!") if b_size != n_shape: raise RuntimeError( "For bias, the C1*C0 must equal to aligned_Cout!") else: if b is not None: raise RuntimeError("for bias_term false, the b must be an None!") if transpose: raise RuntimeError("for quantized IP, only support transpose false")
def __init__(self, var, indices, updates, var_out, nd_flag, kernel_name, compute_type): """ Init scatter base parameters Parameters ---------- var: dict data of input datatype suports float32,float16,int32,int8,uint8 indices: dict data of indices datatype supports int32 updates: dict data of updates datatype supports float32,float16,int32,int8,uint8 var_out: dict data of input nd_flag: bool if this op is nd operator kernel_name: str the name of the operator compute_type: str the compute type of scatter Returns ------- None """ self.tik_instance = tik.Tik(tik.Dprofile()) self.nd_flag = nd_flag self.var_shape = var.get("shape") self.var_dtype = var.get("dtype").lower() self.indices_shape = indices.get("shape") self.indices_dtype = indices.get("dtype").lower() self.updates_shape = updates.get("shape") self.updates_dtype = updates.get("dtype").lower() self.var_ele_num = functools_reduce(lambda x, y: x * y, self.var_shape) self.indices_num = functools_reduce(lambda x, y: x * y, self.indices_shape) self.updates_num = functools_reduce(lambda x, y: x * y, self.updates_shape) self.kernel_name = kernel_name if self.indices_shape == (1,) and \ len(self.var_shape)-len(self.updates_shape) == 1: if not nd_flag: self.updates_shape = (1, ) + self.updates_shape self.check_param(var_out) if nd_flag: if self.indices_shape[-1] == len(self.var_shape): self.update_data_num = 1 else: self.update_data_num = functools_reduce( lambda x, y: x * y, self.var_shape[self.indices_shape[-1]:]) self.max_indice = functools_reduce( lambda x, y: x * y, self.var_shape[0:self.indices_shape[-1]]) self.index_dims = self.indices_shape[-1] else: if len(self.var_shape) > 1: self.update_data_num = functools_reduce( lambda x, y: x * y, self.var_shape[1:]) else: self.update_data_num = 1 self.max_indice = self.var_shape[0] self.index_dims = 1 self.compute_type = compute_type self.ub_size_bytes = ( tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) - 8192) self.var_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.var_dtype) // 8 self.indices_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.indices_dtype) // 8 self.var_data_each_block = 32 // self.var_dtype_bytes_size self.indices_data_each_block = 32 // self.indices_dtype_bytes_size self.indices_ub_number = 0 self.updates_ub_number = 0 self.index_loop_num = 0 self.max_num_one_repeat = 128 if self.var_dtype in ("float32", "int32"): self.max_num_one_repeat = 64 if self.update_data_num < self.var_data_each_block: self.block_num = 1 else: ai_core_num = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.indice_step = math.ceil(self.max_indice / ai_core_num) self.block_num = math.ceil(self.max_indice / self.indice_step) self.var_gm = self.tik_instance.Tensor(self.var_dtype, self.var_shape, name="var_gm", scope=tik.scope_gm) self.indices_gm = self.tik_instance.Tensor(self.indices_dtype, self.indices_shape, name="indices_gm", scope=tik.scope_gm) self.updates_gm = self.tik_instance.Tensor(self.updates_dtype, self.updates_shape, name="updates_gm", scope=tik.scope_gm) self.out_gm = self.tik_instance.Tensor(self.var_dtype, self.var_shape, name="out_gm", scope=tik.scope_gm) self.vconv_dst_dtype = "float16" self.init_ub_tensor_para() self.var_vconv_ub = None self.updates_vconv_ub = None self.var_tile_vconv_ub = None self.updates_tile_vconv_ub = None self.var_ub = None self.updates_ub = None self.indices_ub = None self.var_tile_ub = None self.updates_tile_ub = None self.var_read_index = None self.updates_read_index = None self.indices_loop_index = None self.indices_tmp = None
def localsolve(self, solver=None, verbosity=1, x0=None, rel_tol=1e-4, iteration_limit=50, *args, **kwargs): """Locally solves a SignomialProgram and returns the solution. Arguments --------- solver : str or function (optional) By default uses one of the solvers found during installation. If set to "mosek", "mosek_cli", or "cvxopt", uses that solver. If set to a function, passes that function cs, A, p_idxs, and k. verbosity : int (optional) If greater than 0, prints solve time and number of iterations. Each GP is created and solved with verbosity one less than this, so if greater than 1, prints solver name and time for each GP. x0 : dict (optional) Initial location to approximate signomials about. rel_tol : float Iteration ends when this is greater than the distance between two consecutive solve's objective values. iteration_limit : int Maximum GP iterations allowed. *args, **kwargs : Passed to solver function. Returns ------- result : dict A dictionary containing the translated solver result. """ if verbosity > 0: print("Beginning signomial solve.") self.starttime = time() self.gps = [] # NOTE: SIDE EFFECTS prevcost, cost, rel_improvement = None, None, None while rel_improvement is None or rel_improvement > rel_tol: if len(self.gps) > iteration_limit: raise RuntimeWarning("""problem unsolved after %s iterations. The last result is available in Model.program.gps[-1].result. If the gps appear to be converging, you may wish to increase the iteration limit by calling .localsolve(..., iteration_limit=NEWLIMIT).""" % len(self.gps)) gp = self.step(x0, verbosity=verbosity-1) self.gps.append(gp) # NOTE: SIDE EFFECTS try: result = gp.solve(solver, verbosity-1, *args, **kwargs) except (RuntimeWarning, ValueError): nearest_feasible = feasibility_model(gp, "max") self.gps.append(nearest_feasible) result = nearest_feasible.solve(verbosity=verbosity-1) result["cost"] = None x0 = result["variables"] prevcost, cost = cost, result["cost"] if prevcost and cost: rel_improvement = abs(prevcost-cost)/(prevcost + cost) else: rel_improvement = None # solved successfully! if verbosity > 0: print("Solving took %i GP solves" % len(self.gps) + " and %.3g seconds." % (time() - self.starttime)) # parse the result and return nu's of original monomials from # variable sensitivities nu = result["sensitivities"]["monomials"] sens_vars = {var: sum([gp.exps[i][var]*nu[i] for i in locs]) for (var, locs) in gp.varlocs.items()} nu_ = [] for signomial in self.signomials: for c, exp in zip(signomial.cs, signomial.exps): var_ss = [sens_vars[var]*val for var, val in exp.items()] nu_.append(functools_reduce(mul, var_ss, np.sign(c))) result["sensitivities"]["monomials"] = np.array(nu_) # TODO: SP sensitivities are weird, and potentially incorrect self.result = result # NOTE: SIDE EFFECTS return result
def __init__(self, src, dst, src_format, dst_format, kernel_name): """ Init zn_2_hwcn_lstm parameters Parameters ---------- src : dict, shape and dtype of input. dst: dict, shape and dtype of input. src_format: str, source data format, can be fractal_zn. dst_format: str, target data format, can be hwcn. kernel_name: str, kernel name, default value is "zn_2_hwcn_lstm". Returns ------- None """ self.tik_instance = tik.Tik(tik.Dprofile()) self.src_format = src_format self.dst_format = dst_format self.kernel_name = kernel_name self.src_shape = src.get("shape") self.src_dtype = src.get("dtype").lower() self.dst_shape = dst.get("shape") self.dst_dtype = dst.get("dtype").lower() self.h = self.dst_shape[3] // 4 self.i = self.dst_shape[2] - self.h self.h_align = math.ceil(self.h / 16) * 16 self.i_align = math.ceil(self.i / 16) * 16 self.src_data_num = functools_reduce(lambda x, y: x * y, self.src_shape[:]) self.ub_size_bytes = UB_SIZE - 9216 self.core_num = MAX_CORE_NUM self.src_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.src_dtype) // 8 self.src_data_each_block = 32 // self.src_dtype_bytes_size self.src_gm = self.tik_instance.Tensor(self.src_dtype, self.src_shape, name="src_gm", scope=tik.scope_gm) self.dst_gm = self.tik_instance.Tensor(self.dst_dtype, self.dst_shape, name="dst_gm", scope=tik.scope_gm) self.src_burst_len = 0 self.burst_len = 0 self.dst_burst_len = 0 self.src_ub_number = 0 self.temp_ub_number = 0 self.half_ub_number = 0 self.each_data_num = 0 self.each_core_data = 0 self.last_core_data = 0 self.c_num = 0 self.i_flag = 0 self.n0_ni_c0 = (self.src_shape[1] * self.src_shape[2] * self.src_shape[3]) self.ni_c0 = self.src_shape[2] * self.src_shape[3] self.temp_c = None self.before_c = None self.core_loop_index = None self.src_ub = None self.temp_ub = None self.tile_ub = None self.temp_burst_len = None self.remain_data = None self.temp_data = None ai_core_num = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.c_each_core = math.ceil(self.src_shape[0] / ai_core_num) self.core_num = math.ceil(self.src_shape[0] / self.c_each_core) self.each_core_data = self.c_each_core * self.n0_ni_c0 self.c_last_core = (self.src_shape[0] - self.c_each_core * (self.core_num - 1)) self.last_core_data = self.c_last_core * self.n0_ni_c0 data_num = (self.i % (self.c_each_core * 16)) * self.dst_shape[3] if (data_num < self.src_data_each_block) and data_num > 0: self.core_num = self.change_core_num() self.each_core_data = self.c_each_core * self.n0_ni_c0 self.c_last_core = (self.src_shape[0] - self.c_each_core * (self.core_num - 1)) self.last_core_data = self.c_last_core * self.n0_ni_c0 self.check_param()
def fake_quant_per_layer(x, min_val, max_val, y, symmetric, narrow_range, num_bits, kernel_name="fake_quant_per_layer"): """FakeQuantPerLayer""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) if symmetric: quant_min = 0 - 2**(num_bits - 1) quant_max = 2**(num_bits - 1) - 1 else: quant_min = 0 quant_max = 2**num_bits - 1 if narrow_range: quant_min = quant_min + 1 input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res = fake_quant_per_layer_compute(input_data, min_data, max_data, y, quant_min, quant_max, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, min_data, max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)