def laplacian_of_gaussian_ad(head, x): """2nd derivative of gaussian, which should be the same as laplacian of gaussian filter.""" y = gaussian(x) # 1st derivative dx = list(akg.differentiate(y, [x], head)) head_fake = akg.tvm.compute(x.shape, lambda *ind: akg.tvm.const(1.0, dtype=y.dtype)) # 2nd derivative dx2 = list(akg.differentiate(dx[0], [x], head_fake)) return dx2[0]
def bias_add_ad_v2(head, input_shape, data_format, target=utils.CCE): """Compute gradient for bias_add operator using automatic differentiate.""" check_list = ["NHWC", "NC1HWC0", "DefaultFormat"] if data_format not in check_list: raise RuntimeError( "bias_add_grad only support %s while dataformat is %s" % (",".join(check_list), data_format)) head_plh = akg.tvm.placeholder(head.shape, head.dtype, "head_plh") if data_format == "NC1HWC0": bias_shape = (1, head.shape[1], 1, 1, head.shape[4]) bias_plh = akg.tvm.placeholder(bias_shape, head.dtype, "bias_plh") elif data_format == "NHWC": bias_shape = (input_shape[-1], ) bias_plh = akg.tvm.placeholder(bias_shape, head.dtype, "bias_plh") else: bias_shape = (input_shape[1], ) bias_plh = akg.tvm.placeholder(bias_shape, head.dtype, "bias_plh") bias_add_res = bias_add(head_plh, bias_plh, data_format) shape1 = [x.value for x in head_plh.shape] shape2 = [x.value for x in bias_plh.shape] def custom_bias_add_diff(out, input_data, head, ad_attrs, new_pld_array): if len(shape2) != 1: raise RuntimeError("Default Format needs Bias is a 1D Tensor!") if data_format == "NHWC": return [akg.tvm.compute(shape2, lambda l: head[0, 0, 0, l])] if data_format == "DefaultFormat": if len(shape1) == 2: return [akg.tvm.compute(shape2, lambda l: head[0, l])] if len(shape1) == 4: return [akg.tvm.compute(shape2, lambda l: head[0, l, 0, 0])] raise RuntimeError( "bias_add only support 2D and 4D shape while dataformat is DefaultFormat" ) return None if data_format == "NC1HWC0": jacs = list(akg.differentiate(bias_add_res, [bias_plh], head)) else: variables = akg.get_variables("reshape_diff") jacs = list( akg.differentiate( bias_add_res, [bias_plh], head, None, None, override={variables[0]: (variables[1], custom_bias_add_diff)})) return jacs[0]
def lstmcell_c_ad(_input, hx, cx, w_ih, w_hh, b_ih, b_hh, Head, input_id, target="cce"): _, forward_c_op = lstmcell(_input, hx, cx, w_ih, w_hh, b_ih, b_hh) tensor_list = [_input, hx, cx, w_ih, w_hh, b_ih, b_hh] _jacs = list(akg.differentiate(forward_c_op, [tensor_list[input_id]], Head)) ################################################### # Need to disable CSE due to stmt dense() + dense() attrs = dict() attrs['disable_cse'] = True attrs['to_three_address_reuse'] = True attrs['to_three_address_min_split'] = 10 ################################################### return _jacs[0], attrs
def tanh_ad(head, in_data): """ Compute gradient of tanh operator using automatic differentiate. Args: head (tvm.tensor.Tensor): Tensor of type float16, float32. in_data (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor has the same shape as input. """ in_dtype = in_data.dtype # On cloud environment, cast data type from 'float16' to 'float32', # then cast result back to 'float16', could achieve higher precision. if in_dtype == 'float16' and not utils.product_is_mini(): in_data = akg.topi.cast(in_data, "float32") head = akg.topi.cast(head, "float32") out_data = tanh.tanh(in_data) jacs = list(akg.differentiate(out_data, [in_data], head)) jacs_res = jacs[0] if in_dtype == 'float16' and not utils.product_is_mini(): jacs_res = akg.topi.cast(jacs_res, 'float16') return jacs_res
def bias_add_ad(head, input_shape, data_format): """ Compute gradient for bias_add operator using automatic differentiate. Args: head (tvm.tensor.Tensor): Input tensor. input_shape (Union[list, tuple]): Input shape of head. data_format (str): Data format of input tensors. Returns: tvm.tensor.Tensor of same shape and type as head. """ check_list = ["NHWC", "NC1HWC0", "DefaultFormat"] if data_format not in check_list: raise RuntimeError("bias_add_grad only support %s while dataformat is %s" % (",".join(check_list), data_format)) vc_util.check_shape(head.shape) shape1 = [x.value for x in head.shape] vc_util.davinci_format_check(shape1, data_format) a = akg.tvm.placeholder(head.shape, head.dtype, "A") if data_format == "NC1HWC0": bias_shape = (1, head.shape[1], 1, 1, head.shape[4]) b = akg.tvm.placeholder(bias_shape, head.dtype, "B") elif data_format == "NHWC": bias_shape = (input_shape[-1],) b = akg.tvm.placeholder(bias_shape, head.dtype, "B") else: bias_shape = (input_shape[1],) b = akg.tvm.placeholder(bias_shape, head.dtype, "B") c = bias_add.bias_add(a, b, data_format) jacs = list(akg.differentiate(c, [b], head)) attrs = {} return jacs[0], attrs
def logsoftmax_ad(shape, dtype, axis, kernel_name, attrs): """Compute the gradient of logsoftmax by autodiff.""" check_list = ["float16"] if not dtype.lower() in check_list: raise RuntimeError("logsoftmax test only support %s while dtype is %s" % (",".join(check_list), dtype)) # check_shape(shape) if axis < 0: axis = len(shape) + axis if axis >= len(shape): raise RuntimeError("axis should be less than dimension") if axis != len(shape) - 1: raise RuntimeError("Only support the last axis currently") shape_new = [shape[-2], shape[-1]] if len(shape) > 2: for i in range(len(shape) - 2): shape_new[0] = shape_new[0] * shape[i] shape = shape_new a_up = akg.tvm.placeholder(shape, dtype=dtype, name="input") b_up = logsoftmax.logsoftmax_op(a_up, shape, axis) head = akg.tvm.placeholder(b_up.shape, name="head", dtype=dtype) _jacs = list(akg.differentiate(b_up, [a_up], head)) sjac = akg.tvm.create_schedule([_jacs[0].op]) sjac[_jacs[0].op.input_tensors[1]].compute_inline() op_vars = [head, a_up, _jacs[0]] with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(sjac, op_vars, "cce", name="test2", attrs=attrs, polyhedral=True) return mod
def rnncell_relu_ad(inputs, hidden, w_ih, w_hh, b_ih, b_hh, Head, input_id): forward_op = rnn_relu_cell(inputs, hidden, w_ih, w_hh, b_ih, b_hh) tensor_list = [inputs, hidden, w_ih, w_hh, b_ih, b_hh] _jacs = list(akg.differentiate(forward_op, [tensor_list[input_id]], Head)) return _jacs[0]
def abs_ad(head, in_data): """ Compute gradient of abs operator with automatic differentiate. Args: head (tvm.tensor.Tensor): Tensor of type float16, float32, int8, uint8, int32. in_data (tvm.tensor.Tensor): Tensor of type float16, float32, int8, uint8, int32. Returns: tvm.tensor.Tensor has the same shape as input. """ dtype = in_data.dtype # check head's validation. vc_util.check_shape(head.shape) vc_util.ops_dtype_check(head.dtype, vc_util.DtypeForDavinci.ALL_TYPES) need_cast_dtype = ["int8", "int32", "uint8"] abs_data = abs.abs_value(in_data) if head.dtype in need_cast_dtype: head = akg.tvm.compute(head.shape, lambda *indice: head(*indice).astype("float16"), name='head_cast') if dtype in need_cast_dtype: abs_data = akg.tvm.compute(abs_data.shape, lambda *indice: abs_data(*indice).astype("float16"), name='abs_cast') jacs = list(akg.differentiate(abs_data, [in_data], head)) if dtype in need_cast_dtype: jacs[0] = akg.tvm.compute(jacs[0].shape, lambda *indice: jacs[0](*indice).astype(dtype), name='res') return jacs[0]
def matmul_ad(data_shape, weight_shape, dtype, attrs=None): check_list = ["float16"] if not (dtype.lower() in check_list): raise RuntimeError("matmul test only support %s while dtype is %s" % (",".join(check_list), dtype)) # check_shape(shape) assert (len(data_shape) == 2) assert (len(weight_shape) == 2) assert (data_shape[1] == weight_shape[0]) m, k = data_shape _, n = weight_shape a = akg.tvm.placeholder((m, k), name='a', dtype=dtype) b = akg.tvm.placeholder((k, n), name='b', dtype=dtype) kk = akg.tvm.reduce_axis((0, k), name='kk') c = akg.tvm.compute( (m, n), lambda i, j: akg.lang.ascend.mmad(a[i, kk] * b[kk, j], axis=kk), name="c") head = akg.tvm.placeholder(c.shape, name="Head", dtype='float16') _jacs = list(akg.differentiate(c, [a], head)) sjac = akg.tvm.create_schedule([_jacs[0].op]) op_vars = [head, b, _jacs[0]] with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(sjac, op_vars, "cce", name="test2", attrs=attrs, polyhedral=True) return mod
def minimum_ad(head, data_x, data_y, grad_x=True, grad_y=True): """ Calculating the reversed outputs of the operator minimum by using automatic differentiate. Args: head (tvm.tensor.Tensor): Input tensor of float32, float16 and int32. data_x (tvm.tensor.Tensor): Input tensor of float32, float16 and int32. data_y (tvm.tensor.Tensor): Input tensor of float32, float16 and int32. grad_x (bool): Default is True, whether to differentiate x. grad_y (bool): Default is True, whether to differentiate y. Returns: tvm.tensor.Tensor, has the same type and shape as grads, if grad_x and grad_y all equal to True, need return a list like: [jacs[0], jacs[1]]. """ utils.elemwise_shape_check(data_x.shape, data_y.shape) utils.elemwise_shape_check(head.shape, data_x.shape) utils.elemwise_dtype_check( data_x.dtype, head.dtype, [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32]) utils.elemwise_dtype_check( data_x.dtype, data_y.dtype, [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32]) if not grad_x and not grad_y: raise ValueError("At least one of grad_x and grad_y is True.") op = minimum(data_x, data_y) jacs = list(akg.differentiate(op, [data_x, data_y], head)) if grad_x and grad_y: return jacs[0], jacs[1] if grad_x: return jacs[0] return jacs[1]
def softmax_ad_optimized(head, data, axis=-1): """ Computes the autodiff of softmax. Args: head (tvm.tensor.Tensor): Original differentiation values. data (tvm.tensor.Tensor): Input of softmax. axis (int): Along which axis softmax is performed. Returns: tvm.tensor.Tensor, the overall differentiation values. """ def get_shape(pld): return [d.value for d in pld.shape] def temp_compute(shape, grad, sftmx_fwd, *indices): shp_len = len(shape) grad_index = indices[:(shp_len - 2)] + indices[-1:] sftmx_fwd_index = indices[:-1] temp = grad(*grad_index) * akg.tvm.expr.Select( indices[-1] == indices[-2], sftmx_fwd(*sftmx_fwd_index) * (1 - sftmx_fwd(*sftmx_fwd_index)), -sftmx_fwd(*sftmx_fwd_index) * sftmx_fwd(*grad_index)) return temp def temp_sum_compute(shape, temp, *indices): kk = akg.tvm.reduce_axis((0, shape[-1]), name='kk') index = indices[:] + (kk, ) temp_sum = akg.tvm.sum(temp(*index), axis=kk) return temp_sum def custom_softmax_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) sftmx_fwd = Softmax(data, -1)[0] shape.append(shape[-1]) temp = akg.tvm.compute( shape, lambda *indices: temp_compute(shape, grad, sftmx_fwd, *indices), name="softmax_select2") temp_sum = akg.tvm.compute( shape[:-1], lambda *indices: temp_sum_compute(shape, temp, *indices), name="softmax_ad2") return [temp_sum] l_up = Softmax(data, axis)[0] # For the large expression tree's dl w.r.t. data (where softmax is embedded inside), use the default fdiff. # For softmax's dl w.r.t. data (note: l_up is not a direct dependency of data), use the custom_softmax_fdiff. # In this case, l_up is the same as l_up, and data same as data, but this needs not be the case. [dl_ddata ] = akg.differentiate(l_up, [data], head, None, None, override={l_up: ([data], custom_softmax_fdiff)}) attrs = {} return dl_ddata, attrs
def gelu_ad_custom(head, in_data, target="cce"): """ Automatic differentiation of gelu with customize function. In order to achieve higher precision, we could also self-define tanh part differentiate with simplify calculation. """ dtype = in_data.dtype const1 = akg.tvm.const(0.044715, dtype) const2 = akg.tvm.const(0.7978845, dtype) const3 = akg.tvm.const(0.1070322, dtype) tmp0 = akg.topi.multiply(in_data, in_data) pow0 = akg.topi.multiply(tmp0, in_data) mul0 = pow0 * const1 add0 = in_data + mul0 mul1 = add0 * const2 tanh_res = Tanh(mul1) add1 = tanh_res + akg.tvm.const(1, dtype) mul2 = add1 * akg.tvm.const(0.5, dtype) mul3 = in_data * mul2 res = mul3 def gelu_diff(out, inp, head, ad_attrs, new_array_pld): temp = tanh_fdiff(head, mul1) return [ temp * (akg.tvm.const(0.7978845, dtype) + const3 * inp[0] * inp[0]) ] jacs = list( akg.differentiate(res, [in_data], head, None, None, override={tanh_res: ([in_data], gelu_diff)})) return jacs[0]
def blas_axby_ad(head, alpha, beta): """Compute gradient of blas_axby operator using automatic differentiate.""" x = akg.tvm.placeholder(head.shape, head.dtype, "inputx") y = akg.tvm.placeholder(head.shape, head.dtype, "inputy") op = blas_axby.blas_axby(x, y, alpha, beta) jacs = list(akg.differentiate(op, [x, y], head)) return jacs[0], jacs[1]
def erf_ad(head, x): """Compute gradient of erf operator using automatic differentiate.""" if utils.product_is_mini(): raise RuntimeError("Not support erf_ad on mini device.") output = erf.erf(x) jacs = list(akg.differentiate(output, [x], head)) return jacs[0]
def sparse_softmax_cross_entropy_with_logits_ad(labels, logits, reduction='mean', grad_scale=1.0): """Compute gradient for sparse_softmax_cross_entropy_with_logits operator using automatic differentiate.""" attr_map = {} def custom_softmax_cross_entropy_with_logits_fdiff(out, inputs, grad, attrs, new_pld_array): strategy, _, backprop = loss.sparse_softmax_cross_entropy_with_logits_impl( inputs[1], inputs[0], reduction=reduction, scale=grad_scale) if strategy: attr_map["custom_tiling"] = strategy return [backprop] l_value, _ = loss.sparse_softmax_cross_entropy_with_logits( labels, logits, reduction) head = akg.tvm.compute(l_value.shape, lambda *i: akg.tvm.const(1.0, l_value.dtype), name='head') [dl_dlogits ] = akg.differentiate(l_value, [logits], head, None, None, override={ l_value: ([logits, labels], custom_softmax_cross_entropy_with_logits_fdiff) }) return dl_dlogits, attr_map
def reduce_max_ad_optimized(head, data, axis, keepdims, target="cce"): def get_shape(pld): return [d.value for d in pld.shape] def custom_reduce_max_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) max_ = akg.lang.ascend.reduce_max(data, axis=axis, keepdims=keepdims) max_broadcast = akg.lang.ascend.broadcast(max_, shape) return [ akg.tvm.compute(shape, lambda *indices: akg.tvm.expr.Select( data(*indices) == max_broadcast(*indices), grad(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims)), akg.tvm.const(0, dtype=data.dtype)), name="reduce_max_ad2") ] l = reduce_max(data, axis, keepdims, target=target) [dl_ddata ] = akg.differentiate(l, [data], head, None, None, override={l: ([data], custom_reduce_max_fdiff)}) return dl_ddata
def avgpool_ad(head, data, kernel, stride, pad): """Compute gradient of avgpool operator using automatic differentiate.""" attrs = { "enable_post_poly_loop_partition": False, "enable_pre_poly_loop_partition": False } avgpool_fwd, _ = avgpool(data, kernel, stride, pad) [dl_ddata] = akg.differentiate(avgpool_fwd, [data], head) return dl_ddata, attrs
def conv_input_ad(input_ad_inputs, fmap_shape, filter_shape, pad_, stride_, dilation_, attrs=None): """ Compute dx according to "conv forward". Args: input_ad_inputs (list[tvm.tensor.Tensor]): a list with length 2. input_ad_inputs[0](consider as dy) Tensor of type float16 ,shape 5D(out_n, out_c//C0, out_h, out_w,C0) input_ad_inputs[1](consider as w) Tensor of type float16 ,shape 4D(wC//C0*wH*wW, wN//C0, C0,C0) fmap_shape (list): [fN, fC, fH, fW] filter_shape (list): [wN, wC, wH, wW] pad_ (list): [pad_left, pad_right, pad_top, pad_bottom] stride_ (list): [stride_h, stride_w] dilation_ (list): [dilation_h, dilation_w] attrs (dict): a dict with keys like conv_tile, bypass and etc. Returns: tvm.tensor.Tensor, configs. """ backward_dy, forward_w = input_ad_inputs in_n, in_c, in_h, in_w = fmap_shape block_size = 16 in_c = (in_c + block_size - 1) // block_size * block_size x_5d_shape = (in_n, in_c // block_size, in_h, in_w, block_size) forward_x = akg.tvm.placeholder(x_5d_shape, forward_w.dtype, "input_X") original_filter_shape = akg.tvm.placeholder(filter_shape, forward_w.dtype, "input_filter") forward_output, _ = conv_forward.conv([forward_x, forward_w], fmap_shape, filter_shape, pad_, stride_, dilation_, use_bias=False, attrs=attrs) ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 0} jacs = list( akg.differentiate(forward_output, [forward_x], backward_dy, ad_attrs, [backward_dy, forward_w, original_filter_shape])) configs = conv_input_ad_config([backward_dy, forward_w], fmap_shape, filter_shape, pad_, stride_, dilation_, attrs=attrs) return jacs[0], configs
def maxpool_ad_no_custom_diff_poly_all_max(head, data, kernel, stride, pad): """automatic differentiate of maxpool with polyhedral""" attrs = { "enable_post_poly_loop_partition": False, "enable_pre_poly_loop_partition": False } maxpool_fwd = maxpool.old_maxpool(data, kernel, stride, pad) [dl_ddata] = akg.differentiate(maxpool_fwd, [data], head, None, None) return dl_ddata, attrs
def smooth_l1_loss_ad(head, prediction, target, anchor_samples, anchor_sample_correct=0, delta=1.0): b = smooth_l1_loss.smooth_l1_loss(prediction, target, anchor_samples, anchor_sample_correct, delta) _jacs = list(akg.differentiate(b[0], [prediction], head)) return _jacs[0]
def roi_align_ad(head, data, rois, pooled_size, spatial_scale, sample_ratio, target="cce"): output = akg.topi.vision.rcnn.roi_align.roi_align_nchw( data, rois, pooled_size, spatial_scale, sample_ratio) _jacs = list(akg.differentiate(output, [data], head)) return _jacs[0]
def triplet_loss_ad(head, anchor_output, positive_output, negative_output, margin=1.0, input_id=0): if not ((input_id >= 0) and (input_id <= 2)): raise RuntimeError("Error: input_id should be 0, 1 or 2 only!") fwd = triplet_loss_naive(anchor_output, positive_output, negative_output, margin) if (input_id == 0): _jacs = list( akg.differentiate( fwd, [anchor_output, positive_output, negative_output], head)) elif (input_id == 1): _jacs = list(akg.differentiate(fwd, [positive_output], head)) else: _jacs = list(akg.differentiate(fwd, [negative_output], head)) return _jacs[0]
def mean_ad(head, input_shape, axis, keepdims): """mean autodiff.""" tensor_a = tvm.placeholder(input_shape, head.dtype, "A") tensor_b = mean.mean(tensor_a, axis, keepdims) # remove useless mean_output if isinstance(tensor_b, tuple): tensor_b = tensor_b[0] if tensor_b.op.name == "mean_output": tensor_b = tensor_b.op.input_tensors[0] jacs = list(akg.differentiate(tensor_b, [tensor_a], head)) return jacs[0]
def elu_ad(head, x, target="cce"): """ Computes elu_grad. Args: head (tvm.tensor.Tensor): Tensor of type float16, float32 x (tvm.tensor.Tensor): Input of elu Returns: akg.tvm.Tensor of same type and shape as inputs """ y = elu.elu(x) jacs = list(akg.differentiate(y, [x], head)) return akg.lang.ascend.cast_to(jacs[0], head.dtype)
def bernoulli_logprob_ad(head, x, probs): """ An example of differentiating bernoulli.logprob in all inputs and paramters Args: head: The adjoint of the output, in other words, some tensors, by which the Jacobians will be multiplied x: input, tenosor of 0 or 1 probs: probabilities of random variables taking values 1 """ mod = bernoulli.bernoulli(probs).log_prob(x) auto_diff_outs = list(akg.differentiate(mod, [x, probs], head)) return auto_diff_outs
def cos_ad(head, a, target="cce"): """ Computes cosine derivative value of a tensor. Args: head (tvm,tensor.Tensor): Tensor of type float16, float32 a (tvm,tensor.Tensor): Tensor of type float16, float32 Returns: akg.tvm.Tensor of same type and shape as inputs """ b, attr = cos.cos(a) jacs = list(akg.differentiate(b, [a], head)) return jacs[0], attr
def avgpool_ad_no_custom_diff_manual_schedule(head, data, kernel, stride, pad): """automatic differentiate of avgpool with manual schedule.""" attrs = { "enable_post_poly_loop_partition": False, "enable_pre_poly_loop_partition": False } avgpool_fwd, _ = avgpool.avgpool(data, kernel, stride, pad) [dl_ddata] = akg.differentiate(avgpool_fwd, [data], head) # schedule for differetiation operation s = akg.tvm.create_schedule([dl_ddata.op]) kh, kw = kernel shape = get_shape(data) ib, ic1, ih, iw, ic0 = shape if kh == ih and kw == iw: pad2d_input_2_grad = dl_ddata res_value_res_grad = pad2d_input_2_grad.op.input_tensors[0] head = res_value_res_grad.op.input_tensors[0] def comp_func(s): head_ub = s.cache_read(head, "local.UB", [res_value_res_grad]) result_ub = s.cache_write(pad2d_input_2_grad, "local.UB") s[res_value_res_grad].set_scope("local.UB") b, c1, h, w, c0 = pad2d_input_2_grad.op.axis s[head_ub].compute_at(s[pad2d_input_2_grad], b) s[res_value_res_grad].compute_at(s[pad2d_input_2_grad], b) s[result_ub].compute_at(s[pad2d_input_2_grad], b) else: pad2d_input_2_grad = dl_ddata Broadcast_jac = pad2d_input_2_grad.op.input_tensors[0] res_value_res_grad = Broadcast_jac.op.input_tensors[0] head = res_value_res_grad.op.input_tensors[0] def comp_func(s): head_ub = s.cache_read(head, "local.UB", [res_value_res_grad]) result_ub = s.cache_write(pad2d_input_2_grad, "local.UB") s[Broadcast_jac].set_scope("local.UB") s[res_value_res_grad].set_scope("local.UB") b, c1, h, w, c0 = result_ub.op.axis s[result_ub].reorder(*result_ub.op.reduce_axis, b, c1, h, w, c0) s[Broadcast_jac].compute_at(s[result_ub], b) return dl_ddata, comp_func, attrs
def normal_diag_KLdiv_ad(head, mean, scale): """ An example of differentiating normal_diag.KLdiv in all inputs and paramters Args: head: The adjoint of the output, in other words, some tensors, by which the Jacobians will be multiplied x: input mean: vector of means of MVN scale: vector of sigma of MVN with diagonal covariance """ mod = normal_diag.normal_diag(mean, scale).KL_divergence() auto_diff_outs = list(akg.differentiate(mod, [mean, scale], head)) return auto_diff_outs
def reduce_min_ad_optimized(HEAD, data, axis, keepdims): def get_shape(pld): return [d.value for d in pld.shape] def grad_compute(grad, *indices): indices_list = list(indices) axis_list = [x + len(indices_list) if x < 0 else x for x in list(axis)] if keepdims: grad_indices_list = [ indices_list[i] if i not in axis_list else 0 for i in range(len(indices_list)) ] else: grad_indices_list = [ indices_list[i] for i in range(len(indices_list)) if i not in axis_list ] grad_ind = tuple(grad_indices_list) return grad(*grad_ind) def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) min_ = akg.lang.cce.reduce_min(data, axis=axis, keepdims=keepdims) min_broadcast = akg.lang.cce.broadcast(min_, shape) return [ akg.tvm.compute(shape, lambda *indices: akg.tvm.expr.Select( data(*indices) == min_broadcast(*indices), grad_compute(grad, *indices), akg.tvm.const(0, dtype=data.dtype)), name="reduce_min_ad2") ] L = reduce_min.reduce_min(data, axis, keepdims) [dL_ddata ] = akg.differentiate(L, [data], HEAD, None, None, override={L: ([data], custom_reduce_min_fdiff)}) return dL_ddata
def mean_ad(head, input_shape, axis, keepdims): """ Compute gradient of mean operator using automatic differentiate. Args: head (tvm.tensor.Tensor): Input tensor. input_shape (Union[list, tuple]): Shape of input tensor of mean operator. axis (Union[list, tuple, int]): Specifies which axis to reduce. keepdims (bool): Keep the reduced axis with length 1 if keepdims is true. Returns: tvm.tensor.Tensor. """ a = akg.tvm.placeholder(input_shape, head.dtype, "A") b, _ = mean.mean(a, axis, keepdims) jacs = list(akg.differentiate(b, [a], head)) return jacs[0]