def _neg_loss(pred, gt): ''' Modified focal loss. Exactly the same as CornerNet. Runs faster and costs a little bit more memory Arguments: pred (batch x c x h x w) gt_regr (batch x c x h x w) ''' pos_inds = gt.__eq__(1).astype('float32') neg_inds = gt.__lt__(1).astype('float32') neg_weights = nd.power(1 - gt, 4) loss = 0 pos_loss = nd.log(pred) * nd.power(1 - pred, 2) * pos_inds neg_loss = nd.log(1 - pred) * nd.power(pred, 2) * neg_weights * neg_inds num_pos = pos_inds.astype('float32').sum() pos_loss = pos_loss.sum() neg_loss = neg_loss.sum() if num_pos == 0: loss = loss - neg_loss else: loss = loss - (pos_loss + neg_loss) / num_pos return loss
def implement_1(self, x, label): ''' following paper to implement ''' # weight normalize with x.context: w = self.weight.data() w_norm = w / nd.sqrt(nd.sum(nd.power(w, 2), axis=1)).reshape((-1, 1)) # cos_theta = x'w/|x|. note: |w| = 1 x_norm = nd.power(x, 2) x_norm = nd.sum(x_norm, axis=1) x_norm = nd.sqrt(x_norm) cos_theta = nd.dot(x, w_norm, transpose_b=True) cos_theta = cos_theta / x_norm.reshape((-1, 1)) cos_theta = nd.clip(cos_theta, -1, 1) # cos_m_theta = cos(m * theta) cos_m_theta = self.margin_cos[self.margin](cos_theta) # k with mx.autograd.pause(): theta = nd.arccos(cos_theta) k = nd.sign((self.margin * theta / math.pi)) # i=j is phi_theta and i!=j is cos_theta phi_theta = ((-1)**k) * cos_m_theta - 2 * k x_norm_phi_theta = x_norm.reshape((-1, 1)) * phi_theta x_norm_cos_theta = x_norm.reshape((-1, 1)) * cos_theta # i=j index with mx.autograd.pause(): index = nd.one_hot(label, x_norm_phi_theta.shape[1]) # output with mx.autograd.pause(): lamb = self.__get_lambda() output = x_norm_cos_theta * 1.0 output = output - x_norm_cos_theta * index / (1 + lamb) output = output + x_norm_phi_theta * index / (1 + lamb) return output
def euclidean_dist(x, y): m, n = x.shape[0], y.shape[0] xx = nd.power(x, 2).sum(axis=1, keepdims=True).broadcast_to((m, n)) yy = nd.power(y, 2).sum(axis=1, keepdims=True).broadcast_to((n, m)).T dist = xx + yy dist = dist - 2 * nd.dot(x, y.T) dist = dist.clip(a_min=1e-12, a_max=1e12).sqrt() return dist
def forward(self, X): self.linear_item = nd.dot(X, self.w.data()) self.interaction_item = nd.sum( nd.power(nd.dot(X, self.latent_vec.data()), 2) - nd.dot(nd.power(X, 2), nd.power(self.latent_vec.data(), 2)), axis=1, keepdims=True) self.y_hat = self.linear_item + self.interaction_item + self.b.data() return self.y_hat
def forward(self, adj, feat): r"""Compute (Dense) Graph Convolution layer. Parameters ---------- adj : mxnet.NDArray The adjacency matrix of the graph to apply Graph Convolution on, when applied to a unidirectional bipartite graph, ``adj`` should be of shape should be of shape :math:`(N_{out}, N_{in})`; when applied to a h**o graph, ``adj`` should be of shape :math:`(N, N)`. In both cases, a row represents a destination node while a column represents a source node. feat : torch.Tensor The input feature. Returns ------- mxnet.NDArray The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}` is size of output feature. """ adj = adj.astype(feat.dtype).as_in_context(feat.context) src_degrees = nd.clip(adj.sum(axis=0), a_min=1, a_max=float('inf')) dst_degrees = nd.clip(adj.sum(axis=1), a_min=1, a_max=float('inf')) feat_src = feat if self._norm == 'both': norm_src = nd.power(src_degrees, -0.5) shp_src = norm_src.shape + (1, ) * (feat.ndim - 1) norm_src = norm_src.reshape(shp_src).as_in_context(feat.context) feat_src = feat_src * norm_src if self._in_feats > self._out_feats: # mult W first to reduce the feature size for aggregation. feat_src = nd.dot(feat_src, self.weight.data(feat_src.context)) rst = nd.dot(adj, feat_src) else: # aggregate first then mult W rst = nd.dot(adj, feat_src) rst = nd.dot(rst, self.weight.data(feat_src.context)) if self._norm != 'none': if self._norm == 'both': norm_dst = nd.power(dst_degrees, -0.5) else: # right norm_dst = 1.0 / dst_degrees shp_dst = norm_dst.shape + (1, ) * (feat.ndim - 1) norm_dst = norm_dst.reshape(shp_dst).as_in_context(feat.context) rst = rst * norm_dst if self.bias is not None: rst = rst + self.bias.data(feat.context) if self._activation is not None: rst = self._activation(rst) return rst
def GoodFitting(): # Just Fitting, Third Oder Polynomial n_train, n_test, true_w, true_b = 100, 100, [1.2, -3.4, 5.6], 5 features = nd.random.normal(shape=(n_train + n_test, 1)) poly_features = nd.concat(features, nd.power(features, 2), nd.power(features, 3)) labels = (true_w[0] * poly_features[:, 0] + true_w[1] * poly_features[:, 1] + true_w[2] * poly_features[:, 2] + true_b) labels += nd.random.normal(scale=0.1, shape=labels.shape) fit_and_plot(poly_features[:n_train, :], poly_features[n_train:, :], labels[:n_train], labels[n_train:])
def box_ciou(b1, b2): """ 输入为: ---------- b1: NDarray, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh b2: NDarray, shape=(batch, feat_w, feat_h, anchor_num, 4), xywh 返回为: ------- ciou: NDarray, shape=(batch, feat_w, feat_h, anchor_num, 1) """ # 求出预测框左上角右下角 b1_xy = b1[..., :2] b1_wh = b1[..., 2:4] b1_wh_half = b1_wh / 2. b1_mins = b1_xy - b1_wh_half b1_maxes = b1_xy + b1_wh_half # 求出真实框左上角右下角 b2_xy = b2[..., :2] b2_wh = b2[..., 2:4] b2_wh_half = b2_wh / 2. b2_mins = b2_xy - b2_wh_half b2_maxes = b2_xy + b2_wh_half # 求真实框和预测框所有的iou intersect_mins = nd.max(b1_mins, b2_mins) intersect_maxes = nd.min(b1_maxes, b2_maxes) intersect_wh = nd.max(intersect_maxes - intersect_mins, nd.zeros_like(intersect_maxes)) intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] b1_area = b1_wh[..., 0] * b1_wh[..., 1] b2_area = b2_wh[..., 0] * b2_wh[..., 1] union_area = b1_area + b2_area - intersect_area iou = intersect_area / nd.clip(union_area, a_min=1e-6) # 计算中心的差距 center_distance = nd.sum(nd.power((b1_xy - b2_xy), 2), axis=-1) # 找到包裹两个框的最小框的左上角和右下角 enclose_mins = nd.min(b1_mins, b2_mins) enclose_maxes = nd.max(b1_maxes, b2_maxes) enclose_wh = nd.max(enclose_maxes - enclose_mins, nd.zeros_like(intersect_maxes)) # 计算对角线距离 enclose_diagonal = nd.sum(nd.power(enclose_wh, 2), axis=-1) ciou = iou - 1.0 * (center_distance) / nd.clip(enclose_diagonal, a_min=1e-6) v = (4 / (math.pi**2)) * nd.power( (nd.arctan(b1_wh[..., 0] / nd.clip(b1_wh[..., 1], min=1e-6)) - nd.arctan(b2_wh[..., 0] / nd.clip(b2_wh[..., 1], a_min=1e-6))), 2) alpha = v / nd.clip((1.0 - iou + v), a_max=1e-6) ciou = ciou - alpha * v return ciou
def hybrid_forward(self, F, pred, label, sample_weight=None): label = _reshape_like(F, label, pred) if not self._from_sigmoid: max_val = F.relu(-pred) loss = pred - pred * label + max_val + F.log(F.exp(-max_val) + F.exp(-pred - max_val)) else: p = mx.nd.array(1 / (1 + nd.exp(-pred)), ctx=ctx) weights = nd.exp(label + (1 - label * 2) * batch_ratios) gamma = 2 w_p, w_n = nd.power(1. - p, gamma), nd.power(p, gamma) loss = - (w_p * F.log(p + 1e-12) * label + w_n * F.log(1. - p + 1e-12) * (1. - label)) loss *= weights return F.mean(loss, axis=self._batch_axis, exclude=True)
def euclidean_dist(x, y): """ Args: x: pytorch Variable, with shape [m, d] y: pytorch Variable, with shape [n, d] Returns: dist: pytorch Variable, with shape [m, n] """ m, n = x.shape[0], y.shape[0] xx = nd.power(x, 2).sum(axis=1, keepdims=True).broadcast_to((m, n)) yy = nd.power(y, 2).sum(axis=1, keepdims=True).broadcast_to((n, m)).T dist = xx + yy dist = dist - 2 * nd.dot(x, y.T) dist = dist.clip(a_min=1e-12, a_max=1e12).sqrt() # for numerical stability return dist
def _not_faster_neg_loss(pred, gt): pos_inds = gt.__eq__(1).astype('float32') neg_inds = gt.__lt__(1).astype('float32') num_pos = pos_inds.astype('float32').sum() neg_weights = nd.power(1 - gt, 4) loss = 0 trans_pred = pred * neg_inds + (1 - pred) * pos_inds weight = neg_weights * neg_inds + pos_inds all_loss = nd.log(1 - trans_pred) * nd.power(trans_pred, 2) * weight all_loss = all_loss.sum() if num_pos > 0: all_loss /= num_pos loss -= all_loss return loss
def train_model(model, train_xs, train_ys): with device_ctx: # Convert to ndarray train_xs, train_ys = nd.array(train_xs), nd.array(train_ys) # Prepare the train dataset and model batch_size = 100 train_dataset = data.ArrayDataset(train_xs, train_ys) train_data_iter = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # model = nn.Sequential() # model.add(nn.Dense(1, activation=None)) # # model.add(nn.Dense(1)) model.initialize(init.Normal(sigma=0.1)) # model.initialize(init.Xavier()) loss_f = loss.L2Loss() trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': 0.05}) # Train the model num_epochs = 25 for epoch in range(1, num_epochs + 1): l = None for X, y in train_data_iter: with autograd.record(): # l = loss_f(model(X), y) l = nd.power(model(X) - y, 2) l.backward() trainer.step(batch_size) # l = loss_f(model(train_xs), train_ys) l = nd.power(model(train_xs) - train_ys, 2) mse = np.sum( np.power( train_ys.asnumpy().reshape(-1, 1) - model(train_xs).asnumpy().reshape(-1, 1), 2)) / len(train_xs) if epoch % 5 == 0: print('epoch %d, loss: %.4f, mse: %.4f' % (epoch, l.mean().asnumpy(), mse))
def position_encoding_init(max_length, dim): X = nd.arange(0, max_length).reshape( (-1, 1)) / nd.power(10000, nd.arange(0, dim, 2) / dim) position_weight = nd.zeros((max_length, dim)) position_weight[:, 0::2] = nd.sin(X) position_weight[:, 1::2] = nd.cos(X) return position_weight
def load_data_polynomial(true_w, true_b, num_train=5000, num_test=1000): """ """ features = nd.normal(shape=(num_train + num_test, 1)) poly_features = [nd.power(features, i) for i in range(1, len(true_w) + 1)] poly_features = nd.concat(*poly_features) labels = nd.dot(poly_features, true_w) + true_b labels += nd.random.normal(scale=0.1) return features, poly_features, labels
def __init__(self, units, dropout, max_len=1000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(dropout) # Create a long enough P, max_len来充当T吗 # P: (1, max_len, D) self.P = nd.zeros((1, max_len, units)) # X: (max_len, D/2) X = nd.arange(0, max_len).reshape((-1,1)) / nd.power(10000, nd.arange(0, units, 2) / units) self.P[:, :, 0::2] = nd.sin(X) # 从0开始间隔2填充P 如0 2 4 偶数序列对应sin self.P[:, :, 1::2] = nd.cos(X) # 从1开始间隔2填充P 如1 3 5 奇数序列对应cos
def _sum_n_square(self): """Helper function for paramater regularisation """ sum_of_square = 0 pdict = self.layer.params with autograd.record(): for param in pdict: sum_of_square = sum_of_square + nd.sum( nd.power(pdict[param].data(), 2)) return sum_of_square
def __init__(self, units, dropout, max_len=1000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(dropout) # Create a long enough P self.P = nd.zeros((1, max_len, units)) X = nd.arange(0, max_len).reshape( (-1, 1)) / nd.power(10000, nd.arange(0, units, 2) / units) self.P[:, :, 0::2] = nd.sin(X) self.P[:, :, 1::2] = nd.cos(X)
def implement_0(self, x, label): ''' following the sphereface code of caffe ''' # weight normalize with x.context: w = self.weight.data() with mx.autograd.pause(): w_norm = w / nd.sqrt(nd.sum(nd.power(w, 2), axis=1)).reshape( (-1, 1)) w[:] = w_norm # x_norm = |x| x_norm = nd.power(x, 2) x_norm = nd.sum(x_norm, axis=1) x_norm = nd.sqrt(x_norm) # cos_theta = x'w/|x|. note: |w| = 1 cos_theta = nd.dot(x, w, transpose_b=True) cos_theta = cos_theta / x_norm.reshape((-1, 1)) # cos_theta_quadratic & cos_theta_quartic cos_theta_quadratic = cos_theta**2 cos_theta_quartic = cos_theta**4 with mx.autograd.pause(): # sign_0 = sign(cos_theta) sign_0 = nd.sign(cos_theta) # sign_3 = sign_0 * sign(2 * cos_theta_quadratic_ - 1) sign_3 = sign_0 * nd.sign(2 * cos_theta_quadratic - 1) # sign_4 = 2 * sign_0 + sign_3 - 3 sign_4 = 2 * sign_0 + sign_3 - 3 # phi_theta = (sign_3 * (8 * cos_theta_quartic - 8 * cos_theta_quadratic + 1) + sign_4) phi_theta = sign_3 * (8 * cos_theta_quartic - 8 * cos_theta_quadratic + 1) + sign_4 x_norm_phi_theta = x_norm.reshape((-1, 1)) * phi_theta # i=j index with mx.autograd.pause(): index = nd.one_hot(label, x_norm_phi_theta.shape[1]) # output with mx.autograd.pause(): lamb = self.__get_lambda() # 10 output = nd.dot(x, w, transpose_b=True) output2 = output * (1.0 - index) + x_norm_phi_theta * index output3 = (output2 + lamb * nd.dot(x, w, transpose_b=True)) / (1 + lamb) return output3
def predict(yolo: Yolo, x, threshold=0.5): """ return label ,C,location :param yolo: :return: """ assert len(x) == 1, "Only One image for now" ypre = yolo(x) label, preds, location = deal_output(ypre, yolo.s, b=yolo.b, c=yolo.class_num) indexs = [] for i, c in enumerate(preds[0]): if c > threshold: indexs.append(i) class_names = [] C_list = [] bos_list = [] for index in indexs: label_index = int(index / 2) location_offect = int(index % 2) class_index = nd.argmax(label[0][label_index], axis=0) C = preds[0][index] locat = location[0][label_index][location_offect] C_list.append(C.asscalar()) #######traslate the name label_name = yolo.class_names text = label_name[int(class_index.asscalar())] class_names.append(text) ###traslate the locat x, y, w, h = locat w, h = nd.power(w, 2), nd.power(h, 2) ceil = 1 / 4 row = int(label_index / 4) columns = label_index % 4 x_center = columns * ceil + x y_center = row * ceil + y x_min, y_min, x_max, y_max = x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w, y_center + 0.5 * h box = nd.concatenate([x_min, y_min, x_max, y_max], axis=0) * 256 bos_list.append(box.asnumpy()) return class_names, C_list, bos_list
def normal(): """ 它的每个元素都随机采样于均值为0、标准差为1的正态分布。nd.sqrt(nd.power(a, 2).sum()) :return: """ n = nd.normal(0, 1, shape=(2, 2)) logger.info(n) a = nd.array([1, 2, 3, 4]) print(a.norm()) print(nd.sqrt(nd.power(a, 2).sum()))
def scale_and_bound(self, sample, log_prob, mean): action_bounded = sample.tanh() # bound action action_scaled = action_bounded * self.action_scale + self.action_bias # scale action mean_bounded = mean.tanh( ) * self.action_scale + self.action_bias # bound and scale mean log_prob_bounded = log_prob - (self.action_scale * (1 - nd.power(action_bounded, 2)) + EPSILON).log() return action_scaled, log_prob_bounded, mean_bounded
def _update_params(self, accumulated_grads): # scale gradients by lot size, add noise, and update the parameters for param_name, param in self._params.items(): # average the clipped gradients and then add noise to each averaged gradient param_grad_update = (accumulated_grads[param_name] / self._hyperparams['lot_size']) + \ mx.random.normal(0, self._hyperparams['sigma'], param.shape, ctx=self._model_ctx) # update biased first moment estimate self._m[param_name] = self._hyperparams['beta_1'] * self._m[param_name] + (1 - self._hyperparams['beta_1']) * param_grad_update # update biased second raw moment estimate self._v[param_name] = self._hyperparams['beta_2'] * self._v[param_name] + (1 - self._hyperparams['beta_2']) * nd.square(param_grad_update) # compute bias-corrected first moment estimate m_hat = self._m[param_name] / (1 - nd.power(self._hyperparams['beta_1'], self._step + 1)) # compute bias-corrected second raw moment estimate v_hat = self._v[param_name] / (1 - nd.power(self._hyperparams['beta_2'], self._step + 1)) # update params with ADAM param[:] = param - self._hyperparams['lr'] * m_hat / (nd.sqrt(v_hat) + 1e-8)
def positional(x): batch_size, length, model_dim = x.shape # (length, 1) pos = nd.arange(length).expand_dims(1) # (1, model_dim/2), 10000^(2i/model_dim) div = nd.power(10000, nd.arange(model_dim / 2) * 2 / model_dim) out = nd.zeros((length, model_dim)) out[:, 0::2] = nd.sin(pos / div) out[:, 1::2] = nd.cos(pos / div) return nd.broadcast_axis(out.expand_dims(0), axis=0, size=batch_size)
def power(self, tensor_in_1, tensor_in_2): """ Result of first array elements raised to powers from second array, element-wise with broadcasting. Args: tensor_in_1 (Tensor): Tensor object tensor_in_2 (Tensor): Tensor object Returns: MXNet NDArray: First array elements raised to powers from second array. """ tensor_in_1 = self.astensor(tensor_in_1) tensor_in_2 = self.astensor(tensor_in_2) return nd.power(tensor_in_1, tensor_in_2)
def forward(self, adj, feat): r"""Compute (Dense) Graph Convolution layer. Parameters ---------- adj : mxnet.NDArray The adjacency matrix of the graph to apply Graph Convolution on, should be of shape :math:`(N, N)`, where a row represents the destination and a column represents the source. feat : mxnet.NDArray The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. Returns ------- mxnet.NDArray The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}` is size of output feature. """ adj = adj.astype(feat.dtype).as_in_context(feat.context) if self._norm: in_degrees = adj.sum(axis=1) norm = nd.power(in_degrees, -0.5) shp = norm.shape + (1,) * (feat.ndim - 1) norm = norm.reshape(shp).as_in_context(feat.context) feat = feat * norm if self._in_feats > self._out_feats: # mult W first to reduce the feature size for aggregation. feat = nd.dot(feat, self.weight.data(feat.context)) rst = nd.dot(adj, feat) else: # aggregate first then mult W rst = nd.dot(adj, feat) rst = nd.dot(rst, self.weight.data(feat.context)) if self._norm: rst = rst * norm if self.bias is not None: rst = rst + self.bias.data(feat.context) if self._activation is not None: rst = self._activation(rst) return rst
def forward(self, graph, feat): r"""Compute Simplifying Graph Convolution layer. Parameters ---------- graph : DGLGraph The graph. feat : mxnet.NDArray The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. Returns ------- mxnet.NDArray The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}` is size of output feature. Notes ----- If ``cache`` is se to True, ``feat`` and ``graph`` should not change during training, or you will get wrong results. """ graph = graph.local_var() if self._cached_h is not None: feat = self._cached_h else: # compute normalization degs = nd.clip(graph.in_degrees().astype(feat.dtype), 1, float('inf')) norm = nd.power(degs, -0.5).expand_dims(1) norm = norm.as_in_context(feat.context) # compute (D^-1 A D)^k X for _ in range(self._k): feat = feat * norm graph.ndata['h'] = feat graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h')) feat = graph.ndata.pop('h') feat = feat * norm if self.norm is not None: feat = self.norm(feat) # cache feature if self._cached: self._cached_h = feat return self.fc(feat)
import sys sys.path.append('..') import gluonbook as gb from mxnet import autograd, gluon, nd from mxnet.gluon import data as gdata, loss as gloss, nn # y=1.2(X)−3.4X(2)+5.6X(3)+5+ϵ n_train = 100 n_test = 100 true_w = [1.2, 3.4, 5.6] true_b = 5 features = nd.random.normal(shape=(n_train + n_test, 1)) poly_features = nd.concat(features, nd.power(features, 2), nd.power(features, 3)) labels = true_w[0] * poly_features[:, 0] + true_w[ 1] * poly_features[:, 1] + true_w[2] * poly_features[:, 2] + true_b labels += nd.random.normal(scale=0.01, shape=labels.shape) from IPython.display import set_matplotlib_formats def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None, legend=None,
def sum_squared_error(self, yhat, y): return nd.nansum(nd.power(y - yhat, 2), axis=0, exclude=True)
def forward(self, graph, feat): r""" Description ----------- Compute Simplifying Graph Convolution layer. Parameters ---------- graph : DGLGraph The graph. feat : mxnet.NDArray The input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. Returns ------- mxnet.NDArray The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}` is size of output feature. Raises ------ DGLError If there are 0-in-degree nodes in the input graph, it will raise DGLError since no message will be passed to those nodes. This will cause invalid output. The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``. Note ---- If ``cache`` is set to True, ``feat`` and ``graph`` should not change during training, or you will get wrong results. """ with graph.local_scope(): if not self._allow_zero_in_degree: if graph.in_degrees().min() == 0: raise DGLError( 'There are 0-in-degree nodes in the graph, ' 'output for those nodes will be invalid. ' 'This is harmful for some applications, ' 'causing silent performance regression. ' 'Adding self-loop on the input graph by ' 'calling `g = dgl.add_self_loop(g)` will resolve ' 'the issue. Setting ``allow_zero_in_degree`` ' 'to be `True` when constructing this module will ' 'suppress the check and let the code run.') if self._cached_h is not None: feat = self._cached_h else: # compute normalization degs = nd.clip(graph.in_degrees().astype(feat.dtype), 1, float('inf')) norm = nd.power(degs, -0.5).expand_dims(1) norm = norm.as_in_context(feat.context) # compute (D^-1 A D)^k X for _ in range(self._k): feat = feat * norm graph.ndata['h'] = feat graph.update_all(fn.copy_u('h', 'm'), fn.sum('m', 'h')) feat = graph.ndata.pop('h') feat = feat * norm if self.norm is not None: feat = self.norm(feat) # cache feature if self._cached: self._cached_h = feat return self.fc(feat)
def total_variation_loss(x): """ regularize convolutional masks (not currently in use) """ a = nd.square(x[:, :, :-1, :-1] - x[:, :, 1:, :-1]) b = nd.square(x[:, :, :-1, :-1] - x[:, :, :-1, 1:]) return nd.sum(nd.mean(nd.power(a + b, 1.25), axis=(2, 3)))
def hybrid_forward(self, F, x, a, b): mean = x.mean(axis = -1) # batch * _in_seq_len _mean = nd.repeat(mean.expand_dims(axis = -1), repeats = x.shape[-1], axis = -1) # batch * _in_seq_len * embedding_dim std = nd.sqrt(nd.sum(nd.power((x - _mean), 2), axis = -1) / x.shape[1]) # batch * _in_seq_len _std = nd.repeat(std.expand_dims(axis = -1), repeats = x.shape[-1], axis = -1) # batch * _in_seq_len * embedding_dim return F.elemwise_div(F.multiply((x - _mean), a), (_std + self.eps)) + b