def dot_attention(query, key, value, mask, dropout=0.0): # query: (batch_size, h, length_q, model_dim/h) # key: (batch_size, h, length_k, model_dim/h) # value: (batch_size, h, length_k, model_dim/h) query_shape = query.shape query = query.reshape(-3, -2) key = key.reshape(-3, -2) value = value.reshape(-3, -2) # matmul, t: (batch_size*h, length_q, length_k) t = nd.batch_dot(query, key.swapaxes(1, 2)) / math.sqrt(query.shape[-1]) # masked # mask PAD and future words m = nd.full(t.shape, LARGE_NEGATIVE_VALUE) mask = nd.ones(t.shape) * mask t = nd.where(mask, t, m) # softmax t = nd.softmax(t, axis=-1) if dropout > 0.0: t = nd.dropout(t, p=dropout) # (batch_size, h, length_q, model_dim/h) return nd.batch_dot(t, value).reshape(query_shape)
def _train_batch(self, batch, gibbs_sampling_steps, learning_rate): """Performs k-step Contrastive Divergence (CD-k) learning. Updates weights and biases. Keep in mind that most variables are "batch" tensors. Variable name suffix "_pr" stands for Pr. (probability). """ hidden_pr, hidden, dreamed_visible, dreamed_hidden_pr = self.gibbs_sampling_step( batch) positive_phase = nd.batch_dot(self._transpose_batch(batch), hidden) for _ in range(gibbs_sampling_steps - 1): _, _, dreamed_visible, dreamed_hidden_pr = self.gibbs_sampling_step( dreamed_visible) negative_phase = nd.batch_dot(self._transpose_batch(dreamed_visible), dreamed_hidden_pr) # make learning rate independent from the batch size learning_rate = learning_rate / batch.shape[0] self.weights += learning_rate * nd.sum(positive_phase - negative_phase, axis=(0, )) if self.hidden_bias is not None: self.hidden_bias += learning_rate * nd.sum( hidden_pr - dreamed_hidden_pr, axis=(0, )) if self.visible_bias is not None: self.visible_bias += learning_rate * nd.sum( batch - dreamed_visible, axis=(0, ))
def forward(self, query, key, value, mask=None): d = query.shape[-1] scores = nd.batch_dot(query, key, transpose_b=True) / math.sqrt(d) attention_weights = nlp.model.attention_cell._masked_softmax( nd, scores, mask, scores.dtype) attention_weights = self.dropout(attention_weights) return nd.batch_dot(attention_weights, value)
def forward(self, input_data): freq = input_data[:, 0:2].expand_dims(1) input_data = input_data[:, 2:] e1_vec_start = FIXED_WORD_LENGTH * DIMENSION x = input_data[:, :e1_vec_start].reshape( (input_data.shape[0], FIXED_WORD_LENGTH, DIMENSION)) # (m, 60, 110) e1neimask = input_data[:, e1_vec_start:e1_vec_start + MASK_LENGTH] # (m, 51) e1edge = input_data[:, e1_vec_start + MASK_LENGTH:e1_vec_start + MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH].reshape( (input_data.shape[0], ENTITY_DEGREE, WORD_DIMENSION * 2)) # (m, 51, 200) e1neigh = e1edge[:, :, :WORD_DIMENSION] e2_vec_start = e1_vec_start + MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH e2neimask = input_data[:, e2_vec_start:e2_vec_start + MASK_LENGTH] # (m, 51) e2edge = input_data[:, e2_vec_start + MASK_LENGTH:e2_vec_start + MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH].reshape( (input_data.shape[0], ENTITY_DEGREE, WORD_DIMENSION * 2)) # (m, 51,200) e2neigh = e2edge[:, :, :WORD_DIMENSION] gru = self.gru x = nd.transpose(x, axes=(1, 0, 2)) h = gru(x) ht = nd.transpose(h, axes=(1, 0, 2)) gru_out = self.gru_out y1 = gru_out(ht.expand_dims(1)) # (m,200) att = self.center_att e1edge = nd.tanh(e1edge) e1g = att(e1edge) * freq[:, :, :1] # (m,51,1) e1g = e1g * e1neimask.expand_dims(2) e1g = nd.softmax(e1g, axis=1) e1gt = nd.transpose(e1g, axes=(0, 2, 1)) # (m,1,151) e1n = nd.batch_dot(e1gt, e1neigh) # (m,1,100) e1n = e1n.reshape((e1n.shape[0], 100)) # (m,100) e2edge = nd.tanh(e2edge) e2g = att(e2edge) * freq[:, :, 1:] # (m,51,1) e2g = e2g * e2neimask.expand_dims(2) e2g = nd.softmax(e2g, axis=1) e2gt = nd.transpose(e2g, axes=(0, 2, 1)) # (m,1,151) e2n = nd.batch_dot(e2gt, e2neigh) # (m,1,100) e2n = e2n.reshape((e2n.shape[0], 100)) # (m,100) center_y = nd.concat(e1n, e2n, dim=1) # (m,200) center_out = self.center_out center_y = center_out(center_y) out = self.output y4 = nd.concat(y1, center_y, dim=1) y5 = out(y4) return y5
def forward(self, x): """Forward Relation Module. Parameters ---------- feat : mxnet.nd.NDArray or mxnet.symbol (M, 1024) Feature tensor (used to compute q). ctx_feat : mxnet.nd.NDArray or mxnet.symbol (N, 1024)Contextual Feature tensor (used to compute k,v). box: mxnet.nd.NDArray or mxnet.symbol (M, 4) boxes with corner encoding. ctx_box: mxnet.nd.NDArray or mxnet.symbol (N, 4) boxes with corner encoding. Returns ------- gt_relation_feat, ctx_relation_feat (M, 1024). """ e = self.dim_k # e = 1024 (feature size) k, v, q = x.shape[0], x.shape[0], x.shape[ 0] # k, v, q = N (number of bounding boxes) h = self.num_group # h = 16 (Number of groups or num_group for multi head attention) x = x.reshape(k, h, e) x = x.reshape(k * h, e) keys = self.to_keys(x).reshape(k, h, e).transpose( axes=(1, 0, 2)) # keys : (h, k, e) values = self.to_values(x).reshape(k, h, e).transpose( axes=(1, 0, 2)) # values : (h, v, e) queries = self.to_queries(x).reshape(k, h, e).transpose( axes=(1, 0, 2)) # queries : (h, q, e) keys = keys / (self.num_feat**(1 / 4)) queries = queries / (self.num_feat**(1 / 4)) dot = F.batch_dot(lhs=queries, rhs=keys, transpose_a=False, transpose_b=True) # dot : (h, q, k) attention = F.softmax(dot, axis=2) out = F.batch_dot(lhs=attention, rhs=values, transpose_a=False, transpose_b=False) # out : (h, q, e) out = out.transpose(axes=(1, 0, 2)) # out : (q, h, e) out = out.reshape(q, -1) # out : (q, h*e) out = self.unify_heads(out) # out : (q, e) return out
def _get_co_attention(as_, bs_, r, lamb=k_lambda): """ as_, bs_: (batch_size, seq_len, embed_size) r: (batch_size, seq_len, seq_len, 5) """ e = nd.batch_dot(as_, bs_, transpose_b=True) + lamb * F( r, ctx) # (batch_size, seq_len, seq_len,) alpha = nd.softmax(e, axis=2) # alpha_ij = exp(eij) / SUM_k(exp(eik)) beta = nd.softmax(e, axis=1) # beta_ij = exp(ij) / SUM_k(exp(ekj)) beta = nd.transpose(beta, axes=[0, 2, 1]) # transpose becasue of softmax axis=1 ac = nd.batch_dot(alpha, bs_) # bc = nd.batch_dot(beta, as_) return ac, bc, alpha, beta
def forward(self, query, key, value, valid_length): """Forward function""" query, key = self.W_k(query), self.W_q(key) features = query.expand_dims(axis=2) + key.expand_dims(axis=1) scores = self.v(features).squeeze(axis=-1) attention_weights = self.dropout(masked_softmax(scores, valid_length)) return nd.batch_dot(attention_weights, value)
def forward(self, x): ''' Parameters ---------- x: mx.ndarray, shape is (batch_size, N, C_{r-1}, T_{r-1}) Returns ---------- mx.ndarray, shape is (batch_size, N, num_of_time_filters, T_{r-1}) ''' (batch_size, num_of_vertices, num_of_features, num_of_timesteps) = x.shape # shape is (batch_size, T, T) temporal_At = self.TAt(x) x_TAt = nd.batch_dot(x.reshape(batch_size, -1, num_of_timesteps), temporal_At)\ .reshape(batch_size, num_of_vertices, num_of_features, num_of_timesteps) # cheb gcn with spatial attention spatial_At = self.SAt(x_TAt) spatial_gcn = self.cheb_conv_SAt(x, spatial_At) # convolution along time axis time_conv_output = (self.time_conv(spatial_gcn.transpose((0, 2, 1, 3))) .transpose((0, 2, 1, 3))) # residual shortcut x_residual = (self.residual_conv(x.transpose((0, 2, 1, 3))) .transpose((0, 2, 1, 3))) return self.ln(nd.relu(x_residual + time_conv_output))
def forward(self, x1, x2): y1 = self.mlp(x1) y2 = self.mlp(x2) # re-shape it y1 = y1.expand_dims(axis=1) # add dummy dimension y2 = y2.expand_dims(axis=2) # Y1: (N, 1, C) Y2: (N, C, 1) return nd.batch_dot(y1, y2)
def _calculate_trilinear_similarity(self, context, query, context_max_len, query_max_len, w4mlu, bias): """Implement the computation of trilinear similarity function. refer https://github.com/NLPLearn/QANet/blob/master/layers.py#L505 The similarity function is: f(w, q) = W[w, q, w * q] where w and q represent the word in context and query respectively, and * operator means hadamard product. Parameters ----------- context : NDArray input tensor with shape `(batch_size, context_sequence_length, hidden_size)` query : NDArray input tensor with shape `(batch_size, query_sequence_length, hidden_size)` context_max_len : int context_max_len : int Returns -------- similarity_mat : NDArray output tensor with shape `(batch_size, context_sequence_length, query_sequence_length)` """ subres0 = nd.tile(self.w4c(context), [1, 1, query_max_len]) subres1 = nd.tile(nd.transpose(self.w4q(query), axes=(0, 2, 1)), [1, context_max_len, 1]) subres2 = nd.batch_dot(w4mlu * context, nd.transpose(query, axes=(0, 2, 1))) similarity_mat = subres0 + subres1 + subres2 + bias return similarity_mat
def attention(query, key, value, mask=None, dropout=None): # Q * K.transpose() * value assert (len(query.shape) == 3) assert (len(key.shape) == 3) assert (len(value.shape) == 3) d_model = query.shape[-1] scores = nd.batch_dot(query, key, transpose_b=True) / math.sqrt(d_model) if mask is not None: val = nd.ones(scores.shape, ctx=cfg.ctx) * (-1e9) scores = nd.where(mask == 1, scores, val) p_attn = nd.softmax(scores, axis=-1) if dropout is not None: p_attn = dropout(p_attn) return nd.batch_dot(p_attn, value), p_attn
def forward(self, cur_input, state, encoder_outputs): # 当循环神经网络有多个隐藏层时,取靠近输出层的单层隐藏状态 single_layer_state = [state[0][-1].expand_dims(0)] encoder_outputs = encoder_outputs.reshape((self.max_seq_len, -1, self.encoder_num_hiddens)) hidden_broadcast = nd.broadcast_axis(single_layer_state[0], axis=0, size=self.max_seq_len) encoder_outputs_and_hiddens = nd.concat(encoder_outputs, hidden_broadcast, dim=2) energy = self.attention(encoder_outputs_and_hiddens) batch_attention = nd.softmax(energy, axis=0).transpose((1, 2, 0)) batch_encoder_outputs = encoder_outputs.swapaxes(0, 1) decoder_context = nd.batch_dot(batch_attention, batch_encoder_outputs) #改这里 input_and_context = nd.concat(nd.expand_dims(self.embedding(cur_input), axis=1), decoder_context, dim=2) concat_input = self.rnn_concat_input(input_and_context).reshape((1, -1, 0)) concat_input = self.dropout(concat_input) state = [nd.broadcast_axis(single_layer_state[0], axis=0,size=self.num_layers)] output, state = self.rnn(concat_input, state) output = self.dropout(output) output = self.out(output).reshape((-3, -1)) return output, state
def calculate_loss(x, y, model, loss, loss_name, class_weight, penalization_coeff): """calculate loss value Args: x (NDArray): intput of model y (NDArray): target model (Block): model loss (gluon.loss): loss function loss_name (str): name of loss function class_weight (NDArray): weight of sample loss value for each category penalization_coeff (float): Attention penalty coefficient Returns: NDArray: output of model NDArray: loss value """ pred, att = model(x) if loss_name == 'sce': l = loss(pred, y) elif loss_name == 'wsce': l = loss(pred, y, class_weight, class_weight.shape[0]) # penalty diversity_penalty = nd.batch_dot(att, nd.transpose(att, axes=(0, 2, 1)) ) - nd.eye(att.shape[1], ctx=att.context) l = l + penalization_coeff * diversity_penalty.norm(axis=(1, 2)) return pred, l
def forward(self, x): if self.routing is not None: routing_weight = nd.softmax(nd.zeros(shape=(1, 1, self.num_points), ctx=x.context), axis=2) trans = self.stn(x) x = nd.transpose(x, (0, 2, 1)) x = nd.batch_dot(x, trans) x = nd.transpose(x, (0, 2, 1)) x = nd.relu(self.bn1(self.conv1(x))) pointfeat = x x = nd.relu(self.bn2(self.conv2(x))) x = self.bn3(self.conv3(x)) if self.routing is not None: s = nd.sum(x * routing_weight, axis=2, keepdims=True) # v = Squash(s, axis=1) for _ in range(self.routing): routing_weight = routing_weight + nd.sum( x * s, axis=1, keepdims=True) c = nd.softmax(routing_weight, axis=2) s = nd.sum(x * c, axis=2, keepdims=True) # v = Squash(s, axis=1) x = s else: x = self.mp1(x) if self.global_feat: return x, trans else: x = x.repeat(self.num_points, axis=2) return nd.concat(x, pointfeat, dim=1), trans
def matmul(self, x, y, transpose_a=False,transpose_b=False): x = nd.split(x, self.embedding_size, 2) y = nd.split(y, self.embedding_size, 2) res = [] for idx in range(self.embedding_size): array = nd.batch_dot(x[idx], y[idx], transpose_a,transpose_b=transpose_b) res.append(array.asnumpy().tolist()) return nd.array(res,ctx=self.ctx)
def augment(points, xforms, r=None): points_xformed = nd.batch_dot(points, xforms, name='points_xformed') if r is None: return points_xformed jitter_data = r * mx.random.normal(shape=points_xformed.shape) jitter_clipped = nd.clip(jitter_data, -5 * r, 5 * r, name='jitter_clipped') return points_xformed + jitter_clipped
def forward(self, decoder_output, encoder_output): """TODO: Docstring for forward. :decoder_output: TODO :encoder_output: TODO :returns: TODO """ decoder_output = decoder_output.transpose([0, 2, 1]) score = nd.batch_dot(encoder_output, decoder_output) weight = nd.softmax(score, axis=1) context = nd.batch_dot(nd.transpose(weight, [0, 2, 1]), encoder_output) return context, nd.squeeze(weight)
def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False): """Do xWy Parameters ---------- x : NDArray (input_size x seq_len) x batch_size W : NDArray (num_outputs x ny) x nx y : NDArray (input_size x seq_len) x batch_size input_size : int input dimension seq_len : int sequence length batch_size : int batch size num_outputs : int number of outputs bias_x : bool whether concat bias vector to input x bias_y : bool whether concat bias vector to input y Returns ------- output : NDArray [seq_len_y x seq_len_x if output_size == 1 else seq_len_y x num_outputs x seq_len_x] x batch_size """ if bias_x: x = nd.concat(x, nd.ones((1, seq_len, batch_size)), dim=0) if bias_y: y = nd.concat(y, nd.ones((1, seq_len, batch_size)), dim=0) ny = input_size + bias_y # W: (num_outputs x ny) x nx lin = nd.dot(W, x) if num_outputs > 1: lin = reshape_fortran(lin, (ny, num_outputs * seq_len, batch_size)) y = y.transpose([2, 1, 0]) # May cause performance issues lin = lin.transpose([2, 1, 0]) blin = nd.batch_dot(lin, y, transpose_b=True) blin = blin.transpose([2, 1, 0]) if num_outputs > 1: blin = reshape_fortran(blin, (seq_len, num_outputs, seq_len, batch_size)) return blin
def forward(self, x, spatial_attention, cheb_polynomials): ''' Chebyshev graph convolution operation Parameters ---------- x: mx.ndarray, graph signal matrix shape is (batch_size, N, F, T_{r-1}), F is the num of features spatial_attention: mx.ndarray, shape is (batch_size, N, N) spatial attention scores Returns ---------- mx.ndarray, shape is (batch_size, N, self.num_of_filters, T_{r-1}) ''' (batch_size, num_of_vertices, num_of_features, num_of_timesteps) = x.shape self.Theta.shape = (self.K, num_of_features, self.num_of_filters) self.Theta._finish_deferred_init() cur_context = x.context outputs = [] for time_step in range(num_of_timesteps): # shape is (batch_size, V, F) graph_signal = x[:, :, :, time_step] output = nd.zeros(shape=(batch_size, num_of_vertices, self.num_of_filters), ctx=x.context) for k in range(self.K): # shape of T_k is (V, V) T_k = cheb_polynomials[k].tostype('default').as_in_context(cur_context) # print("T_K: ", T_k) # shape of T_k_with_at is (batch_size, V, V) T_k_with_at = T_k * spatial_attention # T_k_with_at = T_k.as_in_context(cur_context) * spatial_attention # print("T_k_with_at: ", T_k_with_at) # shape of theta_k is (F, num_of_filters) theta_k = self.Theta.data(cur_context)[k] # shape is (batch_size, V, F) # rhs = nd.batch_dot(T_k_with_at.transpose((0, 2, 1)).tostype('csr'), # graph_signal) # print("T_k_with_at: ", T_k_with_at) # print("graph signal: ", graph_signal) rhs = nd.batch_dot(T_k_with_at, graph_signal) # print("rhs: ", rhs) # print("theta_k: ", theta_k) output = output + nd.dot(rhs, theta_k) outputs.append(output.expand_dims(-1)) return nd.relu(nd.concat(*outputs, dim=-1))
def forward(self, x): ''' Parameters ---------- x: mx.ndarray, x^{(r - 1)}_h shape is (batch_size, N, C_{r-1}, T_{r-1}) Returns ---------- E_normalized: mx.ndarray, S', spatial attention scores shape is (batch_size, T_{r-1}, T_{r-1}) ''' _, num_of_vertices, num_of_features, num_of_timesteps = x.shape # defer shape self.U_1.shape = (num_of_vertices, ) self.U_2.shape = (num_of_features, num_of_vertices) self.U_3.shape = (num_of_features, ) self.b_e.shape = (1, num_of_timesteps, num_of_timesteps) self.V_e.shape = (num_of_timesteps, num_of_timesteps) for param in [self.U_1, self.U_2, self.U_3, self.b_e, self.V_e]: param._finish_deferred_init() # print(x) # print(self.U_1.data()) # print(self.U_2.data()) # print("=========================") # print("Context of Variables") # print("x::{0}, D1::{1}, D2::{2}".format(x.context, self.U_1.data().context, self.U_2.data().context)) # print("=========================") # print(x) # compute temporal attention scores # shape is (N, T, V) # context.current_context() # print("temporal context", context.current_context()) cur_context = x.context lhs = nd.dot(nd.dot(x.transpose((0, 3, 2, 1)), self.U_1.data(cur_context)), self.U_2.data(cur_context)) # shape is (N, V, T) rhs = nd.dot(self.U_3.data(cur_context), x.transpose((2, 0, 1, 3))) product = nd.batch_dot(lhs, rhs) E = nd.dot(self.V_e.data(cur_context), nd.sigmoid(product + self.b_e.data(cur_context)) .transpose((1, 2, 0))).transpose((2, 0, 1)) # normailzation E = E - nd.max(E, axis=1, keepdims=True) exp = nd.exp(E) E_normalized = exp / nd.sum(exp, axis=1, keepdims=True) return E_normalized
def forward(self, cur_input, state, encoder_outputs): # 当RNN为多层时,取最靠近输出层的单层隐含状态。 # state.shape is [(1, batch_size, decoder_hidden_dim)] single_layer_state = [state[0][-1].expand_dims(0)] # encoder_outputs.shape is (max_seq_len, batch_size * encoder_hidden_dim) encoder_outputs = encoder_outputs.reshape( (self.max_seq_len, -1, self.encoder_hidden_dim)) # single_layer_state尺寸: [(1, batch_size, decoder_hidden_dim)] # hidden_broadcast尺寸: (max_seq_len, batch_size, decoder_hidden_dim) hidden_broadcast = nd.broadcast_axis(single_layer_state[0], axis=0, size=self.max_seq_len) # encoder_outputs_and_hiddens尺寸: # (max_seq_len, batch_size, encoder_hidden_dim + decoder_hidden_dim) encoder_outputs_and_hiddens = nd.concat(encoder_outputs, hidden_broadcast, dim=2) # energy尺寸: (max_seq_len, batch_size, 1) energy = self.attention(encoder_outputs_and_hiddens) # batch_attention尺寸: (batch_size, 1, max_seq_len) batch_attention = nd.softmax(energy, axis=0).transpose((1, 2, 0)) # batch_encoder_outputs尺寸: (batch_size, max_seq_len, encoder_hidden_dim) batch_encoder_outputs = encoder_outputs.swapaxes(0, 1) # decoder_context尺寸: (batch_size, 1, encoder_hidden_dim) decoder_context = nd.batch_dot(batch_attention, batch_encoder_outputs) # cur_input尺寸: (batch_size,) # input_and_context尺寸: (batch_size, 1, decoder_hidden_dim + encoder_hidden_dim ) input_and_context = nd.concat(nd.expand_dims(self.embedding(cur_input), axis=1), decoder_context, dim=2) # concat_input尺寸: (1, batch_size, decoder_hidden_dim) concat_input = self.rnn_concat_input(input_and_context).reshape( (1, -1, 0)) concat_input = self.dropout(concat_input) # 当RNN为多层时,用单层隐含状态初始化各个层的隐含状态。 state = [ nd.broadcast_axis(single_layer_state[0], axis=0, size=self.num_layers) ] # XXX 注意:state 是 [nd.NDArray] output, state = self.rnn(concat_input, state) output = self.dropout(output) output = self.out(output) output = nd.reshape(output, (-3, -1)) # output尺寸: (batch_size * 1, output_dim) return output, state
def predict(self,x): h=self.e(x[:, 0]) r=self.r(x[:, 1]) t=self.e(x[:, 2]) t=t.reshape(-1,self.dim,1) r=r.reshape(-1,self.dim,self.dim) tr=nd.batch_dot(r,t) tr=tr.reshape(-1,self.dim) score = nd.sum(h*tr,-1) return -score
def forward(self, input_data): x = nd.transpose(input_data, axes=(1, 0, 2)) h = nd.transpose(self.gru(x), axes=(1, 0, 2)) # (m,60,100) h = nd.tanh(h) g = self.att(h) # (m,60,1) g = nd.softmax(g, axis=1) gt = nd.transpose(g, axes=(0, 2, 1)) # (m,1,60) n = nd.batch_dot(gt, h) y = self.att_out(n) return self.output(y)
def forward(self, x_left, x_right): x_left = self.embed_left(x_left) x_right = self.embed_right(x_right) embed_cross = nd.expand_dims( nd.batch_dot(x_left, x_right, transpose_b=True), 3) embed_cross = nd.transpose(embed_cross, (0, 3, 1, 2)) embed_cross = self._conv_block(embed_cross) embed_pool = self.pool(embed_cross) out = self.output_layer(embed_pool) return out
def forward(self, emb_a, emb_b): # emb_a: batch_size*seq_len_a*emb_size, emb_b: batch_size*seq_len_b*emb_size # self.W: emb_size*emb_size # After the evaluation, the shape is batch_size*seq_len_a*emb_size_b dot_product = nd.batch_dot(nd.dot(emb_a, self.W.data()), \ nd.transpose(emb_b, axes=(0, 2, 1))) # this softmax is subject to servere numerical unstability, # add a work around G_ab = nd.softmax(dot_product - nd.max(dot_product, axis=1, keepdims=True), axis=1) return G_ab
def compute_curvature(nn_pts): nn_pts_mean = nd.mean(nn_pts, axis=2, keepdims=True) # (N, P, 1, 3) nn_pts_demean = nn_pts - nn_pts_mean # (N, P, K, 3) nn_pts_NPK31 = nd.expand_dims(nn_pts_demean, axis=-1) covariance_matrix = nd.batch_dot(nn_pts_NPK31, nn_pts_NPK31, transpose_b=True) # (N, P, K, 3, 3) covariance_matrix_mean = nd.mean(covariance_matrix, axis=2, keepdims=False) # (N, P, 3, 3) eigvals = compute_eigenvals(covariance_matrix_mean) # (N, P, 3) curvature = nd.min(eigvals, axis=-1) / (nd.sum(eigvals, axis=-1) + 1e-8) return curvature
def bdd_message_func(self, edges): """Message function for block-diagonal-decomposition regularizer""" ctx = edges.src['h'].context if edges.src['h'].dtype in (np.int32, np.int64) and len(edges.src['h'].shape) == 1: raise TypeError('Block decomposition does not allow integer ID feature.') weight = self.weight.data(ctx)[edges.data['type'], :].reshape( -1, self.submat_in, self.submat_out) node = edges.src['h'].reshape(-1, 1, self.submat_in) msg = nd.batch_dot(node, weight).reshape(-1, self.out_feat) if 'norm' in edges.data: msg = msg * edges.data['norm'] return {'msg': msg}
def make_dynamic_dec(T, values_L): values_T = nd.array(np.linspace(1, T, num=T), ctx=values_L.context) values_T = nd.expand_dims(nd.expand_dims(values_T, axis=0), axis=2) values_T = nd.broadcast_axis(values_T, axis=0, size=values_L.shape[0]) values_TL = nd.batch_dot(values_T, values_L, transpose_b=True) values_sin = nd.sin(values_TL) values_cos = nd.cos(values_TL) return nd.concat(values_sin, values_cos, dim=2)
def forward(self, feature, data): """ Forward process of a HyperDense layer Args: feature: a NDArray with shape [n, d] data: a NDArray with shape [n, b, pre_d] Returns: output: a NDArray with shape [n, b, d] """ weight = self.w_mlp(feature) # [n, pre_hidden_size * hidden_size] weight = nd.reshape(weight, (-1, self.pre_hidden_size, self.hidden_size)) bias = nd.reshape(self.b_mlp(feature), shape=(-1, 1, 1)) # [n, 1, 1] return nd.batch_dot(data, weight) + bias
def forward(self, x, time, context): hid = [] hid.append(x) # m_i = sum A_ij * x_ij + T_A_i Ain_c = self.A(context) Ain_t = self.T_A(time) Ain = Ain_c + Ain_t # c_i = sum B_ij * u + T_B_i Bin_c = self.B(context) Bin_t = self.T_B(time) Bin = Bin_c + Bin_t for h in xrange(self.nhop): hid3dim = hid[-1].expand_dims(1) Aout = nd.batch_dot(hid3dim, Ain.swapaxes(1,2)) Aout2dim = Aout.reshape((-1, self.mem_size)) P = nd.softmax(Aout2dim, axis=1) Prob3dim = P.expand_dims(1) Bout = nd.batch_dot(Prob3dim, Bin) Bout2dim = Bout.reshape((-1, self.edim)) Cout = self.C(hid[-1]) Dout = Bout2dim + Cout if self.lindim == self.edim: hid.append(Dout) elif self.lindim == 0: hid.append(nd.relu(Dout)) else: F = Dout[:, :self.lindim] G = Dout[:, self.lindim:] K = nd.relu(G) hid.append(nd.concat(F, K, dim=1)) z = self.W(hid[-1]) return z
def forward(self, query, key, value, valid_length=None): """Forward function""" d = query.shape[-1] scores = nd.batch_dot(query, key, transpose_b=True) / math.sqrt(d) attention_weights = self.dropout(masked_softmax(scores, valid_length)) return nd.batch_dot(attention_weights, value)