def simple_rnn(inputs, state, temperature=1.0): outputs = [] h = state for X in inputs: h_linear = nd.dot(X, Wxh) + nd.dot(h, Whh) + bh h = nd.tanh(h_linear) yhat_linear = nd.dot(h, Why) + by yhat = softmax(yhat_linear, temperature=temperature) outputs.append(yhat) return (outputs, h)
def lstm(_inputs, initial_state_h, initial_state_c, *parameters): # _inputs: a list with length num_steps, # corresponding element: batch_size * input_dim matrix H = initial_state_h # hidden state C = initial_state_c # memory cell [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hy, b_y] = parameters _outputs = [] for X in _inputs: # compute INPUT gate from input and last/initial hidden state input_gate = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i) # compute FORGET gate from input and last/initial hidden state forget_gate = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f) # compute OUTPUT gate from input and last/initial hidden state output_gate = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o) # compute memory cell candidate from input and last/initial hidden state memory_cell_candidate = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c) # compute memory cell from last memory cell and memory cell candidate C = forget_gate * C + input_gate * memory_cell_candidate # compute hidden state from output gate and memory cell H = output_gate * nd.tanh(C) # compute output from hidden state Y = nd.dot(H, W_hy) + b_y _outputs.append(Y) return _outputs, H, C
def perceptron(w,b,x,y): if (y * (nd.dot(w,x) + b)).asscalar() <= 0: w += y * x b += y return 1 else: return 0
def gru(_inputs, initial_state, *parameters): # _inputs: a list with length num_steps, # corresponding element: batch_size * input_dim matrix H = initial_state [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hy, b_y] = parameters _outputs = [] for X in _inputs: # compute update gate from input and last/initial hidden state update_gate = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z) # compute reset gate from input and last/initial hidden state reset_gate = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r) # compute candidate hidden state from input, reset gate and last/initial hidden state H_candidate = nd.tanh(nd.dot(X, W_xh) + reset_gate * nd.dot(H, W_hh) + b_h) # compute hidden state from candidate hidden state and last hidden state H = update_gate * H + (1 - update_gate) * H_candidate # compute output from hidden state Y = nd.dot(H, W_hy) + b_y _outputs.append(Y) return _outputs, H
def plotscore(w, d): xgrid = np.arange(-3, 3, 0.02) ygrid = np.arange(-3, 3, 0.02) xx, yy = np.meshgrid(xgrid, ygrid) zz = nd.zeros(shape=(xgrid.size, ygrid.size, 2)) zz[:, :, 0] = nd.array(xx) zz[:, :, 1] = nd.array(yy) vv = nd.dot(zz, w) + b CS = plt.contour(xgrid, ygrid, vv.asnumpy()) plt.clabel(CS, inline=1, fontsize=10)
def _spectral_norm(self): """ spectral normalization """ w = self.params.get('weight').data(self.ctx) w_mat = nd.reshape(w, [w.shape[0], -1]) _u = self.u.data(self.ctx) _v = None for _ in range(POWER_ITERATION): _v = nd.L2Normalization(nd.dot(_u, w_mat)) _u = nd.L2Normalization(nd.dot(_v, w_mat.T)) sigma = nd.sum(nd.dot(_u, w_mat) * _v) if sigma == 0.: sigma = EPSILON self.params.setattr('u', _u) return w / sigma
def batchwise_covariance(X, Y): meanx = meany = vary = n = C = 0 for x, y in zip(X, Y): m = len(x) meanx_ = x.mean(axis=0, keepdims=True) meany_ = y.mean(axis=0, keepdims=True) dx = x - meanx_ dy = y - meany_ C_ = nd.dot(dx, dy, transpose_a=True) C += C_ + nd.dot((meanx - meanx_), (meany - meany_), transpose_a=True) * n * m / (n+m) vary_ = nd.sum(dy**2, axis=0) vary += vary_ + ((meany - meany_)**2) * n * m / (n+m) meanx = (n * meanx + m * meanx_) / (n+m) meany = (n * meany + m * meany_) / (n+m) n += m return C / n, vary / n
def lstm_rnn(inputs, h, c, temperature=1.0): outputs = [] for X in inputs: g = nd.tanh(nd.dot(X, Wxg) + nd.dot(h, Whg) + bg) i = nd.sigmoid(nd.dot(X, Wxi) + nd.dot(h, Whi) + bi) f = nd.sigmoid(nd.dot(X, Wxf) + nd.dot(h, Whf) + bf) o = nd.sigmoid(nd.dot(X, Wxo) + nd.dot(h, Who) + bo) ####################### # ####################### c = f * c + i * g h = o * nd.tanh(c) ####################### # ####################### yhat_linear = nd.dot(h, Why) + by yhat = softmax(yhat_linear, temperature=temperature) outputs.append(yhat) return (outputs, h, c)
def gru_rnn(inputs, h, temperature=1.0): outputs = [] for X in inputs: z = nd.sigmoid(nd.dot(X, Wxz) + nd.dot(h, Whz) + bz) r = nd.sigmoid(nd.dot(X, Wxr) + nd.dot(h, Whr) + br) g = nd.tanh(nd.dot(X, Wxh) + nd.dot(r * h, Whh) + bh) h = z * h + (1 - z) * g yhat_linear = nd.dot(h, Why) + by yhat = softmax(yhat_linear, temperature=temperature) outputs.append(yhat) return (outputs, h)
def cov_reg_it(self, x, y, mean_x, mean_xy, num_x, num_y, regime): cond_y = regime(y) # number of times each sample's x for w.t dot x was inside the regime num_n = cond_y.sum(axis=1, keepdims=True) # => weighted sum over x wsum_x = nd.dot(num_n, x, transpose_a=True) # y's in regime reg_y = y * cond_y # sum of xy's in regime # sum_xy = (reg_y.expand_dims(axis=2) * x.expand_dims(axis=1)).sum(axis=0) sum_xy = nd.dot(reg_y, x, transpose_a=True) num_x_cur = num_n.sum() mean_x = (num_x * mean_x + wsum_x) / (num_x + num_x_cur + 1e-12) num_y_cur = cond_y.sum(axis=0) mean_xy = (num_y.T * mean_xy + sum_xy) / (num_y + num_y_cur + 1e-12).T num_y += num_y_cur return mean_x, mean_xy, num_y
def hue(src, delta, p=0.5): """Hue distortion""" if np.random.uniform(0, 1) > p: alpha = np.random.uniform(-delta, delta) u = np.cos(alpha * np.pi) w = np.sin(alpha * np.pi) bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]]) tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321], [0.211, -0.523, 0.311]]) ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647], [1.0, -1.107, 1.705]]) t = np.dot(np.dot(ityiq, bt), tyiq).T src = nd.dot(src, nd.array(t, ctx=src.context)) return src return src
def forward(self, x): x = self.dense(x) # Use the constant parameters created, as well as the relu and dot # functions of NDArray x = nd.relu(nd.dot(x, self.rand_weight.data()) + 1) # Reuse the fully connected layer. This is equivalent to sharing # parameters with two fully connected layers x = self.dense(x) # Here in Control flow, we need to call asscalar to return the scalar # for comparison while x.norm().asscalar() > 1: x /= 2 if x.norm().asscalar() < 0.8: x *= 10 return x.sum()
def evaluate_FITB_accuracy(data_loader: AsyncDataLoader, model): ''' Measures the accuracy of the model in indicating the correct variable ''' with data_loader as data_loader: correct = 0 for split_batch, batch_length in tqdm(data_loader, total=data_loader.total_batches): batches_outputs = [(batch, model(batch.data)) for batch in split_batch] for batch, output in batches_outputs: predictions_labels = model.unbatchify(batch, output) for prediction, label in predictions_labels: correct += int(nd.dot(prediction, label).asscalar()) return correct / len(data_loader)
def net(X, Verbose=False): X = X.as_in_context(W1.context) #将X的存储位置与W1一致 #第一层卷积 h1_conv = nd.Convolution(data=X, weight=W1, bias=b1, kernel=W1.shape[2:], num_filter=W1.shape[0]) h1_activation = nd.relu(h1_conv) h1 = nd.Pooling(data=h1_activation, pool_type="max", kernel=(2, 2), stride=(2, 2)) #第二层卷积 h2_conv = nd.Convolution(data=h1, weight=W2, bias=b2, kernel=W2.shape[2:], num_filter=W2.shape[0]) h2_activation = nd.relu(h2_conv) h2 = nd.Pooling(data=h2_activation, pool_type="max", kernel=(2, 2), stride=(2, 2)) h2 = h2.flatten() #第三层全链接 h3 = nd.relu(nd.dot(h2, W3) + b3) #第四层全链接 h4 = nd.dot(h3, W4) + b4 if Verbose: print('1st conv block:', h1.shape) print('2nd conv block:', h2.shape) print('1st dense:', h3.shape) print('2nd dense:', h4.shape) print('output:', h4) return h4
def forward(self, x, spatial_attention): ''' Chebyshev graph convolution operation Parameters ---------- x: mx.ndarray, graph signal matrix shape is (batch_size, N, F, T_{r-1}), F is the num of features spatial_attention: mx.ndarray, shape is (batch_size, N, N) spatial attention scores Returns ---------- mx.ndarray, shape is (batch_size, N, self.num_of_filters, T_{r-1}) ''' (batch_size, num_of_vertices, num_of_features, num_of_timesteps) = x.shape self.Theta.shape = (self.K, num_of_features, self.num_of_filters) self.Theta._finish_deferred_init() outputs = [] for time_step in range(num_of_timesteps): # shape is (batch_size, V, F) graph_signal = x[:, :, :, time_step] output = nd.zeros(shape=(batch_size, num_of_vertices, self.num_of_filters), ctx=x.context) for k in range(self.K): # shape of T_k is (V, V) T_k = self.cheb_polynomials[k] # shape of T_k_with_at is (batch_size, V, V) T_k_with_at = T_k * spatial_attention # shape of theta_k is (F, num_of_filters) theta_k = self.Theta.data()[k] # shape is (batch_size, V, F) rhs = nd.batch_dot(T_k_with_at.transpose((0, 2, 1)), graph_signal) output = output + nd.dot(rhs, theta_k) outputs.append(output.expand_dims(-1)) return nd.relu(nd.concat(*outputs, dim=-1))
def lstm_rnn(self, inputs, h, c, temperature=1.0): outputs = [] for X in inputs: # if not X.shape[0] == 77: # continue X = nd.one_hot(X, 60) #print("X.shape",X.shape,self.Wxg.shape,self.Whg.shape,h.shape) g = nd.tanh(nd.dot(X, self.Wxg) + nd.dot(h, self.Whg) + self.bg) i = nd.sigmoid(nd.dot(X, self.Wxi) + nd.dot(h, self.Whi) + self.bi) f = nd.sigmoid(nd.dot(X, self.Wxf) + nd.dot(h, self.Whf) + self.bf) o = nd.sigmoid(nd.dot(X, self.Wxo) + nd.dot(h, self.Who) + self.bo) c = f * c + i * g h = o * nd.tan(c) yhat_linear = nd.dot(h, self.Why) + self.by yhat = self.softmax(yhat_linear, temperature=temperature) #yhat = mx.ndarray.softmax(yhat_linear,temperature=temperature) outputs.append(yhat) return (outputs, h, c)
def __getTwoCross(self, X): batch = 0 t = None for x in tqdm(X): s = 0 for j1 in range(len(x)): for j2 in range(j1 + 1, len(x)): s += (nd.dot(self.bw[j1], self.bw[j2]) * x[j1] * x[j2]).asscalar() s = nd.array([[s]], dtype='float64') if batch == 0: t = nd.array(s) else: t = nd.concat(t, s, dim=0) batch += 1 return t
def forward(self, x): root = next(iter(self._structure.items()))[0] if (len(self._routerlayer) > 0): router_d, embedd_d = self._contextify(x)(root) embedd = nd.stack(*[embedd_d[key] for key in sorted(embedd_d)], axis=0) router = nd.stack(*[router_d[key] for key in sorted(router_d)], axis=-1) return nd.dot(router, embedd) else: head = nd.ones_like(nd.slice_axis(x, axis=1, begin=0, end=None)) return self._contextify(x)(root) * head
def stats_batchwise(x_bat, y_bat, n, x_mean, y_mean, x_var=None, y_var=None, xx_cov=None, yy_cov=None, xy_cov=None, x_mean_skip=False, y_mean_skip=False): m = x_bat.shape[0] x_bat_mean = x_bat.mean(axis=0, keepdims=True) y_bat_mean = y_bat.mean(axis=0, keepdims=True) dx = x_bat - x_bat_mean dy = y_bat - y_bat_mean if x_var is not None: x_bat_var = nd.sum(dx**2, axis=0) x_var += x_bat_var + ((x_mean - x_bat_mean)**2) * n * m / (n + m) if y_var is not None: y_bat_var = nd.sum(dy**2, axis=0) y_var += y_bat_var + ((y_mean - y_bat_mean)**2) * n * m / (n + m) if xx_cov is not None: xx_bat_cov = nd.dot(dx, dx, transpose_a=True) xx_cov += xx_bat_cov + nd.dot( (x_mean - x_bat_mean), (x_mean - x_bat_mean), transpose_a=True) * n * m / (n + m) if yy_cov is not None: yy_bat_cov = nd.dot(dy, dy, transpose_a=True) yy_cov += yy_bat_cov + nd.dot( (y_mean - y_bat_mean), (y_mean - y_bat_mean), transpose_a=True) * n * m / (n + m) if xy_cov is not None: xy_bat_cov = nd.dot(dy, dx, transpose_a=True) xy_cov += xy_bat_cov + nd.dot( (y_mean - y_bat_mean), (x_mean - x_bat_mean), transpose_a=True) * n * m / (n + m) if not x_mean_skip: x_mean = (n * x_mean + m * x_bat_mean) / (n + m) if not y_mean_skip: y_mean = (n * y_mean + m * y_bat_mean) / (n + m) n += m return n, x_mean, y_mean, x_var, y_var, xx_cov, yy_cov, xy_cov
def compute_vertex_layer(self, layer: int, vertex: int, subgraph: Subgraph) -> NDArray: feature_sum = nd.zeros(shape=(self._feature_layers[layer - 1][vertex].size, 1), ctx=data_ctx) for neighbor in self._graph.vertices[vertex].neighbors: prev = self._feature_layers[layer - 1][neighbor.id] prev_act = prev if layer == 1 else self._act(prev) feature_sum = feature_sum + prev_act / math.sqrt( subgraph.degree * neighbor.degree) res = self._b[layer - 1].data() + nd.dot( self._W[layer - 1].data(), feature_sum.as_in_context(model_ctx)) res = res.as_in_context(data_ctx) return res \ if not self._concatenate_features or layer == self._num_layers - 1 \ else nd.concat(self._feature_layers[layer - 1][vertex], res, dim=0)
def gru_rnn(inputs, H, *params): # inputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵 # H: 尺寸为 batch_size * hidden_dim 矩阵 # outputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵 W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hy, b_y = params outputs = [] for X in inputs: Z = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z) R = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r) H_tilda = nd.tanh(nd.dot(X, W_xh) + R * nd.dot(H, W_hh) + b_h) H = Z * H + (1 - Z) * H_tilda Y = nd.dot(H, W_hy) + b_y outputs.append(Y) return (outputs, H)
def basis_message_func(self, edges): """Message function for basis regularizer""" ctx = edges.src['h'].context if self.num_bases < self.num_rels: # generate all weights from bases weight = self.weight.data(ctx).reshape( self.num_bases, self.in_feat * self.out_feat) weight = nd.dot(self.w_comp.data(ctx), weight).reshape( self.num_rels, self.in_feat, self.out_feat) else: weight = self.weight.data(ctx) msg = utils.bmm_maybe_select(edges.src['h'], weight, edges.data['type']) if 'norm' in edges.data: msg = msg * edges.data['norm'] return {'msg': msg}
def load_data_linear_regression(true_w, true_b, num_train=1000, num_test=0): """ """ assert isinstance(true_w, list) assert isinstance(true_b, float) num_features = len(true_w) true_w = nd.array(true_w) true_b = nd.array([ true_b, ]) x = nd.random.normal(scale=1, shape=(num_train + num_test, num_features)) y = nd.dot(x, true_w) + true_b y += nd.random.normal(scale=0.01, shape=y.shape) return x, y
def forward(self, user_id, text, topics): user_word = self.emb_uw(user_id) word_emb = self.emb_word(text) topics_emb = self.emb_word(topics) topics_emb = nd.transpose(topics_emb, axes=(1,0)) topics_emb = nd.reshape(topics_emb, (self.word_dim,self.topics_num,1)) topics_emb = nd.dot(word_emb, topics_emb) topics_emb = nd.reshape(topics_emb, (self.batch_size,self.sentence_length,self.topics_num)) topics_emb = nd.softmax(topics_emb,axis=2) topics_emb = self.mlp_topic(topics_emb) word_emb = self.mlp_word(word_emb) xw = nd.concat(user_word, word_emb, topics_emb, dim=1) xw_1 = self.mlp_w1(xw) xw_2 = self.mlp_w2(xw_1) res = self.mlp(xw_2) return res
def gru(self, inputs, state): W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = self.params H = state outputs = [] for X in inputs: Z = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z) R = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r) H_tilda = nd.tanh(nd.dot(R * H, W_hh) + nd.dot(X, W_xh) + b_h) H = Z * H + (1 - Z) * H_tilda Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, H
def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False): """Do xWy Parameters ---------- x : NDArray (input_size x seq_len) x batch_size W : NDArray (num_outputs x ny) x nx y : NDArray (input_size x seq_len) x batch_size input_size : int input dimension seq_len : int sequence length batch_size : int batch size num_outputs : int number of outputs bias_x : bool whether concat bias vector to input x bias_y : bool whether concat bias vector to input y Returns ------- output : NDArray [seq_len_y x seq_len_x if output_size == 1 else seq_len_y x num_outputs x seq_len_x] x batch_size """ if bias_x: x = nd.concat(x, nd.ones((1, seq_len, batch_size)), dim=0) if bias_y: y = nd.concat(y, nd.ones((1, seq_len, batch_size)), dim=0) ny = input_size + bias_y # W: (num_outputs x ny) x nx lin = nd.dot(W, x) if num_outputs > 1: lin = reshape_fortran(lin, (ny, num_outputs * seq_len, batch_size)) y = y.transpose([2, 1, 0]) # May cause performance issues lin = lin.transpose([2, 1, 0]) blin = nd.batch_dot(lin, y, transpose_b=True) blin = blin.transpose([2, 1, 0]) if num_outputs > 1: blin = reshape_fortran(blin, (seq_len, num_outputs, seq_len, batch_size)) return blin
def lstm_rnn(inputs, state_h, state_c, *params): # inputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵 # H: 尺寸为 batch_size * hidden_dim 矩阵 # outputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵 [ W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c, W_hy, b_y ] = params H = state_h C = state_c outputs = [] for X in inputs: I = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i) F = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f) O = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o) C_tilda = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c) C = F * C + I * C_tilda H = O * nd.tanh(C) Y = nd.dot(H, W_hy) + b_y outputs.append(Y) return (outputs, H, C)
def forward(self, inputs, state): """ forward function """ h, c = state outputs = [] for x in inputs: i = nd.sigmoid( nd.dot(x, self.w_xi) + nd.dot(h, self.w_hi) + self.b_i) f = nd.sigmoid( nd.dot(x, self.w_xf) + nd.dot(h, self.w_hf) + self.b_f) o = nd.sigmoid( nd.dot(x, self.w_xo) + nd.dot(h, self.w_ho) + self.b_o) c_tilda = nd.tanh( nd.dot(x, self.w_xc) + nd.dot(h, self.w_hc) + self.b_c) c = f * c + i * c_tilda h = o * c y = nd.dot(h, self.w_hq) + self.b_q outputs.append(y) y_hat = nd.concat(*outputs, dim=0) return y_hat, (h, c)
def name_face(self, person_face): ## Name the face of a person based on the dataset face = self.model.get_input(person_face) if face is None: return None face = nd.array(self.model.get_feature(face), ctx=self.ctx) # Calculate the similarity between the known features and the current face feature sim = nd.dot(self.dataset, face) scores = {} for known_id, index in self.names.items(): scores[known_id] = max(sim[index]).asnumpy() if max(scores.values()) > self.args.threshold_face: return max(scores, key=scores.get) else: return None
def forward(self, adj, feat): r""" Description ----------- Compute (Dense) Graph SAGE layer. Parameters ---------- adj : mxnet.NDArray The adjacency matrix of the graph to apply SAGE Convolution on, when applied to a unidirectional bipartite graph, ``adj`` should be of shape should be of shape :math:`(N_{out}, N_{in})`; when applied to a h**o graph, ``adj`` should be of shape :math:`(N, N)`. In both cases, a row represents a destination node while a column represents a source node. feat : mxnet.NDArray or a pair of mxnet.NDArray If a mxnet.NDArray is given, the input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. If a pair of mxnet.NDArray is given, the pair must contain two tensors of shape :math:`(N_{in}, D_{in})` and :math:`(N_{out}, D_{in})`. Returns ------- mxnet.NDArray The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}` is size of output feature. """ check_eq_shape(feat) if isinstance(feat, tuple): feat_src = self.feat_drop(feat[0]) feat_dst = self.feat_drop(feat[1]) else: feat_src = feat_dst = self.feat_drop(feat) adj = adj.astype(feat_src.dtype).as_in_context(feat_src.context) in_degrees = adj.sum(axis=1, keepdims=True) h_neigh = (nd.dot(adj, feat_src) + feat_dst) / (in_degrees + 1) rst = self.fc(h_neigh) # activation if self.activation is not None: rst = self.activation(rst) # normalization if self._norm is not None: rst = self._norm(rst) return rst
def backward(self, DY): ''' Backward-passes an input error gradient DY towards the input neurons of this layer. Parameters ---------- DY : mxnet.ndarray.ndarray.NDArray an error gradient shaped same as the output array of forward, i.e. (N,Hy,Wy,Dy) with N = number of samples in the batch Hy = heigth of the output Wy = width of the output Dy = output depth = input depth Returns ------- DX : mxnet.ndarray.ndarray.NDArray the error gradient propagated towards the input ''' self.DY = DY N, Hy, Wy, NF = DY.shape hf, wf, df, NF = self.W.shape hstride, wstride = self.stride DX = nd.zeros_like(self.X, ctx=self.ctx, dtype=self.dtype) if not (hf == wf and self.stride == (1, 1)): for i in range(Hy): for j in range(Wy): DX[:, i * hstride:i * hstride + hf, j * wstride:j * wstride + wf, :] += (nd.expand_dims(self.W, axis=0) * nd.expand_dims( DY[:, i:i + 1, j:j + 1, :], axis=3)).sum( axis=4) #sum over all the filters else: for i in range(hf): for j in range(wf): DX[:, i:i + Hy:hstride, j:j + Wy:wstride, :] += nd.dot(DY, self.W[i, j, :, :].T) return DX #* (hf*wf*df)**.5 / (NF*Hy*Wy)**.5
def forward(self, x, *args): #傅里叶级数 x = nd.array(x) n = self.n ns = nd.array(range(1, n)) ns = ns.reshape((-1, 1)) T = nd.dot(ns, x.reshape((1, -1))) pl = 2 * pi / self.l.data().abs() # an = self.an.data() bn = self.bn.data() san = an[1:].reshape((-1, 1)) sbn = bn[1:].reshape((-1, 1)) f = san * nd.cos(T * pl) + sbn * nd.sin(T * pl) f = nd.sum(f, axis=0, keepdims=False) f = f + an[0] return f.reshape((-1, 1))
def getfake(samples, dimensions, epsilon): wfake = nd.random_normal(shape=(dimensions)) # fake weight vector for separation bfake = nd.random_normal(shape=(1)) # fake bias wfake = wfake / nd.norm(wfake) # rescale to unit length # making some linearly separable data, simply by chosing the labels accordingly X = nd.zeros(shape=(samples, dimensions)) Y = nd.zeros(shape=(samples)) i = 0 while (i < samples): tmp = nd.random_normal(shape=(1,dimensions)) margin = nd.dot(tmp, wfake) + bfake if (nd.norm(tmp).asscalar() < 3) & (abs(margin.asscalar()) > epsilon): X[i,:] = tmp[0] Y[i] = 1 if margin.asscalar() > 0 else -1 i += 1 return X, Y
def forward(self, inputs, state): """ forward function """ h, = state outputs = [] for x in inputs: z = nd.sigmoid( nd.dot(x, self.w_xz) + nd.dot(h, self.w_hz) + self.b_z) r = nd.sigmoid( nd.dot(x, self.w_xr) + nd.dot(h, self.w_hr) + self.b_r) h_tilda = nd.tanh( nd.dot(x, self.w_xh) + nd.dot(h, self.w_hh) + self.b_h) h = z * h + (1 - z) * h_tilda y = nd.dot(h, self.w_hq) + self.b_q outputs.append(y) y_hat = nd.concat(*outputs, dim=0) return y_hat, (h, )
def function_set(self): self.__batch_y_hat = [] for X in self.__batch_X: I = nd.sigmoid( nd.dot(X, self.__W_xi) + nd.dot(self.__state, self.__W_hi) + self.__b_i) F = nd.sigmoid( nd.dot(X, self.__W_xf) + nd.dot(self.__state, self.__W_hf) + self.__b_f) O = nd.sigmoid( nd.dot(X, self.__W_xo) + nd.dot(self.__state, self.__W_ho) + self.__b_o) C_tilda = nd.tanh( nd.dot(X, self.__W_xc) + nd.dot(self.__state, self.__W_hc) + self.__b_c) # 注意这里 C 同 state 一样 self.__C = F * self.__C + I * C_tilda self.__state = O * nd.tanh(self.__C) self.__batch_y_hat.append( nd.dot(self.__state, self.__W_hy) + self.__b_y) self.__batch_y_hat = nd.concat(*self.__batch_y_hat, dim=0) return self.__batch_y_hat
def getfake(samples, dimensions, epsilon): wfake = nd.random_normal(shape=(dimensions)) # fake weight vector for separation bfake = nd.random_normal(shape=(1)) # fake bias wfake = wfake / nd.norm(wfake) # rescale to unit length # making some linearly separable data, simply by chosing the labels accordingly X = nd.zeros(shape=(samples, dimensions)) Y = nd.zeros(shape=(samples)) i = 0 while (i < samples): tmp = nd.random_normal(shape=(1, dimensions)) margin = nd.dot(tmp, wfake) + bfake if (nd.norm(tmp).asscalar() < 3) & (abs(margin.asscalar()) > epsilon): X[i, :] = tmp Y[i] = 2 * (margin > 0) - 1 i += 1 return X, Y
def get_fake(samples, dimensions, epsilon): wfake = nd.random_normal(shape=(dimensions)) bfake = nd.random_normal(shape=(1)) wfake = wfake / nd.norm(wfake) X = nd.zeros(shape=(samples, dimensions)) Y = nd.zeros(shape=(samples)) i = 0 while i < samples: tmp = nd.random_normal(shape=(1, dimensions)) margin = nd.dot(tmp, wfake) + bfake if (nd.norm(tmp).asscalar() < 3) and (abs( margin.asscalar() > epsilon)): X[i, :] = tmp Y[i] = 1 if margin.ascalar() > 0 else -1 i += 1 return X, Y
def solve_discrete_lv(self, mat1, mat2=None, is_full_matrix=True): p = ( [] ) # need to store as list for autograd won't let you append indices in same matrix p.append(self.p0) for n in range(self.num_time_steps - 1): # element-wise vector division and multiplication # Compute Ap to generate synthetic data for the full rank matrix A if is_full_matrix: mat_vec_prod = nd.dot(mat1, p[n]) else: mat_vec_prod = compute_mat_vec_prod(mat1, mat2, p[n]) p.append((1 + self.r * (1 - mat_vec_prod / self.k)) * p[n]) # concat puts in nd array of size num_ts*N # need to take size (N, num_ts) and transpose otherwise default is doing row major # and we need column major storing of the linear list p return (nd.concat(*p, dim=0).reshape(self.num_time_steps, self.num_time_series).T)
def compute_gradients(self, elbo: nd.NDArray, data_batch: mx.io.DataBatch = None, log_q_sum: nd.NDArray = None, mode: str = 'train') -> None: """Compute gradients and assign them to variational parameters. Args: elbo: evidence lower bound that we maximize data_batch: minibatch of data with data indices as labels log_q_sum: sum of log probs of samples from variational distributions q. """ cfg = self.gradient_config if cfg['estimator'] == 'pathwise': for block in self.sequential._children: for child_block in block._children: if hasattr(child_block, 'is_reparam'): assert child_block.is_reparam == True if len(self._point_mass_params) > 0 and mode == 'train': variables = [p.data() for p in self._point_mass_params] assert elbo.shape[-1] == cfg['batch_size'] loss = nd.mean(-elbo, -1) point_mass_grads = autograd.grad(loss, variables, retain_graph=True) _assign_grads(self._point_mass_params, point_mass_grads) if cfg['estimator'] == 'pathwise': (-elbo).backward() elif cfg['estimator'] == 'score_function': variables = [param.repeated for param in self._score_params] score_functions = autograd.grad(log_q_sum, variables) mx.autograd.set_recording(False) score_grads = [] for param, score_function in zip(self._score_params, score_functions): grad = _leave_one_out_gradient_estimator(score_function, -elbo) if 'emb' in param.name: # turns out the sparse implementation is not faster?! # data, label = data_batch # label = label.astype(np.int64) # grad = nd.sparse.row_sparse_array( # grad, indices=label, shape=param.shape) # need to broadcast for embeddings one_hot = nd.one_hot(data_batch[1], depth=self.n_data) grad = nd.dot(one_hot, grad, transpose_a=True) score_grads.append(grad) _assign_grads(self._score_params, score_grads)
def gru(inputs, state, params): """ @description:门控循环单元 @param {type} @return: """ W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params H, = state outputs = [] for X in inputs: Z = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z) R = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r) H_tilda = nd.tanh(nd.dot(X, W_xh) + nd.dot(R * H, W_hh) + b_h) H = Z * H + (1 - Z) * H_tilda Y = nd.dot(H, W_hq) + b_q outputs.append(Y) return outputs, (H, )
def hue(src, delta, p=0.5): """Hue distortion""" if np.random.uniform(0, 1) > p: alpha = random.uniform(-delta, delta) u = np.cos(alpha * np.pi) w = np.sin(alpha * np.pi) bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]]) tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321], [0.211, -0.523, 0.311]]) ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647], [1.0, -1.107, 1.705]]) t = np.dot(np.dot(ityiq, bt), tyiq).T src = nd.dot(src, nd.array(t, ctx=src.context)) return src return src
def most_similar_to(self, word, k=5): ''' Returns top k words similar to the argument. Eg. emb = Embedder(dimensions = 50) print(emb.most_similar_to('baby')) Returns... ['babies', 'boy', 'girl', 'newborn', 'pregnant'] ''' vec = self.__emb_mapper[word].reshape((-1, 1)) emb_vecs = self.__norm_vecs_by_row(self.__emb_mapper.idx_to_vec) dot_product = nd.dot(emb_vecs, vec) indices = nd.topk(dot_product.reshape((len(self.__embedder), )), k=k + 1, ret_typ='indices') indices = [int(i.asscalar()) for i in indices] # Remove unknown and input tokens. return self.__embedder.to_tokens(indices[1:])
def backward(self,DY): ''' Backward-passes an input error gradient DY towards the input neurons of this layer. Parameters ---------- DY : mxnet.ndarray.ndarray.NDArray an error gradient shaped same as the output array of forward, i.e. (N,Hy,Wy,Dy) with N = number of samples in the batch Hy = heigth of the output Wy = width of the output Dy = output depth = input depth Returns ------- DX : mxnet.ndarray.ndarray.NDArray the error gradient propagated towards the input ''' self.DY = DY N,Hy,Wy,NF = DY.shape hf,wf,df,NF = self.W.shape hstride, wstride = self.stride DX = nd.zeros_like(self.X,ctx=self.ctx, dtype=self.dtype) if not (hf == wf and self.stride == (1,1)): for i in range(Hy): for j in range(Wy): DX[:,i*hstride:i*hstride+hf , j*wstride:j*wstride+wf , : ] += ( nd.expand_dims(self.W, axis=0) * nd.expand_dims(DY[:,i:i+1,j:j+1,:], axis=3) ).sum(axis=4) #sum over all the filters else: for i in range(hf): for j in range(wf): DX[:,i:i+Hy:hstride,j:j+Wy:wstride,:] += nd.dot(DY,self.W[i,j,:,:].T) return DX #* (hf*wf*df)**.5 / (NF*Hy*Wy)**.5
def stats_batchwise(x_bat, y_bat, n, x_mean, y_mean, x_var=None, y_var=None, xx_cov=None, yy_cov=None, xy_cov=None, x_mean_skip=False, y_mean_skip=False): m = x_bat.shape[0] x_bat_mean = x_bat.mean(axis=0, keepdims=True) y_bat_mean = y_bat.mean(axis=0, keepdims=True) dx = x_bat - x_bat_mean dy = y_bat - y_bat_mean if x_var is not None: x_bat_var = nd.sum(dx**2, axis=0) x_var += x_bat_var + ((x_mean - x_bat_mean)**2) * n * m / (n+m) if y_var is not None: y_bat_var = nd.sum(dy**2, axis=0) y_var += y_bat_var + ((y_mean - y_bat_mean)**2) * n * m / (n+m) if xx_cov is not None: xx_bat_cov = nd.dot(dx, dx, transpose_a=True) xx_cov += xx_bat_cov + nd.dot((x_mean - x_bat_mean), (x_mean - x_bat_mean), transpose_a=True) * n * m / (n+m) if yy_cov is not None: yy_bat_cov = nd.dot(dy, dy, transpose_a=True) yy_cov += yy_bat_cov + nd.dot((y_mean - y_bat_mean), (y_mean - y_bat_mean), transpose_a=True) * n * m / (n+m) if xy_cov is not None: xy_bat_cov = nd.dot(dy, dx, transpose_a=True) xy_cov += xy_bat_cov + nd.dot((y_mean - y_bat_mean), (x_mean - x_bat_mean), transpose_a=True) * n * m / (n+m) if not x_mean_skip: x_mean = (n * x_mean + m * x_bat_mean) / (n+m) if not y_mean_skip: y_mean = (n * y_mean + m * y_bat_mean) / (n+m) n += m return n, x_mean, y_mean, x_var, y_var, xx_cov, yy_cov, xy_cov
def forward(self, x): linear = nd.dot(x, self.weight.data()) + self.bias.data() return nd.relu(linear)
def linreg(X, w, b): """线性回归模型。""" return nd.dot(X, w) + b
def get_distance_matrix(x): """Get distance matrix given a matrix. Used in testing.""" square = nd.sum(x ** 2.0, axis=1, keepdims=True) distance_square = square + square.transpose() - (2.0 * nd.dot(x, x.transpose())) return nd.sqrt(distance_square)
def linreg(X, w, b): """Linear regression.""" return nd.dot(X, w) + b
#patterntest import numpy as np from mxnet import nd from ecGAN.layer import Conv2D from ecGAN.explain.pattern.estimator import estimators lay = Conv2D(20, 2, strides=2, padding=0, regimes=estimators['linear']()) lay.initialize() data = nd.random.normal(5,shape=[1000,3,8,8]) out = lay(data) lay.init_pattern() lay.collect_pparams().initialize() for mdat in [data[i::100] for i in range(100)]: lay.forward_logged(mdat) lay.learn_pattern() lay.compute_pattern() resdat = data.reshape([1000,3,4,2,4,2]).transpose([0,2,4,1,3,5]).reshape([1000*4*4,3*4]) resout = out.transpose([0,2,3,1]).reshape([1000*4*4,20]) rescov = nd.dot((resout - resout.mean(0)).T, (resdat - resdat.mean(0))) / resout.shape[0] #TODO check whether correlation is correct! var_y = (lay.weight.data().flatten() * rescov).mean(1, keepdims=True) std_y = (resout - resout.mean(0)).mean(0)
def test_dot(): a = nd.ones(shape=(LARGE_X, SMALL_Y)) b = nd.ones(shape=(SMALL_Y, SMALL_Y)) res = nd.dot(a, b) assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
def gram(x): c = x.shape[1] n = x.size / x.shape[1] y = x.reshape((c, int(n))) return nd.dot(y, y.T) / n
def forward(self, inputs, target, next_word_history, cache_history, begin_state=None): # pylint: disable=arguments-differ """Defines the forward computation for cache cell. Arguments can be either :py:class:`NDArray` or :py:class:`Symbol`. Parameters ---------- inputs: NDArray The input data target: NDArray The label next_word_history: NDArray The next word in memory cache_history: NDArray The hidden state in cache history Returns -------- out: NDArray The linear interpolation of the cache language model with the regular word-level language model next_word_history: NDArray The next words to be kept in the memory for look up (size is equal to the window size) cache_history: NDArray The hidden states to be kept in the memory for look up (size is equal to the window size) """ output, hidden, encoder_hs, _ = \ super(self.lm_model.__class__, self.lm_model).\ forward(inputs, begin_state) encoder_h = encoder_hs[-1].reshape(-3, -2) output = output.reshape(-1, self._vocab_size) start_idx = len(next_word_history) \ if next_word_history is not None else 0 next_word_history = nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0) for t in target], dim=0) if next_word_history is None \ else nd.concat(next_word_history, nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0) for t in target], dim=0), dim=0) cache_history = encoder_h if cache_history is None \ else nd.concat(cache_history, encoder_h, dim=0) out = None softmax_output = nd.softmax(output) for idx, vocab_L in enumerate(softmax_output): joint_p = vocab_L if start_idx + idx > self._window: valid_next_word = next_word_history[start_idx + idx - self._window:start_idx + idx] valid_cache_history = cache_history[start_idx + idx - self._window:start_idx + idx] logits = nd.dot(valid_cache_history, encoder_h[idx]) cache_attn = nd.softmax(self._theta * logits).reshape(-1, 1) cache_dist = (cache_attn.broadcast_to(valid_next_word.shape) * valid_next_word).sum(axis=0) joint_p = self._lambdas * cache_dist + (1 - self._lambdas) * vocab_L out = joint_p[target[idx]] if out is None \ else nd.concat(out, joint_p[target[idx]], dim=0) next_word_history = next_word_history[-self._window:] cache_history = cache_history[-self._window:] return out, next_word_history, cache_history, hidden
def forward(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None): """Run decoding Parameters ---------- word_inputs : mxnet.ndarray.NDArray word indices of seq_len x batch_size tag_inputs : mxnet.ndarray.NDArray tag indices of seq_len x batch_size arc_targets : mxnet.ndarray.NDArray gold arc indices of seq_len x batch_size rel_targets : mxnet.ndarray.NDArray gold rel indices of seq_len x batch_size Returns ------- tuple (arc_accuracy, rel_accuracy, overall_accuracy, loss) when training, else if given gold target then return arc_accuracy, rel_accuracy, overall_accuracy, outputs, otherwise return outputs, where outputs is a list of (arcs, rels). """ is_train = autograd.is_training() def flatten_numpy(ndarray): """Flatten nd-array to 1-d column vector Parameters ---------- ndarray : numpy.ndarray input tensor Returns ------- numpy.ndarray A column vector """ return np.reshape(ndarray, (-1,), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) # non padding, non root token number if is_train or arc_targets is not None: mask_1D = flatten_numpy(mask) mask_1D_tensor = nd.array(mask_1D) unked_words = np.where(word_inputs < self._vocab.words_in_train, word_inputs, self._vocab.UNK) word_embs = self.word_embs(nd.array(unked_words, dtype='int')) if self.pret_word_embs: word_embs = word_embs + self.pret_word_embs(nd.array(word_inputs)) tag_embs = self.tag_embs(nd.array(tag_inputs)) # Dropout emb_inputs = nd.concat(word_embs, tag_embs, dim=2) # seq_len x batch_size top_recur = biLSTM(self.f_lstm, self.b_lstm, emb_inputs, batch_size, dropout_x=self.dropout_lstm_input if is_train else 0) top_recur = nd.Dropout(data=top_recur, axes=[0], p=self.dropout_mlp) W_dep, b_dep = self.mlp_dep_W.data(), self.mlp_dep_b.data() W_head, b_head = self.mlp_head_W.data(), self.mlp_head_b.data() dep, head = leaky_relu(nd.dot(top_recur, W_dep.T) + b_dep), leaky_relu(nd.dot(top_recur, W_head.T) + b_head) dep, head = nd.Dropout(data=dep, axes=[0], p=self.dropout_mlp), nd.Dropout(data=head, axes=[0], p=self.dropout_mlp) dep, head = nd.transpose(dep, axes=[2, 0, 1]), nd.transpose(head, axes=[2, 0, 1]) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = self.arc_W.data() arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = reshape_fortran(arc_logits, (seq_len, seq_len * batch_size)) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.argmax(0) # seq_len x batch_size if is_train or arc_targets is not None: correct = np.equal(arc_preds.asnumpy(), arc_targets) arc_correct = correct.astype(np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = flatten_numpy(arc_targets) losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D)) arc_loss = nd.sum(losses * mask_1D_tensor) / num_tokens if not is_train: arc_probs = np.transpose( np.reshape(nd.softmax(flat_arc_logits, axis=0).asnumpy(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = self.rel_W.data() rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = reshape_fortran(rel_logits, (seq_len, self._vocab.rel_size, seq_len * batch_size)) # (#head x rel_size) x (#dep x batch_size) _target_vec = nd.array(targets_1D if is_train else flatten_numpy(arc_preds.asnumpy())).reshape( seq_len * batch_size, 1) _target_mat = _target_vec * nd.ones((1, self._vocab.rel_size)) partial_rel_logits = nd.pick(flat_rel_logits, _target_mat.T, axis=0) # (rel_size) x (#dep x batch_size) if is_train or arc_targets is not None: rel_preds = partial_rel_logits.argmax(0) targets_1D = flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds.asnumpy(), targets_1D).astype(np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = self.softmax_loss(partial_rel_logits, nd.array(targets_1D)) rel_loss = nd.sum(losses * mask_1D_tensor) / num_tokens if not is_train: rel_probs = np.transpose(np.reshape(nd.softmax(flat_rel_logits.transpose([1, 0, 2]), axis=0).asnumpy(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if is_train or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if is_train: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs