def nodeforward(self, x, cs, hs, ctx): x = nd.reshape(x, (self.dim_h, )) _Ui = nd.zeros((self.dim_h, ), ctx=ctx) _Uo = nd.zeros((self.dim_h, ), ctx=ctx) _Uu = nd.zeros((self.dim_h, ), ctx=ctx) _Uf = [nd.zeros((self.dim_h, ), ctx=ctx) for i in range(len(cs))] for idx in range(len(cs)): _Ui = nd.add(_Ui, nd.dot(self.Uis[idx].data(), hs[idx])) _Uo = nd.add(_Uo, nd.dot(self.Uos[idx].data(), hs[idx])) _Uu = nd.add(_Uu, nd.dot(self.Uus[idx].data(), hs[idx])) for j in range(len(cs)): _Uf[idx] = nd.add(_Uf[idx], nd.dot(self.Ufs[idx][j].data(), hs[j])) i = nd.sigmoid( nd.add(nd.add(nd.dot(self.Wi.data(), x), _Ui), self.bi.data())) o = nd.sigmoid( nd.add(nd.add(nd.dot(self.Wo.data(), x), _Uo), self.bo.data())) f = [ nd.sigmoid( nd.add(nd.add(nd.dot(self.Wf.data(), x), _Uf[idx]), self.bf.data())) for idx in range(len(cs)) ] u = nd.tanh( nd.add(nd.add(nd.dot(self.Wu.data(), x), _Uu), self.bu.data())) c = nd.zeros((self.dim_h, ), ctx=ctx) for idx in range(len(cs)): c = nd.add(c, nd.multiply(f[idx], cs[idx])) c = nd.add(nd.multiply(i, u), c) h = nd.multiply(o, nd.tanh(c)) return c, h
def where(self, mask, tensor_in_1, tensor_in_2): """ Apply a boolean selection mask to the elements of the input tensors. Example:: >>> where( astensor([1, 0, 1]), astensor([1, 1, 1]), astensor([2, 2, 2])) [1. 2. 1.] Args: mask (bool): Boolean mask (boolean or tensor object of booleans) tensor_in_1 (Tensor): Tensor object tensor_in_2 (Tensor): Tensor object Returns: MXNet NDArray: The result of the mask being applied to the tensors. """ mask = self.astensor(mask) tensor_in_1 = self.astensor(tensor_in_1) tensor_in_2 = self.astensor(tensor_in_2) return nd.add(nd.multiply(mask, tensor_in_1), nd.multiply(nd.subtract(1, mask), tensor_in_2))
def where(self, mask, tensor_in_1, tensor_in_2): """ Apply a boolean selection mask to the elements of the input tensors. Example: >>> import pyhf >>> pyhf.set_backend(pyhf.tensor.mxnet_backend()) >>> pyhf.tensorlib.where( ... pyhf.tensorlib.astensor([1, 0, 1]), ... pyhf.tensorlib.astensor([1, 1, 1]), ... pyhf.tensorlib.astensor([2, 2, 2])) ... <BLANKLINE> [1. 2. 1.] <NDArray 3 @cpu(0)> Args: mask (bool): Boolean mask (boolean or tensor object of booleans) tensor_in_1 (Tensor): Tensor object tensor_in_2 (Tensor): Tensor object Returns: MXNet NDArray: The result of the mask being applied to the tensors. """ mask = self.astensor(mask) tensor_in_1 = self.astensor(tensor_in_1) tensor_in_2 = self.astensor(tensor_in_2) return nd.add( nd.multiply(mask, tensor_in_1), nd.multiply(nd.subtract(1, mask), tensor_in_2), )
def feature_detect(self, tag_inputs, word_inputs, bert): is_train = autograd.is_training() batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] # unked_words = np.where(word_inputs < self._vocab.words_in_train, word_inputs, self._vocab.UNK) if self.pret_word_embs is not None: word_embs = self.pret_word_embs(nd.array(word_inputs)) if bert is not None: word_embs = nd.concat(word_embs, nd.array(bert), dim=2) else: word_embs = nd.array(bert) tag_embs = self.tag_embs(nd.array(tag_inputs)) if self.tag_embs is not None else None # Dropout if is_train: wm, tm = self.generate_emb_mask(seq_len, batch_size) if self.tag_embs is not None: emb_inputs = nd.concat(nd.multiply(wm, word_embs), nd.multiply(tm, tag_embs), dim=2) else: emb_inputs = nd.multiply(wm, word_embs) else: if self.tag_embs is not None: emb_inputs = nd.concat(word_embs, tag_embs, dim=2) # seq_len x batch_size else: emb_inputs = word_embs top_recur = biLSTM(self.f_lstm, self.b_lstm, emb_inputs, batch_size, dropout_x=self.dropout_lstm_input if is_train else 0) return top_recur
def forward(self, x): with x.context: c = nd.softmax(self.b.data(), axis=1) u = nd.dot(x, self.w.data()) s = nd.multiply(c, u) s_nrm = nd.sum(s*s) fact = s_nrm / ( 1. + s_nrm) v = fact * s / nd.sqrt(s_nrm) self.u_v = nd.sum(nd.multiply(u, v)) return u
def hybrid_forward(self, F, X, *args): """ This method closely follows the formulas in RNN/lstm_formulas.png """ h, c = args[0], args[1] f_t = (self.W_f(X) + self.U_f(h)).sigmoid() i_t = (self.W_i(X) + self.U_i(h)).sigmoid() o_t = (self.W_o(X) + self.U_o(h)).sigmoid() c_tilde_t = (self.W_c(X) + self.U_c(h)).sigmoid() new_c = nd.multiply(f_t, c) + nd.multiply(i_t, c_tilde_t) new_h = nd.multiply(o_t, c.sigmoid()) return new_h, new_c
def get_xps(weight_denominator, weight_numerator, z): xps = list() xps.append(z) for _ in range(max(len(weight_numerator), len(weight_denominator))): xps.append(nd.multiply(xps[-1], z)) xps.insert(0, nd.ones_like(z)) return xps
def Rational_MXNET_A_F(x, weight_numerator, weight_denominator, training): # P(X) / Q(X) = a_0 + a_1 * X + ... + a_n * X ^ n / # 1 + | b_0 * X | + | b_1 * X | ^ 2 + ... + | b_i * X | ^ {i + 1} z = nd.reshape(x, shape=(-1, )) xps = get_xps(weight_denominator, weight_numerator, z) numerator = nd.array([0], dtype='float32') for i, w_n in enumerate(weight_numerator): numerator = numerator + nd.multiply(w_n, xps[i]) denominator = nd.array([1.0], dtype='float32') for j, w_d in enumerate(weight_denominator): denominator = denominator + nd.abs(nd.multiply(w_d, xps[j + 1])) return nd.divide(numerator, denominator).reshape(x.shape)
def _clip_px_gradients(self, batch_grads, px_clipping_factors): # hacky workaround for not knowing how to multiply a (b,) shape array with a (b, x) or (b, x, y) shape array expanded_batch_clipping_factors = nd.expand_dims( px_clipping_factors, 1) if len(batch_grads.shape) == 3: expanded_batch_clipping_factors = nd.expand_dims( expanded_batch_clipping_factors, 1) return nd.multiply(batch_grads, expanded_batch_clipping_factors)
def Rational_MXNET_C_F(x, weight_numerator, weight_denominator, training): # P(X) / Q(X) = a_0 + a_1 * X + ... + a_n * X ^ n / # eps + |b_1*X + b_1*X^2 + ... + b_{n-1}*X^n| z = nd.reshape(x, shape=(-1, )) xps = get_xps(weight_denominator, weight_numerator, z) numerator = nd.array([0], dtype='float32') for i, w_n in enumerate(weight_numerator): numerator = numerator + nd.multiply(w_n, xps[i]) denominator = nd.array([0], dtype='float32') for j, w_d in enumerate(weight_denominator): denominator = denominator + nd.multiply(w_d, xps[j]) return nd.divide(numerator, (0.1 + nd.abs(denominator))).reshape(x.shape)
def Rational_MXNET_D_F(x, weight_numerator, weight_denominator, training, random_deviation=0.1): # P(X)/Q(X) = noised(a_0) + noised(a_1)*X +noised(a_2)*X^2 + ... + noised(a_n)*X^n / # 1 + |noised(b_0)*X + noised(b_1)*X^2 + ... + noised(b_{n-1})*X^n| # Noised parameters have uniform noise to be in range [(1-random_deviation)*parameter,(1+random_deviation)*parameter]. if not training: # do not add noise return Rational_MXNET_B_F(x, weight_numerator, weight_denominator, training) z = nd.reshape(x, shape=(-1, )) lower_bound = nd.array([1 - random_deviation]) upper_bound = nd.array([1 + random_deviation]) xps = get_xps(weight_denominator, weight_numerator, z) numerator = nd.array([0], dtype='float32') for i, w_n in enumerate(weight_numerator): w_n_noised = nd.multiply( w_n, nd.sample_uniform(low=lower_bound, high=upper_bound, shape=z.shape, dtype='float32')) numerator = numerator + nd.multiply(w_n_noised, xps[i]) denominator = nd.array([0], dtype='float32') for j, w_d in enumerate(weight_denominator): w_d_noised = nd.multiply( w_d, nd.sample_uniform(low=lower_bound, high=upper_bound, shape=z.shape, dtype='float32')) denominator = denominator + nd.multiply(w_d_noised, xps[j + 1]) return nd.divide(numerator, (1 + nd.abs(denominator))).reshape(x.shape)
def make_std_mask(trg, pad, ctx): """ Create a mask to hide padding ad future words. Compare each element of trg_mask and sub_mask. (1, 1) -> 1 o.w. -> 0 There is no bitwise operator for mxnet """ trg_mask = (trg != pad).expand_dims(axis = -2) trg_mask = nd.repeat(trg_mask, repeats = trg_mask.shape[-1], axis = -2) sub_mask = subsequent_mask(trg.shape[-1]) sub_mask = nd.repeat(sub_mask, repeats = trg_mask.shape[0], axis = 0) trg_mask = nd.multiply(trg_mask, sub_mask.as_in_context(ctx)) return trg_mask
def dense_bw(self, input_layer, input_error): """Fully connected layer backward process""" self.d_act_z = self.d_act(self.z) self.delta_b = nd.multiply(input_error, self.d_act_z) x = nd.transpose(input_layer) self.delta_W = nd.dot(x, self.delta_b) output_bp = nd.dot(self.delta_b, nd.transpose(self.W)) self.delta_b = nd.sum(self.delta_b, axis=0) assert self.batch_size == input_error.shape[0] self.W = nd.subtract( self.W, self.delta_W * (self.learning_rate / self.batch_size)) self.b = nd.subtract( self.b, self.delta_b * (self.learning_rate / self.batch_size)) return output_bp
def partial_trim(epoch, v, net, f): # apply partial knowledge trimmed mean attack vi_shape = v[0].shape #first compute the distribution parameters all_grads = nd.concat(*v, dim=1) adv_grads = all_grads[:, :f] e_mu = nd.mean(adv_grads, axis=1) # mean e_sigma = nd.sqrt( nd.sum(nd.square(nd.subtract(adv_grads, e_mu.reshape(-1, 1))), axis=1) / f) # standard deviation for i in range(f): # apply attack to compromised worker devices with randomness v[i] = ( e_mu - nd.multiply(e_sigma, nd.sign(e_mu)) * (3. + nd.random.uniform(shape=e_sigma.shape))).reshape(vi_shape) return v
def _concrete_dropout(self, x): """Forward pass for dropout layer """ with autograd.record(): eps = 1e-7 temp = 0.1 self.p = nd.sigmoid(self.p_logit.data()) # Check if batch size is the same as unif_noise, if not take care unif_noise = nd.array(np.random.uniform(size=tuple(x.shape))) drop_prob = (nd.log(self.p + eps) - nd.log(1 - self.p + eps) + nd.log(unif_noise + eps) - nd.log(1 - unif_noise + eps)) drop_prob = nd.sigmoid(drop_prob / temp) random_tensor = 1 - drop_prob retain_prob = 1 - self.p x = nd.multiply(x, random_tensor) x = x / retain_prob self.all_p.append(self.p) return x
def forward(self, x): embeds = self.embed(x) # batch * time step * embedding x_i = embeds.expand_dims(1) x_i = nd.repeat(x_i, repeats=self.sentence_length, axis=1) # batch * time step * time step * embedding x_j = embeds.expand_dims(2) x_j = nd.repeat(x_j, repeats=self.sentence_length, axis=2) # batch * time step * time step * embedding x_full = nd.concat( x_i, x_j, dim=3) # batch * time step * time step * (2 * embedding) # New input data _x = x_full.reshape((-1, 2 * self.emb_dim)) # Network for attention _attn = self.attn(_x) _att = _attn.reshape((-1, self.sentence_length, self.sentence_length)) _att = nd.sigmoid(_att) att = nd.softmax(_att, axis=1) _x = self.g_fc1(_x) # (batch * time step * time step) * hidden_dim _x = self.g_fc2(_x) # (batch * time step * time step) * hidden_dim # sentence_length*sentence_length개의 결과값을 모두 합해서 sentence representation으로 나타냄 x_g = _x.reshape( (-1, self.sentence_length, self.sentence_length, self.hidden_dim)) _inflated_att = _att.expand_dims(axis=-1) _inflated_att = nd.repeat(_inflated_att, repeats=self.hidden_dim, axis=3) x_q = nd.multiply(_inflated_att, x_g) sentence_rep = nd.mean(x_q.reshape(shape=(-1, self.sentence_length**2, self.hidden_dim)), axis=1) return sentence_rep, att
def cross_entropy(self, o, y): """ o is the output from fully connected layer (num_examples x num_classes) y is labels (num_examples x 1) Note that y is not one-hot encoded vector. It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required. """ m = y.shape[0] print(o[0]) p = self.softmax(o) print(p[0]) print(y[0]) k = nd.multiply(y, p) print(k[0]) input() # We use multidimensional array indexing to extract # softmax probability of the correct label for each sample. # Refer to https://docs.scipy.org/doc/numpy/user/basics.indexing.html#indexing-multi-dimensional-arrays for understanding multidimensional array indexing. log_likelihood = -nd.log(nd.max(k, axis=1)) print(log_likelihood) loss = nd.sum(log_likelihood) / m print(666, loss) return loss
def run(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None, is_train=True): """ Train or test :param word_inputs: seq_len x batch_size :param tag_inputs: seq_len x batch_size :param arc_targets: seq_len x batch_size :param rel_targets: seq_len x batch_size :param is_train: is training or test :return: """ # return 0, 0, 0, nd.dot(self.junk.data(), nd.ones((3, 1))).sum() def flatten_numpy(ndarray): """ Flatten nd-array to 1-d column vector :param ndarray: :return: """ return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) # non padding, non root token number if is_train or arc_targets is not None: mask_1D = flatten_numpy(mask) # mask_1D_tensor = nd.inputTensor(mask_1D, batched=True) mask_1D_tensor = nd.array(mask_1D) # if batched=True, the last dimension is used as a batch dimension if arr is a list of numpy ndarrays unked_words = np.where(word_inputs < self._vocab.words_in_train, word_inputs, self._vocab.UNK) word_embs = self.word_embs(nd.array(unked_words, dtype='int')) if self.pret_word_embs: word_embs = word_embs + self.pret_word_embs(nd.array(word_inputs)) tag_embs = self.tag_embs(nd.array(tag_inputs)) # Dropout if is_train: wm, tm = self.generate_emb_mask(seq_len, batch_size) emb_inputs = nd.concat(nd.multiply(wm, word_embs), nd.multiply(tm, tag_embs), dim=2) else: emb_inputs = nd.concat(word_embs, tag_embs, dim=2) # seq_len x batch_size top_recur = biLSTM(self.bi_lstm, emb_inputs, batch_size) # if is_train: # top_recur = nd.Dropout(data=top_recur, axes=[0], p=self.dropout_mlp) W_dep, b_dep = self.mlp_dep_W.data(), self.mlp_dep_b.data() W_head, b_head = self.mlp_head_W.data(), self.mlp_head_b.data() dep, head = leaky_relu(nd.dot(top_recur, W_dep.T) + b_dep), leaky_relu( nd.dot(top_recur, W_head.T) + b_head) # if is_train: # dep, head = nd.Dropout(data=dep, axes=[0], p=self.dropout_mlp), nd.Dropout(data=head, axes=[0], # p=self.dropout_mlp) dep, head = nd.transpose(dep, axes=[2, 0, 1]), nd.transpose(head, axes=[2, 0, 1]) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] # return 0, 0, 0, dep_arc.sum() + head_arc.sum() W_arc = self.arc_W.data() arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # return 0, 0, 0, arc_logits.sum() # (#head x #dep) x batch_size flat_arc_logits = reshape_fortran(arc_logits, (seq_len, seq_len * batch_size)) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.argmax(0) if len(arc_preds.shape) == 1: # dynet did unnecessary jobs arc_preds = np.expand_dims(arc_preds, axis=1) # seq_len x batch_size if is_train or arc_targets is not None: correct = np.equal(arc_preds.asnumpy(), arc_targets) arc_correct = correct.astype(np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = flatten_numpy(arc_targets) losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D)) arc_loss = nd.sum(losses * mask_1D_tensor) / num_tokens if not is_train: arc_probs = np.transpose( np.reshape( nd.softmax(flat_arc_logits).asnumpy(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = self.rel_W.data() # dep_rel = nd.concat([dep_rel, nd.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) # head_rel = nd.concat([head_rel, nd.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = reshape_fortran( rel_logits, (seq_len, self._vocab.rel_size, seq_len * batch_size)) # (#head x rel_size) x (#dep x batch_size) _target_vec = nd.array( targets_1D if is_train else flatten_numpy(arc_preds)).reshape( seq_len * batch_size, 1) _target_mat = _target_vec * nd.ones((1, self._vocab.rel_size)) partial_rel_logits = nd.pick(flat_rel_logits, _target_mat.T, axis=0) # (rel_size) x (#dep x batch_size) if is_train or arc_targets is not None: rel_preds = partial_rel_logits.argmax(0) targets_1D = flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds.asnumpy(), targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = self.softmax_loss(partial_rel_logits, nd.array(targets_1D)) rel_loss = nd.sum(losses * mask_1D_tensor) / num_tokens if not is_train: rel_probs = np.transpose( np.reshape( nd.softmax(nd.transpose(flat_rel_logits)).asnumpy(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if is_train or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if is_train: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def forward(self, x): self.batch_size, self.input_dim_vector, self.input_num_capsule = x.shape assert (self.batch_size, self.input_dim_vector, self.input_num_capsule) == (self.batch_size, 8, 1152) x_exp = x.expand_dims(axis=1) x_exp = x_exp.expand_dims(axis=4) assert x_exp.shape == (self.batch_size, 1, 8, 1152, 1) x_tile = x_exp.tile(reps=[1, self.num_capsule, 1, 1, 1]) assert x_tile.shape == (self.batch_size, 10, 8, 1152, 1) x_trans = x_tile.transpose(axes=(0, 3, 1, 2, 4)) assert x_trans.shape == (self.batch_size, 1152, 10, 8, 1) # W = self.W_ij.data() print(self.W_ij.data()[0, 0, 0, 0]) # W = self.routing_weight # print('W',W[0,0,0,0]) W = self.W_ij.data().tile(reps=[self.batch_size, 1, 1, 1, 1]) assert W.shape == (self.batch_size, 1152, 10, 8, 16) # [8, 16].T x [8, 1] => [16, 1] x_dot = x_trans.reshape(shape=(-1, self.input_dim_vector, 1)) #(8,1) W_dot = W.reshape(shape=(-1, self.input_dim_vector, self.dim_vector)) #(8,16) u_hat = nd.batch_dot(W_dot, x_dot, transpose_a=True) u_hat = u_hat.reshape(shape=(self.batch_size, self.input_num_capsule, self.num_capsule, self.dim_vector, -1)) assert u_hat.shape == (self.batch_size, 1152, 10, 16, 1) b_IJ = nd.zeros( (self.batch_size, self.input_num_capsule, self.num_capsule, 1, 1), ctx=self.context) assert b_IJ.shape == ((self.batch_size, 1152, 10, 1, 1)) u_hat_stopped = nd.stop_gradient(u_hat, name='stop_gradient') for r_iter in range(self.iter_routing): c_IJ = nd.softmax(b_IJ, axis=2) s_J = nd.multiply(c_IJ, u_hat) s_J = s_J.sum(axis=1, keepdims=True) # print('s_J',s_J[0,0,0]) assert s_J.shape == (self.batch_size, 1, 10, 16, 1) v_J = self.squash(s_J, axis=3) assert v_J.shape == (self.batch_size, 1, 10, 16, 1) v_J_tiled = v_J.tile(reps=[1, 1152, 1, 1, 1]) if self.iter_routing > 1: # u_hat_stopped (self.batch_size, 1152, 10, 16, 1) # v_J_tiled (self.batch_size, 1152, 10, 16, 1) # u_hat_stopped = u_hat_stopped.reshape(shape=(-1,self.dim_vector,1)) # v_J_tiled = v_J_tiled.reshape(shape=(-1,self.dim_vector,1)) # u_produce_v = nd.stop_gradient( nd.multiply(u_hat_stopped, v_J_tiled, transpose_a=True)) # u_produce_v = u_produce_v.reshape(shape=(self.batch_size, self.input_num_capsule, self.num_capsule, 1, 1)) assert u_produce_v.shape == (self.batch_size, 1152, 10, 1, 1) b_IJ = nd.stop_gradient(b_IJ + u_produce_v, name="update_b_IJ") #(batch_size,1,10,16,1) assert v_J.shape == (self.batch_size, 1, self.num_capsule, self.dim_vector, 1) # print('v_J',v_J[0,0,0,0]) return v_J