def evals(net, adata ,alabel, batch_size): hidden = net.begin_state(func=mx.nd.zeros,batch_size = batch_size,ctx=mx.cpu()) dataLoader = DataLoader(adata, alabel) tl = 0 for data, label in dataLoader.dataIter(batch_size): label = nd.array(label) #label = nd.ones(shape=(5,batch_size)) * label #label = label.reshape((-1,)) dd = nd.array(data.reshape((batch_size,5,11)).swapaxes(0,1)) #hidden = detach(hidden) output,hidden = net(dd, hidden) output = output.reshape((5,batch_size,1)) output = nd.sum(output,axis=0)/5 lv = loss(output, label) tl += nd.sum(lv).asscalar() return tl / len(adata)
def grad_clipping(params, theta, ctx): """Gradient clipping.""" if theta is not None: norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad ** 2) norm = nd.sqrt(norm).asscalar() if norm > theta: for p in params: p.grad[:] *= theta / norm
def grad_clipping(params, clipping_norm, ctx): """Gradient clipping.""" if clipping_norm is not None: norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad ** 2) norm = nd.sqrt(norm).asscalar() if norm > clipping_norm: for p in params: p.grad[:] *= clipping_norm / norm
def evaluate(ctx, net, data_loader): n, total_succ = 0, 0 for X, y_expect in data_loader: X, y_expect = X.as_in_context(ctx), y_expect.as_in_context(ctx) y = net(X) total_succ += nd.sum(y.argmax(axis=1) == y_expect).asscalar() n += X.shape[0] return 100 * total_succ / n
def predict(net, data, label): data = nd.array(data) label = nd.array(label) hidden = net.begin_state(func=mx.nd.zeros,batch_size = data.shape[0],ctx=mx.cpu()) dd = nd.array(data.reshape((data.shape[0],5,11)).swapaxes(0,1)) output,hidden = net(dd,hidden) output = output.reshape((5,data.shape[0],1)) output = nd.sum(output,axis=0)/5 l = nd.argmax(output, axis=1) res = nd.mean(l==label) return res.asscalar()
def forward(self,X,lrp_aware=False): ''' Realizes the forward pass of an input through the convolution layer. Parameters ---------- X : mxnet.ndarray.ndarray.NDArray a network input, shaped (N,H,W,D), with N = batch size H, W, D = input size in heigth, width, depth lrp_aware : bool controls whether the forward pass is to be computed with awareness for multiple following LRP calls. this will sacrifice speed in the forward pass but will save time if multiple LRP calls will follow for the current X, e.g. wit different parameter settings or for multiple target classes. Returns ------- Y : mxnet.ndarray.ndarray.NDArray the layer outputs. ''' self.lrp_aware = lrp_aware self.X = X N,H,W,D = X.shape hf, wf, df, nf = self.W.shape hstride, wstride = self.stride numfilters = self.n #assume the given pooling and stride parameters are carefully chosen. Hout = (H - hf) // hstride + 1 Wout = (W - wf) // wstride + 1 #initialize pooled output self.Y = nd.zeros((N,Hout,Wout,numfilters), ctx=self.ctx, dtype=self.dtype) if self.lrp_aware: self.Z = nd.zeros((N, Hout, Wout, hf, wf, df, nf), ctx=self.ctx, dtype=self.dtype) #initialize container for precomputed forward messages for i in range(Hout): for j in range(Wout): self.Z[:,i,j,...] = nd.expand_dims(self.W, axis=0) * nd.expand_dims(self.X[:, i*hstride:i*hstride+hf , j*wstride:j*wstride+wf , :], axis=4) # N, hf, wf, df, nf self.Y[:,i,j,:] = self.Z[:,i,j,...].sum(axis=(1,2,3)) + self.B else: for i in range(Hout): for j in range(Wout): self.Y[:,i,j,:] = nd.sum( nd.expand_dims( X[:, i*hstride:i*hstride+hf: , j*wstride:j*wstride+wf: , : ].transpose((1,2,3,0)), 4) * nd.expand_dims(self.W, 3), axis=(0,1,2)) + self.B return self.Y
def forward(self, cls_pred, box_pred, cls_target, box_target): """Compute loss in entire batch across devices.""" # require results across different devices at this time cls_pred, box_pred, cls_target, box_target = [_as_list(x) \ for x in (cls_pred, box_pred, cls_target, box_target)] # cross device reduction to obtain positive samples in entire batch num_pos = [] for cp, bp, ct, bt in zip(*[cls_pred, box_pred, cls_target, box_target]): pos_samples = (ct > 0) num_pos.append(pos_samples.sum()) num_pos_all = sum([p.asscalar() for p in num_pos]) if num_pos_all < 1: # no positive samples found, return dummy losses return nd.zeros((1,)), nd.zeros((1,)), nd.zeros((1,)) # compute element-wise cross entropy loss and sort, then perform negative mining cls_losses = [] box_losses = [] sum_losses = [] for cp, bp, ct, bt in zip(*[cls_pred, box_pred, cls_target, box_target]): pred = nd.log_softmax(cp, axis=-1) pos = ct > 0 cls_loss = -nd.pick(pred, ct, axis=-1, keepdims=False) rank = (cls_loss * (pos - 1)).argsort(axis=1).argsort(axis=1) hard_negative = rank < (pos.sum(axis=1) * self._negative_mining_ratio).expand_dims(-1) # mask out if not positive or negative cls_loss = nd.where((pos + hard_negative) > 0, cls_loss, nd.zeros_like(cls_loss)) cls_losses.append(nd.sum(cls_loss, axis=0, exclude=True) / num_pos_all) bp = _reshape_like(nd, bp, bt) box_loss = nd.abs(bp - bt) box_loss = nd.where(box_loss > self._rho, box_loss - 0.5 * self._rho, (0.5 / self._rho) * nd.square(box_loss)) # box loss only apply to positive samples box_loss = box_loss * pos.expand_dims(axis=-1) box_losses.append(nd.sum(box_loss, axis=0, exclude=True) / num_pos_all) sum_losses.append(cls_losses[-1] + self._lambd * box_losses[-1]) return sum_losses, cls_losses, box_losses
def stats_batchwise(x_bat, y_bat, n, x_mean, y_mean, x_var=None, y_var=None, xx_cov=None, yy_cov=None, xy_cov=None, x_mean_skip=False, y_mean_skip=False): m = x_bat.shape[0] x_bat_mean = x_bat.mean(axis=0, keepdims=True) y_bat_mean = y_bat.mean(axis=0, keepdims=True) dx = x_bat - x_bat_mean dy = y_bat - y_bat_mean if x_var is not None: x_bat_var = nd.sum(dx**2, axis=0) x_var += x_bat_var + ((x_mean - x_bat_mean)**2) * n * m / (n+m) if y_var is not None: y_bat_var = nd.sum(dy**2, axis=0) y_var += y_bat_var + ((y_mean - y_bat_mean)**2) * n * m / (n+m) if xx_cov is not None: xx_bat_cov = nd.dot(dx, dx, transpose_a=True) xx_cov += xx_bat_cov + nd.dot((x_mean - x_bat_mean), (x_mean - x_bat_mean), transpose_a=True) * n * m / (n+m) if yy_cov is not None: yy_bat_cov = nd.dot(dy, dy, transpose_a=True) yy_cov += yy_bat_cov + nd.dot((y_mean - y_bat_mean), (y_mean - y_bat_mean), transpose_a=True) * n * m / (n+m) if xy_cov is not None: xy_bat_cov = nd.dot(dy, dx, transpose_a=True) xy_cov += xy_bat_cov + nd.dot((y_mean - y_bat_mean), (x_mean - x_bat_mean), transpose_a=True) * n * m / (n+m) if not x_mean_skip: x_mean = (n * x_mean + m * x_bat_mean) / (n+m) if not y_mean_skip: y_mean = (n * y_mean + m * y_bat_mean) / (n+m) n += m return n, x_mean, y_mean, x_var, y_var, xx_cov, yy_cov, xy_cov
def evaluate_accuracy(data_iterator, net, ctx=[mx.cpu()]): if isinstance(ctx, mx.Context): ctx = [ctx] acc = nd.array([0]) n = 0. if isinstance(data_iterator, mx.io.MXDataIter): data_iterator.reset() for batch in data_iterator: data, label, batch_size = _get_batch(batch, ctx) for X, y in zip(data, label): acc += nd.sum(net(X).argmax(axis=1) == y).copyto(mx.cpu()) n += y.size acc.wait_to_read() # don't push too many operators into backend return acc.asscalar() / n
def train_one_epoch(ctx, net, floss, trainer, data_loader): n, total_loss, total_succ = 0, 0, 0 for X, y_expect in data_loader: X, y_expect = X.as_in_context(ctx), y_expect.as_in_context(ctx) with autograd.record(): y = net(X) loss = floss(y, y_expect) loss.backward() trainer.step(X.shape[0]) total_loss += loss.sum().asscalar() total_succ += nd.sum(y.argmax(axis=1) == y_expect).asscalar() n += X.shape[0] return 100 * total_succ / n, total_loss / n
def _spectral_norm(self): """ spectral normalization """ w = self.params.get('weight').data(self.ctx) w_mat = nd.reshape(w, [w.shape[0], -1]) _u = self.u.data(self.ctx) _v = None for _ in range(POWER_ITERATION): _v = nd.L2Normalization(nd.dot(_u, w_mat)) _u = nd.L2Normalization(nd.dot(_v, w_mat.T)) sigma = nd.sum(nd.dot(_u, w_mat) * _v) if sigma == 0.: sigma = EPSILON self.params.setattr('u', _u) return w / sigma
def batchwise_covariance(X, Y): meanx = meany = vary = n = C = 0 for x, y in zip(X, Y): m = len(x) meanx_ = x.mean(axis=0, keepdims=True) meany_ = y.mean(axis=0, keepdims=True) dx = x - meanx_ dy = y - meany_ C_ = nd.dot(dx, dy, transpose_a=True) C += C_ + nd.dot((meanx - meanx_), (meany - meany_), transpose_a=True) * n * m / (n+m) vary_ = nd.sum(dy**2, axis=0) vary += vary_ + ((meany - meany_)**2) * n * m / (n+m) meanx = (n * meanx + m * meanx_) / (n+m) meany = (n * meany + m * meany_) / (n+m) n += m return C / n, vary / n
def update(self,lrate): N,Hx,Wx,Dx = self.X.shape N,Hy,Wy,NF = self.DY.shape hf,wf,df,NF = self.W.shape hstride, wstride = self.stride DW = nd.zeros_like(self.W,ctx=self.ctx, dtype=self.dtype) if not (hf == wf and self.stride == (1,1)): for i in range(Hy): for j in range(Wy): DW += ( nd.expand_dims(self.X[:, i*hstride:i*hstride+hf , j*wstride:j*wstride+wf , :], axis=4) * nd.expand_dims(self.DY[:,i:i+1,j:j+1,:], axis=3)).sum(axis=0) else: for i in range(hf): for j in range(wf): DW[i,j,:,:] = nd.sum( nd.expand_dims(self.X[:,i:i+Hy:hstride,j:j+Wy:wstride,:], axis=4) * nd.expand_dims(self.DY, axis=3) ,axis=(0,1,2)) DB = self.DY.sum(axis=(0,1,2)) self.W -= lrate * DW / (hf*wf*df*Hy*Wy)**.5 self.B -= lrate * DB / (Hy*Wy)**.5
def forward(self, x): x = nd.sqrt(nd.sum(nd.square(x), 2)) return x
def cross_entropy(yhat, y): return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))
smoothing_constant = .01 num_examples = 60000 for e in range(epochs): cumulative_loss = 0 for i, (data, label) in enumerate(train_iter): # ==== note the difference in raw data input shape ==== # use 4d tensor (batch_size, 1, 28, 28) data = data.as_in_context(model_ctx) label = label.as_in_context(model_ctx) with autograd.record(): output = net(data) loss = softmax_cross_entropy(output, label) loss.backward() trainer.step(data.shape[0]) cumulative_loss += nd.sum(loss).asscalar() test_accuracy = evaluate_accuracy(test_iter, net) train_accuracy = evaluate_accuracy(train_iter, net) print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, cumulative_loss / num_examples, train_accuracy, test_accuracy)) # show test image plt.imshow(test_img[0], cmap='Greys_r') plt.axis('off') plt.show() # make prediction output = net(test_img[0:1]) print("" + str(np.asscalar(nd.argmax(output, axis=1).asnumpy().astype(np.int8))))
def test_ndarray_ones(): a = nd.ones(shape=(LARGE_X, SMALL_Y)) assert a[-1][0] == 1 assert nd.sum(a).asnumpy() == LARGE_SIZE
def __Htransfer(self, e, wr): norm_wr = wr / wr.norm(axis=1, keepdims=True) return e - nd.sum(e * norm_wr, 1, True) * norm_wr
def test_op(self, num_samples=None, num_epochs=None, reset=True, dataset='test'): ''' Evaluates the model using num_samples. Args ---- num_samples: integer, default None The number of samples to evaluate on. This is converted to evaluating on (num_samples // batch_size) minibatches. num_epochs: integer, default None The number of epochs to evaluate on. This used if num_samples is not specified. If neither is specified, defaults to 1 epoch. reset: bool, default True Whether to reset the test data index to 0 before iterating through and evaluating on minibatches. dataset: string, default 'test': Which dataset to evaluate on: 'valid' or 'test'. Returns ------- Loss_u: float The loss on the unlabeled data. Loss_l: float The loss on the labeled data. Eval_u: list of floats A list of evaluation metrics on the unlabeled data. Eval_l: list of floats A list of evaluation metrics on the labeled data. ''' batch_size = self.args['batch_size'] model_ctx = self.model_ctx if num_samples is None and num_epochs is None: # assume full dataset evaluation num_epochs = 1 if reset: # Reset Data to Index Zero if self.data.data[dataset] is not None: self.data.force_reset_data(dataset) if self.data.data[dataset + '_with_labels'] is not None: self.data.force_reset_data(dataset + '_with_labels') # Unlabeled Data u_loss = 'NA' u_eval = [] if self.data.data[dataset] is not None: u_loss = 0 if num_samples is None: num_samps = self.data.data[dataset].shape[0] * num_epochs else: num_samps = num_samples batches = int(np.ceil(num_samps / self.args['batch_size'])) batch_iter = range(batches) if batches > 1: batch_iter = tqdm(batch_iter, desc='unlabeled') for batch in batch_iter: # 1. Retrieve data docs = self.data.get_documents(key=dataset) if self.args['use_kd']: split_on = docs.shape[1] // 2 docs, bert_logits = docs[:, :split_on], docs[:, split_on:] # TODO: below is not used, but also may not be necessary t = self.args['kd_softmax_temp'] kd_docs = nd.softmax(bert_logits / t) * nd.sum( docs, axis=1, keepdims=True) # 2. Compute loss y_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_u) x_reconstruction_u = self.Dec(y_onehot_u_softmax) logits = nd.log_softmax(x_reconstruction_u) loss_recon_unlabel = nd.sum(-docs * logits, axis=1) # 3. Convert to numpy u_loss += nd.mean(loss_recon_unlabel).asscalar() u_loss /= batches # Labeled Data l_loss = 0.0 l_acc = 0.0 if self.data.data[dataset + '_with_labels'] is not None: l_loss = 0 if num_samples is None: num_samps = self.data.data[ dataset + '_with_labels'].shape[0] * num_epochs else: num_samps = num_samples batches = int(np.ceil(num_samps / self.args['batch_size'])) batch_iter = range(batches) if batches > 1: batch_iter = tqdm(batch_iter, desc='labeled') softmaxCEL = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False) for batch in batch_iter: # 1. Retrieve data labeled_docs, labels = self.data.get_documents( key=dataset + '_with_labels', split_on=self.data.data_dim) # 2. Compute loss y_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_u) class_pred = nd.argmax(y_onehot_u_softmax, axis=1) l_a = labels[list(range(labels.shape[0])), class_pred] l_acc += nd.mean(l_a).asscalar() labels = labels / nd.sum(labels, axis=1, keepdims=True) l_l = softmaxCEL(y_onehot_u_softmax, labels) # 3. Convert to numpy l_loss += nd.mean(l_l).asscalar() l_loss /= batches l_acc /= batches return u_loss, l_loss, l_acc
def train_and_predict_rnn(rnn, is_random_iter, epochs, num_steps, hidden_dim, learning_rate, clipping_norm, batch_size, pred_period, pred_len, seqs, get_params, get_inputs, ctx, corpus_indices, idx_to_char, char_to_idx, is_lstm=False): """Train an RNN model and predict the next item in the sequence.""" if is_random_iter: data_iter = data_iter_random else: data_iter = data_iter_consecutive params = get_params() softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() for e in range(1, epochs + 1): # If consecutive sampling is used, in the same epoch, the hidden state # is initialized only at the beginning of the epoch. if not is_random_iter: state_h = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx) if is_lstm: state_c = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx) train_loss, num_examples = 0, 0 for data, label in data_iter(corpus_indices, batch_size, num_steps, ctx): # If random sampling is used, the hidden state has to be # initialized for each mini-batch. if is_random_iter: state_h = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx) if is_lstm: state_c = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx) with autograd.record(): # outputs shape:(batch_size, vocab_size) if is_lstm: outputs, state_h, state_c = rnn(get_inputs(data), state_h, state_c, *params) else: outputs, state_h = rnn(get_inputs(data), state_h, *params) # Let t_ib_j be the j-th element of the mini-batch at time i. # label shape:(batch_size * num_steps) # label = [t_0b_0, t_0b_1, ..., t_1b_0, t_1b_1, ..., ]. label = label.T.reshape((-1,)) # Concatenate outputs: # shape: (batch_size * num_steps, vocab_size). outputs = nd.concat(*outputs, dim=0) # Now outputs and label are aligned. loss = softmax_cross_entropy(outputs, label) loss.backward() grad_clipping(params, clipping_norm, ctx) SGD(params, learning_rate) train_loss += nd.sum(loss).asscalar() num_examples += loss.size if e % pred_period == 0: print("Epoch %d. Training perplexity %f" % (e, exp(train_loss/num_examples))) for seq in seqs: print(' - ', predict_rnn(rnn, seq, pred_len, params, hidden_dim, ctx, idx_to_char, char_to_idx, get_inputs, is_lstm)) print()
def train_and_predict_rnn(rnn, is_random_iter, epochs, num_steps, hidden_dim, learning_rate, clipping_norm, batch_size, pred_period, pred_len, seqs, get_params, get_inputs, ctx, corpus_indices, idx_to_char, char_to_idx, is_lstm=False): """Train an RNN model and predict the next item in the sequence.""" if is_random_iter: data_iter = data_iter_random else: data_iter = data_iter_consecutive params = get_params() softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() for e in range(1, epochs + 1): # If consecutive sampling is used, in the same epoch, the hidden state # is initialized only at the beginning of the epoch. if not is_random_iter: state_h = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx) if is_lstm: state_c = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx) train_loss, num_examples = 0, 0 for data, label in data_iter(corpus_indices, batch_size, num_steps, ctx): # If random sampling is used, the hidden state has to be # initialized for each mini-batch. if is_random_iter: state_h = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx) if is_lstm: state_c = nd.zeros(shape=(batch_size, hidden_dim), ctx=ctx) with autograd.record(): # outputs shape: (batch_size, vocab_size) if is_lstm: outputs, state_h, state_c = rnn(get_inputs(data), state_h, state_c, *params) else: outputs, state_h = rnn(get_inputs(data), state_h, *params) # Let t_ib_j be the j-th element of the mini-batch at time i. # label shape: (batch_size * num_steps) # label = [t_0b_0, t_0b_1, ..., t_1b_0, t_1b_1, ..., ]. label = label.T.reshape((-1,)) # Concatenate outputs: # shape: (batch_size * num_steps, vocab_size). outputs = nd.concat(*outputs, dim=0) # Now outputs and label are aligned. loss = softmax_cross_entropy(outputs, label) loss.backward() grad_clipping(params, clipping_norm, ctx) SGD(params, learning_rate) train_loss += nd.sum(loss).asscalar() num_examples += loss.size if e % pred_period == 0: print("Epoch %d. Training perplexity %f" % (e, exp(train_loss/num_examples))) for seq in seqs: print(' - ', predict_rnn(rnn, seq, pred_len, params, hidden_dim, ctx, idx_to_char, char_to_idx, get_inputs, is_lstm)) print()
def log_loss(output, y): yhat = logistic(output) return -nd.sum(y * nd.log(yhat) + (1 - y) * nd.log(1 - yhat))
def test_net_paras_copy(): net1 = get_net(10) net2 = get_net(10) input = nd.arange(2880).reshape((3, 3, 20, 16)) net1(input) net2(input) ps1 = net1.collect_params() ps2 = net2.collect_params() print(str(net1)) print('----------------------------------') print(str(net2)) print('++++++++++++++++++++++') print(ps1) print('----------------------------------') print(ps2) print('++++++++++++++++++++++') print(net1.prefix) print('----------------------------------') print(net2.prefix) print('++++++++++++++++++++++') prefix_length = len(net2.prefix) print(ps1.keys()) print('----------------------------------') print(ps2.keys()) print('++++++++++++++++++++++') copy_params(net1, net2) print('++++++++++++++++++++++') print(net1.collect_params().values()) print('----------------------------------') print(net2.collect_params().values()) print('++++++++++++++++++++++') # # print(net1.collect_params().items()) # print('----------------------------------') # print(net2.collect_params().items()) # print('++++++++++++++++++++++') # net2.collect_params().update(net1.collect_params()) # net2[0].collect_params().update(net1[0].collect_params()) # print(net1[0].collect_params()) # print('----------------------------------') # print(net2[0].collect_params()) # print('++++++++++++++++++++++') # net2[0].collect_params().update(net1[0].collect_params()) print(nd.sum(net1[0].weight.data() - net2[0].weight.data()).asnumpy) print(nd.sum(net1[1].weight.data() - net2[1].weight.data()).asnumpy) print(nd.sum(net1[2].weight.data() - net2[2].weight.data()).asnumpy) print(nd.sum(net1[4].weight.data() - net2[4].weight.data()).asnumpy) print(nd.sum(net1[5].weight.data() - net2[5].weight.data()).asnumpy) print('----------------------------------') pass
def all(tensor): return nd.sum(tensor != 0).asscalar()
def softmax(y_linear): exp = nd.exp(y_linear - nd.max(y_linear)) partition = nd.sum(exp, axis=0, exclude=True).reshape((-1, 1)) return exp / partition
def test_reduce(): a = nd.ones(shape=(LARGE_X, 1)) assert nd.sum(a).asnumpy() == a.shape[0] * a.shape[1]
prams = [W, W0] # Track the gradients of the parameters for parameter in prams: parameter.attach_grad() # Execute training loop using SGD for E in range(epochs): total_loss = 0 for i, (xtrain, ytrain) in enumerate(train_data): xtrain = xtrain.as_in_context(cntx).reshape((-1, 784)) ytrain = ytrain.as_in_context(cntx) ylabel_flag = nd.one_hot(ytrain, 5) with autograd.record(): y_out = aux.nnet(xtrain, W, W0) loss = aux.cross_ent(y_out, ylabel_flag) loss.backward() prams = aux.SGD(prams, learn_rate) total_loss += nd.sum(loss).asscalar() # Evaluate model on training data train_accuracy = aux.compute_accuracy(train_data, aux.nnet, prams, cntx) # Evaluate model on testing data test_accuracy = aux.compute_accuracy(test_data, aux.nnet, prams, cntx) print("Epoch %s. Loss: %s, Train Accuracy: %s, Test Accuracy: %s" % (E, total_loss / m_cases, train_accuracy, test_accuracy)) # Save trained parameters aux.save_mnist(prams)
def train(self, train_data, log_folder, params_folder, epochs, batch_size, ctx, init_lr, lr_step=5, lr_factor=0.1): """ Train network. :param train_mode: :param train_data: Data and Label for training. Instance of tuple(dict). - valid_keys: valid keys of current category of clothes. - images: Instance of tuple. All images info of current category: - orig_images_id: Instance of list. (image_count) - orig_images_shape: Instance of np.array. (image_count, orig_h, orig_w) - orig_keypoints: Instance of np.array. (image_count, keypoints_count, 3) - norm_images: Instance of np.array. (image_count, 3, h, w) - belief_maps: Instance of np.array. (image_count, keypoints_count, h, w) - norm_centermap: Instance of np.array. (h, w) :param params_folder: Folder holds saved params. :param epochs: :param batch_size: :param ctx: Instance of list. :return: """ logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler(), logging.FileHandler(log_folder + 'train_' + self._name + '_batch_' + str(epochs) + '_' + str(batch_size))]) # 1. check params files and get last epoch and batch epoch_index, batch_index, file = self.utils_params_file(batch_size, 'check', params_folder) # (1) begin a new training if epoch_index == -1 and batch_index == -1: logging.info("No params files detected. Begin a new training.") self.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx) epoch_index = 0 batch_index = 0 # (2) resume training from params file else: logging.info("Params file '%s' detected. Last (epoch, batch): (%d, %d). Resuming training." % (file, epoch_index, batch_index)) self.collect_params().load(params_folder + file, ctx=ctx) batch_index += 1 # 2. train # (1) trainer and loss function for total training mode model_trainer = trainer.Trainer(self.collect_params(), 'sgd', {'learning_rate': init_lr, 'momentum': 0.9, 'wd': 5e-4}) loss_function = loss.SoftmaxCrossEntropyLoss(sparse_label=False) # (2) train each epoch and batch for e in range(epoch_index, epochs): if e != epoch_index: batch_index = 0 # 1> set learning rate model_trainer.set_learning_rate(init_lr * pow(lr_factor, int(e / lr_step))) if e % lr_step == 0: logging.info('Learning rate now is set to be %.6f' % model_trainer.learning_rate) # 2> train batch while True: # (1) get data _, _, orig_images_shape_batch, orig_keypoints_batch, norm_images_batch, norm_center_maps_batch, belief_maps_batch, _ = \ train_data.get_batch_data(if_data_aug=True, loss_mode='softmax', batch_index=batch_index, batch_size=batch_size) if norm_images_batch is None and norm_center_maps_batch is None and belief_maps_batch is None: break # (2) split data into multiple GPU norm_images_batch_LIST = split_and_load(norm_images_batch, ctx_list=ctx) norm_center_maps_batch_LIST = split_and_load(norm_center_maps_batch, ctx_list=ctx) belief_maps_batch_LIST = split_and_load(belief_maps_batch, ctx_list=ctx) #------------------------------------------------------------------------------------------------------- # (3) train total pred_beliefMaps_batch = [] # 1> record auto grad with autograd.record(): # 1.initiate gpu losses gpu_losses = [] # 2.calculate losses on each gpu of each stage for norm_images_batch, norm_center_maps_batch, belief_maps_batch in zip(norm_images_batch_LIST, norm_center_maps_batch_LIST, belief_maps_batch_LIST): # (1) initiate current gpu loss current_gpu_loss = None # (2) network forward pred_beliefMaps = self.forward(input_images=norm_images_batch, center_maps=norm_center_maps_batch) for p_b in pred_beliefMaps[-1].asnumpy(): pred_beliefMaps_batch.append(p_b) # (3) shape groud-truth belief maps to use softmax loss shaped_gt_beliefMaps = nd.reshape(belief_maps_batch, shape=(belief_maps_batch.shape[0], belief_maps_batch.shape[1], belief_maps_batch.shape[2] * belief_maps_batch.shape[3])) # (4) calculate each and every stage loss on current gpu for stage in range(len(self._block_stage)): # 1> shape predicted belief map of current stage shaped_pred_beliefMap = nd.reshape(pred_beliefMaps[stage], shape=(pred_beliefMaps[stage].shape[0], pred_beliefMaps[stage].shape[1], pred_beliefMaps[stage].shape[2] * pred_beliefMaps[stage].shape[3])) # 2> calculate current stage loss on current gpu current_loss = loss_function(shaped_pred_beliefMap, shaped_gt_beliefMaps) # 3> summary current_gpu_loss = current_loss if current_gpu_loss is None else (current_gpu_loss + current_loss) # (5) append & save gpu_losses.append(current_gpu_loss) # 3> backward and update for gpu_loss in gpu_losses: gpu_loss.backward() model_trainer.step(batch_size) nd.waitall() # 4> calculate batch average loss batch_loss = sum([nd.sum(gpu_loss).asscalar() for gpu_loss in gpu_losses]) / (batch_size * len(self._block_stage)) NE = self.calculate_error(valid_keys=utils.keypoints_order[train_data.category], category=train_data.category, predicted_keypoints=self.transform_beliefMaps_into_origKeypoints( predicted_beliefMaps=np.array(pred_beliefMaps_batch), orig_images_shape=orig_images_shape_batch), orig_keypoints=np.array(orig_keypoints_batch)) # 5> print logging.info("Epoch[%d]-Batch[%d] lr: %f. Average loss: %f. NE:%.2f%%" % (e, batch_index, model_trainer.learning_rate, batch_loss, NE*100)) #------------------------------------------------------------------------------------------------------- # (4) save params with batch info (batch_size, batch_index) params_file = self.utils_params_file(operation='generate', batch_size=batch_size, epoch_index=e, batch_index=batch_index) params_old_file = self.utils_params_file(operation='generate', batch_size=batch_size, epoch_index=e, batch_index=batch_index - 1, batches=train_data.calc_batches_count(batch_size)) self.collect_params().save(params_folder + params_file) if os.path.exists(params_folder + params_old_file): os.remove(params_folder + params_old_file) batch_index += 1 # 3.finish logging.info("Training completed.")
def test_ndarray_ones(): a = nd.ones(shape=LARGE_X) assert a[-1] == 1 assert nd.sum(a).asnumpy() == LARGE_X
def train(train_data, net, loss, ctx,global_step,epoch_step, num_epochs,best_F1=0): print("Start training on ", ctx) if isinstance(ctx, mx.Context): ctx = [ctx] for epoch in range(num_epochs): if epoch<50: trainer = gluon.Trainer(net.collect_params(),'adam', {'learning_rate': 0.001, 'wd':1e-3}) elif epoch<90: trainer = gluon.Trainer(net.collect_params(),'adam', {'learning_rate': 0.0001, 'wd':1e-3}) elif epoch<120: trainer = gluon.Trainer(net.collect_params(),'adam', {'learning_rate': 0.00001, 'wd':1e-3}) else: trainer = gluon.Trainer(net.collect_params(),'sgd', {'learning_rate': 0.000001,'momentum': 0.9,'wd':1e-3}) train_loss, n, = 0.0, 0.0 TP,TN,FP,FN=0,0,0,0 start = time() for i,batch in enumerate(train_data): data, label, batch_size = get_batch(batch, ctx) losses = [] with autograd.record(): outputs = [net(X) for X in data] losses = [loss(yhat, y) for yhat, y in zip(outputs, label)] for l in losses: l.backward() sw.add_scalar(tag='cross_entropy', value=l.mean().asscalar(), global_step=global_step) global_step += 1 train_loss += sum([l.sum().asscalar() for l in losses]) n += batch_size trainer.step(batch_size) for data,label in test_data: data=data.as_in_context(ctx[0]) label=label.as_in_context(ctx[0]) pred=net(data) nd.waitall() pred=nd.sigmoid(pred) pred=(pred>0.5).reshape(-1,256,256) TPt=nd.sum(pred*label).asscalar() FPt=nd.sum(pred-(pred*label)).asscalar() FNt=nd.sum(label-(pred*label)).asscalar() TNt=nd.sum((1-pred)*(1-label)).asscalar() TP=TP+TPt FP=FP+FPt FN=FN+FNt TN=TN+TNt ACC=(TP+TN)/(TP+TN+FP+FN+1e-15) TPR=TP/ (TP+ FN+1e-15) TNR= TN/(FP+TN+1e-15) PPV=TP/(TP+FP+1e-15) F1=2*PPV*TPR/(PPV+TPR+1e-15) sw.add_scalar(tag='test_acc', value=ACC, global_step=epoch_step) sw.add_scalar(tag='test_TPR', value=TPR, global_step=epoch_step) sw.add_scalar(tag='test_TNR', value=TNR, global_step=epoch_step) sw.add_scalar(tag='test_PPV', value=PPV, global_step=epoch_step) sw.add_scalar(tag='F1', value=F1, global_step=epoch_step) epoch_step+=1 print('EPOCH',epoch) print('test_acc=',ACC) print('test_TPR=',TPR) print('test_TNR=',TNR) print('test_PPV=',PPV) print('F1=',F1) if F1>best_F1: net.save_parameters('fold2_unet.params') best_F1=F1 if epoch == 0: sw.add_graph(net) print('train_loss=',train_loss/n) print('time:',time() - start) sw.close() net.export("mynet", epoch)
def CapLoss(y_pred, y_true): L = y_true * nd.square(nd.maximum(0., 0.9 - y_pred)) + \ 0.5 * (1 - y_true) * nd.square(nd.maximum(0., y_pred - 0.1)) return nd.mean(nd.sum(L, 1))
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': .001}) training_loss_vector = [] validation_loss_vector = [] for e in range(epochs): cumulative_loss_train = 0 cumulative_loss_valid = 0 for i, (data, label) in enumerate(data_iter_loader_train): data = data.as_in_context(model_ctx).reshape((-1, 784)) label = label.as_in_context(model_ctx) with autograd.record(): output = net(data) loss_train = softmax_cross_entropy(output, label) loss_train.backward() trainer.step(data.shape[0]) cumulative_loss_train += nd.sum(loss_train).asscalar() for j, (data, label) in enumerate(data_iter_loader_valid): data = data.as_in_context(model_ctx).reshape((-1, 784)) label = label.as_in_context(model_ctx) with autograd.record(): output = net(data) loss_valid = softmax_cross_entropy(output, label) cumulative_loss_valid += nd.sum(loss_valid).asscalar() print(cumulative_loss_train / 42000, ' *** ', '***', cumulative_loss_valid / 18000) validation_accuracy = evaluate_accuracy(data_iter_loader_valid, net) train_accuracy = evaluate_accuracy(data_iter_loader_train, net) print("Epoch %s , train_acc %s, validation_acc %s" % (e, train_accuracy, validation_accuracy)) training_loss_vector.append(cumulative_loss_train) validation_loss_vector.append(cumulative_loss_valid)
def unlabeled_train_op_adv_combine_add(self, update_enc=True): ''' Trains the GAN model ''' batch_size = self.args['batch_size'] model_ctx = self.model_ctx eps = 1e-10 ########################## ### unsupervised phase ### ########################## # Retrieve data docs = self.data.get_documents(key='train') class_true = nd.zeros(batch_size, dtype='int32', ctx=model_ctx) class_fake = nd.ones(batch_size, dtype='int32', ctx=model_ctx) loss_reconstruction = nd.zeros((1, ), ctx=model_ctx) ### adversarial phase ### discriminator_z_confidence_true = nd.zeros(shape=(1, ), ctx=model_ctx) discriminator_z_confidence_fake = nd.zeros(shape=(1, ), ctx=model_ctx) discriminator_y_confidence_true = nd.zeros(shape=(1, ), ctx=model_ctx) discriminator_y_confidence_fake = nd.zeros(shape=(1, ), ctx=model_ctx) loss_discriminator = nd.zeros(shape=(1, ), ctx=model_ctx) dirich_entropy = nd.zeros(shape=(1, ), ctx=model_ctx) ### generator phase ### loss_generator = nd.zeros(shape=(1, ), ctx=model_ctx) ### reconstruction phase ### with autograd.record(): y_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_u) x_reconstruction_u = self.Dec(y_onehot_u_softmax) logits = nd.log_softmax(x_reconstruction_u) loss_reconstruction = nd.sum(-docs * logits, axis=1) loss_total = loss_reconstruction * self.args['recon_alpha'] if self.args['adverse']: #and np.random.rand()<0.8: y_true = np.random.dirichlet(np.ones(self.ndim_y) * self.args['dirich_alpha'], size=batch_size) y_true = nd.array(y_true, ctx=model_ctx) dy_true = self.Dis_y(y_true) dy_fake = self.Dis_y(y_onehot_u_softmax) discriminator_y_confidence_true = nd.mean( nd.softmax(dy_true)[:, 0]) discriminator_y_confidence_fake = nd.mean( nd.softmax(dy_fake)[:, 1]) softmaxCEL = gluon.loss.SoftmaxCrossEntropyLoss() loss_discriminator = softmaxCEL(dy_true, class_true) + \ softmaxCEL(dy_fake, class_fake) loss_generator = softmaxCEL(dy_fake, class_true) loss_total = loss_total + loss_discriminator + loss_generator dirich_entropy = nd.mean( nd.sum(-y_true * nd.log(y_true + eps), axis=1)) loss_total.backward() self.optimizer_enc.step(batch_size) self.optimizer_dec.step(batch_size) self.optimizer_dis_y.step(batch_size) latent_max = nd.zeros(self.args['ndim_y'], ctx=model_ctx) for max_ind in nd.argmax(y_onehot_u_softmax, axis=1): latent_max[max_ind] += 1.0 latent_max /= batch_size latent_entropy = nd.mean( nd.sum(-y_onehot_u_softmax * nd.log(y_onehot_u_softmax + eps), axis=1)) latent_v = nd.mean(y_onehot_u_softmax, axis=0) return nd.mean(loss_discriminator).asscalar(), nd.mean(loss_generator).asscalar(), nd.mean(loss_reconstruction).asscalar(), \ nd.mean(discriminator_z_confidence_true).asscalar(), nd.mean(discriminator_z_confidence_fake).asscalar(), \ nd.mean(discriminator_y_confidence_true).asscalar(), nd.mean(discriminator_y_confidence_fake).asscalar(), \ latent_max.asnumpy(), latent_entropy.asscalar(), latent_v.asnumpy(), dirich_entropy.asscalar()
def softmax(X): X_max = nd.max(X, axis=1, keepdims=True) X = X - X_max exp = nd.exp(X) partition = nd.sum(exp, axis=1, keepdims=True) return exp / partition
def unlabeled_train_op_mmd_combine(self, update_enc=True): ''' Trains the MMD model ''' batch_size = self.args['batch_size'] model_ctx = self.model_ctx eps = 1e-10 # Retrieve data docs = self.data.get_documents(key='train') if self.args['use_kd']: split_on = docs.shape[1] // 2 docs, bert_logits = docs[:, :split_on], docs[:, split_on:] t = self.args['kd_softmax_temp'] kd_docs = nd.softmax(bert_logits / t) * nd.sum( docs, axis=1, keepdims=True) kd_docs = kd_docs * (kd_docs > self.args['kd_min_count']) y_true = np.random.dirichlet(np.ones(self.ndim_y) * self.args['dirich_alpha'], size=batch_size) y_true = nd.array(y_true, ctx=model_ctx) with autograd.record(): ### reconstruction phase ### y_onehot_u = self.Enc(docs) y_onehot_u_softmax = nd.softmax(y_onehot_u) if self.args['latent_noise'] > 0: y_noise = np.random.dirichlet(np.ones(self.ndim_y) * self.args['dirich_alpha'], size=batch_size) y_noise = nd.array(y_noise, ctx=model_ctx) y_onehot_u_softmax = ( 1 - self.args['latent_noise'] ) * y_onehot_u_softmax + self.args['latent_noise'] * y_noise x_reconstruction_u = self.Dec(y_onehot_u_softmax) if self.args['use_kd']: kd_logits = nd.log_softmax(x_reconstruction_u / t) logits = nd.log_softmax(x_reconstruction_u) kd_loss_reconstruction = nd.mean( nd.sum(-kd_docs * kd_logits, axis=1)) loss_reconstruction = nd.mean(nd.sum(-docs * logits, axis=1)) loss_total = self.args['recon_alpha'] * ( self.args['kd_loss_alpha'] * t * t * (kd_loss_reconstruction) + (1 - self.args['kd_loss_alpha']) * loss_reconstruction) else: logits = nd.log_softmax(x_reconstruction_u) loss_reconstruction = nd.mean(nd.sum(-docs * logits, axis=1)) loss_total = loss_reconstruction * self.args['recon_alpha'] ### mmd phase ### if self.args['adverse']: y_fake = self.Enc(docs) y_fake = nd.softmax(y_fake) loss_mmd = mmd_loss(y_true, y_fake, ctx_model=model_ctx, t=self.args['kernel_alpha']) loss_total = loss_total + loss_mmd if self.args['l2_alpha'] > 0: loss_total = loss_total + self.args['l2_alpha'] * nd.mean( nd.sum(nd.square(y_onehot_u), axis=1)) loss_total.backward() self.optimizer_enc.step(1) self.optimizer_dec.step(1) # self.m.args['batch_size'] latent_max = nd.zeros(self.args['ndim_y'], ctx=model_ctx) for max_ind in nd.argmax(y_onehot_u, axis=1): latent_max[max_ind] += 1.0 latent_max /= batch_size latent_entropy = nd.mean( nd.sum(-y_onehot_u_softmax * nd.log(y_onehot_u_softmax + eps), axis=1)) latent_v = nd.mean(y_onehot_u_softmax, axis=0) dirich_entropy = nd.mean(nd.sum(-y_true * nd.log(y_true + eps), axis=1)) if self.args['adverse']: loss_mmd_return = loss_mmd.asscalar() else: loss_mmd_return = 0.0 return nd.mean(loss_reconstruction).asscalar( ), loss_mmd_return, latent_max.asnumpy(), latent_entropy.asscalar( ), latent_v.asnumpy(), dirich_entropy.asscalar()
self.linear1 = nn.Dense(in_units=confidence_C,units=(confidence_C+K_way)//2,\ use_bias=True,activation='relu') self.linear2 = nn.Dense(units=K_way) def forward(self, x): x = self.linear1(x) x = self.linear2(x) return x # x shape is N*K_way,to pred top_k is the output_label.loss is SoftmaxwithCrossentropy if __name__ == '__main__': from mxnet.gluon.loss import SigmoidBinaryCrossEntropyLoss from mxnet import nd, autograd model = Decision_thresh(thresh_size=4) model.initialize(init=mx.init.Xavier()) x = nd.array([[0.1, 0.7, 0.9, 0.4], [0.8, 0.5, 0.8, 0.1]]) label = nd.array([[0, 1, 1, 0], [1, 0, 0, 0]]) loss_criterion = SigmoidBinaryCrossEntropyLoss() with autograd.record(): y_pred = model(x) loss = loss_criterion(y_pred, label) print("loss", nd.sum(loss).asscalar()) loss.backward() print(model.thresh.grad()) # to test the Decision_topk model to predict the top_k is groud truth model2 = Decision_topk(confidence_C=63, K_way=4) mdoel2.initialize(init=mx.init.Xavier()) #x = nd.
def squash(x, axis): s_squared_norm = nd.sum(nd.square(x), axis, keepdims=True) scale = s_squared_norm / (1 + s_squared_norm) / nd.sqrt(s_squared_norm + 1e-5) return scale * x
tel = [] for epoch in range(500): total_L = 0.0 hidden = rnn.begin_state(func=mx.nd.zeros,batch_size = batch_size,ctx=mx.cpu()) for data,label in dataLoader.dataIter(batch_size): label = nd.array(label) # print("label shape" ,label.shape) #label = nd.ones(shape=(5,32)) * label #label = label.reshape((-1,)) dd = nd.array(data.reshape((batch_size,5,11)).swapaxes(0,1)) hidden = detach(hidden) with mx.autograd.record(): output, hidden = rnn(dd,hidden) output = output.reshape((5,256,1)) output = nd.sum(output,axis=0)/5 # print(output.shape) lv = loss(output,label) lv.backward() grads = [i.grad() for i in rnn.collect_params().values()] mx.gluon.utils.clip_global_norm(grads,clipping_norm*num_steps*batch_size) trainer.step(batch_size) total_L += mx.nd.sum(lv).asscalar() test_loss = evals(rnn,c,d,batch_size) trl.append(total_L/len(a)) tel.append(test_loss) print("Epoch %d loss %.4f test loss %.4f train acc %.4f test acc %.4f" %(epoch, total_L/len(a), test_loss,predict(rnn,a,b),predict(rnn,c,d))) with open("rnn.csv",'w',newline='') as f: import csv writer = csv.writer(f) writer.writerows([trl,tel])
def test_reduce(): a = nd.ones(shape=(LARGE_X, SMALL_Y)) assert nd.sum(a).asnumpy() == a.shape[0] * a.shape[1]
def softmax(y_linear, temperature=1.0): lin = (y_linear - nd.max(y_linear)) / temperature exp = nd.exp(lin) partition = nd.sum(exp, axis=0, exclude=True).reshape((-1, 1)) return exp / partition
def check_ndarray_ones(): a = nd.ones(shape=LARGE_X) assert a[-1] == 1 assert nd.sum(a) == LARGE_X
def forward(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None): """Run decoding Parameters ---------- word_inputs : mxnet.ndarray.NDArray word indices of seq_len x batch_size tag_inputs : mxnet.ndarray.NDArray tag indices of seq_len x batch_size arc_targets : mxnet.ndarray.NDArray gold arc indices of seq_len x batch_size rel_targets : mxnet.ndarray.NDArray gold rel indices of seq_len x batch_size Returns ------- tuple (arc_accuracy, rel_accuracy, overall_accuracy, loss) when training, else if given gold target then return arc_accuracy, rel_accuracy, overall_accuracy, outputs, otherwise return outputs, where outputs is a list of (arcs, rels). """ is_train = autograd.is_training() def flatten_numpy(ndarray): """Flatten nd-array to 1-d column vector Parameters ---------- ndarray : numpy.ndarray input tensor Returns ------- numpy.ndarray A column vector """ return np.reshape(ndarray, (-1,), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) # non padding, non root token number if is_train or arc_targets is not None: mask_1D = flatten_numpy(mask) mask_1D_tensor = nd.array(mask_1D) unked_words = np.where(word_inputs < self._vocab.words_in_train, word_inputs, self._vocab.UNK) word_embs = self.word_embs(nd.array(unked_words, dtype='int')) if self.pret_word_embs: word_embs = word_embs + self.pret_word_embs(nd.array(word_inputs)) tag_embs = self.tag_embs(nd.array(tag_inputs)) # Dropout emb_inputs = nd.concat(word_embs, tag_embs, dim=2) # seq_len x batch_size top_recur = biLSTM(self.f_lstm, self.b_lstm, emb_inputs, batch_size, dropout_x=self.dropout_lstm_input if is_train else 0) top_recur = nd.Dropout(data=top_recur, axes=[0], p=self.dropout_mlp) W_dep, b_dep = self.mlp_dep_W.data(), self.mlp_dep_b.data() W_head, b_head = self.mlp_head_W.data(), self.mlp_head_b.data() dep, head = leaky_relu(nd.dot(top_recur, W_dep.T) + b_dep), leaky_relu(nd.dot(top_recur, W_head.T) + b_head) dep, head = nd.Dropout(data=dep, axes=[0], p=self.dropout_mlp), nd.Dropout(data=head, axes=[0], p=self.dropout_mlp) dep, head = nd.transpose(dep, axes=[2, 0, 1]), nd.transpose(head, axes=[2, 0, 1]) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = self.arc_W.data() arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = reshape_fortran(arc_logits, (seq_len, seq_len * batch_size)) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.argmax(0) # seq_len x batch_size if is_train or arc_targets is not None: correct = np.equal(arc_preds.asnumpy(), arc_targets) arc_correct = correct.astype(np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = flatten_numpy(arc_targets) losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D)) arc_loss = nd.sum(losses * mask_1D_tensor) / num_tokens if not is_train: arc_probs = np.transpose( np.reshape(nd.softmax(flat_arc_logits, axis=0).asnumpy(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = self.rel_W.data() rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = reshape_fortran(rel_logits, (seq_len, self._vocab.rel_size, seq_len * batch_size)) # (#head x rel_size) x (#dep x batch_size) _target_vec = nd.array(targets_1D if is_train else flatten_numpy(arc_preds.asnumpy())).reshape( seq_len * batch_size, 1) _target_mat = _target_vec * nd.ones((1, self._vocab.rel_size)) partial_rel_logits = nd.pick(flat_rel_logits, _target_mat.T, axis=0) # (rel_size) x (#dep x batch_size) if is_train or arc_targets is not None: rel_preds = partial_rel_logits.argmax(0) targets_1D = flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds.asnumpy(), targets_1D).astype(np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = self.softmax_loss(partial_rel_logits, nd.array(targets_1D)) rel_loss = nd.sum(losses * mask_1D_tensor) / num_tokens if not is_train: rel_probs = np.transpose(np.reshape(nd.softmax(flat_rel_logits.transpose([1, 0, 2]), axis=0).asnumpy(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if is_train or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if is_train: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def softmax(y_linear, temperature=1.0): lin = (y_linear-nd.max(y_linear)) / temperature exp = nd.exp(lin) partition = nd.sum(exp, axis=0, exclude=True).reshape((-1,1)) return exp / partition
def get_distance_matrix(x): """Get distance matrix given a matrix. Used in testing.""" square = nd.sum(x ** 2.0, axis=1, keepdims=True) distance_square = square + square.transpose() - (2.0 * nd.dot(x, x.transpose())) return nd.sqrt(distance_square)
def evaluate_accuracy(data_iterator, net, num_inputs, num_outputs): total_preds = nd.zeros(num_outputs) total_labels = nd.zeros(num_outputs) total_overlap = nd.zeros(num_outputs) numerator = 0. denominator = 0. for i, (data, label) in enumerate(data_iterator): if isMnist: data = data.as_in_context(ctx).reshape((-1, num_inputs)) label = label.as_in_context(ctx) output = net(data) predictions = nd.argmax(output, axis=1) number_same = nd.sum(predictions == label) else: data = data.as_in_context(ctx).astype(numpy.float32) label = label.as_in_context(ctx).astype(numpy.float32) output = net(data) soft_output = softmax(output) single_prediction_index = nd.argmax(soft_output, axis=1) single_label_index = nd.argmax(label, axis=1) number_same = nd.sum(single_prediction_index == single_label_index) #output = net(data) #predictions = nd.argmax(output, axis=1) #numerator += nd.sum(predictions == label) numerator += number_same denominator += data.shape[0] for l in nd.arange(0, num_outputs): if True: pp = (single_prediction_index == l) preds = nd.sum(pp) ll = (single_label_index == l) labels = nd.sum(ll) ss = pp + ll overlap = nd.sum(ss > 1) x = 1 else: preds = 0 labels = 0 overlap = 0 for j in nd.arange(0, batch_size): if single_prediction_index[j] == l: preds += 1 if single_label_index[j] == l: labels += 1 if (single_prediction_index[j] == l and single_label_index[j] == l \ and single_prediction_index[j] == single_label_index[j]): overlap += 1 ##aaa = numpy.set((single_prediction_index == i).asnumpy()) ##bbb = numpy.set((single_prediction_index == single_label_index).asnumpy()) ##ccc = numpy.intersect1d(aaa,bbb) ###prediction_nominator[i] += ##prediction_indices = numpy.nonzero((i == single_prediction_index and single_prediction_index == single_label_index).asnumpy()) ##prediction_set = numpy.set(prediction_indices) ###prediction_denominator[single_prediction_index] += #####label_set = numpy.set(i == single_prediction_index) total_preds[l] += preds total_labels[l] += labels total_overlap[l] += overlap for l in nd.arange(0, num_outputs): p = (total_overlap[l] / total_labels[l]).asscalar() r = (total_overlap[l] / total_preds[l]).asscalar() f1 = 1 * (p * r) / (p + r) print('f1: %s, l %s, p %s, l: %s, o: %s .. precision: %s, recall: %s' \ , (f1 \ , l.asscalar(), total_preds[l].asscalar() \ , total_labels[l].asscalar() \ , total_overlap[l].asscalar() \ , p \ , r \ ) ) return (numerator / denominator).asscalar()
def test_sum(): a = nd.ones(LARGE_X) b = nd.sum(a, axis=0) assert b[0] == LARGE_X
def accuracy(output, label, batch_size): out = nd.argmax(output, axis=1) res = nd.sum(nd.equal(out.reshape((-1, 1)), label)) / batch_size return res
def analy_model(mask=None, model=None, kernel_size=(1, 3), show=False): ''' for build an curve of numbers of paramers in kernel and extract top 3 paramers; got related mask; obtain the right order of key ''' from layers.dy_conv import new_conv from units import init_sphere from layers.sphere_net import SphereNet20 if mask is None: mask = "/home/ldc/PycharmProjects/Dy/log_4dy_Ns3/global.param" if model is None: model = 'log_4dy_Ns3/spherenet_ft_Ns.model' ctx = mxnet.cpu() mnet = SphereNet20(my_fun=new_conv, use_bn=False) # gammas = init_sphere(mnet, model, ctx) # paramers = nd.load(model) netMask = {} if os.path.exists(mask): with open(mask) as f: sv = pickle.load(f) for k, v in sv.netMask.items(): netMask[k] = v.as_in_context(ctx) all = 0 static = {} paramers = {} cal_mask = {} loaded = nd.load(model) k = loaded.keys() keyorder = mnet.collect_params().keys() loaded_key = rearrange(target_key=keyorder, needfix_key=k, show=show) for idx_key, key in enumerate(keyorder): t_k = loaded_key[idx_key] value = loaded[t_k] if not ('conv' in key and 'weight' in key): paramers[key] = value continue size = value.shape output, input = size[:2] # name = 'spherenet200_' + '_'.join(key.split('.')[2:]) ISname = key in netMask.keys() masked = netMask[key] # print key, ':', masked = masked.reshape(size[:2] + (-1, )) masked = nd.sum(masked, axis=-1) static[key] = nd.zeros(output) static[key + '_minus'] = nd.zeros(output) if all < output: all = output for i in range(output): static[key][i] = nd.sum(masked[i] > 3) / input static[key + '_minus'][i] = nd.sum(masked[i] < 3) / input static[key] = static[key].sort() static[key + '_minus'] = static[key + '_minus'].sort() # --------------------deal with paramers in net---------------- N, C, K1, K2 = value.shape value_trans = value.reshape(N, C, -1) tops_mask = nd.topk(nd.abs(value_trans), k=3, ret_typ='mask') tops_idx = nd.topk(tops_mask, k=3, ret_typ='indices') value_trans = value_trans.reshape(-1, K1 * K2) cal_mask[key] = tops_idx.asnumpy().astype(int) tops_idx = tops_idx.reshape(-1, 3) out = [] for x in range(3): out.append(value_trans[range(N * C), tops_idx[:, x]]) paramers[key] = nd.stack(*out).transpose().reshape((N, C) + kernel_size) print key print 'analysis loop stop' if show: from mxboard import SummaryWriter sw = SummaryWriter(logdir='sphere_dynamic', flush_secs=20000) for j in range(all): for k, v in static.items(): if j >= v.shape[0]: continue sw.add_scalar(tag=k, value=v[j].asscalar(), global_step=j) return paramers, cal_mask, keyorder
def accuracy(output, label): return nd.sum(output.argmax(axis=1)==label).asscalar()
def predict(self, x): h = self.e(x[:, 0]) r = self.r(x[:, 2]) t = self.e(x[:, 1]) score = h + r - t return nd.sum(score**2, axis=1, keepdims=True)**0.5
def cross_entropy(yhat, y): return -nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))
def neglogp(action, mean, logstd): assert (mean.shape[-1] == logstd.shape[-1]) std = nd.exp(logstd) + 1e-8 return 0.5 * nd.sum(nd.square((action - mean) / std), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * action.shape[-1] \ + nd.sum(logstd, axis=-1)