def forward(self, src_seq, tgt_seq, src_valid_length=None, tgt_valid_length=None): #pylint: disable=arguments-differ """Generate the prediction given the src_seq and tgt_seq. This is used in training an NMT model. Parameters ---------- src_seq : NDArray tgt_seq : NDArray src_valid_length : NDArray or None tgt_valid_length : NDArray or None Returns ------- outputs : NDArray Shape (batch_size, tgt_length, tgt_word_num) additional_outputs : list of list Additional outputs of encoder and decoder, e.g, the attention weights """ src_valid_length = nd.cast(src_valid_length, dtype='float32') tgt_valid_length = nd.cast(tgt_valid_length, dtype='float32') additional_outputs = [] encoder_outputs, encoder_additional_outputs = self.encode( src_seq, valid_length=src_valid_length) decoder_states = self.decoder.init_state_from_encoder( encoder_outputs, encoder_valid_length=src_valid_length) outputs, _, decoder_additional_outputs =\ self.decode_seq(tgt_seq, decoder_states, tgt_valid_length) additional_outputs.append(encoder_additional_outputs) additional_outputs.append(decoder_additional_outputs) return outputs, additional_outputs
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight)] for l in loss: l.backward() trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i+1)%opt.log_interval: metric_name, metric_score = metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%( epoch, i, batch_size*opt.log_interval/(time.time()-btic), loss_val / (i+1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%( epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) return net
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight)] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i+1)%opt.log_interval: metric_name, metric_score = metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%( epoch, i, batch_size*opt.log_interval/(time.time()-btic), loss_val / (i+1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%( epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) return net
def hybrid_forward(self, F, score_gt, kernel_gt, score_pred, training_masks, *args, **kwargs): # cal ohem mask selected_masks = [] for i in range(score_gt.shape[0]): # cal for text region selected_mask = self._ohem_single(score_gt[i:i + 1], score_pred[i:i + 1], training_masks[i:i + 1]) selected_masks.append(selected_mask) selected_masks = F.concat(*selected_masks, dim=0) s1, s2, s3, s4, s5, s6 = F.split(kernel_gt, num_outputs=6, axis=3, squeeze_axis=True) s1_pred, s2_pred, s3_pred, s4_pred, s5_pred, s6_pred, C_pred = F.split( score_pred, num_outputs=7, axis=1, squeeze_axis=True) self.pixel_acc = batch_pix_accuracy(C_pred, score_gt) # for text map eps = 1e-5 intersection = F.sum(score_gt * C_pred * selected_masks, axis=1) union = F.sum(score_gt * selected_masks, axis=1) + F.sum( C_pred * selected_mask, axis=1) + eps C_dice_loss = 1. - F.mean((2 * intersection / union)) # loss for kernel kernel_dices = [] for s, s_pred in zip( [s1, s2, s3, s4, s5, s6], [s1_pred, s2_pred, s3_pred, s4_pred, s5_pred, s6_pred]): kernel_mask = F.where(C_pred > 0.5, F.ones_like(s_pred), F.zeros_like(s_pred)) kernel_mask = F.cast(kernel_mask, dtype='float32') kernel_mask = F.cast(F.logical_or(kernel_mask, score_gt), dtype='float32') s = F.cast(s, dtype='float32') kernel_intersection = F.sum(s * s_pred * training_masks * kernel_mask, axis=1) kernel_union = F.sum( training_masks * s * kernel_mask, axis=1) + F.sum( training_masks * s_pred * kernel_mask, axis=1) + eps kernel_dice = 2. * kernel_intersection / kernel_union kernel_dice = 1. - F.mean( (2. * kernel_intersection / kernel_union)) kernel_dices.append(kernel_dice) kernel_dice_loss = F.mean(F.array(kernel_dices)) self.kernel_loss = kernel_dice_loss self.C_loss = C_dice_loss loss = self.lam * C_dice_loss + (1. - self.lam) * kernel_dice_loss return loss
def forward(self, pred, label, valid_length): # pylint: disable=arguments-differ """ Parameters ---------- F pred : Symbol or NDArray Shape (batch_size, length, V) label : Symbol or NDArray Shape (batch_size, length) valid_length : Symbol or NDArray Shape (batch_size, ) Returns ------- loss : Symbol or NDArray Shape (batch_size,) """ if self._sparse_label: sample_weight = nd.cast(nd.expand_dims(nd.ones_like(label), axis=-1), dtype=np.float32) else: sample_weight = nd.ones_like(label) sample_weight = nd.SequenceMask(sample_weight, sequence_length=valid_length, use_sequence_length=True, axis=1) return super(SoftmaxCEMaskedLoss, self).forward( pred, label, sample_weight)
def padded_cross_entropy_loss(logits, labels, smoothing, vocab_size): """ Calculate cross entropy loss while ignoring padding. :param logits: Tensor of size [batch_size, length_logits, vocab_size] :param labels: Tensor of size [batch_size, length_labels] :param smoothing: Label smoothing constant, used to determine the on an off values :param vocab_size: int size of the vocabulary :return: a float32 tennsor with shape [batch_size, max(length_logits, length_labels)] """ logits, labels = _pad_tensors_to_same_length(logits, labels) confidence = 1.0 - smoothing low_confidence = (1.0 - confidence) / float(vocab_size - 1) soft_targets = nd.one_hot(indices=nd.cast(labels, dtype='int32'), depth=vocab_size, on_value=confidence, off_value=low_confidence) softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss( axis=-1, sparse_label=False, from_logits=True) xentropy = softmax_cross_entropy(logits, soft_targets) normalizing_constant = -(confidence * np.log(confidence) + float(vocab_size - 1) * low_confidence * np.log(low_confidence + 1e-20)) xentropy = xentropy - normalizing_constant return xentropy
def translate_file(model, subtokenizer, input_file, output_file=None, print_all_translations=True): """Translate lines in file, and save to output file if specified. Args: estimator: tf.Estimator used to generate the translations. subtokenizer: Subtokenizer object for encoding and decoding source and translated lines. input_file: file containing lines to translate output_file: file that stores the generated translations. print_all_translations: If true, all translations are printed to stdout. Raises: ValueError: if output file is invalid. """ print("Begin translate file from: %s" % input_file) batch_size = _DECODE_BATCH_SIZE sorted_inputs, sorted_keys = _get_sorted_inputs(input_file) num_decode_batches = (len(sorted_inputs) - 1) // batch_size + 1 def get_batch(idx): if idx == (num_decode_batches - 1): ret = sorted_inputs[idx * batch_size:-1] leng = len(ret) else: ret = sorted_inputs[idx * batch_size:idx * batch_size + batch_size] leng = len(ret) max_length = 0 for j in xrange(leng): ret[j] = _encode_and_add_eos(ret[j], subtokenizer) if max_length < len(ret[j]): max_length = len(ret[j]) for k in xrange(leng): ret[k] = ret[k] + np.zeros(max_length - len(ret[k])).tolist() return nd.array(ret, ctx=ctx) translations = [] for i in xrange(num_decode_batches): print("\t Tranlate batch %d of %d" % (i, num_decode_batches)) output = model(get_batch(i)) output = output['outputs'] output = nd.cast(output, dtype='int32') for j in xrange(len(output)): translation = _trim_and_decode(output[j].asnumpy().tolist(), subtokenizer) translations.append(translation) with open(output_file) as f: print("Finished translation and write the translated file.") for index in xrange(len(sorted_keys)): f.write("%s\n" % translations[sorted_keys[index]]) f.close()
def _likelihood(self, init, append, connect, end, action_0, actions, iw_ids, log_p_sigma, batch_size, iw_size): # decompose action: action_type, node_type, edge_type, append_pos, connect_pos = \ actions[:, 0], actions[:, 1], actions[:, 2], actions[:, 3], actions[:, 4] _log_mask = lambda _x, _mask: _mask * nd.log(_x + 1e-10) + ( 1 - _mask) * nd.zeros_like(_x) # init init = init.reshape([batch_size * iw_size, self.N_A]) index = nd.stack(nd.arange(action_0.shape[0], ctx=action_0.context, dtype='int32'), action_0, axis=0) loss_init = nd.log(nd.gather_nd(init, index) + 1e-10) # end loss_end = _log_mask(end, nd.cast(action_type == 2, 'float32')) # append index = nd.stack(append_pos, node_type, edge_type, axis=0) loss_append = _log_mask(nd.gather_nd(append, index), nd.cast(action_type == 0, 'float32')) # connect index = nd.stack(connect_pos, edge_type, axis=0) loss_connect = _log_mask(nd.gather_nd(connect, index), nd.cast(action_type == 1, 'float32')) # sum up results log_p_x = loss_end + loss_append + loss_connect log_p_x = fn.squeeze( fn.SegmentSumFn(iw_ids, batch_size * iw_size)(fn.unsqueeze(log_p_x, -1)), -1) log_p_x = log_p_x + loss_init # reshape log_p_x = log_p_x.reshape([batch_size, iw_size]) log_p_sigma = log_p_sigma.reshape([batch_size, iw_size]) l = log_p_x - log_p_sigma l = fn.logsumexp(l, axis=1) - math.log(float(iw_size)) return l
def hybrid_forward(self, F, xcos_theta, xphi_theta, target): self.it += 1 batch_size = target.size # size = (B,classnum) oh_target = target.one_hot(xcos_theta.shape[1]) self.lamb = max(self.LambdaMin, self.LambdaMax / (1 + 0.1 * self.it)) # because indexing is not differentiable in mxnet, we must do this output = xcos_theta - \ oh_target * xcos_theta[range(0, batch_size), target].reshape(-1, 1) / (1 + self.lamb) + \ oh_target * xphi_theta[range(0, batch_size), target].reshape(-1, 1) / (1 + self.lamb) loss = nd.softmax_cross_entropy(output, nd.cast(target, 'float32')) # (B,Classnum) return loss
def hybrid_forward(self, F, score_gt, kernel_gt, score_pred, training_masks, *args, **kwargs): s1, s2, s3, s4, s5, s6 = F.split(kernel_gt, num_outputs=6, axis=3, squeeze_axis=True) s1_pred, s2_pred, s3_pred, s4_pred, s5_pred, s6_pred, C_pred = F.split( score_pred, num_outputs=7, axis=1, squeeze_axis=True) self.pixel_acc = batch_pix_accuracy(C_pred, score_gt) # classification loss eps = 1e-5 intersection = F.sum(score_gt * C_pred * training_masks, axis=1) union = F.sum(training_masks * score_gt, axis=1) + F.sum( training_masks * C_pred, axis=1) + eps C_dice_loss = 1. - F.mean((2 * intersection / union)) # loss for kernel kernel_dices = [] for s, s_pred in zip( [s1, s2, s3, s4, s5, s6], [s1_pred, s2_pred, s3_pred, s4_pred, s5_pred, s6_pred]): kernel_mask = F.where((C_pred * training_masks > 0.5), F.ones_like(C_pred), F.zeros_like(C_pred)) kernel_mask = F.cast(F.logical_or(kernel_mask, score_gt), dtype='float32') s = F.cast(s, dtype='float32') kernel_intersection = F.sum(s * s_pred * kernel_mask, axis=1) kernel_union = F.sum(s * kernel_mask, axis=1) + F.sum( s_pred * kernel_mask, axis=1) + eps kernel_dice = 1. - F.mean( (2. * kernel_intersection / kernel_union)) kernel_dices.append(kernel_dice.asscalar()) kernel_dice_loss = F.mean(F.array(kernel_dices)) # print("kernel_loss:", kernel_dice_loss) self.C_loss = C_dice_loss self.kernel_loss = kernel_dice_loss loss = self.lam * C_dice_loss + (1. - self.lam) * kernel_dice_loss return loss
def _rnn_test(self, X, NX, NX_rep, NX_cum, h): # note: one partition for one molecule X_avg = fn.SegmentSumFn(NX_rep, NX.shape[0])(X) / nd.cast( fn.unsqueeze(NX, 1), 'float32') X_curr = nd.take(X, indices=NX_cum - 1) X = nd.concat(X_avg, X_curr, dim=1) # size: [NX, F_in * 2] # rnn X = fn.unsqueeze(X, axis=1) X, h = self.rnn(X, h) X = fn.squeeze(X, axis=1) return X, h
def _gather_beams(list, beam_indices, batch_size, new_beam_size, cache=None): """Gather beams from nested structure of tensors. Each tensor in nested represents a batch of beams, where beam refers to a single search state (beam search involves searching through multiple states in parallel). This function is used to gather the top beams, specified by beam_indices, from the nested tensors. Args: nested: Nested structure (tensor, list, tuple or dict) containing tensors with shape [batch_size, beam_size, ...]. beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each value in beam_indices must be between [0, beam_size), and are not necessarily unique. batch_size: int size of batch new_beam_size: int number of beams to be pulled from the nested tensors. Returns: Nested structure containing tensors with shape [batch_size, new_beam_size, ...] """ batch_pos = np.arange(0, batch_size * new_beam_size) batch_pos = nd.array(batch_pos, ctx=ctx, dtype='int32') / new_beam_size batch_pos = nd.reshape(batch_pos, (batch_size, new_beam_size)) beam_indices = nd.cast(beam_indices, dtype='int32') coordinates = nd.stack(batch_pos, beam_indices, axis=2) m = coordinates.shape[0] n = coordinates.shape[1] coordinates_tmp = nd.zeros(shape=(m, 2, n), ctx=ctx) for i in xrange(m): coordinates_tmp[i] = coordinates[i].T coordinates_new = nd.ones(shape=(2, m, n), ctx=ctx) for i in xrange(m): coordinates_new[0][i] = coordinates_tmp[i][0] coordinates_new[1][i] = coordinates_tmp[i][1] if cache is None: for i in xrange(len(list)): list[i] = nd.gather_nd(list[i], coordinates_new) return list else: cache = map_structure(lambda t: nd.gather_nd(t, coordinates_new), cache) return cache
def _rnn_train(self, X, NX, NX_rep, graph_to_rnn, rnn_to_graph, NX_cum): X_avg = fn.SegmentSumFn(NX_rep, NX.shape[0])(X) / nd.cast( fn.unsqueeze(NX, 1), 'float32') X_curr = nd.take(X, indices=NX_cum - 1) X = nd.concat(X_avg, X_curr, dim=1) # rnn X = nd.take( X, indices=graph_to_rnn) # batch_size, iw_size, length, num_features batch_size, iw_size, length, num_features = X.shape X = X.reshape([batch_size * iw_size, length, num_features]) X = self.rnn(X) X = X.reshape([batch_size, iw_size, length, -1]) X = nd.gather_nd(X, indices=rnn_to_graph) return X
def _initialize(self, force_reinit=True, ctx=mx.cpu(), dtype='float32'): for k, v in self.collect_params().items(): if 'conv' in k: if 'weight' in k: if 'first' in k or 'output' in k or 'fc' in k or 'squeeze' in k or 'excitation' in k: v.initialize(mx.init.Normal(0.01), force_reinit=force_reinit, ctx=ctx) elif 'transpose' in k: v.initialize(mx.init.Normal(0.01), force_reinit=force_reinit, ctx=ctx) v.set_data(nd.cast(generate_transpose_conv_kernel(v.shape[0]), dtype=dtype)) v.grad_req = 'null' else: v.initialize(mx.init.Normal(1.0 / v.shape[1]), force_reinit=force_reinit, ctx=ctx) if 'bias' in k: v.initialize(mx.init.Constant(0), force_reinit=force_reinit, ctx=ctx) elif 'batchnorm' in k: if 'gamma' in k: v.initialize(mx.init.Constant(1), force_reinit=force_reinit, ctx=ctx) if 'beta' in k: v.initialize(mx.init.Constant(0.0001), force_reinit=force_reinit, ctx=ctx) if 'running' in k: v.initialize(mx.init.Constant(0), force_reinit=force_reinit, ctx=ctx)
def forward(self, X, NX, NX_rep, X_end=None): # segment mean for X if X_end is None: X_end = fn.SegmentSumFn(NX_rep, NX.shape[0])(X) / nd.cast( fn.unsqueeze(NX, 1), 'float32') X = nd.concat(X, X_end[NX_rep, :], dim=1) X_h = nd.relu(self.linear_h(X)).reshape([-1, self.F_h]) X_h_end = nd.relu(self.linear_h_t(X_end)).reshape([-1, self.F_h]) X_x = nd.exp(self.linear_x(X_h)).reshape( [-1, self.k, self.N_B + self.N_B * self.N_A]) X_x_end = nd.exp(self.linear_x_t(X_h_end)).reshape([-1, self.k, 1]) X_sum = nd.sum(fn.SegmentSumFn(NX_rep, NX.shape[0])(X_x), -1, keepdims=True) + X_x_end X_sum_gathered = X_sum[NX_rep, :, :] X_softmax = X_x / X_sum_gathered X_softmax_end = X_x_end / X_sum if self.k > 1: pi = fn.unsqueeze(nd.softmax(self.linear_pi(X_end), axis=1), -1) pi_gathered = pi[NX_rep, :, :] X_softmax = nd.sum(X_softmax * pi_gathered, axis=1) X_softmax_end = nd.sum(X_softmax_end * pi, axis=1) else: X_softmax = fn.squeeze(X_softmax, 1) X_softmax_end = fn.squeeze(X_softmax_end, 1) # generate output connect, append = X_softmax[:, :self.N_B], X_softmax[:, self.N_B:] append = append.reshape([-1, self.N_A, self.N_B]) end = fn.squeeze(X_softmax_end, -1) return append, connect, end
def train(self): self.net.collect_params().reset_ctx(self.ctx) trainer = gluon.Trainer( params=self.net.collect_params(), optimizer='sgd', optimizer_params={ 'learning_rate': self.lr, 'wd': self.wd, 'momentum': self.momentum }, update_on_kvstore=(False if self.use_amp else None)) if self.use_amp: amp.init_trainer(trainer) lr_decay = self.lr_decay lr_steps = sorted( [float(ls) for ls in self.lr_decay_epoch.split(',') if ls.strip()]) mbox_loss = SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') logging.info('Start training from scratch...') for epoch in range(self.epoch): while lr_steps and epoch > lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logging.info("Epoch {} Set learning rate to {}".format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() # reset cause save params may change self.net.collect_params().reset_ctx(self.ctx) self.net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(self.train_data): data = [d.data[0] for d in batch] box_targets = [d.label[0] for d in batch] cls_targets = [ nd.cast(d.label[1], dtype='float32') for d in batch ] with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = self.net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) if self.use_amp: with amp.scale_loss(sum_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) ce_metric.update(0, [l * self.batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * self.batch_size for l in box_loss]) if i > 0 and i % 50 == 0: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logging.info('Epoch {} Batch {} Speed: {:.3f} samples/s, {}={:.3f}, {}={:.3f}'.\ format(epoch, i, self.batch_size/(time.time()-btic), name1, loss1, name2, loss2)) btic = time.time() map_name, mean_ap = self.validation() val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logging.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) self.save_params(epoch)
def test_cast(): x = create_vector(size=LARGE_X // 4) x = nd.tile(x, 4) y = nd.cast(x, np.int32) assert y.dtype == np.int32 assert y[-1] == LARGE_X // 4 - 1
def train(opt): batch_size = opt.batch_size num_joints = opt.num_joints num_gpus = opt.num_gpus batch_size *= max(1, num_gpus) ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = opt.num_workers model_name = opt.model kwargs = { 'ctx': ctx, 'num_joints': num_joints, 'pretrained': opt.use_pretrained, 'pretrained_base': opt.use_pretrained_base, 'pretrained_ctx': ctx } net = get_model(model_name, **kwargs) net.cast(opt.dtype) input_size = [int(i) for i in opt.input_size.split(',')] train_dataset, train_data, train_batch_fn = get_data_loader( opt, batch_size, num_workers, input_size) num_training_samples = len(train_dataset) lr_decay = opt.lr_decay lr_decay_period = opt.lr_decay_period if opt.lr_decay_period > 0: lr_decay_epoch = list( range(lr_decay_period, opt.num_epochs, lr_decay_period)) else: lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch] num_batches = num_training_samples // batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=opt.lr, nepochs=opt.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(opt.lr_mode, base_lr=opt.lr, target_lr=0, nepochs=opt.num_epochs - opt.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) # optimizer = 'sgd' # optimizer_params = {'wd': opt.wd, 'momentum': 0.9, 'lr_scheduler': lr_scheduler} optimizer = 'adam' optimizer_params = {'wd': opt.wd, 'lr_scheduler': lr_scheduler} if opt.dtype != 'float32': optimizer_params['multi_precision'] = True save_frequency = opt.save_frequency if opt.save_dir and save_frequency: save_dir = opt.save_dir makedirs(save_dir) else: save_dir = '' save_frequency = 0 if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: if model_name.startswith('simple'): net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) elif model_name.startswith('mobile'): net.upsampling.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [ nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight) ] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i + 1) % opt.log_interval: metric_name, metric_score = metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), loss_val / (i + 1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info( 'Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n' % (epoch, int(i * batch_size / time_elapsed), int(time_elapsed), loss_val / (i + 1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1)) return net
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_params(), 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum}) else: trainer = gluon.Trainer( net.collect_params(), 'sgd', {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum}, update_on_kvstore=(False if args.amp else None)) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted([float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format(epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): if args.dali: # dali iterator returns a mxnet.io.DataBatch data = [d.data[0] for d in batch] box_targets = [d.label[0] for d in batch] cls_targets = [nd.cast(d.label[1], dtype='float32') for d in batch] else: data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) if args.amp: with amp.scale_loss(sum_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) if (not args.horovod or hvd.rank() == 0): local_batch_size = int(args.batch_size // (hvd.size() if args.horovod else 1)) ce_metric.update(0, [l * local_batch_size for l in cls_loss]) smoothl1_metric.update(0, [l * local_batch_size for l in box_loss]) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format( epoch, i, args.batch_size/(time.time()-btic), name1, loss1, name2, loss2)) btic = time.time() if (not args.horovod or hvd.rank() == 0): name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2)) if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss(weight=2.0) metric = HeatmapAccuracy() best_ap = 0 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() train_data_desc = tqdm(train_data, dynamic_ncols=True) for i, batch in enumerate(train_data_desc): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [ nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight) ] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i + 1) % opt.log_interval: metric_name, metric_score = metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), loss_val / (i + 1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info( 'Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n' % (epoch, int(i * batch_size / time_elapsed), int(time_elapsed), loss_val / (i + 1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, epoch)) if (epoch + 1) % 2 == 0: res = validate(val_data, val_dataset, net, context, opt)[0] logger.info(res) if res['AP'] > best_ap: bestAP = res['AP'] net.save_parameters( f'{save_dir}/best-{round(bestAP, 3)}.params') if os.path.islink(f'{save_dir}/final.params'): os.remove(f'{save_dir}/final.params') os.symlink(f'./best-{round(bestAP, 3)}.params', f'{save_dir}/final.params') if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1)) return net
def calulation(self, input_str, ko_dict, en_dict, en_rev_dict, ctx): """ inference 코드 """ #앞뒤에 START,END 코드 추가 input_str = [ [ 'START', ] + mecab.morphs(input_str.strip()) + [ 'END', ], ] X = encoding_and_padding(input_str, ko_dict, max_seq=self.max_seq_length) #string to embed inputs = F.array(X, ctx=ctx) inputs = F.cast(inputs, dtype='float32') in_sent_last_idx = F.argmax(F.where(inputs == self.end_idx, F.ones_like(inputs), F.zeros_like(inputs)), axis=1) #encoder GRU embeddinged_in = F.cast(self.embedding(inputs), dtype='float32') next_h = F.random.normal(0, 1, (1, self.n_hidden), ctx=ctx) for j in range(self.in_seq_len): p_outputs = F.slice_axis(embeddinged_in, axis=1, begin=j, end=j + 1) p_outputs = F.reshape(p_outputs, (-1, self.embed_dim)) enout, (next_h, ) = self.encoder(p_outputs, [ next_h, ]) if j == 0: enouts = enout next_hs = next_h else: enouts = F.concat(enouts, enout, dim=1) next_hs = F.concat(next_hs, next_h, dim=1) #masking with 0 using length enouts = F.reshape(enouts, (-1, self.in_seq_len, self.n_hidden)) enouts = F.transpose(enouts, (1, 0, 2)) enouts = F.SequenceMask(enouts, sequence_length=in_sent_last_idx + 1, use_sequence_length=True) enouts = F.transpose(enouts, (1, 0, 2)) next_hs = F.reshape(next_hs, (-1, self.n_hidden)) #take가 0 dim만 지원하기 때문에.. # N, 30, 300 -> N * 30, 300 , N = (0,1,2,3,4,5...) next_hs = next_hs.take(in_sent_last_idx) #디코더의 초기 입력값으로 넣을 'START'를 임베딩한다. Y_init = F.array([ [ en_dict['START'], ], ], ctx=ctx) Y_init = F.cast(self.embedding(Y_init), dtype='float32') deout = Y_init[:, 0, :] #출력 시퀀스 길이만큼 순회 for i in range(self.out_seq_len): if self.attention: #print(deout.shape) deout, att_weight = self.apply_attention( F=F, inputs=deout, hidden=next_hs, encoder_outputs=enouts) if i == 0: att_weights = att_weight else: att_weights = F.concat(att_weights, att_weight, dim=0) deout, (next_hs, ) = self.decoder(deout, [ next_hs, ]) #batchnorm을 적용하기 위해 차원 증가/원복 deout = F.expand_dims(deout, axis=1) deout = self.batchnorm(deout) #reduce dim deout = deout[:, 0, :] #'START'의 다음 시퀀스 출력값도출 deout_sm = self.dense(deout) #print(deout_sm.shape) deout = F.one_hot(F.argmax(F.softmax(deout_sm, axis=1), axis=1), depth=self.vocab_size) #print(deout.shape) #decoder에 들어갈 수 있는 형태로 변환(임베딩 적용 및 차원 맞춤) deout = F.argmax(deout, axis=1) deout = F.expand_dims(deout, axis=0) deout = F.cast(self.embedding(deout)[:, 0, :], dtype='float32') gen_char = en_rev_dict[F.argmax(deout_sm, axis=1).asnumpy()[0].astype('int')] if gen_char == '__PAD__' or gen_char == 'END': break else: if i == 0: ret_seq = [ gen_char, ] else: ret_seq += [ gen_char, ] return (" ".join(ret_seq), att_weights)
def hybrid_forward(self, F, inputs, outputs, initial_hidden_state, batch_size_seq): #문장 길이 2 == END tag index inputs = F.cast(inputs, dtype='float32') in_sent_last_idx = F.argmax(F.where(inputs == self.end_idx, F.ones_like(inputs), F.zeros_like(inputs)), axis=1) outputs = F.cast(outputs, dtype='float32') out_sent_last_idx = F.argmax(F.where(outputs == self.end_idx, F.ones_like(outputs), F.zeros_like(outputs)), axis=1) #encoder GRU embeddinged_in = F.cast(self.embedding(inputs), dtype='float32') next_h = initial_hidden_state for j in range(self.in_seq_len): p_outputs = F.slice_axis(embeddinged_in, axis=1, begin=j, end=j + 1) p_outputs = F.reshape(p_outputs, (-1, self.embed_dim)) enout, (next_h, ) = self.encoder(p_outputs, [ next_h, ]) if j == 0: enouts = enout next_hs = next_h else: enouts = F.concat(enouts, enout, dim=1) next_hs = F.concat(next_hs, next_h, dim=1) #masking with 0 using length enouts = F.reshape(enouts, (-1, self.in_seq_len, self.n_hidden)) enouts = F.transpose(enouts, (1, 0, 2)) enouts = F.SequenceMask(enouts, sequence_length=in_sent_last_idx + 1, use_sequence_length=True) enouts = F.transpose(enouts, (1, 0, 2)) next_hs = F.reshape(next_hs, (-1, self.n_hidden)) #take가 0 dim만 지원하기 때문에.. # N, 30, 300 -> N * 30, 300 , N = (0,1,2,3,4,5...) next_hs = next_hs.take(in_sent_last_idx + (batch_size_seq * self.max_seq_length)) embeddinged_out = F.cast(self.embedding(outputs), dtype='float32') #decoder GRU with attention for i in range(self.out_seq_len): #out_seq_len 길이만큼 GRUCell을 unroll하면서 출력값을 적재한다. p_outputs = F.slice_axis(embeddinged_out, axis=1, begin=i, end=i + 1) p_outputs = F.reshape(p_outputs, (-1, self.embed_dim)) # p_outputs = outputs[:,i,:] # 위와 같이 진행한 이유는 hybridize를 위함 if self.attention: p_outputs, _ = self.apply_attention(F=F, inputs=p_outputs, hidden=next_hs, encoder_outputs=enouts) deout, (next_hs, ) = self.decoder(p_outputs, [ next_hs, ]) if i == 0: deouts = deout else: deouts = F.concat(deouts, deout, dim=1) #2dim -> 3dim 으로 reshape deouts = F.reshape(deouts, (-1, self.out_seq_len, self.n_hidden)) #0 padding deouts = F.transpose(deouts, (1, 0, 2)) deouts = F.SequenceMask(deouts, sequence_length=out_sent_last_idx + 1, use_sequence_length=True) deouts = F.transpose(deouts, (1, 0, 2)) deouts = self.batchnorm(deouts) deouts_fc = self.dense(deouts) return (deouts_fc)
def _train_loop(self, train_data, val_data, train_eval_data): # fix seed for mxnet, numpy and python builtin random generator. gutils.random.seed(self._cfg.train.seed) # loss and metric mbox_loss = SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # lr decay policy lr_decay = float(self._cfg.train.lr_decay) lr_steps = sorted([float(ls) for ls in self._cfg.train.lr_decay_epoch]) self._logger.info('Start training from [Epoch %d]', max(self._cfg.train.start_epoch, self.epoch)) self.net.collect_params().reset_ctx(self.ctx) for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch), self._cfg.train.epochs): epoch = self.epoch while lr_steps and epoch >= lr_steps[0]: new_lr = self.trainer.learning_rate * lr_decay lr_steps.pop(0) self.trainer.set_learning_rate(new_lr) self._logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() self.net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): if self._cfg.train.dali: # dali iterator returns a mxnet.io.DataBatch data = [d.data[0] for d in batch] box_targets = [d.label[0] for d in batch] cls_targets = [ nd.cast(d.label[1], dtype='float32') for d in batch ] else: data = gluon.utils.split_and_load(batch[0], ctx_list=self.ctx, batch_axis=0, even_split=False) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=self.ctx, batch_axis=0, even_split=False) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=self.ctx, batch_axis=0, even_split=False) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = self.net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) if self._cfg.ssd.amp: with amp.scale_loss(sum_loss, self.trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore self.trainer.step(1) if not self._cfg.horovod or hvd.rank() == 0: local_batch_size = int( self._cfg.train.batch_size // (hvd.size() if self._cfg.horovod else 1)) ce_metric.update(0, [l * local_batch_size for l in cls_loss]) smoothl1_metric.update( 0, [l * local_batch_size for l in box_loss]) if self._cfg.train.log_interval and not ( i + 1) % self._cfg.train.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() self._logger.info( '[Epoch %d][Batch %d], Speed: %f samples/sec, %s=%f, %s=%f', epoch, i, self._cfg.train.batch_size / (time.time() - btic), name1, loss1, name2, loss2) btic = time.time() if not self._cfg.horovod or hvd.rank() == 0: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() self._logger.info('[Epoch %d] Training cost: %f, %s=%f, %s=%f', epoch, (time.time() - tic), name1, loss1, name2, loss2) if (epoch % self._cfg.valid.val_interval == 0) or \ (self._cfg.save_interval and epoch % self._cfg.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = self._evaluate(val_data) val_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(map_name, mean_ap) ]) self._logger.info('[Epoch %d] Validation: \n%s', epoch, str(val_msg)) current_map = float(mean_ap[-1]) if current_map > self._best_map: cp_name = os.path.join(self._logdir, 'best_checkpoint.pkl') self._logger.info( '[Epoch %d] Current best map: %f vs previous %f, saved to %s', self.epoch, current_map, self._best_map, cp_name) self.save(cp_name) self._best_map = current_map if self._reporter: self._reporter(epoch=epoch, map_reward=current_map) self._time_elapsed += time.time() - btic # map on train data map_name, mean_ap = self._evaluate(train_eval_data) return { 'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed }
def train(): """Training function.""" trainer = gluon.Trainer(model.collect_params(), args.optimizer, {'learning_rate': args.lr, 'beta2': 0.98, 'epsilon': 1e-9}) train_batchify_fn = btf.Tuple(btf.Pad(), btf.Pad(), btf.Stack(), btf.Stack()) test_batchify_fn = btf.Tuple(btf.Pad(), btf.Pad(), btf.Stack(), btf.Stack(), btf.Stack()) target_val_lengths = list(map(lambda x: x[-1], data_val_lengths)) target_test_lengths = list(map(lambda x: x[-1], data_test_lengths)) if args.bucket_scheme == 'constant': bucket_scheme = ConstWidthBucket() elif args.bucket_scheme == 'linear': bucket_scheme = LinearWidthBucket() elif args.bucket_scheme == 'exp': bucket_scheme = ExpWidthBucket(bucket_len_step=1.2) else: raise NotImplementedError train_batch_sampler = FixedBucketSampler(lengths=data_train_lengths, batch_size=args.batch_size, num_buckets=args.num_buckets, ratio=args.bucket_ratio, shuffle=True, use_average_length=True, bucket_scheme=bucket_scheme) logging.info('Train Batch Sampler:\n{}'.format(train_batch_sampler.stats())) train_data_loader = DataLoader(data_train, batch_sampler=train_batch_sampler, batchify_fn=train_batchify_fn, num_workers=8) val_batch_sampler = FixedBucketSampler(lengths=target_val_lengths, batch_size=args.test_batch_size, num_buckets=args.num_buckets, ratio=args.bucket_ratio, shuffle=False, use_average_length=True, bucket_scheme=bucket_scheme) logging.info('Valid Batch Sampler:\n{}'.format(val_batch_sampler.stats())) val_data_loader = DataLoader(data_val, batch_sampler=val_batch_sampler, batchify_fn=test_batchify_fn, num_workers=8) test_batch_sampler = FixedBucketSampler(lengths=target_test_lengths, batch_size=args.test_batch_size, num_buckets=args.num_buckets, ratio=args.bucket_ratio, shuffle=False, use_average_length=True, bucket_scheme=bucket_scheme) logging.info('Test Batch Sampler:\n{}'.format(test_batch_sampler.stats())) test_data_loader = DataLoader(data_test, batch_sampler=test_batch_sampler, batchify_fn=test_batchify_fn, num_workers=8) if args.bleu == 'tweaked': bpe = True split_compound_word = True tokenized = True elif args.bleu == '13a' or args.bleu == 'intl': bpe = False split_compound_word = False tokenized = False else: raise NotImplementedError best_valid_bleu = 0.0 step_num = 0 warmup_steps = args.warmup_steps grad_interval = args.num_accumulated model.collect_params().setattr('grad_req', 'add') average_start = (len(train_data_loader) // grad_interval) * (args.epochs - args.average_start) average_param_dict = None model.collect_params().zero_grad() for epoch_id in range(args.epochs): log_avg_loss = 0 log_wc = 0 loss_denom = 0 step_loss = 0 log_start_time = time.time() for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length) \ in enumerate(train_data_loader): src_valid_length = nd.cast(src_valid_length, dtype='float32') tgt_valid_length = nd.cast(tgt_valid_length, dtype='float32') if batch_id % grad_interval == 0: step_num += 1 new_lr = args.lr / math.sqrt(args.num_units) \ * min(1. / math.sqrt(step_num), step_num * warmup_steps ** (-1.5)) trainer.set_learning_rate(new_lr) src_wc = src_valid_length.sum().asscalar() tgt_wc = tgt_valid_length.sum().asscalar() loss_denom += tgt_wc - tgt_valid_length.shape[0] if src_seq.shape[0] > len(ctx): src_seq_list, tgt_seq_list, src_valid_length_list, tgt_valid_length_list \ = [gluon.utils.split_and_load(seq, ctx, batch_axis=0, even_split=False) for seq in [src_seq, tgt_seq, src_valid_length, tgt_valid_length]] else: src_seq_list = [src_seq.as_in_context(ctx[0])] tgt_seq_list = [tgt_seq.as_in_context(ctx[0])] src_valid_length_list = [src_valid_length.as_in_context(ctx[0])] tgt_valid_length_list = [tgt_valid_length.as_in_context(ctx[0])] Ls = [] with mx.autograd.record(): for src_seq, tgt_seq, src_valid_length, tgt_valid_length \ in zip(src_seq_list, tgt_seq_list, src_valid_length_list, tgt_valid_length_list): out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1) smoothed_label = label_smoothing(tgt_seq[:, 1:]) ls = loss_function(out, smoothed_label, tgt_valid_length - 1).sum() Ls.append((ls * (tgt_seq.shape[1] - 1)) / args.batch_size) for L in Ls: L.backward() if batch_id % grad_interval == grad_interval - 1 or\ batch_id == len(train_data_loader) - 1: if average_param_dict is None: average_param_dict = {k: v.data(ctx[0]).copy() for k, v in model.collect_params().items()} trainer.step(float(loss_denom) / args.batch_size) param_dict = model.collect_params() param_dict.zero_grad() if step_num > average_start: alpha = 1. / max(1, step_num - average_start) for name, average_param in average_param_dict.items(): average_param[:] += alpha * (param_dict[name].data(ctx[0]) - average_param) step_loss += sum([L.asscalar() for L in Ls]) if batch_id % grad_interval == grad_interval - 1 or\ batch_id == len(train_data_loader) - 1: log_avg_loss += step_loss / loss_denom * args.batch_size loss_denom = 0 step_loss = 0 log_wc += src_wc + tgt_wc if (batch_id + 1) % (args.log_interval * grad_interval) == 0: wps = log_wc / (time.time() - log_start_time) logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, ' 'throughput={:.2f}K wps, wc={:.2f}K' .format(epoch_id, batch_id + 1, len(train_data_loader), log_avg_loss / args.log_interval, np.exp(log_avg_loss / args.log_interval), wps / 1000, log_wc / 1000)) log_start_time = time.time() log_avg_loss = 0 log_wc = 0 mx.nd.waitall() valid_loss, valid_translation_out = evaluate(val_data_loader, ctx[0]) valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out, tokenized=tokenized, tokenizer=args.bleu, split_compound_word=split_compound_word, bpe=bpe) logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}' .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100)) test_loss, test_translation_out = evaluate(test_data_loader, ctx[0]) test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out, tokenized=tokenized, tokenizer=args.bleu, split_compound_word=split_compound_word, bpe=bpe) logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}' .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100)) write_sentences(valid_translation_out, os.path.join(args.save_dir, 'epoch{:d}_valid_out.txt').format(epoch_id)) write_sentences(test_translation_out, os.path.join(args.save_dir, 'epoch{:d}_test_out.txt').format(epoch_id)) if valid_bleu_score > best_valid_bleu: best_valid_bleu = valid_bleu_score save_path = os.path.join(args.save_dir, 'valid_best.params') logging.info('Save best parameters to {}'.format(save_path)) model.save_params(save_path) save_path = os.path.join(args.save_dir, 'epoch{:d}.params'.format(epoch_id)) model.save_params(save_path) save_path = os.path.join(args.save_dir, 'average.params') mx.nd.save(save_path, average_param_dict) if args.average_checkpoint: for j in range(args.num_averages): params = mx.nd.load(os.path.join(args.save_dir, 'epoch{:d}.params'.format(args.epochs - j - 1))) alpha = 1. / (j + 1) for k, v in model._collect_params_with_prefix().items(): for c in ctx: v.data(c)[:] += alpha * (params[k].as_in_context(c) - v.data(c)) elif args.average_start > 0: for k, v in model.collect_params().items(): v.set_data(average_param_dict[k]) else: model.load_params(os.path.join(args.save_dir, 'valid_best.params'), ctx) valid_loss, valid_translation_out = evaluate(val_data_loader, ctx[0]) valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out, tokenized=tokenized, tokenizer=args.bleu, bpe=bpe, split_compound_word=split_compound_word) logging.info('Best model valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}' .format(valid_loss, np.exp(valid_loss), valid_bleu_score * 100)) test_loss, test_translation_out = evaluate(test_data_loader, ctx[0]) test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out, tokenized=tokenized, tokenizer=args.bleu, bpe=bpe, split_compound_word=split_compound_word) logging.info('Best model test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}' .format(test_loss, np.exp(test_loss), test_bleu_score * 100)) write_sentences(valid_translation_out, os.path.join(args.save_dir, 'best_valid_out.txt')) write_sentences(test_translation_out, os.path.join(args.save_dir, 'best_test_out.txt'))
def test_cast(): x = create_2d_tensor(rows=SMALL_Y, columns=LARGE_X) y = nd.cast(x, np.int32) assert y.dtype == np.int32 assert y[-1][-1] == SMALL_Y-1
def clip_pass_gradient(x, l=-1., u=1.): clip_up = nd.cast(x > u, "float32") clip_low = nd.cast(x < l, "float32") return x + nd.stop_gradient((u - x) * clip_up + (l - x) * clip_low)
def min_between(arr1, arr2): return nd.cast( nd.min(nd.array([arr1.asnumpy(), arr2.asnumpy()]), axis=0), dtype="float64" )