def eval_acc(inference, val_loader, ctx, return_meta=False): mtc_acc = Accuracy() mtc_acc.reset() feature_nest, y_nest, y_hat_nest = [], [], [] for X, y in val_loader: X = X.as_in_context(ctx[0]) y = y.as_in_context(ctx[0]) with autograd.record(train_mode=False): y_hat, features = inference(X) # update metric mtc_acc.update([y], [y_hat]) if return_meta: y_nest.extend(y.asnumpy()) feature_nest.extend(features.asnumpy()) y_hat_nest.extend(y_hat.asnumpy()) feature_nest = np.array(feature_nest) y_nest = np.array(y_nest) y_hat_nest = np.array(y_hat_nest) if return_meta: return mtc_acc.get()[1], y_nest, y_hat_nest, feature_nest return mtc_acc.get()[1]
def eval_epoch(self): self.is_train = False meter = Accuracy() meter.reset() for X, y in self.test_loader: X = X.as_in_context(self.ctx[0]) y = y.as_in_context(self.ctx[0]) y_hat, features = self.net(X) meter.update([y], [y_hat]) acc = meter.get()[1] logging.info('Test - Epoch {}, Iter {}, Acc {:.2f} %'.format( self.cur_epoch, self.cur_iter, acc * 100)) if acc > self.eval_tracker['Acc']: self.eval_tracker.update({ 'Epoch': self.cur_epoch, 'Iter': self.cur_iter, 'Acc': acc }) self.net.save_parameters('{}_{}_{}_{:.2f}.params'.format( self.cfg.META.CKPT_PATH, self.cur_epoch, self.cur_iter, acc))
def train_block(self, data_iter: DataLoader, docs: Sequence[Document]) -> float: acc = Accuracy() for dids, sids, data, label in tqdm(data_iter, leave=False): # batch_size, sequence_length, input_size -> sequence_length, batch_size, input_size X = nd.transpose(data, axes=(1, 0, 2)).as_in_context(self.ctx) # batch_size, sequence_length -> sequence_length, batch_size Y = label.T.as_in_context(self.ctx) state = self.model.begin_state(batch_size=X.shape[1], ctx=self.ctx) for s in state: s.detach() with autograd.record(): output, state = self.model(X, state) l = self.loss(output, Y) l.backward() grads = [param.grad(self.ctx) for param in self.model.collect_params().values()] clip_global_norm(grads, self.model.rnn_layer.clip * X.shape[0] * X.shape[1]) # sequence_length, batch_size -> batch_size, sequence_length for batch, (preds, labels) in enumerate(zip(nd.argmax(output, axis=2).T, label)): sen = docs[dids[batch].asscalar()].sentences[sids[batch].asscalar()] sequence_length = len(sen) preds = preds[:sequence_length] labels = labels[:sequence_length] acc.update(labels=labels, preds=preds) self.trainer.step(data.shape[0]) return float(acc.get()[1])
def evaluate_block(self, data_iter: DataLoader, docs: Sequence[Document]) -> float: """ :param data_iter: :param docs: :return: """ self.decode_block(data_iter=data_iter, docs=docs) if self.chunking: acc = ChunkF1() for doc in docs: for sen in doc.sentences: acc.update(labels=sen[to_gold(self.key)], preds=sen[self.key]) else: acc = Accuracy() for doc in docs: for sen in doc.sentences: labels = nd.array([ self.label_map.cid(label) for label in sen[to_gold(self.key)] ]) preds = nd.array( [self.label_map.cid(pred) for pred in sen[self.key]]) acc.update(labels=labels, preds=preds) return acc.get()[1]
def eval(self, inference, val_loader, log=True, target=True, epoch=True): """ Evaluate the model :param inference: network :param val_loader: data loader :param log: log flag :param target: target flag for updating the record and log :param epoch: epoch flag for updating the record and log :return: """ mtc_acc = Accuracy() mtc_acc.reset() # val_loader.reset() feature_nest, y_nest, y_hat_nest = [], [], [] for X, Y in val_loader: X_lst = split_and_load(X, self.args.ctx, even_split=False) Y_lst = split_and_load(Y, self.args.ctx, even_split=False) for x, y in zip(X_lst, Y_lst): y_hat, features = inference(x) # update metric mtc_acc.update([y], [y_hat]) y_nest.extend(y.asnumpy()) feature_nest.extend(features.asnumpy()) y_hat_nest.extend(y_hat.asnumpy()) feature_nest = np.array(feature_nest) y_nest = np.array(y_nest).astype(int) y_hat_nest = np.array(y_hat_nest) if log: target_key = 'Tgt' if target else 'Src' epoch_key = 'Epoch' if epoch else 'Iter' record = self.cur_epoch if epoch else self.cur_iter if mtc_acc.get()[1] > self.records[epoch_key]['%s-Acc' % target_key]: if target: self.records[epoch_key][epoch_key] = record self.records[epoch_key]['%s-Acc' % target_key] = mtc_acc.get()[1] self.records[epoch_key]['%s-label' % target_key] = y_nest self.records[epoch_key]['%s-preds' % target_key] = y_hat_nest self.records[epoch_key]['%s-features' % target_key] = feature_nest self.save_params(inference, 0, epoch_key) self.logger.update_scalar( '%s [%d]: Eval-Acc-%s' % (epoch_key, record, target_key), mtc_acc.get()[1]) if self.sw: self.sw.add_scalar('Acc/Eval-%s-Acc-%s' % (epoch, target_key), mtc_acc.get()[1], global_step=record) return mtc_acc.get()[1], y_nest, y_hat_nest, feature_nest
def evaluate(self, itr, ctx): metric = Accuracy() itr.reset() for batch in itr: data = batch.data[0].as_in_context(context=ctx) label = batch.label[0].as_in_context(context=ctx) output = self._net(data) metric.update(label, output) return metric
def run_training(net, trainer, train_dataloader, val_dataloader, epochs, model_path, context): loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss() for e in range(epochs): train_acc = Accuracy() val_acc = Accuracy() train_loss = 0. total_items = 0 for i, (data, label) in enumerate(train_dataloader): items_per_iteration = data.shape[0] total_items += items_per_iteration data = data.as_in_context(context) label = label.as_in_context(context) with autograd.record(): output = net(data) output = output.reshape((-1, 3)) label = label.reshape((-1, 1)) loss = loss_fn(output, label) loss.backward() trainer.step(items_per_iteration) train_loss += loss.mean().asscalar() train_acc.update(label.flatten(), output.argmax(axis=1).flatten()) for i, (data, label) in enumerate(val_dataloader): data = data.as_in_context(context) label = label.as_in_context(context) output = net(data) output = output.reshape((-1, 3)) val_acc.update( label.reshape(-1, 1).flatten(), output.argmax(axis=1).flatten()) print( "Epoch {}. Current Loss: {:.5f}. Train accuracy: {:.3f}, Validation accuracy: {:.3f}." .format(e, train_loss / total_items, train_acc.get()[1], val_acc.get()[1])) net.save_parameters(model_path) return model_path
def validate(net, val_loader, gpu_id, train_index2words, val_index2words): metric = BleuMetric(pred_index2words=train_index2words, label_index2words=val_index2words) metruc_acc = Accuracy() metruc_acc.reset() metric.reset() for batch in tqdm.tqdm(val_loader): batch = [x.as_in_context(mx.gpu(gpu_id)) for x in batch] image, label, label_len = batch predictions, alphas = net(image, None, None) for n, l in enumerate(label_len): l = int(l.asscalar()) la = label[n, 1:l] pred = predictions[n, :] metric.update(la, pred) metruc_acc.update(la, predictions[n, :(l - 1)]) return metric.get()[1], metruc_acc.get()[1]
def train_block(self, data_iter: DataLoader, docs: Sequence[Document]) -> float: """ :param data_iter: :param sens: :return: """ acc = Accuracy() for data, label in tqdm(data_iter, leave=False): data = data.as_in_context(self.ctx) label = label.as_in_context(self.ctx) with autograd.record(): output = self.model(data) l = self.loss(output, label) l.backward() for preds, labels in zip(nd.argmax(output, axis=1), label): acc.update(labels=labels, preds=preds) self.trainer.step(data.shape[0]) return float(acc.get()[1])
def validate(net, val_loader, gpu_id, train_index2words, val_index2words): metric = BleuMetric(pred_index2words=train_index2words, label_index2words=val_index2words) metruc_acc = Accuracy() metruc_acc.reset() metric.reset() for batch in tqdm.tqdm(val_loader): batch = [Variable(torch.from_numpy(x.asnumpy()).cuda()) for x in batch] image, label, label_len = batch label = label.long() label_len = label_len.long() predictions, alphas = net(image, None, None) for n, l in enumerate(label_len): l = int(l.data.cpu().numpy().squeeze().tolist()) la = label[n, 1:l].data.cpu().numpy() pred = predictions[n, :].data.cpu().numpy() metric.update(la, pred) metruc_acc.update( mx.nd.array(la), mx.nd.array(predictions[n, :(l - 1)].data.cpu().numpy())) return metric.get()[1], metruc_acc.get()[1]
def evaluation(self, x, y_true): """ 输入一组数据和标签返回正确率和交叉熵(y与y_true) :param x: data :param y_true: label(one-hot-like) :return: (accuracy,crossentropy) """ #处理onehot标签得到真实标签 nor_label = nd.argmax(y_true, axis=1, keepdims=False) #type:nd.NDArray #predict不一定是softmax过的值 应将其归一化 使其相加值为1 #否则会出现NaN的情况 raw_pred = self.predict(x) #type:nd.NDArray y_pred = raw_pred / raw_pred.sum(axis=1, keepdims=True) #type:nd.NDArray y_pred_sparse = y_pred.argmax(axis=1, keepdims=False) ##开始求各参数 acc = Accuracy() acc.update(labels=[nor_label], preds=[y_pred]) acc_val = acc.get()[1] # 交叉熵 cro = CrossEntropy() cro.update(labels=[nor_label], preds=[y_pred]) cro_val = cro.get()[1] #确定average方式 如果预测值中的每个item的长度大于2表示是多分类 则使用macro方式统计 否则采用binary average = "macro" if len(raw_pred[0]) > 2 else "binary" # Recall recall = recall_score(nor_label.asnumpy(), y_pred_sparse.asnumpy(), average=average, pos_label=self.pos_label) # 精确率 precision = precision_score(nor_label.asnumpy(), y_pred_sparse.asnumpy(), average=average, pos_label=self.pos_label) # 返回 return acc_val, cro_val, recall, precision
def load_net(param_file="net.params", ctx=cpu(0)): net = SimpleNet() net.load_parameters(param_file, ctx=ctx) return net def get_val_data(transformer, batch_size=128): mnist_valid = gluon.data.vision.FashionMNIST(train=False) valid_data = gluon.data.DataLoader( mnist_valid.transform_first(transformer), batch_size=batch_size, num_workers=4) return valid_data if __name__ == "__main__": ctx = gpu(0) if context.num_gpus() else cpu(0) net = load_net("net.params", ctx=ctx) valid_data = get_val_data(transformer) val_acc = Accuracy() for data, label in valid_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.predict_mode(): out = net(data) val_acc.update(label, out) print("Accuray: ", val_acc.get()[1])
output = [] losses = [] with ag.record(): for x, y in zip(data, label): z = model(x) # computes softmax cross entropy loss l = loss_fn(z, y) output.append(z) losses.append(l) # backpropagate the error for one iteration. for l in losses: l.backward() # Update network weights trainer.step(BATCH_SIZE) # Update metric metric.update(label, output) str1 = 'Epoch [{}], Accuracy {:.4f}'.format(epoch, metric.get()[1]) str2 = '~Samples/Sec {:.4f}'.format(BATCH_SIZE * (i + 1) / (time.time() - tick_0)) print('%s %s' % (str1, str2)) metric.reset() elapsed = time.perf_counter() - start print('elapsed: {:0.3f}'.format(elapsed)) # use Accuracy as the evaluation metric metric = Accuracy() for data, label in test_data: data = split_and_load(data, ctx_list=ctx, batch_axis=0) label = split_and_load(label, ctx_list=ctx, batch_axis=0) outputs = []
def fit(self, itr, ctx, epochs, batch_size, callbacks=None): # ADAM optimizer #opt_params={'learning_rate':0.001, 'beta1':0.9, 'beta2':0.999, 'epsilon':1e-08} opt = mx.optimizer.create('adam') # SGD optimizer #opt = mx.optimizer.create('sgd') # AdaDelta optimizer #opt = mx.optimizer.create('adadelta') # initialize parameters # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0 # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers self._net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True) # fetch and broadcast parameters params = self._net.collect_params() # trainer trainer = Trainer(params=params, optimizer=opt, kvstore='device') # loss function loss_fn = SoftmaxCrossEntropyLoss() # use accuracy as the evaluation metric metric = Accuracy() # train for e in range(epochs): if callbacks is not None: for cb in callbacks: cb.before_epoch(e) # reset evaluation result to initial state metric.reset() # reset the train data iterator. itr.reset() # loop over the train data iterator for i, batch in enumerate(itr): # splits train data into multiple slices along batch_axis # copy each slice into a context data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) # splits train label into multiple slices along batch_axis # copy each slice into a context label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [] losses = [] # inside training scope with ag.record(): for x, y in zip(data, label): z = self._net(x) # computes softmax cross entropy loss l = loss_fn(z, y) outputs.append(z) losses.append(l) # backpropagate the error for one iteration for l in losses: l.backward() # make one step of parameter update. # trainer needs to know the batch size of data # to normalize the gradient by 1/batch_size trainer.step(batch_size) # updates internal evaluation metric.update(label, outputs) # invoke callbacks after batch if callbacks is not None: for cb in callbacks: cb.after_batch(e, i, batch_size, metric) # invoke callbacks after epoch if callbacks is not None: for cb in callbacks: cb.after_epoch(e, i, batch_size, metric) return metric
def main(): epoches = 32 gpu_id = 7 ctx_list = [mx.gpu(x) for x in [7, 8]] log_interval = 100 batch_size = 32 start_epoch = 0 # trainer_resume = resume + ".states" if resume is not None else None trainer_resume = None resume = None from mxnet.gluon.data.vision import transforms transform_fn = transforms.Compose([ LeftTopPad(dest_shape=(256, 256)), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/train2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_train2017.json", transforms=transform_fn, feature_hdf5="output/train2017.h5") val_dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/val2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_val2017.json", words2index=dataset.words2index, index2words=dataset.index2words, transforms=transform_fn, feature_hdf5="output/val2017.h5") dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, last_batch="discard") val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) num_words = dataset.words_count # set up logger save_prefix = "output/res50_" logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) net = EncoderDecoder(num_words=num_words, test_max_len=val_dataset.max_len) if resume is not None: net.collect_params().load(resume, allow_missing=True, ignore_extra=True) logger.info("Resumed form checkpoint {}.".format(resume)) params = net.collect_params() for key in params.keys(): if params[key]._data is not None: continue else: if "bias" in key or "mean" in key or "beta" in key: params[key].initialize(init=mx.init.Zero()) logging.info("initialized {} using Zero.".format(key)) elif "weight" in key: params[key].initialize(init=mx.init.Normal()) logging.info("initialized {} using Normal.".format(key)) elif "var" in key or "gamma" in key: params[key].initialize(init=mx.init.One()) logging.info("initialized {} using One.".format(key)) else: params[key].initialize(init=mx.init.Normal()) logging.info("initialized {} using Normal.".format(key)) net.collect_params().reset_ctx(ctx=ctx_list) trainer = mx.gluon.Trainer( net.collect_params(), 'adam', { 'learning_rate': 4e-4, 'clip_gradient': 5, 'multi_precision': True }, ) if trainer_resume is not None: trainer.load_states(trainer_resume) logger.info( "Loaded trainer states form checkpoint {}.".format(trainer_resume)) criterion = Criterion() accu_top3_metric = TopKAccuracy(top_k=3) accu_top1_metric = Accuracy(name="batch_accu") ctc_loss_metric = Loss(name="ctc_loss") alpha_metric = Loss(name="alpha_loss") batch_bleu = BleuMetric(name="batch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) epoch_bleu = BleuMetric(name="epoch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) btic = time.time() logger.info(batch_size) logger.info(num_words) logger.info(len(dataset.words2index)) logger.info(len(dataset.index2words)) logger.info(dataset.words2index["<PAD>"]) logger.info(val_dataset.words2index["<PAD>"]) logger.info(len(val_dataset.words2index)) # net.hybridize(static_alloc=True, static_shape=True) net_parallel = DataParallelModel(net, ctx_list=ctx_list, sync=True) for nepoch in range(start_epoch, epoches): if nepoch > 15: trainer.set_learning_rate(4e-5) logger.info("Current lr: {}".format(trainer.learning_rate)) accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() epoch_bleu.reset() batch_bleu.reset() for nbatch, batch in enumerate(tqdm.tqdm(dataloader)): batch = [mx.gluon.utils.split_and_load(x, ctx_list) for x in batch] inputs = [[x[n] for x in batch] for n, _ in enumerate(ctx_list)] losses = [] with ag.record(): net_parallel.sync = nbatch > 1 outputs = net_parallel(*inputs) for s_batch, s_outputs in zip(inputs, outputs): image, label, label_len = s_batch predictions, alphas = s_outputs ctc_loss = criterion(predictions, label, label_len) loss2 = 1.0 * ((1. - alphas.sum(axis=1))**2).mean() losses.extend([ctc_loss, loss2]) ag.backward(losses) trainer.step(batch_size=batch_size, ignore_stale_grad=True) for n, l in enumerate(label_len): l = int(l.asscalar()) la = label[n, 1:l] pred = predictions[n, :(l - 1)] accu_top3_metric.update(la, pred) accu_top1_metric.update(la, pred) epoch_bleu.update(la, predictions[n, :]) batch_bleu.update(la, predictions[n, :]) ctc_loss_metric.update(None, preds=nd.sum(ctc_loss) / image.shape[0]) alpha_metric.update(None, preds=loss2) if nbatch % log_interval == 0 and nbatch > 0: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in [ epoch_bleu, batch_bleu, accu_top1_metric, accu_top3_metric, ctc_loss_metric, alpha_metric ] ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format(nepoch, nbatch, log_interval * batch_size / (time.time() - btic), msg)) btic = time.time() batch_bleu.reset() accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() bleu, acc_top1 = validate(net, gpu_id=gpu_id, val_loader=val_loader, train_index2words=dataset.index2words, val_index2words=val_dataset.index2words) save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % ( nepoch, bleu, acc_top1) net.collect_params().save(save_path) trainer.save_states(fname=save_path + ".states") logger.info("Saved checkpoint to {}.".format(save_path))
def main(train_list, val_list, model, exp, saved_model, batch_size, optimizer, nb_epochs, augment, max_lr, min_lr, loss_function, train_all, nb_frames, eager, params=None, **kwargs): print("Unused arguments:", kwargs) setname = train_list.split(os.sep)[0] # Timestamp to name experiment folder xptime = strftime("%Y-%m-%d_%Hh%Mm%Ss", gmtime()) xp_folder = "experiments/%s-%s-%s_%s" % (setname, model, exp, xptime) # Make folder mkdir_p(xp_folder) mkdir_p(os.path.join(xp_folder, 'checkpoints')) mkdir_p(os.path.join(xp_folder, 'tb')) print("\nSaving experiment data to:", xp_folder) # Save command (as well as possible) with open(os.path.join(xp_folder, 'command.sh'), "w") as f: command = " ".join(sys.argv[:]) + "\n" f.write(command) # Save employed parameters for future reference if params is not None: write_params(os.path.join(xp_folder, 'params.json'), params) ############# # Callbacks # ############# # Helper: Save the model. ckpt_fmt = os.path.join( xp_folder, 'checkpoints', model + '-' + exp + '.{epoch:03d}-loss{val_loss:.3f}-acc{val_acc:.3f}.hdf5') checkpointer = ModelCheckpoint(filepath=ckpt_fmt, verbose=1, save_best_only=True, monitor='val_acc') # Helper: TensorBoard tb = HistoryKeeper(logdir=os.path.join(xp_folder), keys=['val_acc', 'val_loss', 'train_time', 'val_time']) # Helper: Stop when we stop learning. # early_stopper = EarlyStopper(patience=15) # Helper: Terminate when finding a NaN loss nan_term = TerminateOnNaN() callbacks = [tb, checkpointer, nan_term] ############# ############# # Loading # ############# if augment: augmenter = default_augmenter_vid(strip_size=4) else: augment = False augmenter = None # Dataset classes train_data = ArrayData(train_list, nb_frames=nb_frames, augmenter=augmenter, eager=eager) val_data = ArrayData(val_list, nb_frames=nb_frames, augmenter=None, eager=eager, encoder=train_data.get_encoder()) # Saving encoder with open(os.path.join(xp_folder, 'encoder.pkl'), 'wb') as f: pickle.dump(train_data.get_encoder(), f) # Train loader train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='keep', num_workers=10) nb_samples = len(train_data) # loader should provide the number of sampĺes # Validation loader val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, last_batch='keep', num_workers=10) nb_validation = len( val_data) # loader should provide the number of sampĺes # Compute number of steps steps_per_epoch = math.ceil(nb_samples / batch_size) validation_steps = math.ceil(nb_validation / batch_size) # The model net = ResearchModels(train_data.nb_classes, model, saved_model, input_shape=train_data.shape, train_all=train_all).model # A little more verbosity print("************************************") if train_all: print("Train all layers.") print("Max lr:", max_lr, " Min lr:", min_lr) print("Batch size:", batch_size) print(nb_samples, "training samples,", steps_per_epoch, "steps per epoch") print(nb_validation, "validation samples,", validation_steps, "validation steps") print("Optimizer:", optimizer) if augment: print("Using data augmentation") else: print("WARNING: Not using data augmentation") print("************************************") ############################ # Loss and Optimization # ############################ trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': max_lr}) if loss_function == 'categorical_crossentropy': loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() loss_fn.hybridize() ############ # Training # ############ progress_desc = "Super epoch %03d - acc %.3f - loss %.3f " acc = Accuracy() start_time = time() super_epoch_size = 250 # Learning rate decay iteration = 1 decay_alpha = 0.01**0.25 lr = max_lr for epoch in range(1, nb_epochs + 1): train_loss, val_loss = 0., 0. nb_batches = 0 tic = time() acc.reset() start_training = time() t = tqdm(range(super_epoch_size), unit='epochs') for _ in t: for data, label in train_loader: # Learning rate decay if iteration % 10000 == 0: lr *= decay_alpha trainer.set_learning_rate(lr) print("Learning rate updated to", lr) iteration += 1 current_batch_size = data.shape[0] data = data.copyto(mx.gpu(0)) label = label.copyto(mx.gpu(0)) with autograd.record(): output = net(data) loss = loss_fn(output, label) loss.backward() # print(mx.nd.log_softmax(output[0], axis=-1), label[0]) # update parameters trainer.step(current_batch_size) # calculate training metrics train_loss += loss.mean().asscalar() # accuracy(output, label) acc.update(preds=output, labels=label) nb_batches += 1 t.set_description(progress_desc % (epoch, acc.get()[1], train_loss / nb_batches)) train_time = time() - start_training train_loss /= steps_per_epoch * super_epoch_size train_acc = acc.get()[1] acc.reset() start_val = time() # calculate validation accuracy tval = tqdm(val_loader, leave=False, desc='Running validation', unit='batch') for data, label in tval: data = data.copyto(mx.gpu(0)) label = label.copyto(mx.gpu(0)) # Compute outputs output = net(data) loss = loss_fn(output, label) # Compute metrics val_loss += loss.mean().asscalar() # val_acc += accuracy(output, label) acc.update(preds=output, labels=label) val_time = time() - start_val val_loss /= validation_steps val_acc = acc.get()[1] print( "Epoch %d: loss %.3f, acc %.3f, val_loss %.3f, val_acc %.3f, in %.1f sec" % (epoch, train_loss, train_acc, val_loss, val_acc, time() - tic)) print( "--------------------------------------------------------------------------------" ) stop = False train_info = { 'epoch': epoch, 'loss': train_loss, 'acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc, 'train_time': train_time, 'val_time': val_time } for cb in callbacks: if cb(net, train_info): stop = True if stop: break print() hours, rem = divmod(time() - start_time, 3600) days, hours = divmod(hours, 24) minutes, seconds = divmod(rem, 60) print("%d training epochs in %dd, %dh%dm%.2fs." % (nb_epochs, int(days), int(hours), int(minutes), seconds))
def run_training(net, trainer, train_dataloader, val_dataloader, intents_count, epochs, model_path, context): intent_loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss() max_val_accuracy = 0 best_model_path = '' for e in range(epochs): intent_train_acc = Accuracy() slot_train_acc = Accuracy() intent_val_acc = Accuracy() slot_val_acc = Accuracy() train_loss = 0. total_items = 0 for i, (data, valid_lengths, entities, intent) in enumerate(train_dataloader): length = data.shape[1] items_per_iteration = data.shape[0] total_items += items_per_iteration data = data.as_in_context(context) intent = intent.as_in_context(context) entities = entities.as_in_context(context) hidden_state = net.elmo_container[0].begin_state( mx.nd.zeros, batch_size=items_per_iteration, ctx=context) mask = get_data_mask(length, valid_lengths, items_per_iteration, context) with autograd.record(): intents, slots = net(data, hidden_state, mask) intents = intents.reshape((-1, intents_count)) intent = intent.reshape((-1, 1)) loss_intent = intent_loss_fn(intents, intent) # crf accepts seq_len x bs x channels score, slots_seq = net.crf(slots.transpose(axes=(1, 0, 2))) neg_log_likelihood = net.crf.neg_log_likelihood( slots.transpose(axes=(1, 0, 2)), entities) loss = 0.1 * loss_intent.mean( ) + 0.9 * neg_log_likelihood.mean() loss.backward() trainer.step(1) train_loss += loss.mean().asscalar() intent_train_acc.update(intent.flatten(), intents.argmax(axis=1).flatten()) slot_train_acc.update(entities, slots_seq) for i, (data, valid_lengths, entities, intent) in enumerate(val_dataloader): items_per_iteration = data.shape[0] length = data.shape[1] data = data.as_in_context(context) intent = intent.as_in_context(context) entities = entities.as_in_context(context) hidden_state = net.elmo_container[0].begin_state( mx.nd.zeros, batch_size=items_per_iteration, ctx=context) mask = get_data_mask(length, valid_lengths, items_per_iteration, context) intents, slots = net(data, hidden_state, mask) intents = intents.reshape((-1, intents_count)) intent = intent.reshape((-1, 1)) score, slots_seq = net.crf(slots.transpose(axes=(1, 0, 2))) intent_val_acc.update(intent.flatten(), intents.argmax(axis=1).flatten()) slot_val_acc.update(entities, slots_seq) print( "Epoch {}. Current Loss: {:.5f}. \n" "Intent train accuracy: {:.3f}, Slots train accuracy: {:.3f}, \n" "Intent valid accuracy: {:.3f}, Slot val accuracy: {:.3f}".format( e, train_loss / total_items, intent_train_acc.get()[1], slot_train_acc.get()[1], intent_val_acc.get()[1], slot_val_acc.get()[1])) if max_val_accuracy < slot_val_acc.get()[1]: max_val_accuracy = slot_val_acc.get()[1] best_model_path = model_path + '_{:04d}.params'.format(e) net.save_parameters(best_model_path) print("Improvement observed") else: print("No improvement") return best_model_path
def main(): epoches = 32 gpu_id = 7 ctx_list = [mx.gpu(x) for x in [7, 8]] log_interval = 100 batch_size = 32 start_epoch = 0 # trainer_resume = resume + ".states" if resume is not None else None trainer_resume = None resume = None from mxnet.gluon.data.vision import transforms transform_fn = transforms.Compose([ LeftTopPad(dest_shape=(256, 256)), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/train2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_train2017.json", transforms=transform_fn, feature_hdf5="output/train2017.h5") val_dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/val2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_val2017.json", words2index=dataset.words2index, index2words=dataset.index2words, transforms=transform_fn, feature_hdf5="output/val2017.h5") dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, last_batch="discard") val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) num_words = dataset.words_count # set up logger save_prefix = "output/res50_" logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) net = EncoderDecoder(num_words=num_words, test_max_len=val_dataset.max_len).cuda() for name, p in net.named_parameters(): if "bias" in name: p.data.zero_() else: p.data.normal_(0, 0.01) print(name) net = torch.nn.DataParallel(net) if resume is not None: net.collect_params().load(resume, allow_missing=True, ignore_extra=True) logger.info("Resumed form checkpoint {}.".format(resume)) trainer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, net.parameters()), lr=4e-4) criterion = Criterion() accu_top3_metric = TopKAccuracy(top_k=3) accu_top1_metric = Accuracy(name="batch_accu") ctc_loss_metric = Loss(name="ctc_loss") alpha_metric = Loss(name="alpha_loss") batch_bleu = BleuMetric(name="batch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) epoch_bleu = BleuMetric(name="epoch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) btic = time.time() logger.info(batch_size) logger.info(num_words) logger.info(len(dataset.words2index)) logger.info(len(dataset.index2words)) logger.info(dataset.words2index["<PAD>"]) logger.info(val_dataset.words2index["<PAD>"]) logger.info(len(val_dataset.words2index)) for nepoch in range(start_epoch, epoches): if nepoch > 15: trainer.set_learning_rate(4e-5) logger.info("Current lr: {}".format(trainer.param_groups[0]["lr"])) accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() epoch_bleu.reset() batch_bleu.reset() for nbatch, batch in enumerate(tqdm.tqdm(dataloader)): batch = [ Variable(torch.from_numpy(x.asnumpy()).cuda()) for x in batch ] data, label, label_len = batch label = label.long() label_len = label_len.long() max_len = label_len.max().data.cpu().numpy() net.train() outputs = net(data, label, max_len) predictions, alphas = outputs ctc_loss = criterion(predictions, label, label_len) loss2 = 1.0 * ((1. - alphas.sum(dim=1))**2).mean() ((ctc_loss + loss2) / batch_size).backward() for group in trainer.param_groups: for param in group['params']: if param.grad is not None: param.grad.data.clamp_(-5, 5) trainer.step() if nbatch % 10 == 0: for n, l in enumerate(label_len): l = int(l.data.cpu().numpy()) la = label[n, 1:l].data.cpu().numpy() pred = predictions[n, :(l - 1)].data.cpu().numpy() accu_top3_metric.update(mx.nd.array(la), mx.nd.array(pred)) accu_top1_metric.update(mx.nd.array(la), mx.nd.array(pred)) epoch_bleu.update(la, predictions[n, :].data.cpu().numpy()) batch_bleu.update(la, predictions[n, :].data.cpu().numpy()) ctc_loss_metric.update( None, preds=mx.nd.array([ctc_loss.data.cpu().numpy()]) / batch_size) alpha_metric.update(None, preds=mx.nd.array( [loss2.data.cpu().numpy()])) if nbatch % log_interval == 0 and nbatch > 0: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in [ epoch_bleu, batch_bleu, accu_top1_metric, accu_top3_metric, ctc_loss_metric, alpha_metric ] ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( nepoch, nbatch, log_interval * batch_size / (time.time() - btic), msg)) btic = time.time() batch_bleu.reset() accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() net.eval() bleu, acc_top1 = validate(net, gpu_id=gpu_id, val_loader=val_loader, train_index2words=dataset.index2words, val_index2words=val_dataset.index2words) save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % ( nepoch, bleu, acc_top1) torch.save(net.module.state_dict(), save_path) torch.save(trainer.state_dict(), save_path + ".states") logger.info("Saved checkpoint to {}.".format(save_path))
# train start = time.perf_counter() for epoch in range(1, EPOCHS + 1): # Reset the train data iterator. train_data.reset() for i, batch in enumerate(train_data): if i == 0: tick_0 = time.time() data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) with ag.record(): output = model(data.astype('float32', copy=False)) loss = loss_fn(output, label) loss.backward() trainer.step(BATCH_SIZE) metric.update([label], [output]) str1 = 'Epoch [{}], Accuracy {:.4f}'.format(epoch, metric.get()[1]) str2 = '~Samples/Sec {:.4f}'.format(BATCH_SIZE * (i + 1) / (time.time() - tick_0)) print('%s %s' % (str1, str2)) # Reset evaluation result to initial state. metric.reset() if 0 == hvd.rank(): elapsed = time.perf_counter() - start print('elapsed: {:0.3f}'.format(elapsed)) # use Accuracy as the evaluation metric metric = Accuracy() for batch in test_data: data = batch.data[0].as_in_context(ctx)