# global variables if opt.save_frequency is None: opt.save_frequency = get_default_save_frequency(opt.dataset) logger.info('Starting new image-classification task:, %s', opt) mx.random.seed(opt.seed) batch_size, dataset, classes = opt.batch_size, opt.dataset, get_num_classes( opt.dataset) context = [mx.gpu(int(i)) for i in opt.gpus.split(',') ] if opt.gpus.strip() else [mx.cpu()] if opt.dry_run: context = [mx.cpu()] num_gpus = len(context) batch_size *= max(1, num_gpus) opt.batch_size = batch_size metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5)]) net, arg_params, aux_params = get_model(opt, context) print(net) if opt.profile: import hotshot, hotshot.stats prof = hotshot.Profile('image-classifier-%s-%s.prof' % (opt.model, opt.mode)) prof.runcall(main) prof.close() stats = hotshot.stats.load('image-classifier-%s-%s.prof' % (opt.model, opt.mode)) stats.strip_dirs() stats.sort_stats('cumtime', 'calls')
def get_metrics(): # `metrics` argument was split into `train_metrics` and `val_metrics` in mxnet 1.6.0: # https://github.com/apache/incubator-mxnet/pull/17048 arg_name = "metrics" if is_mxnet_older_than_1_6_0() else "train_metrics" return {arg_name: Accuracy()}
def main(train_list, val_list, model, exp, saved_model, batch_size, optimizer, nb_epochs, augment, max_lr, min_lr, loss_function, train_all, nb_frames, eager, params=None, **kwargs): print("Unused arguments:", kwargs) setname = train_list.split(os.sep)[0] # Timestamp to name experiment folder xptime = strftime("%Y-%m-%d_%Hh%Mm%Ss", gmtime()) xp_folder = "experiments/%s-%s-%s_%s" % (setname, model, exp, xptime) # Make folder mkdir_p(xp_folder) mkdir_p(os.path.join(xp_folder, 'checkpoints')) mkdir_p(os.path.join(xp_folder, 'tb')) print("\nSaving experiment data to:", xp_folder) # Save command (as well as possible) with open(os.path.join(xp_folder, 'command.sh'), "w") as f: command = " ".join(sys.argv[:]) + "\n" f.write(command) # Save employed parameters for future reference if params is not None: write_params(os.path.join(xp_folder, 'params.json'), params) ############# # Callbacks # ############# # Helper: Save the model. ckpt_fmt = os.path.join( xp_folder, 'checkpoints', model + '-' + exp + '.{epoch:03d}-loss{val_loss:.3f}-acc{val_acc:.3f}.hdf5') checkpointer = ModelCheckpoint(filepath=ckpt_fmt, verbose=1, save_best_only=True, monitor='val_acc') # Helper: TensorBoard tb = HistoryKeeper(logdir=os.path.join(xp_folder), keys=['val_acc', 'val_loss', 'train_time', 'val_time']) # Helper: Stop when we stop learning. # early_stopper = EarlyStopper(patience=15) # Helper: Terminate when finding a NaN loss nan_term = TerminateOnNaN() callbacks = [tb, checkpointer, nan_term] ############# ############# # Loading # ############# if augment: augmenter = default_augmenter(strip_size=4) else: augment = False augmenter = None # Dataset classes transform = lambda data, label: (augmenter(preprocess(data)), label) train_data = ImageFolderDataset(train_list, transform=transform) val_data = ImageFolderDataset(val_list) img_shape = train_data[0][0].shape # Train loader train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=10) nb_samples = len(train_data) # loader should provide the number of sampĺes # Validation loader val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=10) nb_validation = len( val_data) # loader should provide the number of sampĺes # Compute number of steps steps_per_epoch = math.ceil(nb_samples / batch_size) validation_steps = math.ceil(nb_validation / batch_size) # The model net = ResearchModels(8, model, saved_model, input_shape=img_shape, train_all=train_all).model # A little more verbosity print("************************************") if train_all: print("Train all layers.") print("Max lr:", max_lr, " Min lr:", min_lr) print("Batch size:", batch_size) print(nb_samples, "training samples,", steps_per_epoch, "steps per epoch") print(nb_validation, "validation samples,", validation_steps, "validation steps") print("Optimizer:", optimizer) if augment: print("Using data augmentation") else: print("WARNING: Not using data augmentation") print("************************************") ############################ # Loss and Optimization # ############################ trainer = gluon.Trainer(net.collect_params(), optimizer, {'learning_rate': max_lr}) if loss_function == 'categorical_crossentropy': loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() loss_fn.hybridize() ############ # Training # ############ progress_desc = "Epoch %03d - acc %.3f - loss %.3f " acc = Accuracy() loss = Loss() start_time = time() for epoch in range(1, nb_epochs + 1): nb_batches = 0 tic = time() acc.reset() loss.reset() train_time = 0 t = tqdm(train_loader, unit='batch') for data, label in t: size = data.shape[0] # print(data.shape) data = data.copyto(mx.gpu(0)) label = label.copyto(mx.gpu(0)) start = time() with autograd.record(): output = net(data) l = loss_fn(output, label) l.backward() end = time() train_time += end - start # update parameters trainer.step(size) acc.update(preds=output, labels=label) loss.update(preds=l, _=None) nb_batches += 1 t.set_description(progress_desc % (epoch, acc.get()[1], loss.get()[1])) train_loss = loss.get()[1] train_acc = acc.get()[1] acc.reset() val_time = 0 # calculate validation accuracy tval = tqdm(val_loader, leave=False, desc='Running validation', unit='batch') for data, label in tval: data = data.copyto(mx.gpu(0)) label = label.copyto(mx.gpu(0)) # Compute outputs start = time() output = net(data) l = loss_fn(output, label) end = time() val_time += end - start # Compute metrics loss.update(preds=l, _=None) acc.update(preds=output, labels=label) val_loss = loss.get()[1] val_acc = acc.get()[1] print( "Epoch %d: loss %.3f, acc %.3f, val_loss %.3f, val_acc %.3f, in %.1f sec" % (epoch, train_loss, train_acc, val_loss, val_acc, time() - tic)) print( "--------------------------------------------------------------------------------" ) stop = False train_info = { 'epoch': epoch, 'loss': train_loss, 'acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc, 'train_time': train_time, 'val_time': val_time } for cb in callbacks: if cb(net, train_info): stop = True if stop: break print() hours, rem = divmod(time() - start_time, 3600) days, hours = divmod(hours, 24) minutes, seconds = divmod(rem, 60) print("%d training epochs in %dd, %dh%dm%.2fs." % (epoch, int(days), int(hours), int(minutes), seconds))
def __init__(self): is_pair = True class_labels = ['0', '1'] metric = Accuracy() super(WNLITask, self).__init__(class_labels, metric, is_pair)
def __init__(self): is_pair = True class_labels = ['neutral', 'entailment', 'contradiction'] metric = Accuracy() super(MNLITask, self).__init__(class_labels, metric, is_pair)
def get_metric(cls): """Get metrics Accuracy and F1""" metric = CompositeEvalMetric() for child_metric in [Accuracy(), F1(average='micro')]: metric.add(child_metric) return metric
def _suggest_metric_for_loss(loss): if isinstance(loss, SoftmaxCrossEntropyLoss): return Accuracy() return None
model_name = opt.model dataset_classes = { 'mnist': 10, 'cifar10': 10, 'imagenet': 1000, 'dummy': 1000, 'sampleimgnet': 200 } batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[ opt.dataset] context = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()] num_gpus = len(context) batch_size *= max(1, num_gpus) lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()] metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5), CrossEntropy()]) def get_model(model, ctx, opt): """Model initialization.""" kwargs = {'ctx': ctx, 'pretrained': opt.use_pretrained, 'classes': classes} if model.startswith('resnet'): kwargs['thumbnail'] = opt.use_thumbnail elif model.startswith('vgg'): kwargs['batch_norm'] = opt.batch_norm net = models.get_model(model, **kwargs) if opt.resume: net.load_params(opt.resume) elif not opt.use_pretrained: if model in ['alexnet']:
out = mx.sym.SoftmaxOutput(data=fc2, name='softmax') model = mx.mod.Module(out, context=ctx) model.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) # initialize parameters model.init_params(initializer=mx.init.Xavier(magnitude=2.)) opt_params = { 'learning_rate': 0.001, 'beta1': 0.9, 'beta2': 0.999, 'epsilon': 1e-08 } opt = mx.optimizer.create('adam', **opt_params) model.init_optimizer(kvstore='device', optimizer=opt) metric = Accuracy() # train start = time.perf_counter() for epoch in range(1, EPOCHS + 1): train_data.reset() for i, batch in enumerate(train_data): if i == 0: tick_0 = time.time() model.forward(batch, is_train=True) model.backward() model.update() model.update_metric(metric, batch.label) str1 = 'Epoch [{}], Accuracy {:.4f}'.format(epoch, metric.get()[1]) str2 = '~Samples/Sec {:.4f}'.format(BATCH_SIZE * (i + 1) / (time.time() - tick_0)) print('%s %s' % (str1, str2))
def get_metric(): """Get metrics Accuracy""" return Accuracy()
def get_metric(): """Get metrics Accuracy and F1""" metric = CompositeEvalMetric() for child_metric in [Accuracy(), F1()]: metric.add(child_metric) return metric
def main(): epoches = 32 gpu_id = 7 ctx_list = [mx.gpu(x) for x in [7, 8]] log_interval = 100 batch_size = 32 start_epoch = 0 # trainer_resume = resume + ".states" if resume is not None else None trainer_resume = None resume = None from mxnet.gluon.data.vision import transforms transform_fn = transforms.Compose([ LeftTopPad(dest_shape=(256, 256)), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/train2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_train2017.json", transforms=transform_fn, feature_hdf5="output/train2017.h5") val_dataset = CaptionDataSet( image_root="/data3/zyx/yks/coco2017/val2017", annotation_path= "/data3/zyx/yks/coco2017/annotations/captions_val2017.json", words2index=dataset.words2index, index2words=dataset.index2words, transforms=transform_fn, feature_hdf5="output/val2017.h5") dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, last_batch="discard") val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) num_words = dataset.words_count # set up logger save_prefix = "output/res50_" logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) net = EncoderDecoder(num_words=num_words, test_max_len=val_dataset.max_len).cuda() for name, p in net.named_parameters(): if "bias" in name: p.data.zero_() else: p.data.normal_(0, 0.01) print(name) net = torch.nn.DataParallel(net) if resume is not None: net.collect_params().load(resume, allow_missing=True, ignore_extra=True) logger.info("Resumed form checkpoint {}.".format(resume)) trainer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, net.parameters()), lr=4e-4) criterion = Criterion() accu_top3_metric = TopKAccuracy(top_k=3) accu_top1_metric = Accuracy(name="batch_accu") ctc_loss_metric = Loss(name="ctc_loss") alpha_metric = Loss(name="alpha_loss") batch_bleu = BleuMetric(name="batch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) epoch_bleu = BleuMetric(name="epoch_bleu", pred_index2words=dataset.index2words, label_index2words=dataset.index2words) btic = time.time() logger.info(batch_size) logger.info(num_words) logger.info(len(dataset.words2index)) logger.info(len(dataset.index2words)) logger.info(dataset.words2index["<PAD>"]) logger.info(val_dataset.words2index["<PAD>"]) logger.info(len(val_dataset.words2index)) for nepoch in range(start_epoch, epoches): if nepoch > 15: trainer.set_learning_rate(4e-5) logger.info("Current lr: {}".format(trainer.param_groups[0]["lr"])) accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() epoch_bleu.reset() batch_bleu.reset() for nbatch, batch in enumerate(tqdm.tqdm(dataloader)): batch = [ Variable(torch.from_numpy(x.asnumpy()).cuda()) for x in batch ] data, label, label_len = batch label = label.long() label_len = label_len.long() max_len = label_len.max().data.cpu().numpy() net.train() outputs = net(data, label, max_len) predictions, alphas = outputs ctc_loss = criterion(predictions, label, label_len) loss2 = 1.0 * ((1. - alphas.sum(dim=1))**2).mean() ((ctc_loss + loss2) / batch_size).backward() for group in trainer.param_groups: for param in group['params']: if param.grad is not None: param.grad.data.clamp_(-5, 5) trainer.step() if nbatch % 10 == 0: for n, l in enumerate(label_len): l = int(l.data.cpu().numpy()) la = label[n, 1:l].data.cpu().numpy() pred = predictions[n, :(l - 1)].data.cpu().numpy() accu_top3_metric.update(mx.nd.array(la), mx.nd.array(pred)) accu_top1_metric.update(mx.nd.array(la), mx.nd.array(pred)) epoch_bleu.update(la, predictions[n, :].data.cpu().numpy()) batch_bleu.update(la, predictions[n, :].data.cpu().numpy()) ctc_loss_metric.update( None, preds=mx.nd.array([ctc_loss.data.cpu().numpy()]) / batch_size) alpha_metric.update(None, preds=mx.nd.array( [loss2.data.cpu().numpy()])) if nbatch % log_interval == 0 and nbatch > 0: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in [ epoch_bleu, batch_bleu, accu_top1_metric, accu_top3_metric, ctc_loss_metric, alpha_metric ] ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( nepoch, nbatch, log_interval * batch_size / (time.time() - btic), msg)) btic = time.time() batch_bleu.reset() accu_top1_metric.reset() accu_top3_metric.reset() ctc_loss_metric.reset() alpha_metric.reset() net.eval() bleu, acc_top1 = validate(net, gpu_id=gpu_id, val_loader=val_loader, train_index2words=dataset.index2words, val_index2words=val_dataset.index2words) save_path = save_prefix + "_weights-%d-bleu-%.4f-%.4f.params" % ( nepoch, bleu, acc_top1) torch.save(net.module.state_dict(), save_path) torch.save(trainer.state_dict(), save_path + ".states") logger.info("Saved checkpoint to {}.".format(save_path))
def fit(self, itr, ctx, epochs, batch_size, callbacks=None): # ADAM optimizer #opt_params={'learning_rate':0.001, 'beta1':0.9, 'beta2':0.999, 'epsilon':1e-08} opt = mx.optimizer.create('adam') # SGD optimizer #opt = mx.optimizer.create('sgd') # AdaDelta optimizer #opt = mx.optimizer.create('adadelta') # initialize parameters # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0 # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers self._net.initialize(mx.init.Xavier(magnitude=2.3), ctx=ctx, force_reinit=True) # fetch and broadcast parameters params = self._net.collect_params() # trainer trainer = Trainer(params=params, optimizer=opt, kvstore='device') # loss function loss_fn = SoftmaxCrossEntropyLoss() # use accuracy as the evaluation metric metric = Accuracy() # train for e in range(epochs): if callbacks is not None: for cb in callbacks: cb.before_epoch(e) # reset evaluation result to initial state metric.reset() # reset the train data iterator. itr.reset() # loop over the train data iterator for i, batch in enumerate(itr): # splits train data into multiple slices along batch_axis # copy each slice into a context data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0, even_split=False) # splits train label into multiple slices along batch_axis # copy each slice into a context label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0, even_split=False) outputs = [] losses = [] # inside training scope with ag.record(): for x, y in zip(data, label): z = self._net(x) # computes softmax cross entropy loss l = loss_fn(z, y) outputs.append(z) losses.append(l) # backpropagate the error for one iteration for l in losses: l.backward() # make one step of parameter update. # trainer needs to know the batch size of data # to normalize the gradient by 1/batch_size trainer.step(batch_size) # updates internal evaluation metric.update(label, outputs) # invoke callbacks after batch if callbacks is not None: for cb in callbacks: cb.after_batch(e, i, batch_size, metric) # invoke callbacks after epoch if callbacks is not None: for cb in callbacks: cb.after_epoch(e, i, batch_size, metric) return metric
'beta2': 0.999, 'epsilon': 1e-08 } opt = mx.optimizer.create('adam', **opt_params) # initialize parameters model.initialize(force_reinit=True, ctx=ctx) # fetch and broadcast parameters params = model.collect_params() if params is not None: hvd.broadcast_parameters(params, root_rank=0) # create DistributedTrainer, a subclass of gluon.Trainer trainer = hvd.DistributedTrainer(params, opt) # loss function loss_fn = SoftmaxCrossEntropyLoss() # use accuracy as the evaluation metric metric = Accuracy() # train start = time.perf_counter() for epoch in range(1, EPOCHS + 1): # Reset the train data iterator. train_data.reset() for i, batch in enumerate(train_data): if i == 0: tick_0 = time.time() data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) with ag.record(): output = model(data.astype('float32', copy=False)) loss = loss_fn(output, label) loss.backward() trainer.step(BATCH_SIZE)
def __init__(self): is_pair = True class_labels = ['not_entailment', 'entailment'] metric = Accuracy() super(QNLITask, self).__init__(class_labels, metric, is_pair)
batch_size = 256 train_data = gluon.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=4) mnist_valid = gluon.data.vision.FashionMNIST(train=False) valid_data = gluon.data.DataLoader(mnist_valid.transform_first(transformer), batch_size=batch_size, num_workers=4) # Only hybrid based networks can be exported net = HybridSequential() net.add(Conv2D(channels=6, kernel_size=5, activation="relu"), MaxPool2D(pool_size=2, strides=2), Conv2D(channels=16, kernel_size=3, activation="relu"), MaxPool2D(pool_size=2, strides=2), Flatten(), Dense(120, activation="relu"), Dense(84, activation="relu"), Dense(10)) net.initialize(init=init.Xavier()) # Only after hybridization a model can be exported with architecture included net.hybridize() trainer = Trainer(net.collect_params(), "sgd", {"learning_rate": 0.1}) est = estimator.Estimator(net=net, loss=SoftmaxCrossEntropyLoss(), metrics=Accuracy(), trainer=trainer) est.fit(train_data=train_data, epochs=2, val_data=valid_data)
def __init__(self): is_pair = False class_labels = ['0', '1'] metric = Accuracy() super(SSTTask, self).__init__(class_labels, metric, is_pair)
def load_net(param_file="net.params", ctx=cpu(0)): net = SimpleNet() net.load_parameters(param_file, ctx=ctx) return net def get_val_data(transformer, batch_size=128): mnist_valid = gluon.data.vision.FashionMNIST(train=False) valid_data = gluon.data.DataLoader( mnist_valid.transform_first(transformer), batch_size=batch_size, num_workers=4) return valid_data if __name__ == "__main__": ctx = gpu(0) if context.num_gpus() else cpu(0) net = load_net("net.params", ctx=ctx) valid_data = get_val_data(transformer) val_acc = Accuracy() for data, label in valid_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.predict_mode(): out = net(data) val_acc.update(label, out) print("Accuray: ", val_acc.get()[1])