def init_comm(backend): """Init communication backend""" # backend specific implementation if backend == 'horovod': try: import horovod.mxnet as hvd except ImportError: logging.info('horovod must be installed.') exit() hvd.init() store = None num_workers = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() is_master_node = rank == local_rank ctxs = [mx.gpu(local_rank)] elif backend == 'byteps': try: import byteps.mxnet as bps except ImportError: logging.info('BytePS must be installed.') exit() bps.init() store = None num_workers = bps.size() rank = bps.rank() local_rank = bps.local_rank() is_master_node = rank == local_rank ctxs = [mx.gpu(local_rank)] else: # kvstore store = mx.kv.create(backend) num_workers = store.num_workers rank = store.rank local_rank = 0 is_master_node = rank == local_rank ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ [mx.gpu(int(x)) for x in args.gpus.split(',')] return store, num_workers, rank, local_rank, is_master_node, ctxs
def main(): opt = parse_args() bps.init() gpu_name = subprocess.check_output( ['nvidia-smi', '--query-gpu=gpu_name', '--format=csv']) gpu_name = gpu_name.decode('utf8').split('\n')[-2] gpu_name = '-'.join(gpu_name.split()) filename = "cifar100-%d-%s-%s.log" % (bps.size(), gpu_name, opt.logging_file) filehandler = logging.FileHandler(filename) streamhandler = logging.StreamHandler() logger = logging.getLogger('') logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) logger.info(opt) batch_size = opt.batch_size classes = 100 num_gpus = opt.num_gpus # batch_size *= max(1, num_gpus) context = mx.gpu(bps.local_rank()) if num_gpus > 0 else mx.cpu( bps.local_rank()) num_workers = opt.num_workers nworker = bps.size() rank = bps.rank() lr_decay = opt.lr_decay lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] num_batches = 50000 // (opt.batch_size * nworker) lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=opt.warmup_lr, target_lr=opt.lr * nworker / bps.local_size(), nepochs=opt.warmup_epochs, iters_per_epoch=num_batches), LRScheduler('step', base_lr=opt.lr * nworker / bps.local_size(), target_lr=0, nepochs=opt.num_epochs - opt.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) num_batches = 50000 // (opt.batch_size * nworker) lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=opt.warmup_lr, target_lr=opt.lr * nworker / bps.local_size(), nepochs=opt.warmup_epochs, iters_per_epoch=num_batches), LRScheduler('step', base_lr=opt.lr * nworker / bps.local_size(), target_lr=0, nepochs=opt.num_epochs - opt.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) model_name = opt.model if model_name.startswith('cifar_wideresnet'): kwargs = {'classes': classes, 'drop_rate': opt.drop_rate} else: kwargs = {'classes': classes} net = get_model(model_name, **kwargs) if opt.resume_from: net.load_parameters(opt.resume_from, ctx=context) if opt.compressor: optimizer = 'sgd' else: optimizer = 'nag' save_period = opt.save_period if opt.save_dir and save_period: save_dir = opt.save_dir makedirs(save_dir) else: save_dir = '' save_period = 0 # from https://github.com/weiaicunzai/pytorch-cifar/blob/master/conf/global_settings.py CIFAR100_TRAIN_MEAN = [ 0.5070751592371323, 0.48654887331495095, 0.4409178433670343 ] CIFAR100_TRAIN_STD = [ 0.2673342858792401, 0.2564384629170883, 0.27615047132568404 ] transform_train = transforms.Compose([ gcv_transforms.RandomCrop(32, pad=4), transforms.RandomFlipLeftRight(), transforms.ToTensor(), transforms.Normalize(CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD) ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD) ]) def test(ctx, val_data): metric = mx.metric.Accuracy() for i, batch in enumerate(val_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) outputs = [net(X) for X in data] metric.update(label, outputs) return metric.get() def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(), ctx=ctx) train_data = gluon.data.DataLoader(gluon.data.vision.CIFAR100( train=True).shard(nworker, rank).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader(gluon.data.vision.CIFAR100( train=False).shard(nworker, rank).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) params = net.collect_params() compression_params = { "compressor": opt.compressor, "ef": opt.ef, "momentum": opt.compress_momentum, "scaling": opt.onebit_scaling, "k": opt.k, "fp16": opt.fp16_pushpull } optimizer_params = { 'lr_scheduler': lr_scheduler, 'wd': opt.wd, 'momentum': opt.momentum } trainer = bps.DistributedTrainer(params, optimizer, optimizer_params, compression_params=compression_params) metric = mx.metric.Accuracy() train_metric = mx.metric.Accuracy() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() iteration = 0 best_val_score = 0 bps.byteps_declare_tensor("acc") for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) train_metric.update(label, output) name, train_acc = train_metric.get() iteration += 1 train_loss /= batch_size * num_batch name, train_acc = train_metric.get() throughput = int(batch_size * nworker * i / (time.time() - tic)) logger.info( '[Epoch %d] speed: %d samples/sec\ttime cost: %f lr=%f' % (epoch, throughput, time.time() - tic, trainer.learning_rate)) name, val_acc = test(ctx, val_data) acc = mx.nd.array([train_acc, val_acc], ctx=ctx[0]) bps.byteps_push_pull(acc, name="acc", is_average=False) acc /= bps.size() train_acc, val_acc = acc[0].asscalar(), acc[1].asscalar() if bps.rank() == 0: logger.info('[Epoch %d] training: %s=%f' % (epoch, name, train_acc)) logger.info('[Epoch %d] validation: %s=%f' % (epoch, name, val_acc)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters( '%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar100-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar100-%s-%d.params' % (save_dir, model_name, epochs - 1)) if opt.mode == 'hybrid': net.hybridize() train(opt.num_epochs, context)
data = batch[0].as_in_context(context) label = batch[1].as_in_context(context) output = model(data.astype(args.dtype, copy=False)) metric.update([label], [output]) return metric.get() # Load training and validation data train_data, val_data, train_size = get_mnist_iterator() # Initialize BytePS bps.init() # BytePS: pin context to local rank context = mx.cpu(bps.local_rank()) if args.no_cuda else mx.gpu( bps.local_rank()) num_workers = bps.size() # Build model model = conv_nets() model.cast(args.dtype) # Initialize parameters model.initialize(mx.init.MSRAPrelu(), ctx=context) # if bps.rank() == 0: model.summary(nd.ones((1, 1, 28, 28), ctx=mx.gpu(bps.local_rank()))) model.hybridize() # BytePS: fetch and broadcast parameters params = model.collect_params()
def main(): opt = parse_args() bps.init() gpu_name = subprocess.check_output( ['nvidia-smi', '--query-gpu=gpu_name', '--format=csv']) gpu_name = gpu_name.decode('utf8').split('\n')[-2] gpu_name = '-'.join(gpu_name.split()) filename = "imagenet-%d-%s-%s.log" % (bps.size(), gpu_name, opt.logging_file) filehandler = logging.FileHandler(filename) streamhandler = logging.StreamHandler() logger = logging.getLogger('') logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) logger.info(opt) batch_size = opt.batch_size classes = 1000 num_training_samples = 1281167 num_gpus = opt.num_gpus # batch_size *= max(1, num_gpus) context = mx.gpu(bps.local_rank()) if num_gpus > 0 else mx.cpu( bps.local_rank()) num_workers = opt.num_workers nworker = bps.size() rank = bps.rank() lr_decay = opt.lr_decay lr_decay_period = opt.lr_decay_period if opt.lr_decay_period > 0: lr_decay_epoch = list( range(lr_decay_period, opt.num_epochs, lr_decay_period)) else: lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch] num_batches = num_training_samples // (batch_size * nworker) lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=opt.warmup_lr, target_lr=opt.lr * nworker / bps.local_size(), nepochs=opt.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(opt.lr_mode, base_lr=opt.lr * nworker / bps.local_size(), target_lr=0, nepochs=opt.num_epochs - opt.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) model_name = opt.model kwargs = { 'ctx': context, 'pretrained': opt.use_pretrained, 'classes': classes } if opt.use_gn: from gluoncv.nn import GroupNorm kwargs['norm_layer'] = GroupNorm if model_name.startswith('vgg'): kwargs['batch_norm'] = opt.batch_norm elif model_name.startswith('resnext'): kwargs['use_se'] = opt.use_se if opt.last_gamma: kwargs['last_gamma'] = True if opt.compressor: optimizer = 'sgd' else: optimizer = 'nag' optimizer_params = { 'wd': opt.wd, 'momentum': opt.momentum, 'lr_scheduler': lr_scheduler } if opt.dtype != 'float32': optimizer_params['multi_precision'] = True net = get_model(model_name, **kwargs) net.cast(opt.dtype) if opt.resume_params is not '': net.load_parameters(opt.resume_params, ctx=context) # teacher model for distillation training if opt.teacher is not None and opt.hard_weight < 1.0: teacher_name = opt.teacher teacher = get_model(teacher_name, pretrained=True, classes=classes, ctx=context) teacher.cast(opt.dtype) distillation = True else: distillation = False # Two functions for reading data from record file or raw images def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers): rec_train = os.path.expanduser(rec_train) rec_train_idx = os.path.expanduser(rec_train_idx) rec_val = os.path.expanduser(rec_val) rec_val_idx = os.path.expanduser(rec_val_idx) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) mean_rgb = [123.68, 116.779, 103.939] std_rgb = [58.393, 57.12, 57.375] def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) return data, label train_data = mx.io.ImageRecordIter(path_imgrec=rec_train, path_imgidx=rec_train_idx, preprocess_threads=num_workers, shuffle=True, batch_size=batch_size, data_shape=(3, input_size, input_size), mean_r=mean_rgb[0], mean_g=mean_rgb[1], mean_b=mean_rgb[2], std_r=std_rgb[0], std_g=std_rgb[1], std_b=std_rgb[2], rand_mirror=True, random_resized_crop=True, max_aspect_ratio=4. / 3., min_aspect_ratio=3. / 4., max_random_area=1, min_random_area=0.08, brightness=jitter_param, saturation=jitter_param, contrast=jitter_param, pca_noise=lighting_param, num_parts=nworker, part_index=rank) val_data = mx.io.ImageRecordIter(path_imgrec=rec_val, path_imgidx=rec_val_idx, preprocess_threads=num_workers, shuffle=False, batch_size=batch_size, resize=resize, data_shape=(3, input_size, input_size), mean_r=mean_rgb[0], mean_g=mean_rgb[1], mean_b=mean_rgb[2], std_r=std_rgb[0], std_g=std_rgb[1], std_b=std_rgb[2], num_parts=nworker, part_index=rank) return train_data, val_data, batch_fn def get_data_loader(data_dir, batch_size, num_workers): normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) return data, label transform_train = transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.RandomFlipLeftRight(), transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param, saturation=jitter_param), transforms.RandomLighting(lighting_param), transforms.ToTensor(), normalize ]) transform_test = transforms.Compose([ transforms.Resize(resize, keep_ratio=True), transforms.CenterCrop(input_size), transforms.ToTensor(), normalize ]) train_data = gluon.data.DataLoader(imagenet.classification.ImageNet( data_dir, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader(imagenet.classification.ImageNet( data_dir, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) return train_data, val_data, batch_fn if opt.use_rec: train_data, val_data, batch_fn = get_data_rec(opt.rec_train, opt.rec_train_idx, opt.rec_val, opt.rec_val_idx, batch_size, num_workers) else: train_data, val_data, batch_fn = get_data_loader( opt.data_dir, batch_size, num_workers) if opt.mixup: train_metric = mx.metric.RMSE() else: train_metric = mx.metric.Accuracy() acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) save_frequency = opt.save_frequency if opt.save_dir and save_frequency: save_dir = opt.save_dir makedirs(save_dir) else: save_dir = '' save_frequency = 0 def mixup_transform(label, classes, lam=1, eta=0.0): if isinstance(label, nd.NDArray): label = [label] res = [] for l in label: y1 = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) y2 = l[::-1].one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) res.append(lam * y1 + (1 - lam) * y2) return res def smooth(label, classes, eta=0.1): if isinstance(label, nd.NDArray): label = [label] smoothed = [] for l in label: res = l.one_hot(classes, on_value=1 - eta + eta / classes, off_value=eta / classes) smoothed.append(res) return smoothed def test(ctx, val_data): if opt.use_rec: val_data.reset() acc_top1.reset() acc_top5.reset() for i, batch in enumerate(val_data): data, label = batch_fn(batch, ctx) outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] acc_top1.update(label, outputs) acc_top5.update(label, outputs) _, top1 = acc_top1.get() _, top5 = acc_top5.get() return (1 - top1, 1 - top5) def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.resume_params is '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 compression_params = { "compressor": opt.compressor, "ef": opt.ef, "momentum": opt.compress_momentum, "scaling": opt.onebit_scaling, "k": opt.k } trainer = bps.DistributedTrainer(net.collect_params(), optimizer, optimizer_params, compression_params=compression_params) if opt.resume_states is not '': trainer.load_states(opt.resume_states) if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: sparse_label_loss = True if distillation: L = gcv.loss.DistillationSoftmaxCrossEntropyLoss( temperature=opt.temperature, hard_weight=opt.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=sparse_label_loss) best_val_score = 1 # bps.byteps_declare_tensor("acc") for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: train_data.reset() train_metric.reset() btic = time.time() for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) if distillation: teacher_prob = [ nd.softmax( teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) for X in data ] with ag.record(): outputs = [ net(X.astype(opt.dtype, copy=False)) for X in data ] if distillation: loss = [ L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob) ] else: loss = [ L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label) ] for l in loss: l.backward() trainer.step(batch_size) if opt.mixup: output_softmax = [ nd.SoftmaxActivation(out.astype('float32', copy=False)) for out in outputs ] train_metric.update(label, output_softmax) else: if opt.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if opt.log_interval and not (i + 1) % opt.log_interval: train_metric_name, train_metric_score = train_metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f\ttime=%f' % (epoch, i, batch_size * nworker * opt.log_interval / (time.time() - btic), train_metric_name, train_metric_score, trainer.learning_rate, time.time() - btic)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * nworker * i / (time.time() - tic)) logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time() - tic)) err_top1_val, err_top5_val = test(ctx, val_data) # acc = mx.nd.array([train_metric_score, err_top1_val, err_top5_val], # ctx=ctx[0]) # bps.byteps_push_pull(acc, name="acc", is_average=False) # acc /= bps.size() # train_metric_score, err_top1_val, err_top5_val = acc[0].asscalar( # ), acc[1].asscalar(), acc[2].asscalar() # if bps.rank() == 0: logger.info('[Epoch %d] training: %s=%f' % (epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f' % (epoch, err_top1_val, err_top5_val)) if err_top1_val < best_val_score: best_val_score = err_top1_val net.save_parameters( '%s/%.4f-imagenet-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) trainer.save_states( '%s/%.4f-imagenet-%s-%d-best.states' % (save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/imagenet-%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/imagenet-%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1)) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if distillation: teacher.hybridize(static_alloc=True, static_shape=True) train(context)
def _current_context(self): if has_gpu: return mx.gpu(bps.local_rank()) else: return mx.current_context()
def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers): rec_train = os.path.expanduser(rec_train) rec_train_idx = os.path.expanduser(rec_train_idx) rec_val = os.path.expanduser(rec_val) rec_val_idx = os.path.expanduser(rec_val_idx) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) mean_rgb = [123.68, 116.779, 103.939] std_rgb = [58.393, 57.12, 57.375] def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) return data, label train_data = mx.io.ImageRecordIter(path_imgrec=rec_train, path_imgidx=rec_train_idx, preprocess_threads=num_workers, shuffle=True, batch_size=batch_size, data_shape=(3, input_size, input_size), mean_r=mean_rgb[0], mean_g=mean_rgb[1], mean_b=mean_rgb[2], std_r=std_rgb[0], std_g=std_rgb[1], std_b=std_rgb[2], rand_mirror=True, random_resized_crop=True, max_aspect_ratio=4. / 3., min_aspect_ratio=3. / 4., max_random_area=1, min_random_area=0.08, brightness=jitter_param, saturation=jitter_param, contrast=jitter_param, pca_noise=lighting_param, num_parts=nworker, part_index=rank, device_id=bps.local_rank()) val_data = mx.io.ImageRecordIter(path_imgrec=rec_val, path_imgidx=rec_val_idx, preprocess_threads=num_workers, shuffle=False, batch_size=batch_size, resize=resize, data_shape=(3, input_size, input_size), mean_r=mean_rgb[0], mean_g=mean_rgb[1], mean_b=mean_rgb[2], std_r=std_rgb[0], std_g=std_rgb[1], std_b=std_rgb[2], num_parts=nworker, part_index=rank, device_id=bps.local_rank()) return train_data, val_data, batch_fn
def fit(args, network, data_loader, **kwargs): """ train a model args : argparse returns network : the symbol definition of the nerual network data_loader : function that returns the train and val data iterators """ # kvstore # kv = mx.kvstore.create(args.kv_store) # if args.gc_type != 'none': # kv.set_gradient_compression({'type': args.gc_type, # 'threshold': args.gc_threshold}) # logging head = '%(asctime)-15s Node[' + str(bps.rank()) + '] %(message)s' logging.basicConfig(level=logging.DEBUG, format=head) logging.info('start with arguments %s', args) # data iterators (train, val) = data_loader(args, (bps.rank(), bps.size(), bps.local_rank())) if args.test_io: tic = time.time() for i, batch in enumerate(train): for j in batch.data: j.wait_to_read() if (i + 1) % args.disp_batches == 0: logging.info( 'Batch [%d]\tSpeed: %.2f samples/sec', i, args.disp_batches * args.batch_size / (time.time() - tic)) tic = time.time() return # load model if 'arg_params' in kwargs and 'aux_params' in kwargs: arg_params = kwargs['arg_params'] aux_params = kwargs['aux_params'] else: sym, arg_params, aux_params = _load_model(args, bps.rank()) if sym is not None: assert sym.tojson() == network.tojson() # save model checkpoint = _save_model(args, bps.rank()) # devices for training if args.cpu_train: devs = [mx.cpu(bps.local_rank())] else: logging.info('Launch BytePS process on GPU-%d', bps.local_rank()) devs = [mx.gpu(bps.local_rank())] # learning rate lr, lr_scheduler = _get_lr_scheduler(args) # create model model = mx.mod.Module(context=devs, symbol=network) lr_scheduler = lr_scheduler optimizer_params = { 'learning_rate': lr, 'wd': args.wd, 'lr_scheduler': lr_scheduler, 'multi_precision': True } # Only a limited number of optimizers have 'momentum' property has_momentum = {'sgd', 'dcasgd', 'nag'} if args.optimizer in has_momentum: optimizer_params['momentum'] = args.mom monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None # A limited number of optimizers have a warmup period has_warmup = {'lbsgd', 'lbnag'} if args.optimizer in has_warmup: if bps.size() > 1: nworkers = bps.size() else: nworkers = 1 epoch_size = args.num_examples / args.batch_size / nworkers if epoch_size < 1: epoch_size = 1 macrobatch_size = args.macrobatch_size if macrobatch_size < args.batch_size * nworkers: macrobatch_size = args.batch_size * nworkers #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999) batch_scale = math.ceil( float(macrobatch_size) / args.batch_size / nworkers) optimizer_params['updates_per_epoch'] = epoch_size optimizer_params[ 'begin_epoch'] = args.load_epoch if args.load_epoch else 0 optimizer_params['batch_scale'] = batch_scale optimizer_params['warmup_strategy'] = args.warmup_strategy optimizer_params['warmup_epochs'] = args.warmup_epochs optimizer_params['num_epochs'] = args.num_epochs if args.initializer == 'default': if args.network == 'alexnet': # AlexNet will not converge using Xavier initializer = mx.init.Normal() # VGG will not trend to converge using Xavier-Gaussian elif 'vgg' in args.network: initializer = mx.init.Xavier() else: initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), elif args.initializer == 'xavier': initializer = mx.init.Xavier() elif args.initializer == 'msra': initializer = mx.init.MSRAPrelu() elif args.initializer == 'orthogonal': initializer = mx.init.Orthogonal() elif args.initializer == 'normal': initializer = mx.init.Normal() elif args.initializer == 'uniform': initializer = mx.init.Uniform() elif args.initializer == 'one': initializer = mx.init.One() elif args.initializer == 'zero': initializer = mx.init.Zero() # evaluation metrices eval_metrics = ['accuracy'] if args.top_k > 0: eval_metrics.append( mx.metric.create('top_k_accuracy', top_k=args.top_k)) supported_loss = ['ce', 'nll_loss'] if len(args.loss) > 0: # ce or nll loss is only applicable to softmax output loss_type_list = args.loss.split(',') if 'softmax_output' in network.list_outputs(): for loss_type in loss_type_list: loss_type = loss_type.strip() if loss_type == 'nll': loss_type = 'nll_loss' if loss_type not in supported_loss: logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \ 'negative likelihood loss is supported!') else: eval_metrics.append(mx.metric.create(loss_type)) else: logging.warning( "The output is not softmax_output, loss argument will be skipped!" ) # callbacks that run after each batch batch_end_callbacks = [ mx.callback.Speedometer(args.batch_size, args.disp_batches) ] if 'batch_end_callback' in kwargs: cbs = kwargs['batch_end_callback'] batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] # BytePS wrapper opt = mx.optimizer.create(args.optimizer, sym=network, **optimizer_params) # opt = bps.DistributedOptimizer(opt) print(str(os.environ) + "=============" + str(bps.rank())) # else: opt = bps.DistributedOptimizer(opt) # BytePS: better to explicitly init model.bind(data_shapes=train.provide_data, label_shapes=train.provide_label) if arg_params is None and aux_params is None: model.init_params(initializer) (arg_params, aux_params) = model.get_params() if arg_params is not None: bps.broadcast_parameters(arg_params, root_rank=0) if aux_params is not None: bps.broadcast_parameters(aux_params, root_rank=0) model.set_params(arg_params=arg_params, aux_params=aux_params) # run model.fit(train, begin_epoch=args.load_epoch if args.load_epoch else 0, num_epoch=args.num_epochs, eval_data=val, eval_metric=eval_metrics, kvstore=None, optimizer=opt, optimizer_params=optimizer_params, batch_end_callback=batch_end_callbacks, epoch_end_callback=checkpoint, allow_missing=True, monitor=monitor)
import time import mxnet as mx import byteps.mxnet as bps from mxnet import autograd as ag from gluoncv.utils import LRScheduler from nvidia.dali.plugin.mxnet import DALIClassificationIterator from gluonfr.loss import * from gluonfr.model_zoo import * from utils import Logger, FacePipe, ParallelValidation # init BytePS bps.init() num_gpu = bps.size() local_rank = bps.local_rank() rank = bps.rank() # hyper parameters lamda = 0.01 r_init = 20.0 embedding_size = 128 lr = 0.1 momentum = 0.9 wd = 4e-5 ctx = mx.gpu(local_rank) num_worker = 4 batch_size_per_gpu = 128 batch_size = batch_size_per_gpu * num_gpu
images = self._decode(inputs) if self.train: images = self.padding(images, paste_x=self.px(), paste_y=self.py()) images = self._cmnp(images, mirror=self._coin()) return images, labels.gpu() args = parse_args() use_mix_up = True if args.mix_up else False use_float16 = True if args.float16 else False label_smoothing = args.epsilon > 0 epsilon = args.epsilon bps.init() num_gpu = bps.size() local_rank = bps.local_rank() rank = bps.rank() epochs = args.epochs + 1 alpha = args.alpha max_accuracy = 0.0 ctx = mx.gpu(bps.local_rank()) # load_data batch_size = args.batch_size * num_gpu train_pipes = [ CifarPipe(args.batch_size, args.num_workers, local_rank, num_gpu, rank, use_float16) ]