def main(): args = parse_args() with open('config.yml', 'r', encoding='utf-8') as f: run = wandb.init(job_type='train', dir=root_dir(), project=args.wandb, config=yaml.safe_load(f)) if not args.model == wandb.config.model_name: raise RuntimeError( f"Inconsistent model name: {args.model} v.s. {wandb.config.model_name}" ) # if using wandb sweep if args.lr and args.wd: wandb.config.lr = args.lr wandb.config.wd = args.wd ctx = get_contexts(args.ctx) wandb.config.ctx = ctx # add contexts to config logger = get_logger(name='train', level=10) fit(run=run, ctx=ctx, log_interval=args.log_interval, no_val=args.no_val, logger=logger)
def train(cfg, ctx_lst, project_name, log_interval=5, no_val=False, lr=None, wd=None): wandb.init(job_type='train', dir=my_tools.root_dir(), config=cfg, project=project_name) if lr and wd: wandb.config.lr = lr wandb.config.wd = wd ctx = my_tools.get_contexts(ctx_lst) wandb.config.ctx = ctx data_factory = DataFactory(wandb.config.data_name) model_factory = ModelFactory(wandb.config.model_name) norm_layer, norm_kwargs = my_tools.get_norm_layer(wandb.config.norm, len(ctx)) model_kwargs = { 'nclass': data_factory.num_class, 'backbone': wandb.config.backbone, 'pretrained_base': wandb.config.backbone_init.get('manner') == 'cls', 'aux': wandb.config.aux, 'crop_size': wandb.config.crop_size, 'base_size': wandb.config.base_size, 'dilate': wandb.config.dilate, 'norm_layer': norm_layer, 'norm_kwargs': norm_kwargs, } net = model_factory.get_model( model_kwargs, resume=wandb.config.resume, lr_mult=wandb.config.lr_mult, backbone_init_manner=wandb.config.backbone_init.get('manner'), backbone_ckpt=wandb.config.backbone_init.get('backbone_ckpt'), prior_classes=wandb.config.backbone_init.get('prior_classes'), ctx=ctx) if net.symbolize: net.hybridize() num_worker = 0 if platform.system() == 'Windows' else 16 train_set = data_factory.seg_dataset( split='train', # sometimes would be 'trainval' mode='train', transform=my_tools.image_transform(), base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) train_iter = DataLoader(train_set, wandb.config.bs_train, shuffle=True, last_batch='discard', num_workers=num_worker) val_set = data_factory.seg_dataset(split='val', mode='val', transform=my_tools.image_transform(), base_size=wandb.config.base_size, crop_size=wandb.config.crop_size) val_iter = DataLoader(val_set, wandb.config.bs_val, shuffle=False, last_batch='keep', num_workers=num_worker) wandb.config.num_train = len(train_set) wandb.config.num_valid = len(val_set) criterion = _get_criterion(wandb.config.aux, wandb.config.aux_weight) criterion.initialize(ctx=ctx) wandb.config.criterion = type(criterion) if wandb.config.optimizer == 'adam': trainer = Trainer(net.collect_params(), 'adam', optimizer_params={ 'learning_rate': wandb.config.lr, 'wd': wandb.config.wd, 'beta1': wandb.config.adam.get('adam_beta1'), 'beta2': wandb.config.adam.get('adam_beta2') }) elif wandb.config.optimizer in ('sgd', 'nag'): scheduler = _lr_scheduler( mode=wandb.config.lr_scheduler, base_lr=wandb.config.lr, target_lr=wandb.config.target_lr, nepochs=wandb.config.epochs, iters_per_epoch=len(train_iter), step_epoch=wandb.config.step.get('step_epoch'), step_factor=wandb.config.step.get('step_factor'), power=wandb.config.poly.get('power')) trainer = Trainer(net.collect_params(), wandb.config.optimizer, optimizer_params={ 'lr_scheduler': scheduler, 'wd': wandb.config.wd, 'momentum': wandb.config.momentum, 'multi_precision': True }) else: raise RuntimeError(f"Unknown optimizer: {wandb.config.optimizer}") metric = SegmentationMetric(data_factory.num_class) logger = get_logger(name='train', level=10) t_start = my_tools.get_strftime() logger.info(f'Training start: {t_start}') for k, v in wandb.config.items(): logger.info(f'{k}: {v}') logger.info('-----> end hyper-parameters <-----') wandb.config.start_time = t_start best_score = .0 best_epoch = 0 for epoch in range(wandb.config.epochs): train_loss = .0 tbar = tqdm(train_iter) for i, (data, target) in enumerate(tbar): gpu_datas = split_and_load(data, ctx_list=ctx) gpu_targets = split_and_load(target, ctx_list=ctx) with autograd.record(): loss_gpus = [ criterion(*net(gpu_data), gpu_target) for gpu_data, gpu_target in zip(gpu_datas, gpu_targets) ] for loss in loss_gpus: autograd.backward(loss) trainer.step(wandb.config.bs_train) nd.waitall() train_loss += sum([loss.mean().asscalar() for loss in loss_gpus]) / len(loss_gpus) tbar.set_description( 'Epoch-%d [training], loss %.5f, %s' % (epoch, train_loss / (i + 1), my_tools.get_strftime('%Y-%m-%d %H:%M:%S'))) if (i % log_interval == 0) or (i + 1 == len(train_iter)): wandb.log({ f'train_loss_batch, interval={log_interval}': train_loss / (i + 1) }) wandb.log({ 'train_loss_epoch': train_loss / (len(train_iter)), 'custom_step': epoch }) if not no_val: val_loss = .0 vbar = tqdm(val_iter) for i, (data, target) in enumerate(vbar): gpu_datas = split_and_load(data=data, ctx_list=ctx, even_split=False) gpu_targets = split_and_load(data=target, ctx_list=ctx, even_split=False) loss_gpus = [] for gpu_data, gpu_target in zip(gpu_datas, gpu_targets): gpu_output = net(gpu_data) loss_gpus.append(criterion(*gpu_output, gpu_target)) metric.update(gpu_target, gpu_output[0]) val_loss += sum([loss.mean().asscalar() for loss in loss_gpus]) / len(loss_gpus) vbar.set_description( 'Epoch-%d [validation], PA %.4f, mIoU %.4f' % (epoch, metric.get()[0], metric.get()[1])) nd.waitall() pix_acc, mean_iou = metric.get() wandb.log({ 'val_PA': pix_acc, 'val_mIoU': mean_iou, 'val_loss': val_loss / len(val_iter), 'custom_step': epoch }) metric.reset() if mean_iou > best_score: my_tools.save_checkpoint( model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=True) best_score = mean_iou best_epoch = epoch logger.info( f'Best val mIoU={round(best_score * 100, 2)} at epoch: {best_epoch}') wandb.config.best_epoch = best_epoch my_tools.save_checkpoint(model=net, model_name=wandb.config.model_name.lower(), backbone=wandb.config.backbone.lower(), data_name=wandb.config.data_name.lower(), time_stamp=wandb.config.start_time, is_best=False)
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import sys sys.path.insert(0, "../../python/") import mxnet as mx import numpy as np import numpy.random as rnd import time import argparse from mxnet.log import get_logger import logging from mxnet.kvstore import BytePS logger = get_logger("Byteps-Backend-Test", level=logging.DEBUG) # parser parser = argparse.ArgumentParser(description='kvstore test') parser.add_argument('--name', type=str, default='byteps') args = parser.parse_args() def check_diff_to_scalar(A, x, rank=None): """ assert A == x""" assert (np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x) # setup keys = ['3', '5', '7'] init_test_keys = [str(i) for i in range(200, 300)]
import os import mxnet import numpy as np from mxnet import autograd, gluon, log, nd from constants import BOS, EOS, PAD from data import load_data, make_src_mask, make_trg_mask from model import make_net from train import ReduceLRScheduler, get_loss from translate import translate logger = log.get_logger(name='transformer', filename='working/log.log', level=log.INFO) ctx = mxnet.cpu() # hyper params epoch = 200 limit = 30 data_dir = '../data/iwslt16/de-en' src_lang = 'de' trg_lang = 'en' batch_size = 1 # for net num_layer = 6 # 6 model_dim = 300 # 512 h = 6 # 8 ff_dim = 1024 # 2048
from mxnetseg.data import segmentation_dataset from mxnetseg.tools import image_transform, root_dir, city_train2label, my_color_palette color_palette = { # dataset name: dataset palette key 'voc2012': 'pascal_voc', 'ade20k': 'ade20k', 'cityscapes': 'citys', 'bdd': 'citys', # BDD100k shares the same 19 semantic categories as Cityscapes 'mhpv1': 'mhpv1', 'camvid': 'camvid', 'camvidfull': 'camvid', 'mapillarry': 'mapillarry', } logger = get_logger(name='eval', level=20) class EvalFactory: """ methods for model evaluation """ @staticmethod def _sample(shape, ctx) -> nd.NDArray: if isinstance(shape, (list, tuple)): h = shape[0] w = shape[1] else: h = shape w = shape