Exemple #1
0
def main():
    args = parse_args()

    with open('config.yml', 'r', encoding='utf-8') as f:
        run = wandb.init(job_type='train',
                         dir=root_dir(),
                         project=args.wandb,
                         config=yaml.safe_load(f))
    if not args.model == wandb.config.model_name:
        raise RuntimeError(
            f"Inconsistent model name: {args.model} v.s. {wandb.config.model_name}"
        )

    # if using wandb sweep
    if args.lr and args.wd:
        wandb.config.lr = args.lr
        wandb.config.wd = args.wd

    ctx = get_contexts(args.ctx)
    wandb.config.ctx = ctx  # add contexts to config

    logger = get_logger(name='train', level=10)

    fit(run=run,
        ctx=ctx,
        log_interval=args.log_interval,
        no_val=args.no_val,
        logger=logger)
Exemple #2
0
def train(cfg,
          ctx_lst,
          project_name,
          log_interval=5,
          no_val=False,
          lr=None,
          wd=None):
    wandb.init(job_type='train',
               dir=my_tools.root_dir(),
               config=cfg,
               project=project_name)
    if lr and wd:
        wandb.config.lr = lr
        wandb.config.wd = wd

    ctx = my_tools.get_contexts(ctx_lst)
    wandb.config.ctx = ctx

    data_factory = DataFactory(wandb.config.data_name)
    model_factory = ModelFactory(wandb.config.model_name)

    norm_layer, norm_kwargs = my_tools.get_norm_layer(wandb.config.norm,
                                                      len(ctx))
    model_kwargs = {
        'nclass': data_factory.num_class,
        'backbone': wandb.config.backbone,
        'pretrained_base': wandb.config.backbone_init.get('manner') == 'cls',
        'aux': wandb.config.aux,
        'crop_size': wandb.config.crop_size,
        'base_size': wandb.config.base_size,
        'dilate': wandb.config.dilate,
        'norm_layer': norm_layer,
        'norm_kwargs': norm_kwargs,
    }
    net = model_factory.get_model(
        model_kwargs,
        resume=wandb.config.resume,
        lr_mult=wandb.config.lr_mult,
        backbone_init_manner=wandb.config.backbone_init.get('manner'),
        backbone_ckpt=wandb.config.backbone_init.get('backbone_ckpt'),
        prior_classes=wandb.config.backbone_init.get('prior_classes'),
        ctx=ctx)
    if net.symbolize:
        net.hybridize()

    num_worker = 0 if platform.system() == 'Windows' else 16
    train_set = data_factory.seg_dataset(
        split='train',  # sometimes would be 'trainval'
        mode='train',
        transform=my_tools.image_transform(),
        base_size=wandb.config.base_size,
        crop_size=wandb.config.crop_size)
    train_iter = DataLoader(train_set,
                            wandb.config.bs_train,
                            shuffle=True,
                            last_batch='discard',
                            num_workers=num_worker)
    val_set = data_factory.seg_dataset(split='val',
                                       mode='val',
                                       transform=my_tools.image_transform(),
                                       base_size=wandb.config.base_size,
                                       crop_size=wandb.config.crop_size)
    val_iter = DataLoader(val_set,
                          wandb.config.bs_val,
                          shuffle=False,
                          last_batch='keep',
                          num_workers=num_worker)
    wandb.config.num_train = len(train_set)
    wandb.config.num_valid = len(val_set)

    criterion = _get_criterion(wandb.config.aux, wandb.config.aux_weight)
    criterion.initialize(ctx=ctx)
    wandb.config.criterion = type(criterion)

    if wandb.config.optimizer == 'adam':
        trainer = Trainer(net.collect_params(),
                          'adam',
                          optimizer_params={
                              'learning_rate': wandb.config.lr,
                              'wd': wandb.config.wd,
                              'beta1': wandb.config.adam.get('adam_beta1'),
                              'beta2': wandb.config.adam.get('adam_beta2')
                          })
    elif wandb.config.optimizer in ('sgd', 'nag'):
        scheduler = _lr_scheduler(
            mode=wandb.config.lr_scheduler,
            base_lr=wandb.config.lr,
            target_lr=wandb.config.target_lr,
            nepochs=wandb.config.epochs,
            iters_per_epoch=len(train_iter),
            step_epoch=wandb.config.step.get('step_epoch'),
            step_factor=wandb.config.step.get('step_factor'),
            power=wandb.config.poly.get('power'))
        trainer = Trainer(net.collect_params(),
                          wandb.config.optimizer,
                          optimizer_params={
                              'lr_scheduler': scheduler,
                              'wd': wandb.config.wd,
                              'momentum': wandb.config.momentum,
                              'multi_precision': True
                          })
    else:
        raise RuntimeError(f"Unknown optimizer: {wandb.config.optimizer}")

    metric = SegmentationMetric(data_factory.num_class)

    logger = get_logger(name='train', level=10)
    t_start = my_tools.get_strftime()
    logger.info(f'Training start: {t_start}')
    for k, v in wandb.config.items():
        logger.info(f'{k}: {v}')
    logger.info('-----> end hyper-parameters <-----')
    wandb.config.start_time = t_start

    best_score = .0
    best_epoch = 0
    for epoch in range(wandb.config.epochs):
        train_loss = .0
        tbar = tqdm(train_iter)
        for i, (data, target) in enumerate(tbar):
            gpu_datas = split_and_load(data, ctx_list=ctx)
            gpu_targets = split_and_load(target, ctx_list=ctx)
            with autograd.record():
                loss_gpus = [
                    criterion(*net(gpu_data), gpu_target)
                    for gpu_data, gpu_target in zip(gpu_datas, gpu_targets)
                ]
            for loss in loss_gpus:
                autograd.backward(loss)
            trainer.step(wandb.config.bs_train)
            nd.waitall()
            train_loss += sum([loss.mean().asscalar()
                               for loss in loss_gpus]) / len(loss_gpus)
            tbar.set_description(
                'Epoch-%d [training], loss %.5f, %s' %
                (epoch, train_loss /
                 (i + 1), my_tools.get_strftime('%Y-%m-%d %H:%M:%S')))
            if (i % log_interval == 0) or (i + 1 == len(train_iter)):
                wandb.log({
                    f'train_loss_batch, interval={log_interval}':
                    train_loss / (i + 1)
                })

        wandb.log({
            'train_loss_epoch': train_loss / (len(train_iter)),
            'custom_step': epoch
        })

        if not no_val:
            val_loss = .0
            vbar = tqdm(val_iter)
            for i, (data, target) in enumerate(vbar):
                gpu_datas = split_and_load(data=data,
                                           ctx_list=ctx,
                                           even_split=False)
                gpu_targets = split_and_load(data=target,
                                             ctx_list=ctx,
                                             even_split=False)
                loss_gpus = []
                for gpu_data, gpu_target in zip(gpu_datas, gpu_targets):
                    gpu_output = net(gpu_data)
                    loss_gpus.append(criterion(*gpu_output, gpu_target))
                    metric.update(gpu_target, gpu_output[0])
                val_loss += sum([loss.mean().asscalar()
                                 for loss in loss_gpus]) / len(loss_gpus)
                vbar.set_description(
                    'Epoch-%d [validation], PA %.4f, mIoU %.4f' %
                    (epoch, metric.get()[0], metric.get()[1]))
                nd.waitall()
            pix_acc, mean_iou = metric.get()
            wandb.log({
                'val_PA': pix_acc,
                'val_mIoU': mean_iou,
                'val_loss': val_loss / len(val_iter),
                'custom_step': epoch
            })
            metric.reset()
            if mean_iou > best_score:
                my_tools.save_checkpoint(
                    model=net,
                    model_name=wandb.config.model_name.lower(),
                    backbone=wandb.config.backbone.lower(),
                    data_name=wandb.config.data_name.lower(),
                    time_stamp=wandb.config.start_time,
                    is_best=True)
                best_score = mean_iou
                best_epoch = epoch

    logger.info(
        f'Best val mIoU={round(best_score * 100, 2)} at epoch: {best_epoch}')
    wandb.config.best_epoch = best_epoch
    my_tools.save_checkpoint(model=net,
                             model_name=wandb.config.model_name.lower(),
                             backbone=wandb.config.backbone.lower(),
                             data_name=wandb.config.data_name.lower(),
                             time_stamp=wandb.config.start_time,
                             is_best=False)
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import sys
sys.path.insert(0, "../../python/")
import mxnet as mx
import numpy as np
import numpy.random as rnd
import time
import argparse
from mxnet.log import get_logger
import logging
from mxnet.kvstore import BytePS
logger = get_logger("Byteps-Backend-Test", level=logging.DEBUG)

# parser
parser = argparse.ArgumentParser(description='kvstore test')
parser.add_argument('--name', type=str, default='byteps')
args = parser.parse_args()


def check_diff_to_scalar(A, x, rank=None):
    """ assert A == x"""
    assert (np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x)


# setup
keys = ['3', '5', '7']
init_test_keys = [str(i) for i in range(200, 300)]
Exemple #4
0
import os

import mxnet
import numpy as np
from mxnet import autograd, gluon, log, nd

from constants import BOS, EOS, PAD
from data import load_data, make_src_mask, make_trg_mask
from model import make_net
from train import ReduceLRScheduler, get_loss
from translate import translate

logger = log.get_logger(name='transformer',
                        filename='working/log.log',
                        level=log.INFO)

ctx = mxnet.cpu()

# hyper params
epoch = 200
limit = 30
data_dir = '../data/iwslt16/de-en'
src_lang = 'de'
trg_lang = 'en'
batch_size = 1

# for net
num_layer = 6  # 6
model_dim = 300  # 512
h = 6  # 8
ff_dim = 1024  # 2048
Exemple #5
0
from mxnetseg.data import segmentation_dataset
from mxnetseg.tools import image_transform, root_dir, city_train2label, my_color_palette

color_palette = {
    # dataset name: dataset palette key
    'voc2012': 'pascal_voc',
    'ade20k': 'ade20k',
    'cityscapes': 'citys',
    'bdd': 'citys',  # BDD100k shares the same 19 semantic categories as Cityscapes
    'mhpv1': 'mhpv1',
    'camvid': 'camvid',
    'camvidfull': 'camvid',
    'mapillarry': 'mapillarry',
}
logger = get_logger(name='eval', level=20)


class EvalFactory:
    """
    methods for model evaluation
    """

    @staticmethod
    def _sample(shape, ctx) -> nd.NDArray:
        if isinstance(shape, (list, tuple)):
            h = shape[0]
            w = shape[1]
        else:
            h = shape
            w = shape