Ejemplo n.º 1
0
def main():
    if opt.builtin_profiler > 0:
        profiler.set_config(profile_all=True, aggregate_stats=True)
        profiler.set_state('run')
    if opt.mode == 'symbolic':
        data = mx.sym.var('data')
        if opt.dtype == 'float16':
            data = mx.sym.Cast(data=data, dtype=np.float16)
        out = net(data)
        if opt.dtype == 'float16':
            out = mx.sym.Cast(data=out, dtype=np.float32)
        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
        mod = mx.mod.Module(softmax, context=context)
        train_data, val_data = get_data_iters(dataset, batch_size, opt)
        mod.fit(train_data,
                eval_data=val_data,
                num_epoch=opt.epochs,
                kvstore=kv,
                batch_end_callback = mx.callback.Speedometer(batch_size, max(1, opt.log_interval)),
                epoch_end_callback = mx.callback.do_checkpoint('image-classifier-%s'% opt.model),
                optimizer = 'sgd',
                optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True},
                initializer = mx.init.Xavier(magnitude=2))
        mod.save_parameters('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
    else:
        if opt.mode == 'hybrid':
            net.hybridize()
        train(opt, context)
    if opt.builtin_profiler > 0:
        profiler.set_state('stop')
        print(profiler.dumps())
Ejemplo n.º 2
0
def test_profile_create_domain_dept():
    profiler.set_config(profile_symbolic=True, filename='test_profile_create_domain_dept.json')
    profiler.set_state('run')
    domain = profiler.Domain(name='PythonDomain')
    print("Domain created: {}".format(str(domain)))
    profiler.dump_profile()
    profiler.set_state('stop')
Ejemplo n.º 3
0
    def __init__(self, opts, ctx):
        self._opts = opts
        self._epochs = opts.epochs
        self._batch_size = opts.batch_size
        self._ctx = ctx

        self._chkpt_interval = opts.chkpt_interval
        self._log_interval = opts.log_interval
        self._weight_interval = opts.weight_interval
        self._profile = opts.profile

        self._epoch_tick = 0
        self._batch_tick = 0

        self._networks = []

        self._overwrite = opts.overwrite
        self._outdir = opts.outdir or os.path.join(os.getcwd(), '{}-{}e-{}'.format(self.model_name(), self._epochs, datetime.now().strftime('%y_%m_%d-%H_%M')))
        self._outdir = os.path.expanduser(self._outdir)
        self._outlogs = os.path.join(self._outdir, 'logs')
        self._outchkpts = os.path.join(self._outdir, 'checkpoints')
        self._outsounds = os.path.join(self._outdir, 'sounds')
        self._prepare_outdir()

        if self._profile:
            self._outprofile = os.path.join(self._outdir, 'profile.json')
            profiler.set_config(profile_all=True, aggregate_stats=True, filename=self._outprofile)

        logging.basicConfig()
        self._logger = logging.getLogger()
        self._logger.setLevel(logging.INFO)
Ejemplo n.º 4
0
def test_profile_create_domain_dept():
    profiler.set_config(profile_symbolic=True,
                        filename='test_profile_create_domain_dept.json')
    profiler.set_state('run')
    domain = profiler.Domain(name='PythonDomain')
    profiler.dump()
    profiler.set_state('stop')
def main():
    if opt.builtin_profiler > 0:
        profiler.set_config(profile_all=True, aggregate_stats=True)
        profiler.set_state('run')
    if opt.mode == 'symbolic':
        data = mx.sym.var('data')
        out = net(data)
        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
        mod = mx.mod.Module(softmax, context=[mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()])
        kv = mx.kv.create(opt.kvstore)
        train_data, val_data = get_data_iters(dataset, batch_size, kv.num_workers, kv.rank)
        mod.fit(train_data,
                eval_data = val_data,
                num_epoch=opt.epochs,
                kvstore=kv,
                batch_end_callback = mx.callback.Speedometer(batch_size, max(1, opt.log_interval)),
                epoch_end_callback = mx.callback.do_checkpoint('image-classifier-%s'% opt.model),
                optimizer = 'sgd',
                optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True},
                initializer = mx.init.Xavier(magnitude=2))
        mod.save_params('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
    else:
        if opt.mode == 'hybrid':
            net.hybridize()
        train(opt, context)
    if opt.builtin_profiler > 0:
        profiler.set_state('stop')
        print(profiler.dumps())
Ejemplo n.º 6
0
def test_profile_create_domain_dept():
    profiler.set_config(profile_symbolic=True,
                        filename='test_profile_create_domain_dept.json')
    profiler.set_state('run')
    domain = profiler.Domain(name='PythonDomain')
    print("Domain created: {}".format(str(domain)))
    profiler.dump_profile()
    profiler.set_state('stop')
Ejemplo n.º 7
0
def main():
    if opt.builtin_profiler > 0:
        profiler.set_config(profile_all=True, aggregate_stats=True)
        profiler.set_state('run')
    if opt.mode == 'hybrid':
        net.hybridize()
    train(opt, device)
    if opt.builtin_profiler > 0:
        profiler.set_state('stop')
        print(profiler.dumps())
Ejemplo n.º 8
0
def main():
    if opt.builtin_profiler > 0:
        profiler.set_config(profile_all=True, aggregate_stats=True)
        profiler.set_state('run')
    if opt.mode == 'hybrid':
        net.hybridize(static_alloc = True, static_shape = True)
    train(opt, context)
    if opt.builtin_profiler > 0:
        profiler.set_state('stop')
        print(profiler.dumps())
def enable_profiler(profile_filename, run=True, continuous_dump=False, aggregate_stats=False):
    profiler.set_config(profile_symbolic=True,
                        profile_imperative=True,
                        profile_memory=True,
                        profile_api=True,
                        filename=profile_filename,
                        continuous_dump=continuous_dump,
                        aggregate_stats=aggregate_stats)
    if run is True:
        profiler.set_state('run')
Ejemplo n.º 10
0
def main():
    if opt.builtin_profiler > 0:
        profiler.set_config(profile_all=True, aggregate_stats=True)
        profiler.set_state('run')
    if opt.mode == 'symbolic':
        train_symbolic(opt, context)
    else:
        train(opt, context)
    if opt.builtin_profiler > 0:
        profiler.set_state('stop')
        print(profiler.dumps())
Ejemplo n.º 11
0
def enable_profiler(profile_filename, run=True, continuous_dump=False, aggregate_stats=False):
    profiler.set_config(profile_symbolic=True,
                        profile_imperative=True,
                        profile_memory=True,
                        profile_api=True,
                        filename=profile_filename,
                        continuous_dump=continuous_dump,
                        aggregate_stats=aggregate_stats
                        )
    print('profile file save to {}'.format(profile_filename))
    if run is True:
      profiler.set_state('run')
Ejemplo n.º 12
0
def enable_profiler(run=True, continuous_dump=False, aggregate_stats=False):
    profile_filename = 'test_profile.json'
    profiler.set_config(profile_symbolic=True,
                        profile_imperative=True,
                        profile_memory=True,
                        profile_api=True,
                        filename=profile_filename,
                        continuous_dump=continuous_dump,
                        aggregate_stats=aggregate_stats)
    print('profile file save to {}'.format(profile_filename))
    if run is True:
        profiler.set_state('run')
Ejemplo n.º 13
0
def main():
    if opt.builtin_profiler > 0:
        profiler.set_config(profile_all=True, aggregate_stats=True)
        profiler.set_state('run')
    if opt.mode == 'symbolic':
        data = mx.sym.var('data')
        out = net(data)
        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
        mod = mx.mod.Module(softmax, context=context)
        kv = mx.kv.create(opt.kvstore)
        eval_metric = []
        eval_metric.append(mx.metric.create('acc'))
        eval_metric.append(mx.metric.create('top_k_accuracy', top_k=5))
        eval_metric.append(mx.metric.create('ce'))
        train_data, val_data = get_data_iters(dataset, batch_size,
                                              kv.num_workers, kv.rank)
        mod.fit(train_data,
                eval_data=val_data,
                eval_metric=eval_metric,
                num_epoch=opt.epochs,
                kvstore=kv,
                batch_end_callback=mx.callback.Speedometer(
                    batch_size, max(1, opt.log_interval)),
                epoch_end_callback=mx.callback.do_checkpoint(
                    'image-classifier-%s' % opt.model),
                optimizer='sgd',
                optimizer_params={
                    'learning_rate': opt.lr,
                    'wd': opt.wd,
                    'momentum': opt.momentum,
                    'multi_precision': True
                },
                initializer=mx.init.Xavier(magnitude=2))
        mod.save_params('image-classifier-%s-%d-final.params' %
                        (opt.model, opt.epochs))
    else:
        if opt.mode == 'hybrid':
            net.hybridize()
        train(opt, context)
    if opt.builtin_profiler > 0:
        profiler.set_state('stop')
        print(profiler.dumps())
Ejemplo n.º 14
0
def main():
    if opt.builtin_profiler > 0:
        profiler.set_config(profile_all=True, aggregate_stats=True)
        profiler.set_state("run")
    if opt.mode == "symbolic":
        data = mx.sym.var("data")
        if opt.dtype == "float16":
            data = mx.sym.Cast(data=data, dtype=np.float16)
        out = net(data)
        if opt.dtype == "float16":
            out = mx.sym.Cast(data=out, dtype=np.float32)
        softmax = mx.sym.SoftmaxOutput(out, name="softmax")
        mod = mx.mod.Module(softmax, context=context)
        train_data, val_data = get_data_iters(dataset, batch_size, opt)
        mod.fit(
            train_data,
            eval_data=val_data,
            num_epoch=opt.epochs,
            kvstore=kv,
            batch_end_callback=mx.callback.Speedometer(
                batch_size, max(1, opt.log_interval)),
            epoch_end_callback=mx.callback.do_checkpoint(
                "image-classifier-%s" % opt.model),
            optimizer="sgd",
            optimizer_params={
                "learning_rate": opt.lr,
                "wd": opt.wd,
                "momentum": opt.momentum,
                "multi_precision": True,
            },
            initializer=mx.init.Xavier(magnitude=2),
        )
        mod.save_parameters("image-classifier-%s-%d-final.params" %
                            (opt.model, opt.epochs))
    else:
        if opt.mode == "hybrid":
            net.hybridize()
        train(opt, context)
    if opt.builtin_profiler > 0:
        profiler.set_state("stop")
        print(profiler.dumps())
Ejemplo n.º 15
0
    def cpp_profile_it(*args, **kwargs):
        # Profile the operation
        profiler.set_config(profile_all=True, aggregate_stats=True)
        profiler.set_state('run')
        res = func(*args, **kwargs)
        profiler.set_state('stop')

        # Prepare the results
        profiler_dump = profiler.dumps(reset=True)

        # args[0] is assumed to be operator name, if not found check for block name.
        # NOTE: This parameter should be removed when we get away from parsing
        # profiler output and start using new profiler APIs - get_summary(), reset()
        if len(args) > 0:
            operator_name = args[0].__name__
        elif 'block' in kwargs:
            operator_name = kwargs['block']._op_name
        else:
            raise ValueError("Unable to identify operator name to extract profiler output!")

        # Get the MXNet profile output
        profiler_output = parse_profiler_dump(operator_name, profiler_dump)
        return res, profiler_output
Ejemplo n.º 16
0
# Horovod: fetch and broadcast parameters
params = model.collect_params()
if params is not None:
    hvd.broadcast_parameters(params, root_rank=0)

# Horovod: create DistributedTrainer, a subclass of gluon.Trainer
trainer = hvd.DistributedTrainer(params, opt)

# Create loss function and train metric
loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
metric = mx.metric.Accuracy()

# Set profiler
profiler.set_config(profile_all=True,
                    aggregate_stats=True,
                    filename="profile_mx_mnist.json")

# Train model
for epoch in range(args.epochs):
    tic = time.time()
    train_data.reset()
    metric.reset()
    for nbatch, batch in enumerate(train_data, start=1):
        # Start and pause profiling
        if nbatch == 100:
            if epoch == 0:
                profiler.set_state('run')
            else:
                profiler.resume()
        elif nbatch == 200:
Ejemplo n.º 17
0
import sys
import os
import math
import mxnet as mx

import time
import psutil
import gc

from mxnet import profiler
from util import *

profiler.set_config(profile_all=True,
                    aggregate_stats=True,
                    continuous_dump=True,
                    filename='profile_output.json')

def cpuStats():
    # print(sys.version)
    # print(psutil.cpu_percent())
    # print(psutil.virtual_memory())  # physical memory usage
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2. ** 30  # memory use in GB...I think
    # print('memory GB:', memoryUse)
    return memoryUse


jitter_param = 0.4
lighting_param = 0.1
mean_rgb = [123.68, 116.779, 103.939]
Ejemplo n.º 18
0
import mxnet as mx
from mxnet import nd
from mxnet import profiler

profiler.set_config(profile_all=True,
                    aggregate_stats=True,
                    filename='cpu_gpu_data_copy_profiler_output.json')

# Create a large Tensor on CPU
data1 = nd.random.uniform(shape=(10000, 10000), ctx=mx.cpu())
data2 = nd.random.uniform(shape=(10000, 10000), ctx=mx.cpu())
nd.waitall()

# Profiler copying data and operation only

profiler.set_state('run')

# Copy data to GPU
data1.as_in_context(context=mx.gpu(0))
data2.as_in_context(context=mx.gpu(0))

# Do couple of operations on GPU
res = data1 + data2
#res = nd.mean(res)
# Copy result back to CPU
res_cpu = res.as_in_context(context=mx.cpu())
nd.waitall()
profiler.set_state('stop')
print(profiler.dumps())

profiler.dump()
Ejemplo n.º 19
0
def train_net(net, config, check_flag, logger, sig_state, sig_pgbar, sig_table):
    print(config)
    # config = Configs()
    # matplotlib.use('Agg')
    # import matplotlib.pyplot as plt
    sig_pgbar.emit(-1)
    mx.random.seed(1)
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    classes = 10
    num_epochs = config.train_cfg.epoch
    batch_size = config.train_cfg.batchsize
    optimizer = config.lr_cfg.optimizer
    lr = config.lr_cfg.lr
    num_gpus = config.train_cfg.gpu
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i)
               for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = config.data_cfg.worker

    warmup = config.lr_cfg.warmup
    if config.lr_cfg.decay == 'cosine':
        lr_sch = lr_scheduler.CosineScheduler((50000//batch_size)*num_epochs,
                                              base_lr=lr,
                                              warmup_steps=warmup *
                                              (50000//batch_size),
                                              final_lr=1e-5)
    else:
        lr_sch = lr_scheduler.FactorScheduler((50000//batch_size)*config.lr_cfg.factor_epoch,
                                              factor=config.lr_cfg.factor,
                                              base_lr=lr,
                                              warmup_steps=warmup*(50000//batch_size))

    model_name = config.net_cfg.name

    if config.data_cfg.mixup:
        model_name += '_mixup'
    if config.train_cfg.amp:
        model_name += '_amp'

    base_dir = './'+model_name
    if os.path.exists(base_dir):
        base_dir = base_dir + '-' + \
            time.strftime("%m-%d-%H.%M.%S", time.localtime())
    makedirs(base_dir)

    if config.save_cfg.tensorboard:
        logdir = base_dir+'/tb/'+model_name
        if os.path.exists(logdir):
            logdir = logdir + '-' + \
                time.strftime("%m-%d-%H.%M.%S", time.localtime())
        sw = SummaryWriter(logdir=logdir, flush_secs=5, verbose=False)
        cmd_file = open(base_dir+'/tb.bat', mode='w')
        cmd_file.write('tensorboard --logdir=./')
        cmd_file.close()

    save_period = 10
    save_dir = base_dir+'/'+'params'
    makedirs(save_dir)

    plot_name = base_dir+'/'+'plot'
    makedirs(plot_name)

    stat_name = base_dir+'/'+'stat.txt'

    csv_name = base_dir+'/'+'data.csv'
    if os.path.exists(csv_name):
        csv_name = base_dir+'/'+'data-' + \
            time.strftime("%m-%d-%H.%M.%S", time.localtime())+'.csv'
    csv_file = open(csv_name, mode='w', newline='')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Epoch', 'train_loss', 'train_acc',
                         'valid_loss', 'valid_acc', 'lr', 'time'])

    logging_handlers = [logging.StreamHandler(), logger]
    logging_handlers.append(logging.FileHandler(
        '%s/train_cifar10_%s.log' % (model_name, model_name)))

    logging.basicConfig(level=logging.INFO, handlers=logging_handlers)
    logging.info(config)

    if config.train_cfg.amp:
        amp.init()

    if config.save_cfg.profiler:
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename=base_dir+'/%s_profile.json' % model_name)
        is_profiler_run = False

    trans_list = []
    imgsize = config.data_cfg.size
    if config.data_cfg.crop:
        trans_list.append(gcv_transforms.RandomCrop(
            32, pad=config.data_cfg.crop_pad))
    if config.data_cfg.cutout:
        trans_list.append(CutOut(config.data_cfg.cutout_size))
    if config.data_cfg.flip:
        trans_list.append(transforms.RandomFlipLeftRight())
    if config.data_cfg.erase:
        trans_list.append(gcv_transforms.block.RandomErasing(s_max=0.25))
    trans_list.append(transforms.Resize(imgsize))
    trans_list.append(transforms.ToTensor())
    trans_list.append(transforms.Normalize([0.4914, 0.4822, 0.4465],
                                           [0.2023, 0.1994, 0.2010]))

    transform_train = transforms.Compose(trans_list)

    transform_test = transforms.Compose([
        transforms.Resize(imgsize),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    def label_transform(label, classes):
        ind = label.astype('int')
        res = nd.zeros((ind.shape[0], classes), ctx=label.context)
        res[nd.arange(ind.shape[0], ctx=label.context), ind] = 1
        return res

    def test(ctx, val_data):
        metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        num_batch = len(val_data)
        test_loss = 0
        for i, batch in enumerate(val_data):
            data = gluon.utils.split_and_load(
                batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(
                batch[1], ctx_list=ctx, batch_axis=0)
            outputs = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(outputs, label)]
            metric.update(label, outputs)
            test_loss += sum([l.sum().asscalar() for l in loss])
        test_loss /= batch_size * num_batch
        name, val_acc = metric.get()
        return name, val_acc, test_loss

    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]

        if config.train_cfg.param_init:
            init_func = getattr(mx.init, config.train_cfg.init)
            net.initialize(init_func(), ctx=ctx, force_reinit=True)
        else:
            net.load_parameters(config.train_cfg.param_file, ctx=ctx)

        summary(net, stat_name, nd.uniform(
            shape=(1, 3, imgsize, imgsize), ctx=ctx[0]))
        # net = nn.HybridBlock()
        net.hybridize()

        root = config.dir_cfg.dataset
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer_arg = {'learning_rate': config.lr_cfg.lr,
                       'wd': config.lr_cfg.wd, 'lr_scheduler': lr_sch}
        extra_arg = eval(config.lr_cfg.extra_arg)
        trainer_arg.update(extra_arg)
        trainer = gluon.Trainer(net.collect_params(), optimizer, trainer_arg)
        if config.train_cfg.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if config.data_cfg.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        # print('start training')
        sig_state.emit(1)
        sig_pgbar.emit(0)
        # signal.emit('Training')
        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1
            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    profiler.set_state('run')
                    is_profiler_run = True
                if epoch == 0 and iteration == 1 and config.save_cfg.tensorboard:
                    sw.add_graph(net)
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not config.data_cfg.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not config.data_cfg.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if config.train_cfg.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                if config.save_cfg.tensorboard:
                    sw.add_scalar(tag='lr', value=trainer.learning_rate,
                                  global_step=iteration)
                if epoch == 0 and iteration == 1 and config.save_cfg.profiler:
                    nd.waitall()
                    profiler.set_state('stop')
                    profiler.dump()
                iteration += 1
                sig_pgbar.emit(iteration)
                if check_flag()[0]:
                    sig_state.emit(2)
                while(check_flag()[0] or check_flag()[1]):
                    if check_flag()[1]:
                        print('stop')
                        return
                    else:
                        time.sleep(5)
                        print('pausing')

            epoch_time = time.time() - tic
            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            # if config.data_cfg.mixup:
            #     train_history.update([acc, 1-val_acc])
            #     plt.cla()
            #     train_history.plot(save_path='%s/%s_history.png' %
            #                        (plot_name, model_name))
            # else:
            train_history.update([1-train_acc, 1-val_acc])
            plt.cla()
            train_history.plot(save_path='%s/%s_history.png' %
                               (plot_name, model_name))

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)

            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, epoch_time))
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            if config.save_cfg.tensorboard:
                sw._add_scalars(tag='Acc',
                                scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
                sw._add_scalars(tag='Loss',
                                scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)

            sig_table.emit([epoch, train_loss, train_acc,
                            val_loss, val_acc, current_lr, epoch_time])
            csv_writer.writerow([epoch, train_loss, train_acc,
                                 val_loss, val_acc, current_lr, epoch_time])
            csv_file.flush()

            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))

    train(num_epochs, context)
    if config.save_cfg.tensorboard:
        sw.close()

    for ctx in context:
        ctx.empty_cache()

    csv_file.close()
    logging.shutdown()
    reload(logging)
    sig_state.emit(0)
Ejemplo n.º 20
0
        model_prefix, load_epoch)

    # initialize the module
    mod = mx.module.Module(symbol=sym,
                           context=ctx,
                           data_names=['user', 'item'],
                           label_names=['score'])
    mod.bind(data_shapes=train_iter.provide_data,
             label_shapes=train_iter.provide_label)

    # get the sparse weight parameter
    mod.set_params(arg_params=arg_params, aux_params=aux_params)

    # profile
    profiler.set_config(profile_all=True,
                        aggregate_stats=True,
                        filename='profile_neumf.json')
    profiler.set_state('run')

    if benchmark:
        logging.info('Evaluating...')
        (hits, ndcgs) = evaluate_model(mod, testRatings, testNegatives, topK,
                                       evaluation_threads)
        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
        print('HR = %.4f, NDCG = %.4f' % (hr, ndcg))
        logging.info('Evaluating completed')
        profiler.set_state('stop')
    else:
        logging.info('Inference started ...')
        nbatch = 0
        tic = time()
import mxnet as mx
from mxnet import autograd
from mxnet import profiler

#################### Set Profiler Config ######################
profiler.set_config(profile_all=True,
                    aggregate_stats=True,
                    filename='cpu_mnist_cnn_profile_output.json')
###############################################################

# Build Network
from mxnet import gluon
net = gluon.nn.HybridSequential()
with net.name_scope():
    net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
    net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
    net.add(gluon.nn.Flatten())
    net.add(gluon.nn.Dense(512, activation="relu"))
    net.add(gluon.nn.Dense(10))

from mxnet.gluon.data.vision import transforms
train_data = gluon.data.DataLoader(
    gluon.data.vision.MNIST(train=True).transform_first(transforms.ToTensor()),
    batch_size=64,
    shuffle=True)

# Set Context
ctx = mx.cpu()
Ejemplo n.º 22
0
def train(net, train_data, val_data, eval_metric, batch_size, ctx, args):
    """Training pipeline"""
    print("rank:{}, training...".format(
        kv.rank)) if "perseus" in args.kv_store else None

    if args.profiler == "1":
        # profiler config
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename='profile_output_{}.json'.format(
                                kv.rank if "perseus" in args.kv_store else 0))
    net.collect_params().setattr('grad_req', 'null')
    net.collect_train_params().setattr('grad_req', 'write')
    optimizer_params = {
        'learning_rate': args.lr,
        'wd': args.wd,
        'momentum': args.momentum
    }
    if args.amp:
        optimizer_params['multi_precision'] = True
    if args.horovod:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(
            net.collect_train_params(
            ),  # fix batchnorm, fix first stage, etc...
            'sgd',
            optimizer_params)
    else:
        trainer = gluon.Trainer(
            net.collect_train_params(
            ),  # fix batchnorm, fix first stage, etc...
            'sgd',
            optimizer_params,
            update_on_kvstore=None,
            kvstore=kv)  #(False if args.amp else None), kvstore=kv)

    if args.amp:
        amp.init_trainer(trainer)

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])
    lr_warmup = float(args.lr_warmup)  # avoid int division

    # TODO(zhreshold) losses?
    rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
        from_sigmoid=False)
    rpn_box_loss = mx.gluon.loss.HuberLoss(
        rho=args.rpn_smoothl1_rho)  # == smoothl1
    rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
    rcnn_box_loss = mx.gluon.loss.HuberLoss(
        rho=args.rcnn_smoothl1_rho)  # == smoothl1
    metrics = [
        mx.metric.Loss('RPN_Conf'),
        mx.metric.Loss('RPN_SmoothL1'),
        mx.metric.Loss('RCNN_CrossEntropy'),
        mx.metric.Loss('RCNN_SmoothL1'),
    ]

    rpn_acc_metric = RPNAccMetric()
    rpn_bbox_metric = RPNL1LossMetric()
    rcnn_acc_metric = RCNNAccMetric()
    rcnn_bbox_metric = RCNNL1LossMetric()
    metrics2 = [
        rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric
    ]

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    if args.custom_model:
        logger.info(
            'Custom model enabled. Expert Only!! Currently non-FPN model is not supported!!'
            ' Default setting is for MS-COCO.')
    logger.info(args)

    if args.verbose:
        logger.info('Trainable parameters:')
        logger.info(net.collect_train_params().keys())
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        mix_ratio = 1.0
        if not args.disable_hybridization:
            net.hybridize(static_alloc=args.static_alloc)
        rcnn_task = ForwardBackwardTask(net,
                                        trainer,
                                        rpn_cls_loss,
                                        rpn_box_loss,
                                        rcnn_cls_loss,
                                        rcnn_box_loss,
                                        mix_ratio=1.0)
        if "perseus" in args.kv_store:
            args.executor_threads = 1
        executor = Parallel(args.executor_threads, rcnn_task) if (
            not args.horovod and "perseus" not in args.kv_store) else None
        if args.mixup:
            # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise
            train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5)
            mix_ratio = 0.5
            if epoch >= args.epochs - args.no_mixup_epochs:
                train_data._dataset._data.set_mixup(None)
                mix_ratio = 1.0
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(
                epoch, new_lr))
        for metric in metrics:
            metric.reset()
        tic = time.time()
        btic = time.time()
        base_lr = trainer.learning_rate
        rcnn_task.mix_ratio = mix_ratio

        if args.profiler == "1":
            # profiler 1
            profiler.set_state('run')

        for i, batch in enumerate(train_data):
            if epoch == 0 and i <= lr_warmup:
                # adjust based on real percentage
                new_lr = base_lr * get_lr_at_iter(i / lr_warmup,
                                                  args.lr_warmup_factor)
                if new_lr != trainer.learning_rate:
                    if i % args.log_interval == 0:
                        logger.info(
                            '[Epoch 0 Iteration {}] Set learning rate to {}'.
                            format(i, new_lr))
                    trainer.set_learning_rate(new_lr)
            batch = split_and_load(batch, ctx_list=ctx)
            metric_losses = [[] for _ in metrics]
            add_losses = [[] for _ in metrics2]
            if executor is not None:
                for data in zip(*batch):
                    executor.put(data)
            for j in range(len(ctx)):
                if executor is not None:
                    result = executor.get()
                else:
                    result = rcnn_task.forward_backward(list(zip(*batch))[0])

                if (not args.horovod) or hvd.rank() == 0:
                    for k in range(len(metric_losses)):
                        metric_losses[k].append(result[k])
                    for k in range(len(add_losses)):
                        add_losses[k].append(result[len(metric_losses) + k])

            for metric, record in zip(metrics, metric_losses):
                metric.update(0, record)
            for metric, records in zip(metrics2, add_losses):
                for pred in records:
                    metric.update(pred[0], pred[1])

            trainer.step(batch_size)

            # update metrics
            if ((not args.horovod) or hvd.rank() == 0) and args.log_interval \
                    and not (i + 1) % args.log_interval:
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get())
                    for metric in metrics + metrics2
                ])
                cur_rank = kv.rank if "perseus" in args.kv_store else 0
                logger.info(
                    'rank:{}, [Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'
                    .format(
                        cur_rank, epoch, i,
                        args.log_interval * batch_size / (time.time() - btic),
                        msg))
                btic = time.time()
            if i >= 100 and args.profiler == "1":
                profiler.set_state('stop')
                print(profiler.dumps())
                break

        if ((not args.horovod) and ("perseus" not in args.kv_store)) or (
                args.horovod and hvd.rank() == 0) or (
                    ("perseus" in args.kv_store) and kv.rank == 0):  # perseus
            #if (not args.horovod) or hvd.rank() == 0:
            msg = ','.join(
                ['{}={:.3f}'.format(*metric.get()) for metric in metrics])
            logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format(
                epoch, (time.time() - tic), msg))
            if not (epoch + 1) % args.val_interval:
                # consider reduce the frequency of validation to save time
                map_name, mean_ap = validate(net, val_data, ctx, eval_metric,
                                             args)
                val_msg = '\n'.join(
                    ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
                logger.info('[Epoch {}] Validation: \n{}'.format(
                    epoch, val_msg))
                current_map = float(mean_ap[-1])
            else:
                current_map = 0.
            save_params(net, logger, best_map, current_map, epoch,
                        args.save_interval, args.save_prefix)
        mx.nd.waitall()
Ejemplo n.º 23
0
def train(net, train_data, val_data, eval_metric, ctx, args):
    """Training pipeline"""
    net.collect_params().setattr('grad_req', 'null')
    net.collect_train_params().setattr('grad_req', 'write')
    rescale_factor = float(
        cfg.GENERAL.FP16_RESCALE_FACTOR) if cfg.GENERAL.FP16 else None
    trainer = gluon.Trainer(
        net.collect_train_params(),  # fix batchnorm, fix first stage, etc...
        'sgd',
        {
            'learning_rate':
            cfg.TRAIN.BASE_LR,
            'wd':
            cfg.TRAIN.WEIGHT_DECAY,
            'momentum':
            cfg.TRAIN.MOMENTUM,
            'clip_gradient':
            5,
            'multi_precision':
            cfg.GENERAL.FP16,
            'rescale_grad':
            1.0 / cfg.GENERAL.FP16_RESCALE_FACTOR if cfg.GENERAL.FP16 else 1.0
        })

    # lr decay policy
    lr_steps = cfg.AUTO.LR_DECAY_EPOCH
    lr_warmup = float(cfg.TRAIN.LR_WARMUP)  # avoid int division

    # TODO(zhreshold) losses?
    rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
        from_sigmoid=False, weight=rescale_factor)
    rpn_box_loss = mx.gluon.loss.HuberLoss(
        rho=1 / 9., weight=rescale_factor)  # i.e. smoothl1
    rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss(
        weight=rescale_factor)
    rcnn_box_loss = mx.gluon.loss.HuberLoss(
        weight=rescale_factor)  # i.e. smoothl1
    metrics = [
        mx.metric.Loss('RPN_Conf'),
        mx.metric.Loss('RPN_SmoothL1'),
        mx.metric.Loss('RCNN_CrossEntropy'),
        mx.metric.Loss('RCNN_SmoothL1'),
    ]
    metrics2 = [
        RPNAccMetric(),
        RPNL1LossMetric(),
        RCNNAccMetric(),
        RCNNL1LossMetric()
    ]

    logger.info("Trainable parameters: ------------------------------------------\n" + \
            pprint.pformat(net.collect_train_params().keys(), indent=1, width=100, compact=True))
    logger.info('LR Schedule [Epochs {} - {}].'.format(
        cfg.AUTO.LR_DECAY_EPOCH, [
            cfg.TRAIN.BASE_LR * cfg.TRAIN.LR_DECAY_FACTOR**i
            for i in range(len(cfg.AUTO.LR_DECAY_EPOCH))
        ]))
    logger.info('Start training from [Epoch {}] to [Epoch {}].'.format(
        cfg.TRAIN.START_EPOCH, cfg.AUTO.END_EPOCH))

    best_map = [0]
    steps_per_epoch = cfg.TRAIN.STEPS_PER_EPOCH if cfg.TRAIN.STEPS_PER_EPOCH else len(
        train_data)
    for epoch in range(cfg.TRAIN.START_EPOCH, cfg.AUTO.END_EPOCH + 1):
        mix_ratio = 1.0
        if cfg.TRAIN.MODE_MIXUP:
            # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise
            train_data._dataset.set_mixup(np.random.uniform, 0.5, 0.5)
            mix_ratio = 0.5
            if epoch >= (cfg.AUTO.END_EPOCH + 1) - cfg.AUTO.NO_MIXUP_EPOCH:
                train_data._dataset.set_mixup(None)
                mix_ratio = 1.0
        if lr_steps and epoch >= lr_steps[0]:
            while lr_steps and epoch >= lr_steps[0]:
                new_lr = trainer.learning_rate * cfg.TRAIN.LR_DECAY_FACTOR
                lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(
                epoch, new_lr))
        for metric in metrics:
            metric.reset()

        tic = time.time()
        btic = time.time()
        if epoch == cfg.TRAIN.START_EPOCH or (
                epoch - 1) % cfg.TRAIN.EVAL_INTERVAL == 0:
            net.hybridize(static_alloc=True)
        base_lr = trainer.learning_rate
        tbar = tqdm(train_data, total=steps_per_epoch)
        tbar.set_description_str("[ TRAIN ]")
        for i, batch in enumerate(tbar):
            i += 1
            total_iter = (epoch - 1) * steps_per_epoch + i
            if total_iter <= lr_warmup:
                # adjust based on real percentage
                new_lr = base_lr * get_lr_at_iter(total_iter / lr_warmup)
                if new_lr != trainer.learning_rate:
                    if total_iter % cfg.GENERAL.LOG_INTERVAL == 0:
                        tqdm.write(
                            '[Warm Up] Set learning rate to {}'.format(new_lr))
                    trainer.set_learning_rate(new_lr)
            batch = split_and_load(
                batch, ctx_list=ctx)  # Split data to 1 batch each device.
            batch_size = len(batch[0])

            losses = []
            metric_losses = [[] for _ in metrics]
            add_losses = [[] for _ in metrics2]
            if args.profile and i == 10:
                profiler.set_config(profile_all=True,
                                    aggregate_stats=True,
                                    filename='profile_output.json')
                profiler.set_state('run')
            with autograd.record():
                for data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(
                        *batch):
                    gt_label = label[:, :, 4:5]
                    gt_box = label[:, :, :4]
                    cls_pred, box_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net(
                        data, gt_box)

                    # losses of rpn
                    if cfg.GENERAL.FP16:
                        rpn_score = rpn_score.astype('float32')
                        rpn_box = rpn_box.astype('float32')
                        rpn_cls_targets = rpn_cls_targets.astype('float32')
                        rpn_box_targets = rpn_box_targets.astype('float32')
                        rpn_box_masks = rpn_box_masks.astype('float32')
                    rpn_score = rpn_score.squeeze(axis=-1)
                    num_rpn_pos = (rpn_cls_targets >= 0).sum()
                    rpn_loss1 = rpn_cls_loss(
                        rpn_score, rpn_cls_targets, rpn_cls_targets >=
                        0) * rpn_cls_targets.size / num_rpn_pos
                    rpn_loss2 = rpn_box_loss(
                        rpn_box, rpn_box_targets,
                        rpn_box_masks) * rpn_box.size / num_rpn_pos
                    # rpn overall loss, use sum rather than average
                    rpn_loss = rpn_loss1 + rpn_loss2

                    # generate targets for rcnn
                    cls_targets, box_targets, box_masks = net.target_generator(
                        roi, samples, matches, gt_label, gt_box)
                    # losses of rcnn
                    if cfg.GENERAL.FP16:
                        cls_pred = cls_pred.astype('float32')
                        box_pred = box_pred.astype('float32')
                        cls_targets = cls_targets.astype('float32')
                        box_targets = box_targets.astype('float32')
                        box_masks = box_masks.astype('float32')
                    num_rcnn_pos = (cls_targets >= 0).sum()
                    rcnn_loss1 = rcnn_cls_loss(
                        cls_pred, cls_targets, cls_targets >= 0
                    ) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
                    rcnn_loss2 = rcnn_box_loss(
                        box_pred, box_targets, box_masks
                    ) * box_pred.size / box_pred.shape[0] / num_rcnn_pos
                    rcnn_loss = rcnn_loss1 + rcnn_loss2
                    # overall losses
                    losses.append(rpn_loss.sum() * mix_ratio +
                                  rcnn_loss.sum() * mix_ratio)
                    metric_losses[0].append(rpn_loss1.sum() * mix_ratio)
                    metric_losses[1].append(rpn_loss2.sum() * mix_ratio)
                    metric_losses[2].append(rcnn_loss1.sum() * mix_ratio)
                    metric_losses[3].append(rcnn_loss2.sum() * mix_ratio)
                    add_losses[0].append(
                        [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]])
                    add_losses[1].append([[rpn_box_targets, rpn_box_masks],
                                          [rpn_box]])
                    add_losses[2].append([[cls_targets], [cls_pred]])
                    add_losses[3].append([[box_targets, box_masks],
                                          [box_pred]])
                autograd.backward(losses)

                for metric, record in zip(metrics, metric_losses):
                    metric.update(0, record)
                for metric, records in zip(metrics2, add_losses):
                    for pred in records:
                        metric.update(pred[0], pred[1])
            trainer.step(batch_size)
            if args.profile:
                mx.nd.waitall()
                profiler.set_state('stop')

            # update metrics
            if cfg.GENERAL.LOG_INTERVAL and total_iter % cfg.GENERAL.LOG_INTERVAL == 0:
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get())
                    for metric in metrics + metrics2
                ])
                total_speed = cfg.GENERAL.LOG_INTERVAL * batch_size / (
                    time.time() - btic)
                speed = total_speed / batch_size  # batch size rely on the gpu num.
                epoch_time_left = (steps_per_epoch - i + 1) / speed
                total_time_left = (
                    (cfg.AUTO.END_EPOCH - epoch) * steps_per_epoch - i +
                    1) / speed
                epoch_tl_h, epoch_tl_m, epoch_tl_s = sec_to_time(
                    epoch_time_left)
                total_tl_h, total_tl_m, _ = sec_to_time(total_time_left)
                tqdm.write(
                    '[Epoch {}][Batch {}], {:.3f}/{:0>2}h{:0>2}m{:0>2}s/{:0>2}h{:0>2}m, {}'
                    .format(epoch, total_iter, total_speed, epoch_tl_h,
                            epoch_tl_m, epoch_tl_s, total_tl_h, total_tl_m,
                            msg))
                btic = time.time()

            if cfg.TRAIN.STEPS_PER_EPOCH and i >= cfg.TRAIN.STEPS_PER_EPOCH:
                break
        tbar.close()
        msg = ','.join(
            ['{}={:.3f}'.format(*metric.get()) for metric in metrics])
        logger.info('[Epoch {}] Training cost: {:.3f}s, {}'.format(
            epoch, (time.time() - tic), msg))
        if epoch % cfg.TRAIN.EVAL_INTERVAL == 0:
            # consider reduce the frequency of validation to save time
            map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
            val_msg = '\n'.join(
                ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)])
            logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg))
            current_map = float(mean_ap[-1])
        else:
            current_map = 0.
        save_params(net, logger, best_map, current_map, epoch,
                    cfg.TRAIN.SAVE_INTERVAL, args.logdir)
Ejemplo n.º 24
0
import mxnet as mx
from mxnet import nd, gluon, init, autograd
from mxnet.gluon import nn
from mxnet.gluon.data.vision import datasets, transforms
import time
from mxnet import profiler

ctx = mx.gpu(0)

profiler.set_config(profile_all=True, aggregate_stats=True, filename='gpu_fashion_mnist_profile_output.json')

# Get Data
mnist_train = datasets.FashionMNIST(train=True)
text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
               'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']


# Data preprocessing
transformer = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(0.13, 0.31)])
mnist_train = mnist_train.transform_first(transformer)

batch_size = 256
train_data = gluon.data.DataLoader(
    mnist_train, batch_size=batch_size, shuffle=True, num_workers=4)


mnist_valid = gluon.data.vision.FashionMNIST(train=False)
valid_data = gluon.data.DataLoader(
    mnist_valid.transform_first(transformer),
Ejemplo n.º 25
0
def main():
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    opt = parse_args()
    batch_size = opt.batch_size
    classes = 10

    num_gpus = opt.num_gpus
    batch_size *= max(1, num_gpus)
    context = [mx.gpu(i)
               for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = opt.num_workers

    lr_sch = lr_scheduler.CosineScheduler((50000//batch_size)*opt.num_epochs,
                                          base_lr=opt.lr,
                                          warmup_steps=5*(50000//batch_size),
                                          final_lr=1e-5)
    # lr_sch = lr_scheduler.FactorScheduler((50000//batch_size)*20,
    #                                       factor=0.2, base_lr=opt.lr,
    #                                       warmup_steps=5*(50000//batch_size))
    # lr_sch = LRScheduler('cosine',opt.lr, niters=(50000//batch_size)*opt.num_epochs,)

    model_name = opt.model
    net = SKT_Lite()
    # if model_name.startswith('cifar_wideresnet'):
    #     kwargs = {'classes': classes,
    #             'drop_rate': opt.drop_rate}
    # else:
    #     kwargs = {'classes': classes}
    # net = get_model(model_name, **kwargs)
    if opt.mixup:
        model_name += '_mixup'
    if opt.amp:
        model_name += '_amp'

    makedirs('./'+model_name)
    os.chdir('./'+model_name)
    sw = SummaryWriter(
        logdir='.\\tb\\'+model_name, flush_secs=5, verbose=False)
    makedirs(opt.save_plot_dir)

    if opt.resume_from:
        net.load_parameters(opt.resume_from, ctx=context)
    optimizer = 'nag'

    save_period = opt.save_period
    if opt.save_dir and save_period:
        save_dir = opt.save_dir
        makedirs(save_dir)
    else:
        save_dir = ''
        save_period = 0

    plot_name = opt.save_plot_dir

    logging_handlers = [logging.StreamHandler()]
    if opt.logging_dir:
        logging_dir = opt.logging_dir
        makedirs(logging_dir)
        logging_handlers.append(logging.FileHandler(
            '%s/train_cifar10_%s.log' % (logging_dir, model_name)))

    logging.basicConfig(level=logging.INFO, handlers=logging_handlers)
    logging.info(opt)

    if opt.amp:
        amp.init()

    if opt.profile_mode:
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename='%s_profile.json' % model_name)

    transform_train = transforms.Compose([
        gcv_transforms.RandomCrop(32, pad=4),
        CutOut(8),
        # gcv_transforms.block.RandomErasing(s_max=0.25),
        transforms.RandomFlipLeftRight(),
        # transforms.RandomFlipTopBottom(),
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    transform_test = transforms.Compose([
        transforms.Resize(32),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465],
                             [0.2023, 0.1994, 0.2010])
    ])

    def label_transform(label, classes):
        ind = label.astype('int')
        res = nd.zeros((ind.shape[0], classes), ctx=label.context)
        res[nd.arange(ind.shape[0], ctx=label.context), ind] = 1
        return res

    def test(ctx, val_data):
        metric = mx.metric.Accuracy()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
        num_batch = len(val_data)
        test_loss = 0
        for i, batch in enumerate(val_data):
            data = gluon.utils.split_and_load(
                batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(
                batch[1], ctx_list=ctx, batch_axis=0)
            outputs = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(outputs, label)]
            metric.update(label, outputs)
            test_loss += sum([l.sum().asscalar() for l in loss])
        test_loss /= batch_size * num_batch
        name, val_acc = metric.get()
        return name, val_acc, test_loss

    def train(epochs, ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        root = os.path.join('..', 'datasets', 'cifar-10')
        train_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

        val_data = gluon.data.DataLoader(
            gluon.data.vision.CIFAR10(
                root=root, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        trainer = gluon.Trainer(net.collect_params(), optimizer,
                                {'learning_rate': opt.lr, 'wd': opt.wd,
                                 'momentum': opt.momentum, 'lr_scheduler': lr_sch})
        if opt.amp:
            amp.init_trainer(trainer)
        metric = mx.metric.Accuracy()
        train_metric = mx.metric.RMSE()
        loss_fn = gluon.loss.SoftmaxCrossEntropyLoss(
            sparse_label=False if opt.mixup else True)
        train_history = TrainingHistory(['training-error', 'validation-error'])
        # acc_history = TrainingHistory(['training-acc', 'validation-acc'])
        loss_history = TrainingHistory(['training-loss', 'validation-loss'])

        iteration = 0

        best_val_score = 0

        for epoch in range(epochs):
            tic = time.time()
            train_metric.reset()
            metric.reset()
            train_loss = 0
            num_batch = len(train_data)
            alpha = 1

            for i, batch in enumerate(train_data):
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    profiler.set_state('run')
                lam = np.random.beta(alpha, alpha)
                if epoch >= epochs - 20 or not opt.mixup:
                    lam = 1

                data_1 = gluon.utils.split_and_load(
                    batch[0], ctx_list=ctx, batch_axis=0)
                label_1 = gluon.utils.split_and_load(
                    batch[1], ctx_list=ctx, batch_axis=0)

                if not opt.mixup:
                    data = data_1
                    label = label_1
                else:
                    data = [lam*X + (1-lam)*X[::-1] for X in data_1]
                    label = []
                    for Y in label_1:
                        y1 = label_transform(Y, classes)
                        y2 = label_transform(Y[::-1], classes)
                        label.append(lam*y1 + (1-lam)*y2)

                with ag.record():
                    output = [net(X) for X in data]
                    loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
                if opt.amp:
                    with ag.record():
                        with amp.scale_loss(loss, trainer) as scaled_loss:
                            ag.backward(scaled_loss)
                            # scaled_loss.backward()
                else:
                    for l in loss:
                        l.backward()
                trainer.step(batch_size)
                train_loss += sum([l.sum().asscalar() for l in loss])

                output_softmax = [nd.SoftmaxActivation(out) for out in output]
                train_metric.update(label, output_softmax)
                metric.update(label_1, output_softmax)
                name, acc = train_metric.get()
                sw.add_scalar(tag='lr', value=trainer.learning_rate,
                              global_step=iteration)
                if epoch == 0 and iteration == 1 and opt.profile_mode:
                    nd.waitall()
                    profiler.set_state('stop')
                iteration += 1

            train_loss /= batch_size * num_batch
            name, acc = train_metric.get()
            _, train_acc = metric.get()
            name, val_acc, _ = test(ctx, val_data)
            if opt.mixup:
                train_history.update([acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            else:
                train_history.update([1-train_acc, 1-val_acc])
                plt.cla()
                train_history.plot(save_path='%s/%s_history.png' %
                                   (plot_name, model_name))
            # acc_history.update([train_acc, val_acc])
            # plt.cla()
            # acc_history.plot(save_path='%s/%s_acc.png' %
            #                  (plot_name, model_name), legend_loc='best')

            if val_acc > best_val_score:
                best_val_score = val_acc
                net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' %
                                    (save_dir, best_val_score, model_name, epoch))

            current_lr = trainer.learning_rate
            name, val_acc, val_loss = test(ctx, val_data)
            loss_history.update([train_loss, val_loss])
            plt.cla()
            loss_history.plot(save_path='%s/%s_loss.png' %
                              (plot_name, model_name), y_lim=(0, 2), legend_loc='best')
            logging.info('[Epoch %d] loss=%f train_acc=%f train_RMSE=%f\n     val_acc=%f val_loss=%f lr=%f time: %f' %
                         (epoch, train_loss, train_acc, acc, val_acc, val_loss, current_lr, time.time()-tic))
            sw._add_scalars(tag='Acc',
                            scalar_dict={'train_acc': train_acc, 'test_acc': val_acc}, global_step=epoch)
            sw._add_scalars(tag='Loss',
                            scalar_dict={'train_loss': train_loss, 'test_loss': val_loss}, global_step=epoch)
            if save_period and save_dir and (epoch + 1) % save_period == 0:
                net.save_parameters('%s/cifar10-%s-%d.params' %
                                    (save_dir, model_name, epoch))
        if save_period and save_dir:
            net.save_parameters('%s/cifar10-%s-%d.params' %
                                (save_dir, model_name, epochs-1))

    if opt.mode == 'hybrid':
        net.hybridize()
    train(opt.num_epochs, context)
    if opt.profile_mode:
        profiler.dump(finished=False)
    sw.close()
Ejemplo n.º 26
0
    def run(self):
        # Helper methods
        def get_random_lot(data_loader):
            return next(iter(data_loader))

        # Data importing, pre-processing, and loading
        num_training_examples, num_testing_examples, train_data_lot_iterator, train_data_eval_iterator, test_data = self._load_data(
        )
        # parameters calculated from loaded data
        self._num_training_examples = num_training_examples
        self._num_testing_examples = num_testing_examples
        self._hyperparams[
            'sample_fraction'] = self._lot_size / num_training_examples
        rounds_per_epoch = round(num_training_examples / self._lot_size)

        # Set up privacy accountant
        accountant = rdp_acct.anaRDPacct()  # dpacct.anaCGFAcct()
        eps_sequence = []

        # Network structure creation
        self._create_network_params()

        # Loss function
        loss_func = self._get_loss_func()

        # Optimization procedure
        trainer = self._optimizer(self._hyperparams, self._net, self._params,
                                  loss_func, self._model_ctx, accountant)

        # begin profiling if enabled
        if self._enable_mxnet_profiling:
            from mxnet import profiler
            profiler.set_config(profile_all=True,
                                aggregate_stats=True,
                                filename='profile_output.json')
            profiler.set_state('run')

        # Training sequence
        rounds = round(self._epochs * rounds_per_epoch)
        loss_sequence = []
        current_epoch_loss = mx.nd.zeros(1, ctx=self._model_ctx)
        for t in range(1, rounds + 1):
            if self._verbose and self._print_epoch_status:
                # show current epoch progress
                epoch_number = 1 + (t - 1) // rounds_per_epoch
                epoch_progress = 1 + (t - 1) % rounds_per_epoch
                printProgressBar(
                    epoch_progress,
                    rounds_per_epoch,
                    prefix='Epoch {} progress:'.format(epoch_number),
                    length=50)

            if self._run_training:
                # prepare random lot of data for DPSGD step
                data, labels = get_random_lot(train_data_lot_iterator)
                data = data.as_in_context(self._model_ctx).reshape(
                    (-1, 1, self._input_layer))
                labels = labels.as_in_context(self._model_ctx)
            else:
                data, labels = [], []

            # perform DPSGD step
            lot_mean_loss = trainer.step(
                data,
                labels,
                accumulate_privacy=self._accumulate_privacy,
                run_training=self._run_training)

            loss_sequence.append(lot_mean_loss)
            current_epoch_loss += lot_mean_loss

            # no need to continue running training if NaNs are present
            if not np.isfinite(lot_mean_loss):
                self._run_training = False
                if self._verbose: print("NaN loss on round {}.".format(t))
            if self._params_not_finite():
                self._run_training = False
                if self._verbose:
                    print("Non-finite parameters on round {}.".format(t))

            if self._accumulate_privacy and self._debugging:
                eps_sequence.append(accountant.get_eps(self._fixed_delta))

            # print some stats after an "epoch"
            if t % rounds_per_epoch == 0:
                if self._verbose:
                    print("Epoch {}  (round {})  complete.".format(
                        t / rounds_per_epoch, t))
                    if self._run_training:
                        print("mean epoch loss: {}".format(
                            current_epoch_loss.asscalar() * self._lot_size /
                            self._num_training_examples))
                        if self._compute_epoch_accuracy:
                            print("training accuracy: {}".format(
                                self._evaluate_accuracy(
                                    train_data_eval_iterator)))
                            print("testing accuracy: {}".format(
                                self._evaluate_accuracy(test_data)))
                    if self._accumulate_privacy and self._debugging:
                        print("eps used: {}\n".format(eps_sequence[-1]))
                    print()
                current_epoch_loss = mx.nd.zeros(1, ctx=self._model_ctx)

        # end profiling if enabled
        if self._enable_mxnet_profiling:
            mx.nd.waitall()
            profiler.set_state('stop')
            print(profiler.dumps())

        # Make sure we don't report a bogus number
        if self._accumulate_privacy:
            final_eps = accountant.get_eps(self._fixed_delta)
        else:
            final_eps = -1

        test_accuracy = self._evaluate_accuracy(test_data)

        if self._save_plots or self._debugging:
            self._create_and_save_plots(t, eps_sequence, loss_sequence,
                                        final_eps, test_accuracy)

        return final_eps, test_accuracy
Ejemplo n.º 27
0
# %% [markdown]
# # Train

# %%
trainer1 = {
    k: gluon.Trainer(v.collect_params(), 'adagrad', {'clip_gradient': 1.25})
    for (k, v) in net1.items()
}
trainer2 = gluon.Trainer(net2.collect_params(), 'adagrad',
                         {'clip_gradient': 1.25})
loss = gluon.loss.L2Loss()

# %%
profiler.set_config(profile_all=True,
                    profile_imperative=True,
                    aggregate_stats=True,
                    continuous_dump=True,
                    filename='profile.json')


# %%
def train_model(dataiter, epoch):
    train_loss = 0
    total_size = 0
    for i, batch in enumerate(dataiter):
        with mx.autograd.record():
            # iterate over the left and right question
            embs = []
            data_lists = []
            for k in range(2):
                embedding = [
Ejemplo n.º 28
0
def main():
    data_p = Path('/storage/data/').resolve()
    checkpoint_p = Path('./checkpoints/').resolve()
    checkpoint_p.mkdir(parents=True, exist_ok=True)
    logs_p = Path('./logs/').resolve()
    shutil.rmtree(logs_p, ignore_errors=True)
    encoder = SevenPlaneEncoder((19, 19))
    builder = SGFDatasetBuilder(data_p, encoder=encoder)
    builder.download_and_prepare()
    train_itr = builder.train_dataset(batch_size=BATCH_SIZE,
                                      max_worker=cpu_count(),
                                      factor=FACTOR)
    test_itr = builder.test_dataset(batch_size=BATCH_SIZE,
                                    max_worker=cpu_count(),
                                    factor=FACTOR)
    # build model
    betago = Model()
    # convert to half-presicion floating point FP16
    # NOTE: all NVIDIA GPUs with compute capability 6.1 have a low-rate FP16 performance == FFP16 is not the fast path on these GPUs
    #       data passed to split_and_load() must be float16 too
    #betago.cast('float16')
    # hybridize for speed
    betago.hybridize(static_alloc=True, static_shape=True)
    # print graph
    shape = (1, ) + encoder.shape()
    mx.viz.print_summary(betago(mx.sym.var('data')), shape={'data': shape})
    # pin GPUs
    ctx = [mx.gpu(i) for i in range(GPU_COUNT)]
    # optimizer
    opt_params = {
        'learning_rate': 0.001,
        'beta1': 0.9,
        'beta2': 0.999,
        'epsilon': 1e-08
    }
    opt = mx.optimizer.create('adam', **opt_params)
    # initialize parameters
    # MXNet initializes the weight matrices uniformly by drawing from [−0.07,0.07], bias parameters are all set to 0
    # 'Xavier': initializer is designed to keep the scale of gradients roughly the same in all layers
    betago.initialize(mx.init.Xavier(magnitude=2.3),
                      ctx=ctx,
                      force_reinit=True)
    # fetch and broadcast parameters
    params = betago.collect_params()
    # trainer
    trainer = Trainer(params=params, optimizer=opt, kvstore='device')
    # loss function
    loss_fn = SoftmaxCrossEntropyLoss()
    # use accuracy as the evaluation metric
    metric = Accuracy()
    with mxb.SummaryWriter(logdir='./logs') as sw:
        # add graph to MXBoard
        #betago.forward(mx.nd.ones(shape, ctx=ctx[0]))
        #betago.forward(mx.nd.ones(shape, ctx=ctx[1]))
        #sw.add_graph(betago)
        profiler.set_config(profile_all=True,
                            aggregate_stats=True,
                            continuous_dump=True,
                            filename='profile_output.json')
        start = time.perf_counter()
        # train
        for e in range(EPOCHS):
            if 0 == e:
                profiler.set_state('run')
            tick = time.time()
            # reset the train data iterator.
            train_itr.reset()
            # loop over the train data iterator
            for i, batch in enumerate(train_itr):
                if 0 == i:
                    tick_0 = time.time()
                # splits train data into multiple slices along batch_axis
                # copy each slice into a context
                data = split_and_load(batch.data[0],
                                      ctx_list=ctx,
                                      batch_axis=0,
                                      even_split=False)
                # splits train label into multiple slices along batch_axis
                # copy each slice into a context
                label = split_and_load(batch.label[0],
                                       ctx_list=ctx,
                                       batch_axis=0,
                                       even_split=False)
                outputs = []
                losses = []
                # inside training scope
                with ag.record():
                    for x, y in zip(data, label):
                        z = betago(x)
                        # computes softmax cross entropy loss
                        l = loss_fn(z, y)
                        outputs.append(z)
                        losses.append(l)
                # backpropagate the error for one iteration
                for l in losses:
                    l.backward()
                # make one step of parameter update.
                # trainer needs to know the batch size of data
                # to normalize the gradient by 1/batch_size
                trainer.step(BATCH_SIZE)
                # updates internal evaluation
                metric.update(label, outputs)
                # Print batch metrics
                if 0 == i % PRINT_N and 0 < i:
                    # checkpointing
                    betago.save_parameters(
                        str(checkpoint_p.joinpath(
                            'betago-{}.params'.format(e))))
                    sw.add_scalar(tag='Accuracy',
                                  value={'naive': metric.get()[1]},
                                  global_step=i - PRINT_N)
                    sw.add_scalar(tag='Speed',
                                  value={
                                      'naive':
                                      BATCH_SIZE * (PRINT_N) /
                                      (time.time() - tick)
                                  },
                                  global_step=i - PRINT_N)
                    print(
                        'epoch[{}] batch [{}], accuracy {:.4f}, samples/sec: {:.4f}'
                        .format(e, i,
                                metric.get()[1],
                                BATCH_SIZE * (PRINT_N) / (time.time() - tick)))
                    tick = time.time()
            if 0 == e:
                profiler.set_state('stop')
                profiler.dump()
            # gets the evaluation result
            print('epoch [{}], accuracy {:.4f}, samples/sec: {:.4f}'.format(
                e,
                metric.get()[1],
                BATCH_SIZE * (i + 1) / (time.time() - tick_0)))
            # reset evaluation result to initial state
            metric.reset()

    elapsed = time.perf_counter() - start
    print('elapsed: {:0.3f}'.format(elapsed))
    # use Accuracy as the evaluation metric
    metric = Accuracy()
    for batch in test_itr:
        data = split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
        label = split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
        outputs = []
        for x in data:
            outputs.append(betago(x))
        metric.update(label, outputs)
    print('validation %s=%f' % metric.get())
Ejemplo n.º 29
0
def train(
    args,
    model,
    train_sampler,
    valid_samplers=None,
    rank=0,
    rel_parts=None,
    barrier=None,
):
    assert args.num_proc <= 1, "MXNet KGE does not support multi-process now"
    assert (args.rel_part == False
            ), "No need for relation partition in single process for MXNet KGE"
    logs = []

    for arg in vars(args):
        logging.info("{:20}:{}".format(arg, getattr(args, arg)))

    if len(args.gpu) > 0:
        gpu_id = (args.gpu[rank % len(args.gpu)]
                  if args.mix_cpu_gpu and args.num_proc > 1 else args.gpu[0])
    else:
        gpu_id = -1

    if args.strict_rel_part:
        model.prepare_relation(mx.gpu(gpu_id))

    if mxprofiler:
        from mxnet import profiler

        profiler.set_config(
            profile_all=True,
            aggregate_stats=True,
            continuous_dump=True,
            filename="profile_output.json",
        )
    start = time.time()
    for step in range(0, args.max_step):
        pos_g, neg_g = next(train_sampler)
        args.step = step
        if step == 1 and mxprofiler:
            profiler.set_state("run")
        with mx.autograd.record():
            loss, log = model.forward(pos_g, neg_g, gpu_id)
        loss.backward()
        logs.append(log)
        model.update(gpu_id)

        if step % args.log_interval == 0:
            for k in logs[0].keys():
                v = sum(l[k] for l in logs) / len(logs)
                print("[Train]({}/{}) average {}: {}".format(
                    step, args.max_step, k, v))
            logs = []
            print(time.time() - start)
            start = time.time()

        if (args.valid and step % args.eval_interval == 0 and step > 1
                and valid_samplers is not None):
            start = time.time()
            test(args, model, valid_samplers, mode="Valid")
            print("test:", time.time() - start)
    if args.strict_rel_part:
        model.writeback_relation(rank, rel_parts)
    if mxprofiler:
        nd.waitall()
        profiler.set_state("stop")
        profiler.dump()
        print(profiler.dumps())
    # clear cache
    logs = []
Ejemplo n.º 30
0
from mxnet import profiler

import Graph
from learners.IterativeLearner import learn_iterative
from common import data_ctx, measure_time
from feature_transformations import FeatureTransformation
from feature_transformations.FeatureScalingTransformation import FeatureScalingTransformation
from feature_transformations.KernelTransformation import KernelTransformation
from feature_transformations.LinearConvolutionTransformation import LinearConvolutionTransformation
from feature_transformations.PcaTransformation import PcaTransformation

############## PARAMETERS ###################
from feature_transformations.RealFeatureScalingTransformation import RealFeatureScalingTransformation
from learners.IterativeLogisticLearner import learn_iterative_logistic

profiler.set_config(aggregate_stats=True, filename='profile_output.json')

mx.random.seed(101)
random.seed(101)


def learn(graph, net_type, training_set, test_set, iterations_per_epoch,
          batch_size):
    def try_standard_approach(approach):
        def get_all_vertices(data_loader: DataLoader):
            res_X = []
            for X, y in data_loader:
                for x in X:
                    res_X.append(round(float(x.asscalar())))
            return res_X
Ejemplo n.º 31
0
import logging
import argparse
import os
import sys
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import mxnet as mx
from models.E3DNet import create_m3d
from lib.data import ClipBatchIter
from mxnet import profiler

train_list = ["fc", "comp_17", "comp_16", "comp_15", "comp_14", "softmax"]
tmp_pool_list = ["final_fc", "softmax_label"]
profiler.set_config(profile_all=True,
                    aggregate_stats=True,
                    filename='profile_output_m3d.json')


def plot_schedule(schedule_fn, iterations=1500):
    # Iteration count starting at 1
    iterations = [i + 1 for i in range(iterations)]
    lrs = [schedule_fn(i) for i in iterations]
    plt.scatter(iterations, lrs)
    plt.xlabel("Iteration")
    plt.ylabel("Learning Rate")
    #plt.savefig('learning_rate.png')


def train(args):
    gpus = [int(i) for i in args.gpus.split(',')]