Beispiel #1
0
def test_imperative_i2_o1():
    import nnabla.functions as F
    x0 = nn.NdArray([2, 3, 4])
    x1 = nn.NdArray([2, 1, 1])
    x0.fill(3)
    x1.fill(0.5)
    y = F.mul2(x0, x1)
    assert np.allclose(y.data, 1.5)
Beispiel #2
0
 def reset(self, epoch, pbar):
     self.epoch = epoch
     self.epoch_loss = 0.0
     self.epoch_error = 0
     self.batch_counter = 0
     self.pbar = pbar
     self.buff = [nn.NdArray(), nn.NdArray()]
     self.reset_buff()
     self.flush = True
Beispiel #3
0
def test_copy_from():
    shape = [2, 3, 4]
    src = nn.NdArray(shape)
    dst = nn.NdArray(shape)
    src.data = 0
    src.cast(dtype=np.uint8)
    dst.copy_from(src, use_current_context=False)
    assert dst.dtype == np.uint8

    from nnabla.ext_utils import get_extension_context
    with nn.context_scope(get_extension_context('cpu', dtype='float')):
        dst.copy_from(src, use_current_context=True)
    assert dst.dtype == np.float32
Beispiel #4
0
def test_wrong_case_ndarray_arithmetic_matmul_ops(seed, shape):
    rng = np.random.RandomState(seed)

    if not shape[0]:
        a1 = rng.randn()
        n1 = nn.NdArray()
        n1.cast(np.float32)[...] = a1
        v1 = nn.Variable()
        v1.data.cast(np.float32)[...] = a1
    else:
        a1 = rng.randn(*shape[0]).astype(np.float32)
        n1 = nn.NdArray.from_numpy_array(a1)
        v1 = nn.Variable.from_numpy_array(a1)

    if not shape[1]:
        a2 = rng.randn()
        n2 = nn.NdArray()
        n2.cast(np.float32)[...] = a2
        v2 = nn.Variable()
        v2.data.cast(np.float32)[...] = a2
    else:
        a2 = rng.randn(*shape[1]).astype(np.float32)
        n2 = nn.NdArray.from_numpy_array(a2)
        v2 = nn.Variable.from_numpy_array(a2)

    with pytest.raises(AssertionError) as excinfo:
        # NdArray @ NdArray
        ans1 = n1 @ n2

        # NdArray @ Variable
        ans2 = n1 @ v2

        # Variable @ NdArray
        ans3 = v1 @ n2

        # Variable @ Variable
        ans4 = v1 @ v2

        # numpy.ndarray or float @ NdArray
        ans5 = a1 @ n1

        # NdArray @ numpy.ndarray or float
        ans6 = n1 @ a2

        # numpy.ndarray or float @ Variable
        ans7 = a1 @ v2

        # Variable @ numpy.ndarray or float
        ans8 = v1 @ a2
def test_scalar_dot(seed, scalar, is_dynamic):
    rng = np.random.RandomState(seed)
    a1 = scalar
    a2 = rng.randn(3, 4, 5, 6).astype(np.float32)
    n = nn.NdArray.from_numpy_array(a2)
    v = nn.Variable.from_numpy_array(a2)

    ref = F.dot(a1, a2)

    ans1 = F.dot(a1, n)
    assert_allclose(ans1.data, ref)

    out1 = nn.NdArray((3, 4, 5, 6))
    F.dot(a1, n, out1)
    assert_allclose(out1.data, ref)

    with nn.auto_forward(is_dynamic):
        ans2 = F.dot(a1, v)
        if not is_dynamic:
            ans2.forward()
        assert_allclose(ans2.d, ref)

        out2 = nn.Variable((3, 4, 5, 6))
        F.dot(a1, v, out2)
        if not is_dynamic:
            out2.forward()
        assert_allclose(out2.d, ref)
Beispiel #6
0
    def __init__(self,
                 comm,
                 losses,
                 save_path=None,
                 nimage_per_epoch=1,
                 show_interval=20,
                 show_keys=None):
        # losses: {"loss_name": loss_Variable, ...} or list of tuple(key, value)

        self.batch_cnt = 0
        self.piter = None
        self.comm = comm
        self.save_path = save_path
        self.nimage_per_epoch = nimage_per_epoch
        self.show_interval = show_interval

        self.losses = OrderedDict(losses)  # fix loss order
        self.epoch_losses = {k: 0. for k in losses.keys()}
        self.buff = {k: nn.NdArray() for k in losses.keys()}
        self.show_keys = list(
            losses.keys()) if show_keys is None else show_keys

        is_master = comm.rank == 0
        self.monitor = MonitorWrapper(save_path, self.epoch_losses) if (
            save_path is not None and is_master) else None

        self._reset_buffer()
Beispiel #7
0
def sum_grad_norm(params):
    norm = nn.NdArray()
    norm.zero()

    for p in params:
        assert isinstance(p, nn.Variable) and not p.grad.clear_called
        norm += F.sum(p.grad**2)

    return np.sqrt(norm.data)
Beispiel #8
0
    def _reset_buffer(self):
        # reset buff
        for loss_name, loss in self.losses.items():
            if loss is None:
                continue
            self.buff[loss_name] = nn.NdArray()
            self.buff[loss_name].zero()

        self.flushed = True
Beispiel #9
0
def test_nd_array_data(value):
    shape = (2, 3)

    # Use default dtype (float32) in getter
    a = nn.NdArray(shape)
    with pytest.raises(Exception):
        _ = a.dtype
    _ = a.data
    assert a.dtype == np.float32

    # Use value dtype in setter
    a = nn.NdArray(shape)
    a.data = value
    if not np.isscalar(value) or \
       (np.dtype(type(value)).kind != 'f' and value > (1 << 53)):
        assert a.dtype == np.asarray(value).dtype
        assert a.data.dtype == np.asarray(value).dtype
    else:
        assert a.data.dtype == np.float32
Beispiel #10
0
 def _sum_error(sum, error):
     ret = None
     if comm:
         # logger.log(99, "Calc error with communicator")
         var = [nn.NdArray()]
         var[0].data = error
         _all_reduce(comm, var, division=False, inplace=True)
         ret = sum + var[0].data
     else:
         ret = sum + error
     return ret
Beispiel #11
0
 def _sum_cost():
     if comm:
         # logger.log(99, "Calc cost with communicator")
         var = [nn.NdArray()]
         var[0].data = cost.sum_iteration
         _all_reduce(comm, var, division=False, inplace=True)
         cost.sum_epoch += var[0].data
         cost.num_iteration += comm.size
     else:
         cost.sum_epoch += cost.sum_iteration
         cost.num_iteration += 1
Beispiel #12
0
def test_nd_array():
    shape = [2, 3, 4]
    a = nn.NdArray(shape)
    npa = np.arange(a.size).reshape(a.shape).astype(np.int32)
    a.data = npa
    b = nn.NdArray.from_numpy_array(npa)
    b.dtype == np.int32
    assert np.all(a.data == npa)
    assert np.all(a.data == b.data)
    assert a.shape == npa.shape
    assert b.size == np.prod(shape)
    a.cast(np.int32)
    assert a.data.dtype == np.int32
    b.zero()
    assert np.all(b.data == 0)
    a.fill(3)
    assert np.all(a.data == 3)
    b.copy_from(a)
    assert np.all(a.data == b.data)
Beispiel #13
0
def test_clear_called():
    a = nn.NdArray(1)
    assert a.clear_called == False
    a.fill(3)
    assert a.clear_called == False
    a.clear()
    assert a.clear_called == True

    a.fill(3)
    assert a.clear_called == False
    a.clear()
    assert a.clear_called == True
    a.zero()
    assert a.clear_called == False
    a.clear()
    assert a.clear_called == True

    a.data[0] = -1
    assert a.clear_called == False
Beispiel #14
0
def test_from_dlpack_given(ext_name, numpy_type, torch_type):
    ctx = get_extension_context(ext_name)
    device_name = ctx.backend[0].split(':')[0]
    if device_name == 'cudnn':
        device_name = 'cuda'  # for PyTorch
    nn.set_default_context(ctx)

    # Init PyTorch Tensor
    t = torch.ones((5, 5), dtype=torch_type, device=torch.device(device_name))

    # PyTorch to DLPack
    dlp = torch.utils.dlpack.to_dlpack(t)

    # DLPack to NNabla
    a = nn.NdArray()
    nn.utils.dlpack.from_dlpack(dlp, a)
    assert a.dtype == numpy_type

    # Check if the memory locations are still same,
    # which means DlpackArray is not copied to other arrays
    # in the same ArrayGroup.
    a += 1
    assert np.all(a.data == t.to('cpu').detach().numpy().copy())
Beispiel #15
0
class TestClearOutputGrad():

    def check_grad_cleared_flags(self, answer):
        result = clear_called_flag_recorder.get_output_clear_called_flags()
        assert len(result) == len(answer)
        for i, flags in enumerate(answer):
            assert len(result[i]) == len(flags)
            for j, flag in enumerate(flags):
                assert flag == result[i][j][1]

    def setup_method(self):
        clear_called_flag_recorder.activate_clear_called_flag_recorder()

    def teardown_method(self):
        clear_called_flag_recorder.deactivate_clear_called_flag_recorder()

    # Test for the type of grad given to backward.
    @pytest.mark.parametrize("grad", [1, None, np.ndarray([1]), nn.NdArray([1])])
    def test_clear_output_grad_argument(self, grad):
        x1 = nn.Variable([1], need_grad=True)

        xx1 = F.identity(x1)
        y1 = F.add_scalar(xx1)

        answer_grad = []
        if grad is None or isinstance(grad, nn.NdArray):
            answer_grad.append([False])  # y1
        else:
            answer_grad.append([True])  # y1
        answer_grad.append([True])  # xx1

        y1.forward(clear_no_need_grad=True)
        clear_called_flag_recorder.deactivate_clear_called_flag_recorder()
        clear_called_flag_recorder.activate_clear_called_flag_recorder()
        y1.backward(clear_buffer=True, grad=grad)

        self.check_grad_cleared_flags(answer_grad)
        assert y1.grad.clear_called == False

    # Test for an inplaced variable.
    def test_clear_output_grad_inplace(self):
        x1 = nn.Variable([1], need_grad=True)

        xx1 = F.identity(x1)
        y1 = F.add_scalar(xx1, inplace=True)
        y2 = F.add_scalar(y1)

        answer_grad = []
        answer_grad.append([True])
        answer_grad.append([True])
        answer_grad.append([True])

        y2.forward(clear_no_need_grad=True)
        clear_called_flag_recorder.deactivate_clear_called_flag_recorder()
        clear_called_flag_recorder.activate_clear_called_flag_recorder()
        y2.backward(clear_buffer=True)

        self.check_grad_cleared_flags(answer_grad)

    # Test for a variable shared with two layer functions.
    def test_clear_output_grad_shared_variable(self):
        x1 = nn.Variable([1], need_grad=True)

        xx1 = F.identity(x1)
        y1 = F.add_scalar(xx1)
        y2 = F.add_scalar(xx1)
        y3 = F.add2(y1, y2)

        answer_grad = []
        answer_grad.append([True])
        answer_grad.append([True])
        answer_grad.append([True])
        answer_grad.append([True])

        y3.forward(clear_no_need_grad=True)
        clear_called_flag_recorder.deactivate_clear_called_flag_recorder()
        clear_called_flag_recorder.activate_clear_called_flag_recorder()
        y3.backward(clear_buffer=True)

        self.check_grad_cleared_flags(answer_grad)

    # Test for a persistent variable.
    def test_clear_output_grad_persistent(self):
        x1 = nn.Variable([1], need_grad=True)

        xx1 = F.identity(x1)
        y1 = F.add_scalar(xx1)
        y2 = F.add_scalar(y1)

        xx1.persistent = True
        y2.persistent = True

        answer_grad = []
        answer_grad.append([False])  # y2
        answer_grad.append([True])  # y1
        answer_grad.append([False])  # xx1

        y2.forward(clear_no_need_grad=True)
        clear_called_flag_recorder.deactivate_clear_called_flag_recorder()
        clear_called_flag_recorder.activate_clear_called_flag_recorder()
        y2.backward(clear_buffer=True)

        self.check_grad_cleared_flags(answer_grad)

    # Test for the input variables of sink.
    # In the case where Function::prohibit_clear_input_buffers returns true,
    # these inputs must not be cleared from any function.
    def test_clear_output_grad_prohibit_clear_input(self):
        x1 = nn.Variable([1], need_grad=True)

        xx1 = F.identity(x1)
        y1 = F.add_scalar(xx1)
        y2 = F.add_scalar(xx1)
        y3 = F.sink(y1, y2)

        answer_grad = []
        answer_grad.append([True])  # y3
        answer_grad.append([False])  # y2
        answer_grad.append([False])  # y1
        answer_grad.append([True])  # xx1

        y3.forward(clear_no_need_grad=True)
        clear_called_flag_recorder.deactivate_clear_called_flag_recorder()
        clear_called_flag_recorder.activate_clear_called_flag_recorder()
        y3.backward(clear_buffer=True)

        self.check_grad_cleared_flags(answer_grad)
Beispiel #16
0
def test_imperative_i1_o1():
    import nnabla.functions as F
    x = nn.NdArray([2, 3, 4])
    x.fill(1)
    x1 = F.add_scalar(x, 1)
    assert np.allclose(x1.data, 2)
Beispiel #17
0
def test_imperative_pf():
    import nnabla.parametric_functions as PF
    x = nn.NdArray([2, 3, 4, 5])
    y = PF.batch_normalization(x)
Beispiel #18
0
# # NNabla Python API Demonstration Tutorial
# # (https://nnabla.readthedocs.io/en/latest/python/tutorial/python_api.html)

import matplotlib.pyplot as plt
import nnabla as nn
import nnabla.functions as F
import nnabla.parametric_functions as PF
import nnabla.solvers as S
import numpy as np

from ivory.utils.path import cache_file

# ## NdArray

a = nn.NdArray((2, 3, 4))
print(a.data)

# -
print("[Substituting random values]")
a.data = np.random.randn(*a.shape)
print(a.data)
print("[Slicing]")
a.data[0, :, ::2] = 0
print(a.data)
# -
a.fill(1)  # Filling all values with one.
print(a.data)
# -
b = nn.NdArray.from_numpy_array(np.ones(a.shape))
print(b.data)
# ## Variable
def test_ndarray_dot(seed, shape, is_dynamic):
    rng = np.random.RandomState(seed)

    if not shape[0]:
        a1 = rng.randn()
        n1 = nn.NdArray()
        n1.cast(np.float32)[...] = a1
        v1 = nn.Variable()
        v1.data.cast(np.float32)[...] = a1
    else:
        a1 = rng.randn(*shape[0]).astype(np.float32)
        n1 = nn.NdArray.from_numpy_array(a1)
        v1 = nn.Variable.from_numpy_array(a1)

    if not shape[1]:
        a2 = rng.randn()
        n2 = nn.NdArray()
        n2.cast(np.float32)[...] = a2
        v2 = nn.Variable()
        v2.data.cast(np.float32)[...] = a2
    else:
        a2 = rng.randn(*shape[1]).astype(np.float32)
        n2 = nn.NdArray.from_numpy_array(a2)
        v2 = nn.Variable.from_numpy_array(a2)

    ref = F.dot(a1, a2)

    ans1_1 = F.dot(n1, n2)
    ans1_2 = F.dot(n1, v2)
    ans1_3 = F.dot(v1, n2)
    assert_allclose(ans1_1.data, ref, atol=1e-3)
    assert_allclose(ans1_2.data, ref, atol=1e-3)
    assert_allclose(ans1_3.data, ref, atol=1e-3)
    with nn.auto_forward(is_dynamic):
        ans1_4 = F.dot(v1, v2)
        if is_dynamic:
            ans1_4.forward()
            assert_allclose(ans1_4.d, ref, atol=1e-3)

    out = ref.copy()
    F.dot(a1, a2, out)
    assert_allclose(out, ref, atol=1e-3)

    out1_1 = nn.NdArray(ans1_1.shape)
    out1_1.cast(np.float32)
    F.dot(n1, n2, out1_1)
    assert_allclose(out1_1.data, ref, atol=1e-3)

    out1_2 = nn.NdArray(ans1_2.shape)
    out1_2.cast(np.float32)
    F.dot(n1, v2, out1_2)
    assert_allclose(out1_2.data, ref, atol=1e-3)

    out1_3 = nn.NdArray(ans1_3.shape)
    out1_3.cast(np.float32)
    F.dot(v1, n2, out1_3)
    assert_allclose(out1_3.data, ref, atol=1e-3)

    out1_4 = nn.Variable(ans1_4.shape)
    out1_4.data.cast(np.float32)
    with nn.auto_forward(is_dynamic):
        F.dot(v1, v2, out1_4)
        if not is_dynamic:
            out1_4.forward()
        assert_allclose(out1_4.d, ref, atol=1e-3)

    # Ndarray with a wrong dtype
    out2_1 = nn.NdArray(ref.shape)
    out2_1.cast(int)
    out2_2 = nn.Variable(ref.shape)
    out2_2.data.cast(int)
    # should not exec
    with pytest.raises(ValueError) as excinfo:
        F.dot(n1, n2, out2_1)
        F.dot(n1, v2, out2_1)
        F.dot(v1, n2, out2_1)
        F.dot(v1, v2, out2_2)
Beispiel #20
0
def train():
    # Check NNabla version
    if utils.get_nnabla_version_integer() < 11900:
        raise ValueError(
            'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0'
        )

    parser, args = get_train_args()

    # Get context.
    ctx = get_extension_context(args.context, device_id=args.device_id)
    comm = CommunicatorWrapper(ctx)
    nn.set_default_context(comm.ctx)
    ext = import_extension_module(args.context)

    # Monitors
    # setting up monitors for logging
    monitor_path = args.output
    monitor = Monitor(monitor_path)

    monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1)
    monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1)
    monitor_validation_loss = MonitorSeries('Validation loss',
                                            monitor,
                                            interval=1)
    monitor_lr = MonitorSeries('learning rate', monitor, interval=1)
    monitor_time = MonitorTimeElapsed("training time per iteration",
                                      monitor,
                                      interval=1)

    if comm.rank == 0:
        if not os.path.isdir(args.output):
            os.makedirs(args.output)

    # Initialize DataIterator for MUSDB18.
    train_source, valid_source, args = load_datasources(parser, args)

    train_iter = data_iterator(
        train_source,
        args.batch_size,
        RandomState(args.seed),
        with_memory_cache=False,
    )

    valid_iter = data_iterator(
        valid_source,
        1,
        RandomState(args.seed),
        with_memory_cache=False,
    )

    if comm.n_procs > 1:
        train_iter = train_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

        valid_iter = valid_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

    # Calculate maxiter per GPU device.
    # Change max_iter, learning_rate and weight_decay according no. of gpu devices for multi-gpu training.
    default_batch_size = 16
    train_scale_factor = (comm.n_procs * args.batch_size) / default_batch_size
    max_iter = int((train_source._size // args.batch_size) // comm.n_procs)
    weight_decay = args.weight_decay * train_scale_factor
    args.lr = args.lr * train_scale_factor

    # Calculate the statistics (mean and variance) of the dataset
    scaler_mean, scaler_std = utils.get_statistics(args, train_source)

    # clear cache memory
    ext.clear_memory_cache()

    max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft,
                                         args.bandwidth)

    # Get X-UMX/UMX computation graph and variables as namedtuple
    model = get_model(args, scaler_mean, scaler_std, max_bin=max_bin)

    # Create Solver and set parameters.
    solver = S.Adam(args.lr)
    solver.set_parameters(nn.get_parameters())

    # Initialize Early Stopping
    es = utils.EarlyStopping(patience=args.patience)

    # Initialize LR Scheduler (ReduceLROnPlateau)
    lr_scheduler = ReduceLROnPlateau(lr=args.lr,
                                     factor=args.lr_decay_gamma,
                                     patience=args.lr_decay_patience)
    best_epoch = 0

    # AverageMeter for mean loss calculation over the epoch
    losses = utils.AverageMeter()

    # Training loop.
    for epoch in trange(args.epochs):
        # TRAINING
        losses.reset()
        for batch in range(max_iter):
            model.mixture_audio.d, model.target_audio.d = train_iter.next()
            solver.zero_grad()
            model.loss.forward(clear_no_need_grad=True)
            if comm.n_procs > 1:
                all_reduce_callback = comm.get_all_reduce_callback()
                model.loss.backward(clear_buffer=True,
                                    communicator_callbacks=all_reduce_callback)
            else:
                model.loss.backward(clear_buffer=True)
            solver.weight_decay(weight_decay)
            solver.update()
            losses.update(model.loss.d.copy(), args.batch_size)
        training_loss = losses.get_avg()

        # clear cache memory
        ext.clear_memory_cache()

        # VALIDATION
        losses.reset()
        for batch in range(int(valid_source._size // comm.n_procs)):
            x, y = valid_iter.next()
            dur = int(valid_source.sample_rate * args.valid_dur)
            sp, cnt = 0, 0
            loss_tmp = nn.NdArray()
            loss_tmp.zero()
            while 1:
                model.vmixture_audio.d = x[Ellipsis, sp:sp + dur]
                model.vtarget_audio.d = y[Ellipsis, sp:sp + dur]
                model.vloss.forward(clear_no_need_grad=True)
                cnt += 1
                sp += dur
                loss_tmp += model.vloss.data
                if x[Ellipsis,
                     sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur:
                    break
            loss_tmp = loss_tmp / cnt
            if comm.n_procs > 1:
                comm.all_reduce(loss_tmp, division=True, inplace=True)
            losses.update(loss_tmp.data.copy(), 1)
        validation_loss = losses.get_avg()

        # clear cache memory
        ext.clear_memory_cache()

        lr = lr_scheduler.update_lr(validation_loss, epoch=epoch)
        solver.set_learning_rate(lr)
        stop = es.step(validation_loss)

        if comm.rank == 0:
            monitor_best_epoch.add(epoch, best_epoch)
            monitor_traing_loss.add(epoch, training_loss)
            monitor_validation_loss.add(epoch, validation_loss)
            monitor_lr.add(epoch, lr)
            monitor_time.add(epoch)

            if validation_loss == es.best:
                best_epoch = epoch
                # save best model
                if args.umx_train:
                    nn.save_parameters(os.path.join(args.output,
                                                    'best_umx.h5'))
                else:
                    nn.save_parameters(
                        os.path.join(args.output, 'best_xumx.h5'))

        if args.umx_train:
            # Early stopping for UMX after `args.patience` (140) number of epochs
            if stop:
                print("Apply Early Stopping")
                break
Beispiel #21
0
 def __init__(self, decay):
     self.decay = decay
     self.shadow_variable = nn.NdArray()
Beispiel #22
0
def train():
    """
    Main script.
    """

    args = get_args()

    _ = nn.load_parameters(args.pretrained_model_path)
    if args.fine_tune:
        nnabla.parameter.pop_parameter('decoder/logits/affine/conv/W')
        nnabla.parameter.pop_parameter('decoder/logits/affine/conv/b')

    n_train_samples = args.train_samples
    n_val_samples = args.val_samples
    distributed = args.distributed
    compute_acc = args.compute_acc

    if distributed:
        # Communicator and Context
        from nnabla.ext_utils import get_extension_context
        extension_module = "cudnn"
        ctx = get_extension_context(
            extension_module, type_config=args.type_config)
        comm = C.MultiProcessDataParalellCommunicator(ctx)
        comm.init()
        n_devices = comm.size
        mpi_rank = comm.rank
        device_id = mpi_rank
        ctx.device_id = str(device_id)
        nn.set_default_context(ctx)
    else:
        # Get context.
        from nnabla.ext_utils import get_extension_context
        extension_module = args.context
        if args.context is None:
            extension_module = 'cpu'
        logger.info("Running in %s" % extension_module)
        ctx = get_extension_context(
            extension_module, device_id=args.device_id, type_config=args.type_config)
        nn.set_default_context(ctx)
        n_devices = 1
        device_id = 0

    # training data
    data = data_iterator_segmentation(
            args.train_samples, args.batch_size, args.train_dir, args.train_label_dir, target_width=args.image_width, target_height=args.image_height)
    # validation data
    vdata = data_iterator_segmentation(args.val_samples, args.batch_size, args.val_dir,
                                       args.val_label_dir, target_width=args.image_width, target_height=args.image_height)

    if distributed:
        data = data.slice(
            rng=None, num_of_slices=n_devices, slice_pos=device_id)
        vdata = vdata.slice(
            rng=None, num_of_slices=n_devices, slice_pos=device_id)
    num_classes = args.num_class

    # Workaround to start with the same initialized weights for all workers.
    np.random.seed(313)
    t_model = get_model(
        args, test=False)
    t_model.pred.persistent = True  # Not clearing buffer of pred in backward
    t_pred2 = t_model.pred.unlinked()
    t_e = F.sum(F.top_n_error(t_pred2, t_model.label, axis=1)
                * t_model.mask) / F.sum(t_model.mask)

    v_model = get_model(
        args, test=True)
    v_model.pred.persistent = True  # Not clearing buffer of pred in forward
    v_pred2 = v_model.pred.unlinked()
    v_e = F.sum(F.top_n_error(v_pred2, v_model.label, axis=1)
                * v_model.mask) / F.sum(t_model.mask)

    # Create Solver
    solver = S.Momentum(args.learning_rate, 0.9)
    solver.set_parameters(nn.get_parameters())

    # Load checkpoint
    start_point = 0
    if args.checkpoint is not None:
        # load weights and solver state info from specified checkpoint file.
        start_point = load_checkpoint(args.checkpoint, solver)

    # Setting warmup.
    base_lr = args.learning_rate / n_devices
    warmup_iter = int(1. * n_train_samples /
                      args.batch_size / args.accum_grad / n_devices) * args.warmup_epoch
    warmup_slope = base_lr * (n_devices - 1) / warmup_iter
    solver.set_learning_rate(base_lr)

    # Create monitor
    import nnabla.monitor as M
    monitor = M.Monitor(args.monitor_path)
    monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10)
    monitor_err = M.MonitorSeries("Training error", monitor, interval=10)
    monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=1)
    monitor_verr = M.MonitorSeries("Validation error", monitor, interval=1)
    monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10)
    monitor_miou = M.MonitorSeries("mean IOU", monitor, interval=10)
    monitor_vtime = M.MonitorTimeElapsed(
        "Validation time", monitor, interval=1)

    # save_nnp
    contents = save_nnp({'x': v_model.image}, {
                        'y': v_model.pred}, args.batch_size)
    save.save(os.path.join(args.model_save_path,
                           'Deeplabv3plus_result_epoch0.nnp'), contents, variable_batch_size=False)

    # Training loop
    for i in range(start_point, int(args.max_iter / n_devices)):
        # Save parameters
        if i % (args.model_save_interval // n_devices) == 0 and device_id == 0:
            save_checkpoint(args.model_save_path, i, solver)
        # Validation
        if i % (args.val_interval // n_devices) == 0 and i != 0:
            vmiou_local = 0.
            val_iter_local = n_val_samples // args.batch_size
            vl_local = nn.NdArray()
            vl_local.zero()
            ve_local = nn.NdArray()
            ve_local.zero()
            for j in range(val_iter_local):
                images, labels, masks = vdata.next()
                v_model.image.d = images
                v_model.label.d = labels
                v_model.mask.d = masks
                v_model.image.data.cast(np.float32, ctx)
                v_model.label.data.cast(np.int32, ctx)
                v_model.loss.forward(clear_buffer=True)
                v_e.forward(clear_buffer=True)
                vl_local += v_model.loss.data
                ve_local += v_e.data
                # Mean IOU computation
                if compute_acc:
                    vmiou_local += compute_miou(num_classes, labels,
                                                np.argmax(v_model.pred.d, axis=1), masks)

            vl_local /= val_iter_local
            ve_local /= val_iter_local
            if compute_acc:
                vmiou_local /= val_iter_local
                vmiou_ndarray = nn.NdArray.from_numpy_array(
                    np.array(vmiou_local))
            if distributed:
                comm.all_reduce(vl_local, division=True, inplace=True)
                comm.all_reduce(ve_local, division=True, inplace=True)
                if compute_acc:
                    comm.all_reduce(vmiou_ndarray, division=True, inplace=True)

            if device_id == 0:
                monitor_vloss.add(i * n_devices, vl_local.data.copy())
                monitor_verr.add(i * n_devices, ve_local.data.copy())
                if compute_acc:
                    monitor_miou.add(i * n_devices, vmiou_local)
                monitor_vtime.add(i * n_devices)

        # Training
        l = 0.0
        e = 0.0
        solver.zero_grad()

        e_acc = nn.NdArray(t_e.shape)
        e_acc.zero()
        l_acc = nn.NdArray(t_model.loss.shape)
        l_acc.zero()
        # Gradient accumulation loop
        for j in range(args.accum_grad):
            images, labels, masks = data.next()
            t_model.image.d = images
            t_model.label.d = labels
            t_model.mask.d = masks
            t_model.image.data.cast(np.float32, ctx)
            t_model.label.data.cast(np.int32, ctx)
            t_model.loss.forward(clear_no_need_grad=True)
            t_model.loss.backward(clear_buffer=True)  # Accumulating gradients
            t_e.forward(clear_buffer=True)
            e_acc += t_e.data
            l_acc += t_model.loss.data

        # AllReduce
        if distributed:
            params = [x.grad for x in nn.get_parameters().values()]
            comm.all_reduce(params, division=False, inplace=False)
            comm.all_reduce(l_acc, division=True, inplace=True)
            comm.all_reduce(e_acc, division=True, inplace=True)
        solver.scale_grad(1./args.accum_grad)
        solver.weight_decay(args.weight_decay)
        solver.update()

        # Linear Warmup
        if i <= warmup_iter:
            lr = base_lr + warmup_slope * i
            solver.set_learning_rate(lr)

        if distributed:
            # Synchronize by averaging the weights over devices using allreduce
            if (i+1) % args.sync_weight_every_itr == 0:
                weights = [x.data for x in nn.get_parameters().values()]
                comm.all_reduce(weights, division=True, inplace=True)

        if device_id == 0:
            monitor_loss.add(
                i * n_devices, (l_acc / args.accum_grad).data.copy())
            monitor_err.add(
                i * n_devices, (e_acc / args.accum_grad).data.copy())
            monitor_time.add(i * n_devices)

        # Learning rate decay at scheduled iter --> changed to poly learning rate decay policy
        # if i in args.learning_rate_decay_at:
        solver.set_learning_rate(base_lr * ((1 - i / args.max_iter)**0.1))

    if device_id == 0:
        nn.save_parameters(os.path.join(args.model_save_path,
                                        'param_%06d.h5' % args.max_iter))

    contents = save_nnp({'x': v_model.image}, {
                        'y': v_model.pred}, args.batch_size)
    save.save(os.path.join(args.model_save_path,
                           'Deeplabv3plus_result.nnp'), contents, variable_batch_size=False)
Beispiel #23
0
def train():
    # Check NNabla version
    if utils.get_nnabla_version_integer() < 11900:
        raise ValueError(
            'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0'
        )

    parser, args = get_train_args()

    # Get context.
    ctx = get_extension_context(args.context, device_id=args.device_id)
    comm = CommunicatorWrapper(ctx)
    nn.set_default_context(comm.ctx)
    ext = import_extension_module(args.context)

    # Monitors
    # setting up monitors for logging
    monitor_path = args.output
    monitor = Monitor(monitor_path)

    monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1)
    monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1)
    monitor_validation_loss = MonitorSeries('Validation loss',
                                            monitor,
                                            interval=1)
    monitor_lr = MonitorSeries('learning rate', monitor, interval=1)
    monitor_time = MonitorTimeElapsed("training time per iteration",
                                      monitor,
                                      interval=1)

    if comm.rank == 0:
        print("Mixing coef. is {}, i.e., MDL = {}*TD-Loss + FD-Loss".format(
            args.mcoef, args.mcoef))
        if not os.path.isdir(args.output):
            os.makedirs(args.output)

    # Initialize DataIterator for MUSDB.
    train_source, valid_source, args = load_datasources(parser, args)

    train_iter = data_iterator(train_source,
                               args.batch_size,
                               RandomState(args.seed),
                               with_memory_cache=False,
                               with_file_cache=False)

    valid_iter = data_iterator(valid_source,
                               1,
                               RandomState(args.seed),
                               with_memory_cache=False,
                               with_file_cache=False)

    if comm.n_procs > 1:
        train_iter = train_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

        valid_iter = valid_iter.slice(rng=None,
                                      num_of_slices=comm.n_procs,
                                      slice_pos=comm.rank)

    # Calculate maxiter per GPU device.
    max_iter = int((train_source._size // args.batch_size) // comm.n_procs)
    weight_decay = args.weight_decay * comm.n_procs

    print("max_iter", max_iter)

    # Calculate the statistics (mean and variance) of the dataset
    scaler_mean, scaler_std = utils.get_statistics(args, train_source)

    max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft,
                                         args.bandwidth)

    unmix = OpenUnmix_CrossNet(input_mean=scaler_mean,
                               input_scale=scaler_std,
                               nb_channels=args.nb_channels,
                               hidden_size=args.hidden_size,
                               n_fft=args.nfft,
                               n_hop=args.nhop,
                               max_bin=max_bin)

    # Create input variables.
    mixture_audio = nn.Variable([args.batch_size] +
                                list(train_source._get_data(0)[0].shape))
    target_audio = nn.Variable([args.batch_size] +
                               list(train_source._get_data(0)[1].shape))

    vmixture_audio = nn.Variable(
        [1] + [2, valid_source.sample_rate * args.valid_dur])
    vtarget_audio = nn.Variable([1] +
                                [8, valid_source.sample_rate * args.valid_dur])

    # create training graph
    mix_spec, M_hat, pred = unmix(mixture_audio)
    Y = Spectrogram(*STFT(target_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop),
                    mono=(unmix.nb_channels == 1))
    loss_f = mse_loss(mix_spec, M_hat, Y)
    loss_t = sdr_loss(mixture_audio, pred, target_audio)
    loss = args.mcoef * loss_t + loss_f
    loss.persistent = True

    # Create Solver and set parameters.
    solver = S.Adam(args.lr)
    solver.set_parameters(nn.get_parameters())

    # create validation graph
    vmix_spec, vM_hat, vpred = unmix(vmixture_audio, test=True)
    vY = Spectrogram(*STFT(vtarget_audio, n_fft=unmix.n_fft,
                           n_hop=unmix.n_hop),
                     mono=(unmix.nb_channels == 1))
    vloss_f = mse_loss(vmix_spec, vM_hat, vY)
    vloss_t = sdr_loss(vmixture_audio, vpred, vtarget_audio)
    vloss = args.mcoef * vloss_t + vloss_f
    vloss.persistent = True

    # Initialize Early Stopping
    es = utils.EarlyStopping(patience=args.patience)

    # Initialize LR Scheduler (ReduceLROnPlateau)
    lr_scheduler = ReduceLROnPlateau(lr=args.lr,
                                     factor=args.lr_decay_gamma,
                                     patience=args.lr_decay_patience)
    best_epoch = 0

    # Training loop.
    for epoch in trange(args.epochs):
        # TRAINING
        losses = utils.AverageMeter()
        for batch in range(max_iter):
            mixture_audio.d, target_audio.d = train_iter.next()
            solver.zero_grad()
            loss.forward(clear_no_need_grad=True)
            if comm.n_procs > 1:
                all_reduce_callback = comm.get_all_reduce_callback()
                loss.backward(clear_buffer=True,
                              communicator_callbacks=all_reduce_callback)
            else:
                loss.backward(clear_buffer=True)
            solver.weight_decay(weight_decay)
            solver.update()
            losses.update(loss.d.copy(), args.batch_size)
        training_loss = losses.avg

        # clear cache memory
        ext.clear_memory_cache()

        # VALIDATION
        vlosses = utils.AverageMeter()
        for batch in range(int(valid_source._size // comm.n_procs)):
            x, y = valid_iter.next()
            dur = int(valid_source.sample_rate * args.valid_dur)
            sp, cnt = 0, 0
            loss_tmp = nn.NdArray()
            loss_tmp.zero()
            while 1:
                vmixture_audio.d = x[Ellipsis, sp:sp + dur]
                vtarget_audio.d = y[Ellipsis, sp:sp + dur]
                vloss.forward(clear_no_need_grad=True)
                cnt += 1
                sp += dur
                loss_tmp += vloss.data
                if x[Ellipsis,
                     sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur:
                    break
            loss_tmp = loss_tmp / cnt
            if comm.n_procs > 1:
                comm.all_reduce(loss_tmp, division=True, inplace=True)
            vlosses.update(loss_tmp.data.copy(), 1)
        validation_loss = vlosses.avg

        # clear cache memory
        ext.clear_memory_cache()

        lr = lr_scheduler.update_lr(validation_loss, epoch=epoch)
        solver.set_learning_rate(lr)
        stop = es.step(validation_loss)

        if comm.rank == 0:
            monitor_best_epoch.add(epoch, best_epoch)
            monitor_traing_loss.add(epoch, training_loss)
            monitor_validation_loss.add(epoch, validation_loss)
            monitor_lr.add(epoch, lr)
            monitor_time.add(epoch)

            if validation_loss == es.best:
                # save best model
                nn.save_parameters(os.path.join(args.output, 'best_xumx.h5'))
                best_epoch = epoch

        if stop:
            print("Apply Early Stopping")
            break
    def __next__(self):
        if self._first_batch is not None:
            batch = self._first_batch
            self._first_batch = None
            return batch
        if self._counter >= self._size:
            if self._auto_reset:
                self.reset()
            # raise StopIteration
        # Gather outputs
        outputs = []
        for p in self._pipes:
            outputs.append(p._share_outputs())
        for i in range(self._num_gpus):
            device_id = self._pipes[i].device_id
            # initialize dict for all output categories
            category_outputs = dict()
            # segregate outputs into categories
            for j, out in enumerate(outputs[i]):
                category_outputs[self.output_map[j]] = out

            # Change DALI TensorLists into Tensors
            category_tensors = dict()
            category_shapes = dict()
            for category, out in category_outputs.items():
                category_tensors[category] = out.as_tensor()
                category_shapes[category] = category_tensors[category].shape()

            # If we did not yet allocate memory for that batch, do it now
            if self._data_batches[i][self._current_data_batch] is None:
                self._category_nnabla_type = dict()
                self._category_device = dict()
                nnabla_gpu_device = get_extension_context('cudnn',
                                                          device_id=device_id)
                nnabla_cpu_device = get_extension_context('cpu')
                # check category and device
                for category in self._output_categories:
                    self._category_nnabla_type[category] = np.dtype(
                        category_tensors[category].dtype())
                    if type(category_tensors[category]) is TensorGPU:
                        self._category_device[category] = nnabla_gpu_device
                    else:
                        self._category_device[category] = nnabla_cpu_device

                nnabla_tensors = dict()
                for category in self._output_categories:
                    nnabla_tensors[category] = nn.NdArray(
                        category_shapes[category])

                self._data_batches[i][
                    self._current_data_batch] = nnabla_tensors
            else:
                nnabla_tensors = self._data_batches[i][
                    self._current_data_batch]

            # Copy data from DALI Tensors to nnabla tensors
            for category, tensor in category_tensors.items():
                feed_ndarray(tensor,
                             nnabla_tensors[category],
                             dtype=self._category_nnabla_type[category],
                             ctx=self._category_device[category])

        for p in self._pipes:
            p._release_outputs()
            p._run()

        copy_db_index = self._current_data_batch
        # Change index for double buffering
        self._current_data_batch = (self._current_data_batch + 1) % 2
        self._counter += self._num_gpus * self.batch_size

        if (self._stop_at_epoch) and (self._counter > self._size):
            # First calculate how much data is required to return exactly self._size entries.
            diff = self._num_gpus * self.batch_size - \
                (self._counter - self._size)
            # Figure out how many GPUs to grab from.
            numGPUs_tograb = int(np.ceil(diff / self.batch_size))
            # Figure out how many results to grab from the last GPU (as a fractional GPU batch may be required to
            # bring us right up to self._size).
            mod_diff = diff % self.batch_size
            data_fromlastGPU = mod_diff if mod_diff else self.batch_size

            # Grab the relevant data.
            # 1) Grab everything from the relevant GPUs.
            # 2) Grab the right data from the last GPU.
            # 3) Append data together correctly and return.
            output = [
                db[copy_db_index]
                for db in self._data_batches[0:numGPUs_tograb]
            ]
            output[-1] = output[-1].copy()
            for category in self._output_categories:
                output[-1][category] = output[-1][category][0:data_fromlastGPU]

            return output

        return [db[copy_db_index] for db in self._data_batches]
Beispiel #25
0
def CNN_run(args, ops, alphas_dict):
    """
        Based on the given model architecture,
        construct CNN and execute training.
        input:
            args: arguments set by user.
            ops: operations used in the network.
            arch_dict: a dictionary containing architecture information.
    """

    data_iterator = data_iterator_cifar10
    all_data = data_iterator(args.batch_size, True)
    tdata = all_data.slice(rng=None, slice_start=0, slice_end=25000)
    vdata = all_data.slice(rng=None, slice_start=25000, slice_end=50000)

    # CIFAR10 statistics, mean and variance
    CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1))
    CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1))

    channels, image_height, image_width = 3, 32, 32
    batch_size = args.batch_size
    initial_model_lr = args.model_lr

    one_epoch = tdata.size // batch_size
    max_iter = args.epoch * one_epoch

    # Create monitor.
    monitor = Monitor(args.monitor_path)
    monitor_loss = MonitorSeries("Training loss", monitor, interval=100)
    monitor_err = MonitorSeries("Training error", monitor, interval=100)
    monitor_vloss = MonitorSeries("Validation loss", monitor, interval=100)
    monitor_verr = MonitorSeries("Validation error", monitor, interval=100)

    # prepare variables and graph used for training
    image_train = nn.Variable(
        (batch_size, channels, image_height, image_width))
    label_train = nn.Variable((batch_size, 1))
    input_image_train = {"image": image_train, "label": label_train}
    pred_train = construct_networks(args, ops, image_train, test=False)
    loss_train = loss_function(pred_train, label_train)

    # prepare solvers for model parameters
    model_params_dict = \
        {k: v for k, v in nn.get_parameters().items() if "alpha_" not in k}
    solver_model = S.Momentum(initial_model_lr)
    solver_model.set_parameters(
        {
            k: v
            for k, v in nn.get_parameters().items()
            if k in model_params_dict.keys()
        },
        reset=False,
        retain_state=True)

    # prepare solvers for architecture parameters
    solver_archs = S.Adam(alpha=args.arch_lr, beta1=0.5, beta2=0.999)
    solver_archs.set_parameters(
        {
            k: v
            for k, v in nn.get_parameters().items() if k in alphas_dict.keys()
        },
        reset=False,
        retain_state=True)

    # Training-loop
    for i in range(max_iter):

        # Update Model Parameters.

        if args.second_order:
            # store the weights before update.
            original_weights = {
                k: nn.Variable(v.shape, need_grad=True).apply(
                    data=nn.NdArray(v.shape).copy_from(v.data))
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

            # gradients refuge
            accumulated_gradient = \
                {k: nn.Variable(v.shape).apply(d=0)
                 for k, v in alphas_dict.items()}

        image, label = tdata.next()
        image = image / 255.0
        image = (image - CIFAR_MEAN) / CIFAR_STD
        input_image_train["image"].d = image
        input_image_train["label"].d = label
        loss_train.forward()

        e = categorical_error(pred_train.d, input_image_train["label"].d)
        monitor_loss.add(i, loss_train.d.copy())
        monitor_err.add(i, e)

        if args.lr_control_model:
            new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0)
            solver_model.set_learning_rate(new_lr)

        solver_model.zero_grad()
        loss_train.backward(clear_buffer=True)

        if args.with_grad_clip_model:
            for k, v in model_params_dict.items():
                v.grad.copy_from(
                    F.clip_by_norm(v.grad, args.grad_clip_value_model))

        solver_model.weight_decay(args.weight_decay_model)
        solver_model.update()  # weights update ( w -> w')

        if args.second_order:
            updated_weights = {
                k: nn.Variable(v.shape, need_grad=True).apply(
                    data=nn.NdArray(v.shape).copy_from(v.data))
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

        # Update Architecture Parameters.

        ve, vloss = 0., 0.
        v_image, v_label = vdata.next()
        v_image = v_image / 255.0
        v_image = (v_image - CIFAR_MEAN) / CIFAR_STD
        input_image_train["image"].d = v_image
        input_image_train["label"].d = v_label
        # compute Loss_on_valid(w', alpha)
        loss_train.forward(clear_no_need_grad=True)

        ve = categorical_error(pred_train.d, input_image_train["label"].d)
        monitor_vloss.add(i, loss_train.d.copy())
        monitor_verr.add(i, ve)

        solver_archs.zero_grad()
        solver_model.zero_grad()
        loss_train.backward(clear_buffer=True)  # its gradient is stored

        if args.second_order:
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict,
                                                  coeff=1.)

            # grad_alpha_L_val(w', alpha).  Note that gradient stored into .data
            delta_gradient_w = {
                k: nn.Variable(v.shape).apply(data=nn.NdArray(
                    v.shape).copy_from(v.grad),
                                              need_grad=True)
                for k, v in nn.get_parameters().items() if "alpha_" not in k
            }

            epsilon = 0.01 / np.sum(
                [np.linalg.norm(v.d) for v in delta_gradient_w.values()])

            coeff = 1.0 * epsilon
            # w -> w+ (= w + epsilon*grad_Loss_on_val(w', alpha))
            weight_modify(original_weights, delta_gradient_w,
                          model_params_dict, coeff)

            input_image_train["image"].d = image  # reuse the same data
            input_image_train["label"].d = label

            # compute Loss_on_train(w+, alpha)
            loss_train.forward()
            solver_archs.zero_grad()
            solver_model.zero_grad()
            loss_train.backward(clear_buffer=True)  # its gradient is stored

            # accumulate currently registered gradient
            coeff = (-1.) * args.eta / 2. * epsilon
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict, coeff)

            coeff = -1.0 * epsilon
            # w -> w- (= w - epsilon*grad_Loss_on_val(w', alpha))
            weight_modify(original_weights, delta_gradient_w,
                          model_params_dict, coeff)

            # compute Loss_on_train(w-, alpha)
            loss_train.forward()
            solver_archs.zero_grad()
            solver_model.zero_grad()
            loss_train.backward(clear_buffer=True)  # its gradient is stored

            # accumulate currently registered gradient again
            coeff = (+1.) * args.eta / 2. * epsilon
            accumulated_gradient = store_gradient(accumulated_gradient,
                                                  alphas_dict, coeff)

            # replace the weights
            for k, v in alphas_dict.items():
                nn.parameter.set_parameter(
                    k,
                    nn.Variable(v.shape).apply(data=v.data,
                                               grad=accumulated_gradient[k],
                                               need_grad=True))
            for k, v in model_params_dict.items():
                nn.parameter.set_parameter(
                    k,
                    nn.Variable(v.shape).apply(data=updated_weights[k].data,
                                               need_grad=True))

        solver_archs.weight_decay(args.weight_decay_archs)
        solver_archs.update()

        if i % 1000 == 0:
            for k, v in alphas_dict.items():
                keynames = k.split("_")
                print("\nParameters for {} cell, node {} to {};".format(
                    keynames[1], keynames[2], keynames[3]))
                show_ops_and_prob(v.d, ops)

    return alphas_dict