Exemple #1
0
    def test_transfer_learning(self):
        # forward
        x = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev)
        x.gaussian(0.0, 1.0)
        x1 = autograd.Conv2d(3, 1, 2)(x)
        y = autograd.Flatten()(x1)[0]
        y_t = tensor.Tensor(shape=(2, 4), device=gpu_dev)
        y_t.gaussian(0.0, 1.0)
        loss = autograd.MeanSquareError()(y, y_t)[0]
        # backward
        sgd = opt.SGD(lr=0.01)
        for p, gp in autograd.backward(loss):
            sgd.update(p, gp)
        sgd.step()

        # frontend
        model = sonnx.to_onnx([x], [y])
        # print('The model is:\n{}'.format(model))

        # backend
        sg_ir = sonnx.prepare(model, device=gpu_dev)
        # forward
        x1 = sg_ir.run([x], last_layers=-1)[0]
        x2 = autograd.Conv2d(1, 1, 2)(x1)
        y_o = autograd.Flatten()(x2)[0]
        # backward
        y_ot = tensor.Tensor(shape=(2, 1), device=gpu_dev)
        y_ot.gaussian(0.0, 1.0)
        loss = autograd.MeanSquareError()(y_o, y_ot)[0]
        sgd = opt.SGD(lr=0.01)
        for p, gp in autograd.backward(loss):
            sgd.update(p, gp)
        sgd.step()
Exemple #2
0
    def test_retraining(self):
        # forward
        x = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev)
        x.gaussian(0.0, 1.0)
        x1 = autograd.Conv2d(3, 1, 2)(x)
        x2 = autograd.Conv2d(1, 1, 2)(x1)
        y = autograd.Flatten()(x2)[0]
        y_t = tensor.Tensor(shape=(2, 1), device=gpu_dev)
        y_t.gaussian(0.0, 1.0)
        loss = autograd.MeanSquareError()(y, y_t)[0]
        # backward
        sgd = opt.SGD(lr=0.01)
        for p, gp in autograd.backward(loss):
            sgd.update(p, gp)
        sgd.step()

        # frontend
        model = sonnx.to_onnx([x], [y])
        # print('The model is:\n{}'.format(model))

        # backend
        sg_ir = sonnx.prepare(model, device=gpu_dev)
        for idx, tens in sg_ir.tensor_map.items():
            tens.requires_grad = True
            tens.stores_grad = True
            sg_ir.tensor_map[idx] = tens
        # forward
        y_o = sg_ir.run([x])[0]
        # backward
        loss = autograd.MeanSquareError()(y_o, y_t)[0]
        sgd = opt.SGD(lr=0.01)
        for p, gp in autograd.backward(loss):
            sgd.update(p, gp)
        sgd.step()
Exemple #3
0
def train(model,
          x,
          y,
          epochs=1,
          batch_size=64,
          dev=device.get_default_device()):
    batch_number = x.shape[0] // batch_size

    for i in range(epochs):
        for b in range(batch_number):
            l_idx = b * batch_size
            r_idx = (b + 1) * batch_size

            x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
            target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])

            output_batch = model.forward(x_batch)
            # onnx_model = sonnx.to_onnx([x_batch], [y])
            # print('The model is:\n{}'.format(onnx_model))

            loss = autograd.softmax_cross_entropy(output_batch, target_batch)
            accuracy_rate = accuracy(tensor.to_numpy(output_batch),
                                     tensor.to_numpy(target_batch))

            sgd = opt.SGD(lr=0.001)
            for p, gp in autograd.backward(loss):
                sgd.update(p, gp)
            sgd.step()

            if b % 1e2 == 0:
                print("acc %6.2f loss, %6.2f" %
                      (accuracy_rate, tensor.to_numpy(loss)[0]))
    print("training completed")
    return x_batch, output_batch
Exemple #4
0
def onnx_to_singa(niter, use_cpu=False):
    if use_cpu:
        print("Using CPU")
        dev = device.get_default_device()
    else:
        print("Using GPU")
        dev = device.create_cuda_gpu()
    model = sonnx.load("mlp.onnx")
    backend = sonnx.prepare(model, device=dev)
    sgd = opt.SGD(0.1)
    inputs = Tensor(
        data=data,
        device=dev,
        requires_grad=False,
        stores_grad=False,
        name="input",
    )
    target = Tensor(
        data=label,
        device=dev,
        requires_grad=False,
        stores_grad=False,
        name="target",
    )

    for i in range(100):
        y = backend.run([inputs])[0]
        loss = autograd.softmax_cross_entropy(y, target)
        for p, gp in autograd.backward(loss):
            sgd.update(p, gp)
        loss_rate = tensor.to_numpy(loss)[0]
        accuracy_rate = accuracy(tensor.to_numpy(y), label)

        print("Iter {}, accurate={}, loss={}".format(i, accuracy_rate, loss_rate))
Exemple #5
0
def transfer_learning(sg_ir,
                      x,
                      y,
                      epochs=1,
                      batch_size=64,
                      dev=device.get_default_device()):
    batch_number = x.shape[0] // batch_size

    trans_model = Trans(sg_ir, -1)

    for i in range(epochs):
        for b in range(batch_number):
            l_idx = b * batch_size
            r_idx = (b + 1) * batch_size

            x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx])
            target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx])
            output_batch = trans_model.forward(x_batch)

            loss = autograd.softmax_cross_entropy(output_batch, target_batch)
            accuracy_rate = accuracy(tensor.to_numpy(output_batch),
                                     tensor.to_numpy(target_batch))

            sgd = opt.SGD(lr=0.07)
            for p, gp in autograd.backward(loss):
                sgd.update(p, gp)
            sgd.step()

            if b % 1e2 == 0:
                print("acc %6.2f loss, %6.2f" %
                      (accuracy_rate, tensor.to_numpy(loss)[0]))
    print("transfer-learning completed")
    return trans_model
Exemple #6
0
 def test_exponential_decay_no_staircase_cpu(self):
     lr = opt.ExponentialDecay(0.1, 2, 0.5, False)
     sgd1 = opt.SGD(lr=lr)
     for i in range(5):
         np.testing.assert_array_almost_equal(
             tensor.to_numpy(sgd1.lr_value), [0.1 * 0.5**(i / 2)])
         sgd1.step()
Exemple #7
0
def train_resnet(DIST=True, graph=True, sequential=False, verbosity=0):

    # Define the hypermeters good for the train_resnet
    niters = 100
    batch_size = 32
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)

    IMG_SIZE = 224

    # For distributed training, sequential has better throughput in the current version
    if DIST == True:
        sgd = opt.DistOpt(sgd)
        world_size = sgd.world_size
        local_rank = sgd.local_rank
        global_rank = sgd.global_rank
        sequential = True
    else:
        local_rank = 0
        world_size = 1
        global_rank = 0
        sequential = False

    dev = device.create_cuda_gpu_on(local_rank)

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
    tx.copy_from_numpy(x)
    ty.copy_from_numpy(y)

    dev.SetVerbosity(verbosity)
    dev.SetSkipIteration(5)

    # construct the model
    from model import resnet
    model = resnet.resnet50(num_channels=3, num_classes=1000)

    model.train()
    model.set_optimizer(sgd)
    model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)

    # train model
    dev.Sync()
    start = time.time()
    with trange(niters) as t:
        for _ in t:
            model(tx, ty, dist_option='fp32', spars=None)

    dev.Sync()
    end = time.time()
    titer = (end - start) / float(niters)
    throughput = float(niters * batch_size * world_size) / (end - start)
    if global_rank == 0:
        print("Throughput = {} per second".format(throughput), flush=True)
        print("TotalTime={}".format(end - start), flush=True)
        print("Total={}".format(titer), flush=True)
        dev.PrintTimeProfiling()
Exemple #8
0
 def __init__(self, hidden_size):
     super(LSTMModel3, self).__init__()
     self.lstm = layer.CudnnRNN(
         hidden_size=hidden_size,
         batch_first=True,
         #    return_sequences=True,
         use_mask=True)
     self.l1 = layer.Linear(2)
     self.optimizer = opt.SGD(0.1)
Exemple #9
0
 def __init__(self, vocab_size, hidden_size=32):
     super(CharRNN, self).__init__()
     self.rnn = autograd.LSTM(vocab_size, hidden_size)
     self.dense = autograd.Linear(hidden_size, vocab_size)
     self.optimizer = opt.SGD(0.01)
     self.hidden_size = hidden_size
     self.vocab_size = vocab_size
     self.hx = tensor.Tensor((1, self.hidden_size))
     self.cx = tensor.Tensor((1, self.hidden_size))
Exemple #10
0
 def __init__(self, hidden_size, bidirectional, num_layers):
     super(LSTMModel2, self).__init__()
     self.lstm = layer.CudnnRNN(hidden_size=hidden_size,
                                num_layers=num_layers,
                                bidirectional=bidirectional,
                                return_sequences=False,
                                rnn_mode='lstm',
                                batch_first=True)
     self.optimizer = opt.SGD(0.1)
Exemple #11
0
    def setUp(self):
        self.sgd = opt.SGD(lr=0.05)

        self.generate_data(400)

        cpu_dev.ResetGraph()

        if singa_wrap.USE_CUDA:
            gpu_dev.ResetGraph()
Exemple #12
0
def run(args, local_rank, world_size, nccl_id):
    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5)
    sgd = opt.DistOpt(sgd,
                      nccl_id=nccl_id,
                      local_rank=local_rank,
                      world_size=world_size)
    train.run(sgd.global_rank, sgd.world_size, sgd.local_rank, args.max_epoch,
              args.batch_size, args.model, args.data, sgd, args.graph,
              args.dist_option, args.spars)
    def _build_model(self, num_classes, image_size):
        lr = self._knobs.get('learning_rate')

        # read and make onnx model
        download_model(self.model_url)
        onnx_model = onnx.load(os.path.join('/tmp', self.model_path))
        model = self.singa_model(onnx_model, num_classes, image_size)

        model.set_optimizer(opt.SGD(lr=lr, momentum=0.9, weight_decay=1e-5))
        return model
Exemple #14
0
 def __init__(self, vocab_size, hidden_size=32):
     super(CharRNN, self).__init__()
     self.rnn = layer.LSTM(vocab_size, hidden_size)
     self.cat = layer.Cat()
     self.reshape1 = layer.Reshape()
     self.dense = layer.Linear(hidden_size, vocab_size)
     self.reshape2 = layer.Reshape()
     self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
     self.optimizer = opt.SGD(0.01)
     self.hidden_size = hidden_size
     self.vocab_size = vocab_size
Exemple #15
0
    def test_sgd_const_lr_momentum_weight_decay(self, dev=cpu_dev):
        sgd1 = opt.SGD(lr=0.1, weight_decay=0.2)
        w_shape = (2, 3)
        w = tensor.Tensor(w_shape, device=dev).set_value(0.1)
        g = tensor.Tensor(w_shape, device=dev).set_value(0.01)

        w_step1 = w - 0.1 * (g + 0.2 * w)

        sgd1.apply(w.name, w, g)

        assertTensorEqual(w, w_step1)
Exemple #16
0
    def test_sgd_const_lr(self, dev=cpu_dev):
        cpu_dev.EnableGraph(False)
        sgd1 = opt.SGD(lr=0.1)
        w_shape = (2, 3)
        w = tensor.Tensor(w_shape, device=dev).set_value(0.1)
        g = tensor.Tensor(w_shape, device=dev).set_value(0.1)

        w_step1 = w - 0.1 * g
        sgd1.apply(w.name, w, g)

        assertTensorEqual(w, w_step1)
Exemple #17
0
    def test_sgd_const_lr_momentum_nesterov(self, dev=cpu_dev):
        sgd1 = opt.SGD(lr=0.1, momentum=0.9, nesterov=True)
        w_shape = (2, 3)
        w = tensor.Tensor(w_shape, device=dev).set_value(0.1)
        g = tensor.Tensor(w_shape, device=dev).set_value(0.1)

        buf = g
        w_step1 = w - 0.1 * (g + 0.9 * buf)

        sgd1.apply(w.name, w, g)

        assertTensorEqual(w, w_step1)
def run(args, local_rank, world_size, nccl_id):
    sgd = opt.SGD(lr=args.lr,
                  momentum=0.9,
                  weight_decay=1e-5,
                  dtype=singa_dtype[args.precision])
    sgd = opt.DistOpt(sgd,
                      nccl_id=nccl_id,
                      local_rank=local_rank,
                      world_size=world_size)
    train_cnn.run(sgd.global_rank, sgd.world_size, sgd.local_rank,
                  args.max_epoch, args.batch_size, args.model, args.data, sgd,
                  args.graph, args.verbosity, args.dist_option, args.spars,
                  args.precision)
Exemple #19
0
def singa_to_onnx(epochs, use_cpu=False, batchsize=32):
    sgd = opt.SGD(lr=0.1)

    # operations initialization
    conv1 = autograd.Conv2d(1, 8, 3, 2, padding=1) # 28 - 14
    conv2 = autograd.Conv2d(8, 4, 3, 2, padding=1) # 14 - 7
    pooling = autograd.MaxPool2d(3, 2, padding=1) # 7 - 4
    linear = autograd.Linear(64, 10)

    def forward(x, t):
        y = conv1(x)
        y = autograd.relu(y)
        y = conv2(y)
        y = autograd.relu(y)
        y = pooling(y)
        y = autograd.flatten(y)
        y = linear(y)
        loss = autograd.softmax_cross_entropy(y, t)
        return loss, y

    autograd.training = True
    (x_train, y_train), (x_test, y_test), dev = common(use_cpu)

    niter = 1 # x_train.shape[0] // batchsize
    for epoch in range(epochs):
        accuracy_rate = 0.0
        loss_rate = 0.0
        for i in range(niter):
            inputs = tensor.Tensor(
                device=dev,
                data=x_train[i * batchsize : (i + 1) * batchsize],
                stores_grad=False,
                name="input",
            )
            targets = tensor.Tensor(
                device=dev,
                data=y_train[i * batchsize : (i + 1) * batchsize],
                requires_grad=False,
                stores_grad=False,
                name="target",
            )
            loss, y = forward(inputs, targets)
            accuracy_rate += accuracy(
                tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize]
            )
            loss_rate += tensor.to_numpy(loss)[0]
            for p, gp in autograd.backward(loss):
                sgd.update(p, gp)
        print( "accuracy is {}, loss is {}".format( accuracy_rate / niter, loss_rate / niter))
    model = sonnx.to_onnx_model([inputs], [y])
    sonnx.save(model, "cnn.onnx")
Exemple #20
0
    def __init__(self, hidden_size, seq_length, batch_size, bidirectional,
                 num_layers, return_sequences, rnn_mode, batch_first):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.return_sequences = return_sequences

        self.lstm = layer.CudnnRNN(hidden_size=hidden_size,
                                   num_layers=num_layers,
                                   bidirectional=bidirectional,
                                   return_sequences=return_sequences,
                                   rnn_mode=rnn_mode,
                                   batch_first=batch_first)
        self.optimizer = opt.SGD(0.1)
Exemple #21
0
def singa_to_onnx(niter, use_cpu=False):
    if use_cpu:
        print("Using CPU")
        dev = device.get_default_device()
    else:
        print("Using GPU")
        dev = device.create_cuda_gpu()
    inputs = Tensor(
        data=data,
        device=dev,
        requires_grad=False,
        stores_grad=False,
        name="input",
    )
    target = Tensor(
        data=label,
        device=dev,
        requires_grad=False,
        stores_grad=False,
        name="target",
    )

    w0 = Tensor(shape=(2, 3), device=dev, requires_grad=True, stores_grad=True)
    w0.gaussian(0.0, 0.1)
    b0 = Tensor(shape=(3,), device=dev, requires_grad=True, stores_grad=True)
    b0.set_value(0.0)

    w1 = Tensor(shape=(3, 2), device=dev, requires_grad=True, stores_grad=True)
    w1.gaussian(0.0, 0.1)
    b1 = Tensor(shape=(2,), device=dev, requires_grad=True, stores_grad=True)
    b1.set_value(0.0)

    sgd = opt.SGD(0.1)
    # training process
    for i in range(100):
        x = autograd.matmul(inputs, w0)
        x = autograd.add_bias(x, b0)
        x = autograd.relu(x)
        x = autograd.matmul(x, w1)
        x = autograd.add_bias(x, b1)
        loss = autograd.softmax_cross_entropy(x, target)
        for p, gp in autograd.backward(loss):
            sgd.update(p, gp)

        print("training loss = ", tensor.to_numpy(loss)[0])
    sonnx.export([inputs], [x], file_path="mlp.onnx")
Exemple #22
0
    def test_sgd_const_lr_momentum(self, dev=cpu_dev):
        sgd1 = opt.SGD(lr=0.1, momentum=0.9)
        w_shape = (2, 3)
        w = tensor.Tensor(w_shape, device=dev).set_value(0.1)
        g = tensor.Tensor(w_shape, device=dev).set_value(0.01)

        w_step1 = w - 0.1 * g
        buf = g

        sgd1.apply(w.name, w, g)
        sgd1.step()

        assertTensorEqual(w, w_step1)

        buf = g + buf * 0.9
        w_step2 = w - 0.1 * buf

        sgd1.apply(w.name, w, g)

        assertTensorEqual(w, w_step2)
Exemple #23
0
def train():
    """Start the training procedure 
    """
    num_epochs = 1
    learning_rate = 0.05
    batch_size = 8

    data_loader = DataLoader(os.path.join("data", "fetal_health.csv"))
    data_loader.standardize_column("baseline value")
    x_train, y_train = data_loader.load_data(subset="train")
    x_valid, y_valid = data_loader.load_data(subset="valid")

    num_classes = len(np.unique(y_train))
    num_samples, num_features = x_train.shape

    assert x_train.shape[1] == x_valid.shape[
        1], "Number of features should be equal!"
    assert x_train.shape[0] == y_train.shape[
        0], "Number of training samples should be equal!"
    assert x_valid.shape[0] == y_valid.shape[
        0], "Number of validation samples should be equal!"

    dev = get_default_device()
    tx = tensor.Tensor((num_samples, num_features), dev, tensor.float32)
    ty = tensor.Tensor((num_samples, ), dev, tensor.int32)

    sgd = opt.SGD(learning_rate)
    model = create_MLP_model(perceptron_size=10, num_classes=num_classes)
    model.set_optimizer(sgd)
    model.compile([tx], is_train=True, use_graph=True, sequential=False)
    model.train()

    for i in range(num_epochs):
        tx.copy_from_numpy(x_train.astype(np.float32))
        ty.copy_from_numpy(y_train.astype(np.int32))
        out, loss = model(tx, ty, 'fp32', spars=None)

        # TODO: Add metric evaluation on validation data
        if i % 10 == 0:
            print("training loss = {:.3f}".format(tensor.to_numpy(loss)[0]))
Exemple #24
0
def onnx_to_singa(epochs, use_cpu=False, batchsize=32):
    (x_train, y_train), (x_test, y_test), dev = common(use_cpu)
    model = sonnx.load("cnn.onnx")
    backend = sonnx.prepare(model, dev)
    autograd.training = True
    sgd = opt.SGD(lr=0.01)
    niter = x_train.shape[0] // batchsize
    for epoch in range(epochs):
        accuracy_rate = 0.0
        loss_rate = 0.0
        for i in range(niter):
            inputs = tensor.Tensor(
                device=dev,
                data=x_train[i * batchsize : (i + 1) * batchsize],
                stores_grad=False,
                name="input",
            )
            targets = tensor.Tensor(
                device=dev,
                data=y_train[i * batchsize : (i + 1) * batchsize],
                requires_grad=False,
                stores_grad=False,
                name="target",
            )
            y = backend.run([inputs])[0]
            loss = autograd.softmax_cross_entropy(y, targets)

            accuracy_rate += accuracy(
                tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize]
            )
            loss_rate += tensor.to_numpy(loss)[0]

            for p, gp in autograd.backward(loss):
                sgd.update(p, gp)

        print("accuracy is {}, loss is {}".format(accuracy_rate / niter, loss_rate / niter))
Exemple #25
0
# under the License.
#

# the code is modified from
# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py

from singa import autograd
from singa import tensor
from singa import device
from singa import opt

import numpy as np
from tqdm import trange

if __name__ == "__main__":
    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
    sgd = opt.DistOpt(sgd)

    from resnet import resnet50
    model = resnet50()

    if (sgd.rank_in_global == 0):
        print("Start intialization...........", flush=True)

    dev = device.create_cuda_gpu_on(sgd.rank_in_local)
    niters = 100
    batch_size = 32
    IMG_SIZE = 224

    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
    ty = tensor.Tensor((batch_size, ), dev, tensor.int32)
Exemple #26
0
def train_mnist_cnn(DIST=False,
                    local_rank=None,
                    world_size=None,
                    nccl_id=None,
                    spars=0,
                    topK=False,
                    corr=True):

    # Define the hypermeters good for the mnist_cnn
    max_epoch = 10
    batch_size = 128
    sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        '''
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          local_rank=local_rank,
                          world_size=world_size)
        '''
        dev = device.get_default_device()
        # create kvstore
        kv_type = 'dist_sync'  #set synchronization mode
        lr = 0.005
        kv = singa_kvstore.create_kvstore(kv_type,
                                          'SingaSGD',
                                          lr=0.005,
                                          momentum=0.9,
                                          weight_decay=1e-5)
        global_rank = kv.rank
        world_size = kv.num_workers
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, global_rank,
                                          world_size)
        test_x, test_y = data_partition(test_x, test_y, global_rank,
                                        world_size)

    # create model
    model = CNN()
    ''' 
    num_channels = train_x.shape[1]
    image_size = train_x.shape[2]
    data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
    num_classes = (np.max(train_y) + 1).item()
    model = resnet.resnet18(num_channels=1, num_classes=num_classes)
    '''
    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev,
                       tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)
    '''
    if DIST:
        #Initial a batch to help obtain model parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        #Initial kv store for workers of ps-architecture
        key = 0
        for p, g in autograd.backward(loss):
            kv.init(key, mx.nd.array(tensor.to_numpy(p)))
            key += 1
     '''

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if (DIST == True):
            print('^_^Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)
        time_start = time.time()
        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            singa_kvstore.backward_and_update(kv, loss)
            '''
            if DIST:
                #push
                kv_pairs = []
                key = 0
                for p, g in autograd.backward(loss):
                    kv.push(key,mx.nd.array(tensor.to_numpy(g)))
                    kv_pairs.append((key,p,g))
                    key += 1
                #pull
                for key,p,g in kv_pairs:
                    out_buf = mx.nd.zeros(p.shape)
                    kv.pull(key,out=out_buf)
                    p.copy_from_numpy(out_buf.asnumpy())
             '''

            # Evaluation Phase
            if b % 20 != 0:
                continue
            autograd.training = False
            num_test_batch_inside = 20
            test_correct = 0
            for b in range(num_test_batch_inside):
                x = test_x[b * batch_size:(b + 1) * batch_size]
                y = test_y[b * batch_size:(b + 1) * batch_size]
                tx.copy_from_numpy(x)
                ty.copy_from_numpy(y)
                out_test = model.forward(tx)
                test_correct += accuracy(tensor.to_numpy(out_test), y)
            print('Evaluation accuracy = %f' %
                  (test_correct / (batch_size * num_test_batch_inside)),
                  flush=True)
            autograd.training = True
        print('epoch time is %f' % (time.time() - time_start))
Exemple #27
0
                        '--learning-rate',
                        default=0.005,
                        type=float,
                        help='initial learning rate',
                        dest='lr')
    # determine which gpu to use
    parser.add_argument('-i',
                        '--device-id',
                        default=0,
                        type=int,
                        help='which GPU to use',
                        dest='device_id')
    parser.add_argument('-g',
                        '--disable-graph',
                        default='True',
                        action='store_false',
                        help='disable graph',
                        dest='graph')
    parser.add_argument('-v',
                        '--log-verbosity',
                        default=0,
                        type=int,
                        help='logging verbosity',
                        dest='verbosity')

    args = parser.parse_args()

    sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5)
    run(0, 1, args.device_id, args.max_epoch, args.batch_size, args.model,
        args.data, sgd, args.graph, args.verbosity)
Exemple #28
0
    label = to_categorical(label, 2).astype(np.float32)
    print("train_data_shape:", data.shape)
    print("train_label_shape:", label.shape)

    inputs = Tensor(data=data)
    target = Tensor(data=label)

    w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True)
    w0.gaussian(0.0, 0.1)
    b0 = Tensor(shape=(3, ), requires_grad=True, stores_grad=True)
    b0.set_value(0.0)

    w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True)
    w1.gaussian(0.0, 0.1)
    b1 = Tensor(shape=(2, ), requires_grad=True, stores_grad=True)
    b1.set_value(0.0)

    sgd = opt.SGD(0.05)
    # training process
    for i in range(1001):
        x = autograd.matmul(inputs, w0)
        x = autograd.add_bias(x, b0)
        x = autograd.relu(x)
        x = autograd.matmul(x, w1)
        x = autograd.add_bias(x, b1)
        loss = autograd.softmax_cross_entropy(x, target)
        sgd.backward_and_update(loss)

        if i % 100 == 0:
            print("training loss = ", tensor.to_numpy(loss)[0])
Exemple #29
0
def train_mnist_cnn(DIST=False,
                    local_rank=None,
                    world_size=None,
                    nccl_id=None,
                    spars=0,
                    topK=False,
                    corr=True):

    # Define the hypermeters good for the mnist_cnn
    max_epoch = 10
    batch_size = 64
    sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)

    # Prepare training and valadiation data
    train_x, train_y, test_x, test_y = load_dataset()
    IMG_SIZE = 28
    num_classes = 10
    train_y = to_categorical(train_y, num_classes)
    test_y = to_categorical(test_y, num_classes)

    # Normalization
    train_x = train_x / 255
    test_x = test_x / 255

    if DIST:
        # For Distributed GPU Training
        sgd = opt.DistOpt(sgd,
                          nccl_id=nccl_id,
                          local_rank=local_rank,
                          world_size=world_size)
        dev = device.create_cuda_gpu_on(sgd.local_rank)
        # Dataset partition for distributed training
        train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
                                          sgd.world_size)
        test_x, test_y = data_partition(test_x, test_y, sgd.global_rank,
                                        sgd.world_size)
        world_size = sgd.world_size
    else:
        # For Single GPU
        dev = device.create_cuda_gpu()
        world_size = 1

    # create model
    model = CNN()

    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
    num_train_batch = train_x.shape[0] // batch_size
    num_test_batch = test_x.shape[0] // batch_size
    idx = np.arange(train_x.shape[0], dtype=np.int32)

    if DIST:
        #Sychronize the initial parameters
        autograd.training = True
        x = np.random.randn(batch_size, 1, IMG_SIZE,
                            IMG_SIZE).astype(np.float32)
        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
        tx.copy_from_numpy(x)
        ty.copy_from_numpy(y)
        out = model.forward(tx)
        loss = autograd.softmax_cross_entropy(out, ty)
        for p, g in autograd.backward(loss):
            synchronize(p, sgd)

    # Training and Evaulation Loop
    for epoch in range(max_epoch):
        start_time = time.time()
        np.random.shuffle(idx)

        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Starting Epoch %d:' % (epoch))

        # Training Phase
        autograd.training = True
        train_correct = np.zeros(shape=[1], dtype=np.float32)
        test_correct = np.zeros(shape=[1], dtype=np.float32)
        train_loss = np.zeros(shape=[1], dtype=np.float32)

        for b in range(num_train_batch):
            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
            x = augmentation(x, batch_size)
            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out = model.forward(tx)
            loss = autograd.softmax_cross_entropy(out, ty)
            train_correct += accuracy(tensor.to_numpy(out), y)
            train_loss += tensor.to_numpy(loss)[0]
            if DIST:
                if (spars == 0):
                    sgd.backward_and_update(loss, threshold=50000)
                else:
                    sgd.backward_and_sparse_update(loss,
                                                   spars=spars,
                                                   topK=topK,
                                                   corr=corr)
            else:
                sgd.backward_and_update(loss)

        if DIST:
            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
            reducer = tensor.Tensor((1,), dev, tensor.float32)
            train_correct = reduce_variable(train_correct, sgd, reducer)
            train_loss = reduce_variable(train_loss, sgd, reducer)

        # Output the Training Loss and Accuracy
        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Training loss = %f, training accuracy = %f' %
                  (train_loss, train_correct /
                   (num_train_batch * batch_size * world_size)),
                  flush=True)

        # Evaluation Phase
        autograd.training = False
        for b in range(num_test_batch):
            x = test_x[b * batch_size:(b + 1) * batch_size]
            y = test_y[b * batch_size:(b + 1) * batch_size]
            tx.copy_from_numpy(x)
            ty.copy_from_numpy(y)
            out_test = model.forward(tx)
            test_correct += accuracy(tensor.to_numpy(out_test), y)

        if DIST:
            # Reduce the Evaulation Accuracy from Multiple Devices
            test_correct = reduce_variable(test_correct, sgd, reducer)

        # Output the Evaluation Accuracy
        if ((DIST == False) or (sgd.global_rank == 0)):
            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                  (test_correct / (num_test_batch * batch_size * world_size),
                   time.time() - start_time),
                  flush=True)
Exemple #30
0
    data_per_rank = dataset_x.shape[0] // world_size
    idx_start = rank_in_global * data_per_rank
    idx_end = (rank_in_global + 1) * data_per_rank
    return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end]


if __name__ == '__main__':

    # Generate a NCCL ID to be used for collective communication
    nccl_id = singa.NcclIdHolder()

    gpu_per_node = 8
    max_epoch = 10
    batch_size = 64

    sgd = opt.SGD(lr=0.005 * gpu_per_node, momentum=0.9, weight_decay=1e-5)

    # Use sparsification with parameters
    topK = False  # When topK = False, Sparsification based on a constant absolute threshold
    corr = True  # If True, uses local accumulate gradient for the correction
    sparsThreshold = 0.05  # The constant absolute threshold for sparsification

    process = []
    for gpu_num in range(0, gpu_per_node):
        process.append(
            multiprocessing.Process(target=train_mnist_cnn,
                                    args=(sgd, max_epoch, batch_size, True,
                                          data_partition, gpu_num,
                                          gpu_per_node, nccl_id,
                                          sparsThreshold, topK, corr)))