def test_transfer_learning(self): # forward x = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev) x.gaussian(0.0, 1.0) x1 = autograd.Conv2d(3, 1, 2)(x) y = autograd.Flatten()(x1)[0] y_t = tensor.Tensor(shape=(2, 4), device=gpu_dev) y_t.gaussian(0.0, 1.0) loss = autograd.MeanSquareError()(y, y_t)[0] # backward sgd = opt.SGD(lr=0.01) for p, gp in autograd.backward(loss): sgd.update(p, gp) sgd.step() # frontend model = sonnx.to_onnx([x], [y]) # print('The model is:\n{}'.format(model)) # backend sg_ir = sonnx.prepare(model, device=gpu_dev) # forward x1 = sg_ir.run([x], last_layers=-1)[0] x2 = autograd.Conv2d(1, 1, 2)(x1) y_o = autograd.Flatten()(x2)[0] # backward y_ot = tensor.Tensor(shape=(2, 1), device=gpu_dev) y_ot.gaussian(0.0, 1.0) loss = autograd.MeanSquareError()(y_o, y_ot)[0] sgd = opt.SGD(lr=0.01) for p, gp in autograd.backward(loss): sgd.update(p, gp) sgd.step()
def test_retraining(self): # forward x = tensor.Tensor(shape=(2, 3, 3, 3), device=gpu_dev) x.gaussian(0.0, 1.0) x1 = autograd.Conv2d(3, 1, 2)(x) x2 = autograd.Conv2d(1, 1, 2)(x1) y = autograd.Flatten()(x2)[0] y_t = tensor.Tensor(shape=(2, 1), device=gpu_dev) y_t.gaussian(0.0, 1.0) loss = autograd.MeanSquareError()(y, y_t)[0] # backward sgd = opt.SGD(lr=0.01) for p, gp in autograd.backward(loss): sgd.update(p, gp) sgd.step() # frontend model = sonnx.to_onnx([x], [y]) # print('The model is:\n{}'.format(model)) # backend sg_ir = sonnx.prepare(model, device=gpu_dev) for idx, tens in sg_ir.tensor_map.items(): tens.requires_grad = True tens.stores_grad = True sg_ir.tensor_map[idx] = tens # forward y_o = sg_ir.run([x])[0] # backward loss = autograd.MeanSquareError()(y_o, y_t)[0] sgd = opt.SGD(lr=0.01) for p, gp in autograd.backward(loss): sgd.update(p, gp) sgd.step()
def train(model, x, y, epochs=1, batch_size=64, dev=device.get_default_device()): batch_number = x.shape[0] // batch_size for i in range(epochs): for b in range(batch_number): l_idx = b * batch_size r_idx = (b + 1) * batch_size x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx]) target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx]) output_batch = model.forward(x_batch) # onnx_model = sonnx.to_onnx([x_batch], [y]) # print('The model is:\n{}'.format(onnx_model)) loss = autograd.softmax_cross_entropy(output_batch, target_batch) accuracy_rate = accuracy(tensor.to_numpy(output_batch), tensor.to_numpy(target_batch)) sgd = opt.SGD(lr=0.001) for p, gp in autograd.backward(loss): sgd.update(p, gp) sgd.step() if b % 1e2 == 0: print("acc %6.2f loss, %6.2f" % (accuracy_rate, tensor.to_numpy(loss)[0])) print("training completed") return x_batch, output_batch
def onnx_to_singa(niter, use_cpu=False): if use_cpu: print("Using CPU") dev = device.get_default_device() else: print("Using GPU") dev = device.create_cuda_gpu() model = sonnx.load("mlp.onnx") backend = sonnx.prepare(model, device=dev) sgd = opt.SGD(0.1) inputs = Tensor( data=data, device=dev, requires_grad=False, stores_grad=False, name="input", ) target = Tensor( data=label, device=dev, requires_grad=False, stores_grad=False, name="target", ) for i in range(100): y = backend.run([inputs])[0] loss = autograd.softmax_cross_entropy(y, target) for p, gp in autograd.backward(loss): sgd.update(p, gp) loss_rate = tensor.to_numpy(loss)[0] accuracy_rate = accuracy(tensor.to_numpy(y), label) print("Iter {}, accurate={}, loss={}".format(i, accuracy_rate, loss_rate))
def transfer_learning(sg_ir, x, y, epochs=1, batch_size=64, dev=device.get_default_device()): batch_number = x.shape[0] // batch_size trans_model = Trans(sg_ir, -1) for i in range(epochs): for b in range(batch_number): l_idx = b * batch_size r_idx = (b + 1) * batch_size x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx]) target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx]) output_batch = trans_model.forward(x_batch) loss = autograd.softmax_cross_entropy(output_batch, target_batch) accuracy_rate = accuracy(tensor.to_numpy(output_batch), tensor.to_numpy(target_batch)) sgd = opt.SGD(lr=0.07) for p, gp in autograd.backward(loss): sgd.update(p, gp) sgd.step() if b % 1e2 == 0: print("acc %6.2f loss, %6.2f" % (accuracy_rate, tensor.to_numpy(loss)[0])) print("transfer-learning completed") return trans_model
def test_exponential_decay_no_staircase_cpu(self): lr = opt.ExponentialDecay(0.1, 2, 0.5, False) sgd1 = opt.SGD(lr=lr) for i in range(5): np.testing.assert_array_almost_equal( tensor.to_numpy(sgd1.lr_value), [0.1 * 0.5**(i / 2)]) sgd1.step()
def train_resnet(DIST=True, graph=True, sequential=False, verbosity=0): # Define the hypermeters good for the train_resnet niters = 100 batch_size = 32 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) IMG_SIZE = 224 # For distributed training, sequential has better throughput in the current version if DIST == True: sgd = opt.DistOpt(sgd) world_size = sgd.world_size local_rank = sgd.local_rank global_rank = sgd.global_rank sequential = True else: local_rank = 0 world_size = 1 global_rank = 0 sequential = False dev = device.create_cuda_gpu_on(local_rank) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size,), dev, tensor.int32) x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) dev.SetVerbosity(verbosity) dev.SetSkipIteration(5) # construct the model from model import resnet model = resnet.resnet50(num_channels=3, num_classes=1000) model.train() model.set_optimizer(sgd) model.compile([tx], is_train=True, use_graph=graph, sequential=sequential) # train model dev.Sync() start = time.time() with trange(niters) as t: for _ in t: model(tx, ty, dist_option='fp32', spars=None) dev.Sync() end = time.time() titer = (end - start) / float(niters) throughput = float(niters * batch_size * world_size) / (end - start) if global_rank == 0: print("Throughput = {} per second".format(throughput), flush=True) print("TotalTime={}".format(end - start), flush=True) print("Total={}".format(titer), flush=True) dev.PrintTimeProfiling()
def __init__(self, hidden_size): super(LSTMModel3, self).__init__() self.lstm = layer.CudnnRNN( hidden_size=hidden_size, batch_first=True, # return_sequences=True, use_mask=True) self.l1 = layer.Linear(2) self.optimizer = opt.SGD(0.1)
def __init__(self, vocab_size, hidden_size=32): super(CharRNN, self).__init__() self.rnn = autograd.LSTM(vocab_size, hidden_size) self.dense = autograd.Linear(hidden_size, vocab_size) self.optimizer = opt.SGD(0.01) self.hidden_size = hidden_size self.vocab_size = vocab_size self.hx = tensor.Tensor((1, self.hidden_size)) self.cx = tensor.Tensor((1, self.hidden_size))
def __init__(self, hidden_size, bidirectional, num_layers): super(LSTMModel2, self).__init__() self.lstm = layer.CudnnRNN(hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, return_sequences=False, rnn_mode='lstm', batch_first=True) self.optimizer = opt.SGD(0.1)
def setUp(self): self.sgd = opt.SGD(lr=0.05) self.generate_data(400) cpu_dev.ResetGraph() if singa_wrap.USE_CUDA: gpu_dev.ResetGraph()
def run(args, local_rank, world_size, nccl_id): sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5) sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) train.run(sgd.global_rank, sgd.world_size, sgd.local_rank, args.max_epoch, args.batch_size, args.model, args.data, sgd, args.graph, args.dist_option, args.spars)
def _build_model(self, num_classes, image_size): lr = self._knobs.get('learning_rate') # read and make onnx model download_model(self.model_url) onnx_model = onnx.load(os.path.join('/tmp', self.model_path)) model = self.singa_model(onnx_model, num_classes, image_size) model.set_optimizer(opt.SGD(lr=lr, momentum=0.9, weight_decay=1e-5)) return model
def __init__(self, vocab_size, hidden_size=32): super(CharRNN, self).__init__() self.rnn = layer.LSTM(vocab_size, hidden_size) self.cat = layer.Cat() self.reshape1 = layer.Reshape() self.dense = layer.Linear(hidden_size, vocab_size) self.reshape2 = layer.Reshape() self.softmax_cross_entropy = layer.SoftMaxCrossEntropy() self.optimizer = opt.SGD(0.01) self.hidden_size = hidden_size self.vocab_size = vocab_size
def test_sgd_const_lr_momentum_weight_decay(self, dev=cpu_dev): sgd1 = opt.SGD(lr=0.1, weight_decay=0.2) w_shape = (2, 3) w = tensor.Tensor(w_shape, device=dev).set_value(0.1) g = tensor.Tensor(w_shape, device=dev).set_value(0.01) w_step1 = w - 0.1 * (g + 0.2 * w) sgd1.apply(w.name, w, g) assertTensorEqual(w, w_step1)
def test_sgd_const_lr(self, dev=cpu_dev): cpu_dev.EnableGraph(False) sgd1 = opt.SGD(lr=0.1) w_shape = (2, 3) w = tensor.Tensor(w_shape, device=dev).set_value(0.1) g = tensor.Tensor(w_shape, device=dev).set_value(0.1) w_step1 = w - 0.1 * g sgd1.apply(w.name, w, g) assertTensorEqual(w, w_step1)
def test_sgd_const_lr_momentum_nesterov(self, dev=cpu_dev): sgd1 = opt.SGD(lr=0.1, momentum=0.9, nesterov=True) w_shape = (2, 3) w = tensor.Tensor(w_shape, device=dev).set_value(0.1) g = tensor.Tensor(w_shape, device=dev).set_value(0.1) buf = g w_step1 = w - 0.1 * (g + 0.9 * buf) sgd1.apply(w.name, w, g) assertTensorEqual(w, w_step1)
def run(args, local_rank, world_size, nccl_id): sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype[args.precision]) sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) train_cnn.run(sgd.global_rank, sgd.world_size, sgd.local_rank, args.max_epoch, args.batch_size, args.model, args.data, sgd, args.graph, args.verbosity, args.dist_option, args.spars, args.precision)
def singa_to_onnx(epochs, use_cpu=False, batchsize=32): sgd = opt.SGD(lr=0.1) # operations initialization conv1 = autograd.Conv2d(1, 8, 3, 2, padding=1) # 28 - 14 conv2 = autograd.Conv2d(8, 4, 3, 2, padding=1) # 14 - 7 pooling = autograd.MaxPool2d(3, 2, padding=1) # 7 - 4 linear = autograd.Linear(64, 10) def forward(x, t): y = conv1(x) y = autograd.relu(y) y = conv2(y) y = autograd.relu(y) y = pooling(y) y = autograd.flatten(y) y = linear(y) loss = autograd.softmax_cross_entropy(y, t) return loss, y autograd.training = True (x_train, y_train), (x_test, y_test), dev = common(use_cpu) niter = 1 # x_train.shape[0] // batchsize for epoch in range(epochs): accuracy_rate = 0.0 loss_rate = 0.0 for i in range(niter): inputs = tensor.Tensor( device=dev, data=x_train[i * batchsize : (i + 1) * batchsize], stores_grad=False, name="input", ) targets = tensor.Tensor( device=dev, data=y_train[i * batchsize : (i + 1) * batchsize], requires_grad=False, stores_grad=False, name="target", ) loss, y = forward(inputs, targets) accuracy_rate += accuracy( tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize] ) loss_rate += tensor.to_numpy(loss)[0] for p, gp in autograd.backward(loss): sgd.update(p, gp) print( "accuracy is {}, loss is {}".format( accuracy_rate / niter, loss_rate / niter)) model = sonnx.to_onnx_model([inputs], [y]) sonnx.save(model, "cnn.onnx")
def __init__(self, hidden_size, seq_length, batch_size, bidirectional, num_layers, return_sequences, rnn_mode, batch_first): super(LSTMModel, self).__init__() self.hidden_size = hidden_size self.seq_length = seq_length self.return_sequences = return_sequences self.lstm = layer.CudnnRNN(hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional, return_sequences=return_sequences, rnn_mode=rnn_mode, batch_first=batch_first) self.optimizer = opt.SGD(0.1)
def singa_to_onnx(niter, use_cpu=False): if use_cpu: print("Using CPU") dev = device.get_default_device() else: print("Using GPU") dev = device.create_cuda_gpu() inputs = Tensor( data=data, device=dev, requires_grad=False, stores_grad=False, name="input", ) target = Tensor( data=label, device=dev, requires_grad=False, stores_grad=False, name="target", ) w0 = Tensor(shape=(2, 3), device=dev, requires_grad=True, stores_grad=True) w0.gaussian(0.0, 0.1) b0 = Tensor(shape=(3,), device=dev, requires_grad=True, stores_grad=True) b0.set_value(0.0) w1 = Tensor(shape=(3, 2), device=dev, requires_grad=True, stores_grad=True) w1.gaussian(0.0, 0.1) b1 = Tensor(shape=(2,), device=dev, requires_grad=True, stores_grad=True) b1.set_value(0.0) sgd = opt.SGD(0.1) # training process for i in range(100): x = autograd.matmul(inputs, w0) x = autograd.add_bias(x, b0) x = autograd.relu(x) x = autograd.matmul(x, w1) x = autograd.add_bias(x, b1) loss = autograd.softmax_cross_entropy(x, target) for p, gp in autograd.backward(loss): sgd.update(p, gp) print("training loss = ", tensor.to_numpy(loss)[0]) sonnx.export([inputs], [x], file_path="mlp.onnx")
def test_sgd_const_lr_momentum(self, dev=cpu_dev): sgd1 = opt.SGD(lr=0.1, momentum=0.9) w_shape = (2, 3) w = tensor.Tensor(w_shape, device=dev).set_value(0.1) g = tensor.Tensor(w_shape, device=dev).set_value(0.01) w_step1 = w - 0.1 * g buf = g sgd1.apply(w.name, w, g) sgd1.step() assertTensorEqual(w, w_step1) buf = g + buf * 0.9 w_step2 = w - 0.1 * buf sgd1.apply(w.name, w, g) assertTensorEqual(w, w_step2)
def train(): """Start the training procedure """ num_epochs = 1 learning_rate = 0.05 batch_size = 8 data_loader = DataLoader(os.path.join("data", "fetal_health.csv")) data_loader.standardize_column("baseline value") x_train, y_train = data_loader.load_data(subset="train") x_valid, y_valid = data_loader.load_data(subset="valid") num_classes = len(np.unique(y_train)) num_samples, num_features = x_train.shape assert x_train.shape[1] == x_valid.shape[ 1], "Number of features should be equal!" assert x_train.shape[0] == y_train.shape[ 0], "Number of training samples should be equal!" assert x_valid.shape[0] == y_valid.shape[ 0], "Number of validation samples should be equal!" dev = get_default_device() tx = tensor.Tensor((num_samples, num_features), dev, tensor.float32) ty = tensor.Tensor((num_samples, ), dev, tensor.int32) sgd = opt.SGD(learning_rate) model = create_MLP_model(perceptron_size=10, num_classes=num_classes) model.set_optimizer(sgd) model.compile([tx], is_train=True, use_graph=True, sequential=False) model.train() for i in range(num_epochs): tx.copy_from_numpy(x_train.astype(np.float32)) ty.copy_from_numpy(y_train.astype(np.int32)) out, loss = model(tx, ty, 'fp32', spars=None) # TODO: Add metric evaluation on validation data if i % 10 == 0: print("training loss = {:.3f}".format(tensor.to_numpy(loss)[0]))
def onnx_to_singa(epochs, use_cpu=False, batchsize=32): (x_train, y_train), (x_test, y_test), dev = common(use_cpu) model = sonnx.load("cnn.onnx") backend = sonnx.prepare(model, dev) autograd.training = True sgd = opt.SGD(lr=0.01) niter = x_train.shape[0] // batchsize for epoch in range(epochs): accuracy_rate = 0.0 loss_rate = 0.0 for i in range(niter): inputs = tensor.Tensor( device=dev, data=x_train[i * batchsize : (i + 1) * batchsize], stores_grad=False, name="input", ) targets = tensor.Tensor( device=dev, data=y_train[i * batchsize : (i + 1) * batchsize], requires_grad=False, stores_grad=False, name="target", ) y = backend.run([inputs])[0] loss = autograd.softmax_cross_entropy(y, targets) accuracy_rate += accuracy( tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize] ) loss_rate += tensor.to_numpy(loss)[0] for p, gp in autograd.backward(loss): sgd.update(p, gp) print("accuracy is {}, loss is {}".format(accuracy_rate / niter, loss_rate / niter))
# under the License. # # the code is modified from # https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py from singa import autograd from singa import tensor from singa import device from singa import opt import numpy as np from tqdm import trange if __name__ == "__main__": sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) sgd = opt.DistOpt(sgd) from resnet import resnet50 model = resnet50() if (sgd.rank_in_global == 0): print("Start intialization...........", flush=True) dev = device.create_cuda_gpu_on(sgd.rank_in_local) niters = 100 batch_size = 32 IMG_SIZE = 224 tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size, ), dev, tensor.int32)
def train_mnist_cnn(DIST=False, local_rank=None, world_size=None, nccl_id=None, spars=0, topK=False, corr=True): # Define the hypermeters good for the mnist_cnn max_epoch = 10 batch_size = 128 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training ''' sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) ''' dev = device.get_default_device() # create kvstore kv_type = 'dist_sync' #set synchronization mode lr = 0.005 kv = singa_kvstore.create_kvstore(kv_type, 'SingaSGD', lr=0.005, momentum=0.9, weight_decay=1e-5) global_rank = kv.rank world_size = kv.num_workers # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, global_rank, world_size) test_x, test_y = data_partition(test_x, test_y, global_rank, world_size) # create model model = CNN() ''' num_channels = train_x.shape[1] image_size = train_x.shape[2] data_size = np.prod(train_x.shape[1:train_x.ndim]).item() num_classes = (np.max(train_y) + 1).item() model = resnet.resnet18(num_channels=1, num_classes=num_classes) ''' tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) ''' if DIST: #Initial a batch to help obtain model parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) #Initial kv store for workers of ps-architecture key = 0 for p, g in autograd.backward(loss): kv.init(key, mx.nd.array(tensor.to_numpy(p))) key += 1 ''' # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if (DIST == True): print('^_^Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) time_start = time.time() for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] singa_kvstore.backward_and_update(kv, loss) ''' if DIST: #push kv_pairs = [] key = 0 for p, g in autograd.backward(loss): kv.push(key,mx.nd.array(tensor.to_numpy(g))) kv_pairs.append((key,p,g)) key += 1 #pull for key,p,g in kv_pairs: out_buf = mx.nd.zeros(p.shape) kv.pull(key,out=out_buf) p.copy_from_numpy(out_buf.asnumpy()) ''' # Evaluation Phase if b % 20 != 0: continue autograd.training = False num_test_batch_inside = 20 test_correct = 0 for b in range(num_test_batch_inside): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) print('Evaluation accuracy = %f' % (test_correct / (batch_size * num_test_batch_inside)), flush=True) autograd.training = True print('epoch time is %f' % (time.time() - time_start))
'--learning-rate', default=0.005, type=float, help='initial learning rate', dest='lr') # determine which gpu to use parser.add_argument('-i', '--device-id', default=0, type=int, help='which GPU to use', dest='device_id') parser.add_argument('-g', '--disable-graph', default='True', action='store_false', help='disable graph', dest='graph') parser.add_argument('-v', '--log-verbosity', default=0, type=int, help='logging verbosity', dest='verbosity') args = parser.parse_args() sgd = opt.SGD(lr=args.lr, momentum=0.9, weight_decay=1e-5) run(0, 1, args.device_id, args.max_epoch, args.batch_size, args.model, args.data, sgd, args.graph, args.verbosity)
label = to_categorical(label, 2).astype(np.float32) print("train_data_shape:", data.shape) print("train_label_shape:", label.shape) inputs = Tensor(data=data) target = Tensor(data=label) w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True) w0.gaussian(0.0, 0.1) b0 = Tensor(shape=(3, ), requires_grad=True, stores_grad=True) b0.set_value(0.0) w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True) w1.gaussian(0.0, 0.1) b1 = Tensor(shape=(2, ), requires_grad=True, stores_grad=True) b1.set_value(0.0) sgd = opt.SGD(0.05) # training process for i in range(1001): x = autograd.matmul(inputs, w0) x = autograd.add_bias(x, b0) x = autograd.relu(x) x = autograd.matmul(x, w1) x = autograd.add_bias(x, b1) loss = autograd.softmax_cross_entropy(x, target) sgd.backward_and_update(loss) if i % 100 == 0: print("training loss = ", tensor.to_numpy(loss)[0])
def train_mnist_cnn(DIST=False, local_rank=None, world_size=None, nccl_id=None, spars=0, topK=False, corr=True): # Define the hypermeters good for the mnist_cnn max_epoch = 10 batch_size = 64 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) dev = device.create_cuda_gpu_on(sgd.local_rank) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.global_rank, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.global_rank, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): synchronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.global_rank == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] if DIST: if (spars == 0): sgd.backward_and_update(loss, threshold=50000) else: sgd.backward_and_sparse_update(loss, spars=spars, topK=topK, corr=corr) else: sgd.backward_and_update(loss) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1,), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
data_per_rank = dataset_x.shape[0] // world_size idx_start = rank_in_global * data_per_rank idx_end = (rank_in_global + 1) * data_per_rank return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end] if __name__ == '__main__': # Generate a NCCL ID to be used for collective communication nccl_id = singa.NcclIdHolder() gpu_per_node = 8 max_epoch = 10 batch_size = 64 sgd = opt.SGD(lr=0.005 * gpu_per_node, momentum=0.9, weight_decay=1e-5) # Use sparsification with parameters topK = False # When topK = False, Sparsification based on a constant absolute threshold corr = True # If True, uses local accumulate gradient for the correction sparsThreshold = 0.05 # The constant absolute threshold for sparsification process = [] for gpu_num in range(0, gpu_per_node): process.append( multiprocessing.Process(target=train_mnist_cnn, args=(sgd, max_epoch, batch_size, True, data_partition, gpu_num, gpu_per_node, nccl_id, sparsThreshold, topK, corr)))