def onnx_to_singa(niter, use_cpu=False): if use_cpu: print("Using CPU") dev = device.get_default_device() else: print("Using GPU") dev = device.create_cuda_gpu() model = sonnx.load("mlp.onnx") backend = sonnx.prepare(model, device=dev) sgd = opt.SGD(0.1) inputs = Tensor( data=data, device=dev, requires_grad=False, stores_grad=False, name="input", ) target = Tensor( data=label, device=dev, requires_grad=False, stores_grad=False, name="target", ) for i in range(100): y = backend.run([inputs])[0] loss = autograd.softmax_cross_entropy(y, target) for p, gp in autograd.backward(loss): sgd.update(p, gp) loss_rate = tensor.to_numpy(loss)[0] accuracy_rate = accuracy(tensor.to_numpy(y), label) print("Iter {}, accurate={}, loss={}".format(i, accuracy_rate, loss_rate))
def singa_to_onnx(epochs, use_cpu=False, batchsize=32): sgd = opt.SGD(lr=0.1) # operations initialization conv1 = autograd.Conv2d(1, 8, 3, 2, padding=1) # 28 - 14 conv2 = autograd.Conv2d(8, 4, 3, 2, padding=1) # 14 - 7 pooling = autograd.MaxPool2d(3, 2, padding=1) # 7 - 4 linear = autograd.Linear(64, 10) def forward(x, t): y = conv1(x) y = autograd.relu(y) y = conv2(y) y = autograd.relu(y) y = pooling(y) y = autograd.flatten(y) y = linear(y) loss = autograd.softmax_cross_entropy(y, t) return loss, y autograd.training = True (x_train, y_train), (x_test, y_test), dev = common(use_cpu) niter = 1 # x_train.shape[0] // batchsize for epoch in range(epochs): accuracy_rate = 0.0 loss_rate = 0.0 for i in range(niter): inputs = tensor.Tensor( device=dev, data=x_train[i * batchsize : (i + 1) * batchsize], stores_grad=False, name="input", ) targets = tensor.Tensor( device=dev, data=y_train[i * batchsize : (i + 1) * batchsize], requires_grad=False, stores_grad=False, name="target", ) loss, y = forward(inputs, targets) accuracy_rate += accuracy( tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize] ) loss_rate += tensor.to_numpy(loss)[0] for p, gp in autograd.backward(loss): sgd.update(p, gp) print( "accuracy is {}, loss is {}".format( accuracy_rate / niter, loss_rate / niter)) model = sonnx.to_onnx_model([inputs], [y]) sonnx.save(model, "cnn.onnx")
def test_vanillaRNN_gpu_tiny_ops_shape_check(self): # gradients shape check. inputs, target, h0 = prepare_inputs_targets_for_rnn_test() rnn = autograd.RNN(3, 2) hs, _ = rnn(inputs, h0) loss = autograd.softmax_cross_entropy(hs[0], target[0]) for i in range(1, len(hs)): l = autograd.softmax_cross_entropy(hs[i], target[i]) loss = autograd.add(loss, l) # d=autograd.infer_dependency(loss.creator) # print(d) for t, dt in autograd.backward(loss): self.check_shape(t.shape, dt.shape)
def singa_to_onnx(niter, use_cpu=False): if use_cpu: print("Using CPU") dev = device.get_default_device() else: print("Using GPU") dev = device.create_cuda_gpu() inputs = Tensor( data=data, device=dev, requires_grad=False, stores_grad=False, name="input", ) target = Tensor( data=label, device=dev, requires_grad=False, stores_grad=False, name="target", ) w0 = Tensor(shape=(2, 3), device=dev, requires_grad=True, stores_grad=True) w0.gaussian(0.0, 0.1) b0 = Tensor(shape=(3,), device=dev, requires_grad=True, stores_grad=True) b0.set_value(0.0) w1 = Tensor(shape=(3, 2), device=dev, requires_grad=True, stores_grad=True) w1.gaussian(0.0, 0.1) b1 = Tensor(shape=(2,), device=dev, requires_grad=True, stores_grad=True) b1.set_value(0.0) sgd = opt.SGD(0.1) # training process for i in range(100): x = autograd.matmul(inputs, w0) x = autograd.add_bias(x, b0) x = autograd.relu(x) x = autograd.matmul(x, w1) x = autograd.add_bias(x, b1) loss = autograd.softmax_cross_entropy(x, target) for p, gp in autograd.backward(loss): sgd.update(p, gp) print("training loss = ", tensor.to_numpy(loss)[0]) sonnx.export([inputs], [x], file_path="mlp.onnx")
def test_LSTM_gpu_tiny_ops_shape_check(self): # gradients shape check. inputs, target, h0 = prepare_inputs_targets_for_rnn_test() c_0 = np.random.random((2, 1)).astype(np.float32) c0 = tensor.Tensor(device=gpu_dev, data=c_0) rnn = autograd.LSTM(3, 2) hs, _, _ = rnn(inputs, (h0, c0)) loss = autograd.softmax_cross_entropy(hs[0], target[0]) for i in range(1, len(hs)): l = autograd.softmax_cross_entropy(hs[i], target[i]) loss = autograd.add(loss, l) # d=autograd.infer_dependency(loss.creator) # print(d) for t, dt in autograd.backward(loss): self.check_shape(t.shape, dt.shape)
def onnx_to_singa(epochs, use_cpu=False, batchsize=32): (x_train, y_train), (x_test, y_test), dev = common(use_cpu) model = sonnx.load("cnn.onnx") backend = sonnx.prepare(model, dev) autograd.training = True sgd = opt.SGD(lr=0.01) niter = x_train.shape[0] // batchsize for epoch in range(epochs): accuracy_rate = 0.0 loss_rate = 0.0 for i in range(niter): inputs = tensor.Tensor( device=dev, data=x_train[i * batchsize : (i + 1) * batchsize], stores_grad=False, name="input", ) targets = tensor.Tensor( device=dev, data=y_train[i * batchsize : (i + 1) * batchsize], requires_grad=False, stores_grad=False, name="target", ) y = backend.run([inputs])[0] loss = autograd.softmax_cross_entropy(y, targets) accuracy_rate += accuracy( tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize] ) loss_rate += tensor.to_numpy(loss)[0] for p, gp in autograd.backward(loss): sgd.update(p, gp) print("accuracy is {}, loss is {}".format(accuracy_rate / niter, loss_rate / niter))
return x if __name__ == '__main__': model = Xception(num_classes=1000) print('Start intialization............') dev = device.create_cuda_gpu_on(0) #dev = device.create_cuda_gpu() niters = 20 batch_size = 16 IMG_SIZE = 299 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size, ), dev, tensor.int32) autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) with trange(niters) as t: for b in t: x = model(tx) loss = autograd.softmax_cross_entropy(x, ty) for p, g in autograd.backward(loss): # print(p.shape, g.shape) sgd.update(p, g) # pass
def train_cifar10(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None, partial_update=False): train_x, train_y = load_train_data() test_x, test_y = load_test_data() train_x, test_x = normalize_for_resnet(train_x, test_x) IMG_SIZE = 224 num_classes=10 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node) dev = device.create_cuda_gpu_on(sgd.rank_in_local) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 from resnet import resnet50 model = resnet50(num_classes=num_classes) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size,), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros( shape=(batch_size,), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) param = [] for p, _ in autograd.backward(loss): sychronize(p, sgd) param.append(p) for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.rank_in_global == 0)): print('Starting Epoch %d:' % (epoch)) #Training Phase autograd.training = True train_correct = np.zeros(shape=[1],dtype=np.float32) test_correct = np.zeros(shape=[1],dtype=np.float32) train_loss = np.zeros(shape=[1],dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size: (b + 1) * batch_size]] x = augmentation(x, batch_size) x = resize_dataset(x,IMG_SIZE) y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), to_categorical(y, num_classes)).astype(np.float32) train_loss += tensor.to_numpy(loss)[0] if not partial_update: sgd.backward_and_update(loss) else: sgd.backward_and_partial_update(loss) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1,), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch*batch_size*world_size)), flush=True) if partial_update: # sychronize parameters before evaluation phase for p in param: sychronize(p, sgd) #Evaulation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size: (b + 1) * batch_size] x = resize_dataset(x,IMG_SIZE) y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model(tx) test_correct += accuracy(tensor.to_numpy(out_test), to_categorical(y, num_classes)) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch*batch_size*world_size), time.time() - start_time ), flush=True)
label = to_categorical(label, 2).astype(np.float32) print('train_data_shape:', data.shape) print('train_label_shape:', label.shape) inputs = Tensor(data=data) target = Tensor(data=label) linear1 = autograd.Linear(3, 2) linear2 = autograd.Linear(2, 2) linear3 = autograd.Linear(2, 2) sgd = optimizer.SGD(0.00) # training process for i in range(1): x = linear1(inputs) x = autograd.relu(x) x1 = linear2(x) x2 = linear3(x) x3 = autograd.add(x1, x2) y = autograd.softmax(x3) loss = autograd.cross_entropy(y, target) gradient = autograd.backward(loss) for p, gp in gradient: sgd.apply(0, gp, p, '') if (i % 100 == 0): print('training loss = ', tensor.to_numpy(loss)[0]) model = sonnx.to_onnx_model([inputs], [y]) onnx.save(model, 'linear.onnx')
def backward_and_spars_update(self, loss, threshold=2097152, spars=0.05, topK=False, corr=True): """ Performs backward propagation from the loss and parameter update with sparsification. THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE: From the loss, it performs backward propagation to get the gradients and do the parameter update. It fuses the tensors with size smaller than the threshold value to reduce network latency, as well as using sparsification schemes to transfer only the gradient elements which are significant. Args: loss(Tensor): loss is the objective function of the deep learning model optimization, e.g. for classification problem it can be the output of the softmax_cross_entropy function. threshold(int): threshold is a parameter to control performance in fusing the tensors. For the tensors of sizes smaller than threshold, they are to be accumulated and fused before the all reduce operation. For the tensors of its size larger than the threshold value, they are to be reduced directly without fusion. spars(float): a parameter to control sparsity as defined below topK(bool): When topK is False, it sparsifies the gradient with absolute value >= sparsWhen topK is True, it sparsifies a fraction of total gradient number equals to spars, E.g. when spars = 0.01, it sparsifies 1 % of the total gradient elements corr(bool): whether to use the local accumulate gradient for correction Attributes: self.sparsInit: A counter to determine which partition to perform all-reduce. self.gradAccumulation: Local gradient accumulation """ if ((not hasattr(self, "sparsInit")) and corr): self.gradAccumulation = [] self.sparsInit = False plist = [] acc = 0 k = -1 glist = [] for p, g in autograd.backward(loss): if g.size() > threshold: # larger than threshold -> reduced directly k += 1 if (corr and (not self.sparsInit)): # create a tensor for the gradient accumulation self.gradAccumulation.append( tensor.Tensor((g.size(), ), p.device, p.dtype)) self.gradAccumulation[k].set_value(0.0) if corr: self.sparsification(g.data, self.gradAccumulation[k].data, spars, topK) else: self.sparsification(g.data, None, spars, topK) else: # smaller than threshold -> accumulate glist.append(g.data) acc += g.size() if (acc > threshold): k += 1 if (corr and (not self.sparsInit)): # create a tensor for the gradient accumulation self.gradAccumulation.append( tensor.Tensor((acc, ), p.device, p.dtype)) self.gradAccumulation[k].set_value(0.0) if corr: self.fused_sparsification( glist, self.gradAccumulation[k].data, spars, topK) else: self.fused_sparsification(glist, None, spars, topK) acc = 0 glist = [] plist.append((p, g)) if glist: k += 1 if (corr and (not self.sparsInit)): # create a tensor for the gradient accumulation self.gradAccumulation.append( tensor.Tensor((acc, ), p.device, p.dtype)) self.gradAccumulation[k].set_value(0.0) if corr: self.fused_sparsification(glist, self.gradAccumulation[k].data, spars, topK) else: self.fused_sparsification(glist, None, spars, topK) self.wait() for p, g in plist: self.update(p, g) self.sparsInit = True
def backward_and_update(self, loss): for p, g in autograd.backward(loss): self.update(p, g)
def forward(x, t): y = conv1(x) y = autograd.relu(y) y = conv2(y) y = autograd.relu(y) y = autograd.max_pool_2d(y) y = autograd.flatten(y) y = linear(y) y = autograd.soft_max(y) loss = autograd.cross_entropy(y, t) return loss, y autograd.training = True for epoch in range(epochs): for i in range(batch_number): inputs = tensor.Tensor(data=x_train[i * 100:(1 + i) * 100, :]) targets = tensor.Tensor(data=y_train[i * 100:(1 + i) * 100, :]) loss, y = forward(inputs, targets) accuracy_rate = accuracy(autograd.ctensor2numpy( y.data), autograd.ctensor2numpy(targets.data)) if (i % 5 == 0): print('accuracy is:', accuracy_rate, 'loss is:', autograd.ctensor2numpy(loss.data)[0]) in_grads = autograd.backward(loss) for param in in_grads: sgd.apply(0, in_grads[param], param, '')
def train_mnist_cnn(DIST=False, local_rank=None, world_size=None, nccl_id=None, spars=0, topK=False, corr=True): # Define the hypermeters good for the mnist_cnn max_epoch = 10 batch_size = 64 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) dev = device.get_default_device(sgd.local_rank) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.global_rank, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.global_rank, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.get_default_device() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): print('tensor.data.type is %s' % type(p.data).__name__) synchronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.global_rank == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) time_start = time.time() for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] if DIST: if (spars == 0): sgd.backward_and_update(loss, threshold=50000) else: sgd.backward_and_sparse_update(loss, spars=spars, topK=topK, corr=corr) else: sgd.backward_and_update(loss) # Evaluation Phase if b % 20 != 0: continue autograd.training = False num_test_batch_inside = 20 test_correct = 0 for b in range(num_test_batch_inside): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) print('Evaluation accuracy = %f' % (test_correct / (batch_size * num_test_batch_inside)), flush=True) autograd.training = True print('epoch time is %f' % (time.time() - time_start)) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1, ), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
def train_mnist_cnn(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None): # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node) dev = device.create_cuda_gpu_on(sgd.rank_in_local) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): sychronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.rank_in_global == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] for p, g in autograd.backward(loss): sgd.update(p, g) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1, ), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
def backward_and_partial_update(self, loss, threshold=2097152): """Performs backward propagation from the loss and parameter update using asychronous training. THIS IS A EXPERIMENTAL FUNCTION FOR RESEARCH PURPOSE: From the loss, it performs backward propagation to get the gradients and do the parameter update. It fuses the tensors smaller than the threshold value to reduce network latency, as well as performing asychronous training where one parameter partition is all-reduced per iteration. The size of the parameter partition depends on the threshold value. Args: loss(Tensor): loss is the objective function of the deep learning model optimization, e.g. for classification problem it can be the output of the softmax_cross_entropy function. threshold(int): threshold is a parameter to control performance in fusing the tensors. For the tensors of sizes smaller than threshold, they are to be accumulated and fused before the all reduce operation. For the tensors of its size larger than the threshold value, they are to be reduced directly without fusion. Attributes: self.partial(int): A counter to determine which partition to perform all-reduce. This counter resets to zero automatlly after an update cycle of the full parameter set. """ if not hasattr(self, "partial"): self.partial = 0 self.partial += 1 k = 0 plist = [] acc = 0 tenlist = [] reduced = [] for p, g in autograd.backward(loss): # every parameters update locally self.opt.update(p, g) # then do the partial parameter sychronization if p.size() > threshold: # larger than threshold -> reduced directly # k is the partition number of the full gradient set k += 1 if (k == self.partial): self.all_reduce(p.data) reduced.append(p) else: # smaller than threshold -> accumulate plist.append(p.data) tenlist.append(p) acc += p.size() if (acc > threshold): k += 1 if (k == self.partial): self.fused_all_reduce(plist) reduced = tenlist acc = 0 plist = [] tenlist = [] if plist: k += 1 if (k == self.partial): self.fused_all_reduce(plist) reduced = tenlist self.wait() # the all-reduced parameters needed to be averaged for r in reduced: r /= self.world_size # the counter returns to zero after a cycle of partial update if (k == self.partial): self.partial = 0
print('train_label_shape:', label.shape) inputs = Tensor(data=data) target = Tensor(data=label) w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True) w0.gaussian(0.0, 0.1) b0 = Tensor(shape=(1, 3), requires_grad=True, stores_grad=True) b0.set_value(0.0) w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True) w1.gaussian(0.0, 0.1) b1 = Tensor(shape=(1, 2), requires_grad=True, stores_grad=True) b1.set_value(0.0) sgd = optimizer.SGD(0.05) # training process for i in range(1001): x = autograd.matmul(inputs, w0) x = autograd.add_bias(x, b0) x = autograd.relu(x) x = autograd.matmul(x, w1) x = autograd.add_bias(x, b1) x = autograd.softmax(x) loss = autograd.cross_entropy(x, target) for p, gp in autograd.backward(loss): sgd.apply(0, gp, p, '') if (i % 100 == 0): print('training loss = ', tensor.to_numpy(loss)[0])
def call(self, loss): for p, g in autograd.backward(loss): if p.name is None: p.name = id(p) self.apply(p.name, p, g)