def lstm_forward(): hs, _, _ = rnn(inputs, (h0, c0)) loss = autograd.softmax_cross_entropy(hs[0], target[0]) for i in range(1, len(hs)): l = autograd.softmax_cross_entropy(hs[i], target[i]) loss = autograd.add(loss, l) return loss
def valinna_rnn_forward(): hs, _ = rnn(inputs, h0) loss = autograd.softmax_cross_entropy(hs[0], target[0]) for i in range(1, len(hs)): l = autograd.softmax_cross_entropy(hs[i], target[i]) loss = autograd.add(loss, l) #grads = autograd.gradients(loss) return loss
def test_lstm_model(self, dev=gpu_dev): hidden_size = 3 seq_length = 2 batch_size = 4 feature_size = 3 bidirectional = False directions = 2 if bidirectional else 1 num_layers = 2 out_size = hidden_size return_sequences = False batch_first = True rnn_mode = "lstm" # manual test case x_data = np.array( [[[0, 0, 1], [0, 1, 0]], [[0, 1, 0], [1, 0, 0]], [[0, 0, 1], [0, 1, 0]], [[1, 0, 0], [0, 0, 1]]], dtype=np.float32).reshape(batch_size, seq_length, hidden_size) # bs, seq, fea if return_sequences: y_data = np.array([[[0, 1, 0], [1, 0, 0]], [[1, 0, 0], [0, 0, 1]], [[0, 1, 0], [1, 0, 0]], [[0, 0, 1], [0, 1, 0]]], dtype=np.float32).reshape( batch_size, seq_length, hidden_size) # bs, hidden y_data.reshape(batch_size, -1) else: y_data = np.array([[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]], dtype=np.float32).reshape( batch_size, hidden_size) # bs, hidden x = tensor.Tensor(device=dev, data=x_data) y_t = tensor.Tensor(device=dev, data=y_data) m = LSTMModel(hidden_size, seq_length, batch_size, bidirectional, num_layers, return_sequences, rnn_mode, batch_first) m.compile([x], is_train=True, use_graph=False, sequential=False) m.train() for i in range(1000): y = m.forward(x) assert y.shape == y_t.shape loss = autograd.softmax_cross_entropy(y, y_t) if i % 100 == 0: print("loss", loss) m.optimizer(loss) m.eval() y = m.forward(x) loss = autograd.softmax_cross_entropy(y, y_t) print("eval loss", loss)
def test_vanillaRNN_gpu_tiny_ops_shape_check(self): # gradients shape check. inputs, target, h0 = prepare_inputs_targets_for_rnn_test() rnn = autograd.RNN(3, 2) hs, _ = rnn(inputs, h0) loss = autograd.softmax_cross_entropy(hs[0], target[0]) for i in range(1, len(hs)): l = autograd.softmax_cross_entropy(hs[i], target[i]) loss = autograd.add(loss, l) # d=autograd.infer_dependency(loss.creator) # print(d) for t, dt in autograd.backward(loss): self.check_shape(t.shape, dt.shape)
def transfer_learning(sg_ir, x, y, epochs=1, batch_size=64, dev=device.get_default_device()): batch_number = x.shape[0] // batch_size trans_model = Trans(sg_ir, -1) for i in range(epochs): for b in range(batch_number): l_idx = b * batch_size r_idx = (b + 1) * batch_size x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx]) target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx]) output_batch = trans_model.forward(x_batch) loss = autograd.softmax_cross_entropy(output_batch, target_batch) accuracy_rate = accuracy(tensor.to_numpy(output_batch), tensor.to_numpy(target_batch)) sgd = opt.SGD(lr=0.07) for p, gp in autograd.backward(loss): sgd.update(p, gp) sgd.step() if b % 1e2 == 0: print("acc %6.2f loss, %6.2f" % (accuracy_rate, tensor.to_numpy(loss)[0])) print("transfer-learning completed") return trans_model
def onnx_to_singa(niter, use_cpu=False): if use_cpu: print("Using CPU") dev = device.get_default_device() else: print("Using GPU") dev = device.create_cuda_gpu() model = sonnx.load("mlp.onnx") backend = sonnx.prepare(model, device=dev) sgd = opt.SGD(0.1) inputs = Tensor( data=data, device=dev, requires_grad=False, stores_grad=False, name="input", ) target = Tensor( data=label, device=dev, requires_grad=False, stores_grad=False, name="target", ) for i in range(100): y = backend.run([inputs])[0] loss = autograd.softmax_cross_entropy(y, target) for p, gp in autograd.backward(loss): sgd.update(p, gp) loss_rate = tensor.to_numpy(loss)[0] accuracy_rate = accuracy(tensor.to_numpy(y), label) print("Iter {}, accurate={}, loss={}".format(i, accuracy_rate, loss_rate))
def train(model, x, y, epochs=1, batch_size=64, dev=device.get_default_device()): batch_number = x.shape[0] // batch_size for i in range(epochs): for b in range(batch_number): l_idx = b * batch_size r_idx = (b + 1) * batch_size x_batch = tensor.Tensor(device=dev, data=x[l_idx:r_idx]) target_batch = tensor.Tensor(device=dev, data=y[l_idx:r_idx]) output_batch = model.forward(x_batch) # onnx_model = sonnx.to_onnx([x_batch], [y]) # print('The model is:\n{}'.format(onnx_model)) loss = autograd.softmax_cross_entropy(output_batch, target_batch) accuracy_rate = accuracy(tensor.to_numpy(output_batch), tensor.to_numpy(target_batch)) sgd = opt.SGD(lr=0.001) for p, gp in autograd.backward(loss): sgd.update(p, gp) sgd.step() if b % 1e2 == 0: print("acc %6.2f loss, %6.2f" % (accuracy_rate, tensor.to_numpy(loss)[0])) print("training completed") return x_batch, output_batch
def test_LSTM_gpu_tiny_ops_shape_check(self): # gradients shape check. inputs, target, h0 = prepare_inputs_targets_for_rnn_test() c_0 = np.random.random((2, 1)).astype(np.float32) c0 = tensor.Tensor(device=gpu_dev, data=c_0) rnn = autograd.LSTM(3, 2) hs, _, _ = rnn(inputs, (h0, c0)) loss = autograd.softmax_cross_entropy(hs[0], target[0]) for i in range(1, len(hs)): l = autograd.softmax_cross_entropy(hs[i], target[i]) loss = autograd.add(loss, l) # d=autograd.infer_dependency(loss.creator) # print(d) for t, dt in autograd.backward(loss): self.check_shape(t.shape, dt.shape)
def forward(x, t): y = conv1(x) y = autograd.relu(y) y = conv2(y) y = autograd.relu(y) y = pooling(y) y = autograd.flatten(y) y = linear(y) loss = autograd.softmax_cross_entropy(y, t) return loss, y
def forward(x, t): y = conv1(x) y = autograd.relu(y) y1 = conv21(y) y2 = conv22(y) y = autograd.cat((y1, y2), 1) y = autograd.relu(y) y = autograd.flatten(y) y = linear(y) loss = autograd.softmax_cross_entropy(y, t) return loss, y
def evaluate(model, data, batch_size, seq_length, dev, inputs, labels): model.eval() val_loss = 0.0 for b in range(data.num_test_batch): batch = data.val_dat[b * batch_size:(b + 1) * batch_size] inputs, labels = convert(batch, batch_size, seq_length, data.vocab_size, dev, inputs, labels) model.reset_states(dev) y = model(inputs) loss = autograd.softmax_cross_entropy(y, labels)[0] val_loss += tensor.to_numpy(loss)[0] print(' validation loss is %f' % (val_loss / data.num_test_batch / seq_length))
def forward(x, t): y = conv1(x) y = autograd.relu(y) y = bn1(y) y = pooling1(y) y1 = conv21(y) y2 = conv22(y) y = autograd.cat((y1, y2), 1) y = bn2(y) y = autograd.relu(y) y = bn2(y) y = pooling2(y) y = autograd.flatten(y) y = linear(y) loss = autograd.softmax_cross_entropy(y, t) return loss, y
def singa_to_onnx(niter, use_cpu=False): if use_cpu: print("Using CPU") dev = device.get_default_device() else: print("Using GPU") dev = device.create_cuda_gpu() inputs = Tensor( data=data, device=dev, requires_grad=False, stores_grad=False, name="input", ) target = Tensor( data=label, device=dev, requires_grad=False, stores_grad=False, name="target", ) w0 = Tensor(shape=(2, 3), device=dev, requires_grad=True, stores_grad=True) w0.gaussian(0.0, 0.1) b0 = Tensor(shape=(3,), device=dev, requires_grad=True, stores_grad=True) b0.set_value(0.0) w1 = Tensor(shape=(3, 2), device=dev, requires_grad=True, stores_grad=True) w1.gaussian(0.0, 0.1) b1 = Tensor(shape=(2,), device=dev, requires_grad=True, stores_grad=True) b1.set_value(0.0) sgd = opt.SGD(0.1) # training process for i in range(100): x = autograd.matmul(inputs, w0) x = autograd.add_bias(x, b0) x = autograd.relu(x) x = autograd.matmul(x, w1) x = autograd.add_bias(x, b1) loss = autograd.softmax_cross_entropy(x, target) for p, gp in autograd.backward(loss): sgd.update(p, gp) print("training loss = ", tensor.to_numpy(loss)[0]) sonnx.export([inputs], [x], file_path="mlp.onnx")
def train_one_batch(self, x, y, dist_option, spars): out = self.forward(x) loss = autograd.softmax_cross_entropy(out, y) if dist_option == 'fp32': self.optimizer.backward_and_update(loss) elif dist_option == 'fp16': self.optimizer.backward_and_update_half(loss) elif dist_option == 'partialUpdate': self.optimizer.backward_and_partial_update(loss) elif dist_option == 'sparseTopK': self.optimizer.backward_and_sparse_update(loss, topK=True, spars=spars) elif dist_option == 'sparseThreshold': self.optimizer.backward_and_sparse_update(loss, topK=False, spars=spars) return out, loss
def forward(x, t): y = conv1(x) y = autograd.tanh(y) y1 = conv21(y) y2 = conv22(y) y = autograd.cat((y1, y2), 1) y = autograd.sigmoid(y) y = bn(y) y = autograd.relu(y) y = autograd.mul(y, y) y = pooling1(y) y = autograd.sigmoid(y) y = pooling2(y) print(tensor.to_numpy(y).shape) y = autograd.flatten(y) y = linear(y) print(tensor.to_numpy(y).shape) loss = autograd.softmax_cross_entropy(y, t) return loss, y
def onnx_to_singa(epochs, use_cpu=False, batchsize=32): (x_train, y_train), (x_test, y_test), dev = common(use_cpu) model = sonnx.load("cnn.onnx") backend = sonnx.prepare(model, dev) autograd.training = True sgd = opt.SGD(lr=0.01) niter = x_train.shape[0] // batchsize for epoch in range(epochs): accuracy_rate = 0.0 loss_rate = 0.0 for i in range(niter): inputs = tensor.Tensor( device=dev, data=x_train[i * batchsize : (i + 1) * batchsize], stores_grad=False, name="input", ) targets = tensor.Tensor( device=dev, data=y_train[i * batchsize : (i + 1) * batchsize], requires_grad=False, stores_grad=False, name="target", ) y = backend.run([inputs])[0] loss = autograd.softmax_cross_entropy(y, targets) accuracy_rate += accuracy( tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize] ) loss_rate += tensor.to_numpy(loss)[0] for p, gp in autograd.backward(loss): sgd.update(p, gp) print("accuracy is {}, loss is {}".format(accuracy_rate / niter, loss_rate / niter))
import time dev.Sync() start = time.time() fd = 0 softmax = 0 update = 0 with trange(niters) as t: for _ in t: dev.Sync() tick = time.time() x = model(tx) dev.Sync() fd += time.time() - tick tick = time.time() loss = autograd.softmax_cross_entropy(x, ty) dev.Sync() softmax += time.time() - tick plist = [] for p, g in autograd.backward(loss): #dev.Sync() # this Sync affects the concurrency and hence omitted tick = time.time() sgd.all_reduce(g) #dev.Sync() # this Sync affects the concurrency and hence omitted update += time.time() - tick plist.append((p, g)) sgd.wait() for p, g in plist: sgd.update(p, g) dev.Sync()
def train_one_batch(self, x, y): out = self.forward(x) loss = autograd.softmax_cross_entropy(out, y) self.optimizer(loss) return out, loss
def train_mnist_cnn(DIST=False, local_rank=None, world_size=None, nccl_id=None, spars=0, topK=False, corr=True): # Define the hypermeters good for the mnist_cnn max_epoch = 10 batch_size = 64 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) dev = device.create_cuda_gpu_on(sgd.local_rank) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.global_rank, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.global_rank, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): synchronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.global_rank == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] if DIST: if (spars == 0): sgd.backward_and_update(loss, threshold=50000) else: sgd.backward_and_sparse_update(loss, spars=spars, topK=topK, corr=corr) else: sgd.backward_and_update(loss) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1,), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
def train_mnist_cnn(DIST=False, local_rank=None, world_size=None, nccl_id=None, spars=0, topK=False, corr=True): # Define the hypermeters good for the mnist_cnn max_epoch = 10 batch_size = 128 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training ''' sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) ''' dev = device.get_default_device() # create kvstore kv_type = 'dist_sync' #set synchronization mode lr = 0.005 kv = singa_kvstore.create_kvstore(kv_type, 'SingaSGD', lr=0.005, momentum=0.9, weight_decay=1e-5) global_rank = kv.rank world_size = kv.num_workers # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, global_rank, world_size) test_x, test_y = data_partition(test_x, test_y, global_rank, world_size) # create model model = CNN() ''' num_channels = train_x.shape[1] image_size = train_x.shape[2] data_size = np.prod(train_x.shape[1:train_x.ndim]).item() num_classes = (np.max(train_y) + 1).item() model = resnet.resnet18(num_channels=1, num_classes=num_classes) ''' tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) ''' if DIST: #Initial a batch to help obtain model parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) #Initial kv store for workers of ps-architecture key = 0 for p, g in autograd.backward(loss): kv.init(key, mx.nd.array(tensor.to_numpy(p))) key += 1 ''' # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if (DIST == True): print('^_^Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) time_start = time.time() for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] singa_kvstore.backward_and_update(kv, loss) ''' if DIST: #push kv_pairs = [] key = 0 for p, g in autograd.backward(loss): kv.push(key,mx.nd.array(tensor.to_numpy(g))) kv_pairs.append((key,p,g)) key += 1 #pull for key,p,g in kv_pairs: out_buf = mx.nd.zeros(p.shape) kv.pull(key,out=out_buf) p.copy_from_numpy(out_buf.asnumpy()) ''' # Evaluation Phase if b % 20 != 0: continue autograd.training = False num_test_batch_inside = 20 test_correct = 0 for b in range(num_test_batch_inside): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) print('Evaluation accuracy = %f' % (test_correct / (batch_size * num_test_batch_inside)), flush=True) autograd.training = True print('epoch time is %f' % (time.time() - time_start))
def train_mnist_cnn(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None): # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node) dev = device.create_cuda_gpu_on(sgd.rank_in_local) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): sychronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.rank_in_global == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] plist = [] for p, g in autograd.backward(loss): if DIST: sgd.all_reduce(g) plist.append((p, g)) if DIST: sgd.wait() for p, g in plist: sgd.update(p, g) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1, ), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
epochs = 1 sgd = opt.SGD(lr=0.00) x_train = preprocess(train[0]) y_train = to_categorical(train[1], num_classes) x_test = preprocess(test[0]) y_test = to_categorical(test[1], num_classes) print('the shape of training data is', x_train.shape) print('the shape of training label is', y_train.shape) print('the shape of testing data is', x_test.shape) print('the shape of testing label is', y_test.shape) model = onnx.load('cnn.onnx') rep = sonnx.prepare(model, dev) print('finish init') autograd.training = True # training process for epoch in range(1): inputs = tensor.Tensor(device=dev, data=x_train[0:100], stores_grad=False) targets = tensor.Tensor(device=dev, data=y_train[0:100], requires_grad=False, stores_grad=False) y0 = rep.run([inputs])[0] loss = autograd.softmax_cross_entropy(y0, targets) print('outputs', tensor.to_numpy(loss)[0])
def loss(out, y): return autograd.softmax_cross_entropy(out, y)
return x if __name__ == '__main__': model = Xception(num_classes=1000) print('Start intialization............') dev = device.create_cuda_gpu_on(0) #dev = device.create_cuda_gpu() niters = 20 batch_size = 16 IMG_SIZE = 299 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size,), dev, tensor.int32) autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) with trange(niters) as t: for b in t: x = model(tx) loss = autograd.softmax_cross_entropy(x, ty) for p, g in autograd.backward(loss): # print(p.shape, g.shape) sgd.update(p, g) # pass
def loss(self, out, ty): return autograd.softmax_cross_entropy(out, ty)
def loss(self, out, ty): ty = autograd.reshape(ty, (-1, 1)) return autograd.softmax_cross_entropy(out, ty)
def train_cifar10(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None, partial_update=False): train_x, train_y = load_train_data() test_x, test_y = load_test_data() train_x, test_x = normalize_for_resnet(train_x, test_x) IMG_SIZE = 224 num_classes=10 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node) dev = device.create_cuda_gpu_on(sgd.rank_in_local) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 from resnet import resnet50 model = resnet50(num_classes=num_classes) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size,), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros( shape=(batch_size,), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) param = [] for p, _ in autograd.backward(loss): sychronize(p, sgd) param.append(p) for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.rank_in_global == 0)): print('Starting Epoch %d:' % (epoch)) #Training Phase autograd.training = True train_correct = np.zeros(shape=[1],dtype=np.float32) test_correct = np.zeros(shape=[1],dtype=np.float32) train_loss = np.zeros(shape=[1],dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size: (b + 1) * batch_size]] x = augmentation(x, batch_size) x = resize_dataset(x,IMG_SIZE) y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), to_categorical(y, num_classes)).astype(np.float32) train_loss += tensor.to_numpy(loss)[0] if not partial_update: sgd.backward_and_update(loss) else: sgd.backward_and_partial_update(loss) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1,), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch*batch_size*world_size)), flush=True) if partial_update: # sychronize parameters before evaluation phase for p in param: sychronize(p, sgd) #Evaulation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size: (b + 1) * batch_size] x = resize_dataset(x,IMG_SIZE) y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model(tx) test_correct += accuracy(tensor.to_numpy(out_test), to_categorical(y, num_classes)) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch*batch_size*world_size), time.time() - start_time ), flush=True)