def __init__(self, device_id=0, **knobs): super().__init__(**knobs) # onnx model url self.model_url = 'https://onnxzoo.blob.core.windows.net/models/opset_8/tiny_yolov2/tiny_yolov2.tar.gz' # model path in the downloaded tar file self.model_path = 'tiny_yolov2/Model.onnx' self.dev = device.create_cuda_gpu_on(device_id)
def train_resnet(DIST=True, graph=True, sequential=False, verbosity=0): # Define the hypermeters good for the train_resnet niters = 100 batch_size = 32 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) IMG_SIZE = 224 # For distributed training, sequential has better throughput in the current version if DIST == True: sgd = opt.DistOpt(sgd) world_size = sgd.world_size local_rank = sgd.local_rank global_rank = sgd.global_rank sequential = True else: local_rank = 0 world_size = 1 global_rank = 0 sequential = False dev = device.create_cuda_gpu_on(local_rank) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size,), dev, tensor.int32) x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) dev.SetVerbosity(verbosity) dev.SetSkipIteration(5) # construct the model from model import resnet model = resnet.resnet50(num_channels=3, num_classes=1000) model.train() model.set_optimizer(sgd) model.compile([tx], is_train=True, use_graph=graph, sequential=sequential) # train model dev.Sync() start = time.time() with trange(niters) as t: for _ in t: model(tx, ty, dist_option='fp32', spars=None) dev.Sync() end = time.time() titer = (end - start) / float(niters) throughput = float(niters * batch_size * world_size) / (end - start) if global_rank == 0: print("Throughput = {} per second".format(throughput), flush=True) print("TotalTime={}".format(end - start), flush=True) print("Total={}".format(titer), flush=True) dev.PrintTimeProfiling()
def train(self): train_data, _, _, _, _, _ = load_data(self.dataset_filepath) dev = device.create_cuda_gpu_on(0) dev.SetRandSeed(0) np.random.seed(0) # sgd = opt.SGD(lr=self.learning_rate, momentum=0.9, weight_decay=1e-5) sgd = opt.Adam(lr=self.learning_rate) noise = tensor.Tensor((self.batch_size, self.noise_size), dev, tensor.float32) real_images = tensor.Tensor((self.batch_size, self.feature_size), dev, tensor.float32) real_labels = tensor.Tensor((self.batch_size, 1), dev, tensor.float32) fake_labels = tensor.Tensor((self.batch_size, 1), dev, tensor.float32) # attached model to graph self.model.set_optimizer(sgd) self.model.compile([noise], is_train=True, use_graph=False, sequential=True) real_labels.set_value(1.0) fake_labels.set_value(0.0) for iteration in range(self.iterations): idx = np.random.randint(0, train_data.shape[0], self.batch_size) real_images.copy_from_numpy(train_data[idx]) self.model.train() # Training the Discriminative Net _, d_loss_real = self.model.train_one_batch_dis( real_images, real_labels) noise.uniform(-1, 1) fake_images = self.model.forward_gen(noise) _, d_loss_fake = self.model.train_one_batch_dis( fake_images, fake_labels) d_loss = tensor.to_numpy(d_loss_real)[0] + tensor.to_numpy( d_loss_fake)[0] # Training the Generative Net noise.uniform(-1, 1) _, g_loss_tensor = self.model.train_one_batch( noise, real_labels) g_loss = tensor.to_numpy(g_loss_tensor)[0] if iteration % self.interval == 0: self.model.eval() self.save_image(iteration) print_log(' The {} iteration, G_LOSS: {}, D_LOSS: {}'.format( iteration, g_loss, d_loss))
def __init__(self, model_url, model_path, singa_model, device_id, **knobs): super().__init__(**knobs) self._knobs = knobs self.__dict__.update(knobs) self.model_url = model_url self.model_path = model_path self.singa_model = singa_model self.dev = device.create_cuda_gpu_on(device_id) self.dev.SetRandSeed(0) np.random.seed(0)
def __init__(self, device_id=0, length=20, **knobs): super().__init__(**knobs) # onnx model url self.model_url = 'https://media.githubusercontent.com/media/onnx/models/master/text/machine_comprehension/bert-squad/model/bertsquad-10.tar.gz' # model path in the downloaded tar file self.model_path = 'download_sample_10/bertsquad10.onnx' self.dev = device.create_cuda_gpu_on(device_id) self.max_answer_length = 30 self.max_seq_length = 256 self.doc_stride = 128 self.max_query_length = 64 self.n_best_size = 20 self.batch_size = 3
x = autograd.relu(features) x = self.globalpooling(x) x = autograd.flatten(x) x = self.fc(x) return x def __call__(self, input): x = self.features(input) x = self.logits(x) return x if __name__ == '__main__': model = Xception(num_classes=1000) print('Start intialization............') dev = device.create_cuda_gpu_on(0) #dev = device.create_cuda_gpu() niters = 20 batch_size = 16 IMG_SIZE = 299 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size, ), dev, tensor.int32) autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y)
def train_mnist_cnn(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None): # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node) dev = device.create_cuda_gpu_on(sgd.rank_in_local) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): sychronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.rank_in_global == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] plist = [] for p, g in autograd.backward(loss): if DIST: sgd.all_reduce(g) plist.append((p, g)) if DIST: sgd.wait() for p, g in plist: sgd.update(p, g) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1, ), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
def train_mnist_cnn(DIST=False, local_rank=None, world_size=None, nccl_id=None, spars=0, topK=False, corr=True): # Define the hypermeters good for the mnist_cnn max_epoch = 10 batch_size = 128 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training ''' sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) ''' # create kvstore kv_type = 'dist_sync' #set synchronization mode lr = 0.005 kv = singa_kvstore.create_kvstore(kv_type, 'sgd', learning_rate=0.005) global_rank = kv.rank world_size = kv.num_workers # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, global_rank, world_size) test_x, test_y = data_partition(test_x, test_y, global_rank, world_size) # create model model = CNN() ''' num_channels = train_x.shape[1] image_size = train_x.shape[2] data_size = np.prod(train_x.shape[1:train_x.ndim]).item() num_classes = (np.max(train_y) + 1).item() model = resnet.resnet18(num_channels=1, num_classes=num_classes) ''' dev = device.create_cuda_gpu_on(kv.rank) #dev = device.create_cuda_gpu() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) ''' if DIST: #Initial a batch to help obtain model parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) #Initial kv store for workers of ps-architecture key = 0 for p, g in autograd.backward(loss): kv.init(key, mx.nd.array(tensor.to_numpy(p))) key += 1 ''' # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if (DIST == True): print('^_^Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) time_start = time.time() for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] singa_kvstore.backward_and_update(kv, loss) ''' if DIST: #push kv_pairs = [] key = 0 for p, g in autograd.backward(loss): kv.push(key,mx.nd.array(tensor.to_numpy(g))) kv_pairs.append((key,p,g)) key += 1 #pull for key,p,g in kv_pairs: out_buf = mx.nd.zeros(p.shape) kv.pull(key,out=out_buf) p.copy_from_numpy(out_buf.asnumpy()) ''' # Evaluation Phase if b % 400 != 0: continue autograd.training = False num_test_batch_inside = 20 test_correct = 0 for b in range(num_test_batch_inside): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] if x.shape[0] != tx.shape[0]: break tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) print('Evaluation accuracy = %f' % (test_correct / (batch_size * num_test_batch_inside)), flush=True) autograd.training = True print('epoch time is %f' % (time.time() - time_start))
for i in range(datalen): a=x[i] sen.append(dic[a]) return sen if __name__ == "__main__": model_file = open('71.bin', 'rb') param = pickle.load(model_file) model_file.close() decoderw=param['decoder_w'] densew,denseb=param['dense_w'],param['dense_b'] hiddensize=param['hidden_size'] numstacks=param['num_stacks'] drop_out=param['dropout'] vocab_size=7000 cuda = device.create_cuda_gpu_on(1) encoder = layer.LSTM(name='lstm1', hidden_size=hiddensize, num_stacks=numstacks, dropout=drop_out, input_sample_shape=(vocab_size,)) decoder = layer.LSTM(name='lstm2', hidden_size=hiddensize, num_stacks=numstacks, dropout=drop_out, input_sample_shape=(vocab_size,)) encoder.to_device(cuda) decoder.to_device(cuda) encoder_w = encoder.param_values()[0] encoder_w.uniform(-0.08, 0.08) decoder.param_values()[0].copy_from_numpy(decoderw, offset=0) dense = layer.Dense('dense', vocab_size, input_sample_shape=(hiddensize,)) dense.to_device(cuda) dense.param_values()[0].copy_from_numpy(densew,offset=0) dense.param_values()[1].copy_from_numpy(denseb,offset=0) metadata,idx_q,idx_a=load_data()
def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100, use_cpu=False): print('Start intialization............') if use_cpu: print('Using CPU') dev = device.get_default_device() else: print('Using GPU') dev = device.create_cuda_gpu_on(1) net.to_device(dev) opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay) for (p, specs) in zip(net.param_names(), net.param_specs()): opt.register(p, specs) tx = tensor.Tensor((batch_size, 3, 32, 32), dev) ty = tensor.Tensor((batch_size, ), dev, core_pb2.kInt) train_x, train_y, test_x, test_y = data num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) fileTimeLog = open("epochTimeLog.text", "a") for epoch in range(3): time.sleep(1) np.random.shuffle(idx) loss, acc = 0.0, 0.0 print('Epoch %d' % epoch) print(datetime.now().timetz()) # miliseconds print(int(round(time.time() * 1000))) fileTimeLog.write('Epoch %d: ' % epoch) fileTimeLog.write(str(int(round(time.time() * 1000)))) fileTimeLog.write('\n') for b in range(20): time.sleep(1) print("train iteration %d" % b) fileTimeLog.write('iteration %d: ' % b) fileTimeLog.write(str(int(round(time.time() * 1000)))) fileTimeLog.write('\n') x = train_x[idx[b * batch_size:(b + 1) * batch_size]] y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) grads, (l, a) = net.train(tx, ty) loss += l acc += a for (s, p, g) in zip(net.param_names(), net.param_values(), grads): opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s), b) # update progress bar utils.update_progress(b * 1.0 / num_train_batch, 'training loss = %f, accuracy = %f' % (l, a)) info = '\ntraining loss = %f, training accuracy = %f, lr = %f' \ % ((loss / num_train_batch), (acc / num_train_batch), get_lr(epoch)) print(info) time.sleep(1) loss, acc = 0.0, 0.0 for b in range(10): time.sleep(1) print("test iteration %d" % b) x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) l, a = net.evaluate(tx, ty) loss += l acc += a print('test loss = %f, test accuracy = %f' % ((loss / num_test_batch), (acc / num_test_batch))) fileTimeLog.close() net.save('model', 20) # save model params into checkpoint file
def train_cifar10(sgd, max_epoch, batch_size, DIST=False, data_partition=None, gpu_num=None, gpu_per_node=None, nccl_id=None, partial_update=False): train_x, train_y = load_train_data() test_x, test_y = load_test_data() train_x, test_x = normalize_for_resnet(train_x, test_x) IMG_SIZE = 224 num_classes=10 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, gpu_num=gpu_num, gpu_per_node=gpu_per_node) dev = device.create_cuda_gpu_on(sgd.rank_in_local) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.rank_in_global, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.rank_in_global, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 from resnet import resnet50 model = resnet50(num_classes=num_classes) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size,), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros( shape=(batch_size,), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) param = [] for p, _ in autograd.backward(loss): sychronize(p, sgd) param.append(p) for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.rank_in_global == 0)): print('Starting Epoch %d:' % (epoch)) #Training Phase autograd.training = True train_correct = np.zeros(shape=[1],dtype=np.float32) test_correct = np.zeros(shape=[1],dtype=np.float32) train_loss = np.zeros(shape=[1],dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size: (b + 1) * batch_size]] x = augmentation(x, batch_size) x = resize_dataset(x,IMG_SIZE) y = train_y[idx[b * batch_size: (b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), to_categorical(y, num_classes)).astype(np.float32) train_loss += tensor.to_numpy(loss)[0] if not partial_update: sgd.backward_and_update(loss) else: sgd.backward_and_partial_update(loss) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1,), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch*batch_size*world_size)), flush=True) if partial_update: # sychronize parameters before evaluation phase for p in param: sychronize(p, sgd) #Evaulation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size: (b + 1) * batch_size] x = resize_dataset(x,IMG_SIZE) y = test_y[b * batch_size: (b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model(tx) test_correct += accuracy(tensor.to_numpy(out_test), to_categorical(y, num_classes)) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.rank_in_global == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch*batch_size*world_size), time.time() - start_time ), flush=True)
parser.add_argument('-n', '--num-layers', default=2, type=int, help='num layers', dest='num_layers') args = parser.parse_args() # parameters seq_limit = 50 embed_size = 300 hid = 32 # gpu device dev = device.create_cuda_gpu_on(args.device_id) # create placeholder tx = tensor.Tensor((args.bs, seq_limit, embed_size), dev, tensor.float32) ty = tensor.Tensor((args.bs, 2), dev, tensor.float32) tx.gaussian(0, 1) ty.gaussian(0, 1) # create model m = IMDBModel(hid, mode=args.mode, return_sequences=args.return_sequences, bidirectional=args.bidirectional, num_layers=args.num_layers) m.set_opt(opt.SGD(args.lr, 0.9))
def train_resnet(DIST='singa', graph=True, sequential=False): # Define the hypermeters good for the train_resnet niters = 100 batch_size = 32 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) IMG_SIZE = 224 # For distributed training, sequential has better throughput in the current version if DIST=='singa': sgd = opt.DistOpt(sgd) world_size = sgd.world_size local_rank = sgd.local_rank global_rank = sgd.global_rank sequential = True else: kv_type = 'dist_sync' #set synchronization mode kv = singa_kvstore.create_kvstore(kv_type,'sgd',learning_rate=0.005) global_rank = kv.rank world_size = kv.num_workers sequential = True dev = device.create_cuda_gpu_on(kv.rank) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size,), dev, tensor.int32) x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) # construct the model from model import resnet model = resnet.resnet50(num_channels=3, num_classes=1000) model.train() model.on_device(dev) model.set_optimizer(sgd) model.graph(graph, sequential) # train model if DIST=='singa': dev.Sync() compute_time = 0.0 syn_time = 0.0 start = time.time() with trange(niters) as t: for _ in t: out = model(tx) compute_start = time.time() loss = model.loss(out, ty) compute_time += time.time()-compute_start if DIST=='singa': syn_start = time.time() model.optim(loss, dist_option='fp32', spars=None) syn_time += time.time()-syn_start else: #autograd.training = True syn_start = time.time() singa_kvstore.backward_and_update(kv,loss) syn_time += time.time()-syn_start if DIST=='singa': dev.Sync() end = time.time() compute_time = compute_time /float(niters) syn_time = syn_time/ float(niters) titer = (end - start) / float(niters) throughput = float(niters * batch_size * world_size) / (end - start) if global_rank == 0: print("compute_time = {}".format(compute_time),flush=True) print("syn_time = {}".format(syn_time),flush=True) print("Throughput = {} per second".format(throughput), flush=True) print("TotalTime={}".format(end - start), flush=True) print("Total={}".format(titer), flush=True)
def train_mnist_cnn(DIST=False, local_rank=None, world_size=None, nccl_id=None, spars=0, topK=False, corr=True): # Define the hypermeters good for the mnist_cnn max_epoch = 10 batch_size = 64 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5) # Prepare training and valadiation data train_x, train_y, test_x, test_y = load_dataset() IMG_SIZE = 28 num_classes = 10 train_y = to_categorical(train_y, num_classes) test_y = to_categorical(test_y, num_classes) # Normalization train_x = train_x / 255 test_x = test_x / 255 if DIST: # For Distributed GPU Training sgd = opt.DistOpt(sgd, nccl_id=nccl_id, local_rank=local_rank, world_size=world_size) dev = device.create_cuda_gpu_on(sgd.local_rank) # Dataset partition for distributed training train_x, train_y = data_partition(train_x, train_y, sgd.global_rank, sgd.world_size) test_x, test_y = data_partition(test_x, test_y, sgd.global_rank, sgd.world_size) world_size = sgd.world_size else: # For Single GPU dev = device.create_cuda_gpu() world_size = 1 # create model model = CNN() tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32) ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_test_batch = test_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) if DIST: #Sychronize the initial parameters autograd.training = True x = np.random.randn(batch_size, 1, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) for p, g in autograd.backward(loss): synchronize(p, sgd) # Training and Evaulation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if ((DIST == False) or (sgd.global_rank == 0)): print('Starting Epoch %d:' % (epoch)) # Training Phase autograd.training = True train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) for b in range(num_train_batch): x = train_x[idx[b * batch_size:(b + 1) * batch_size]] x = augmentation(x, batch_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out = model.forward(tx) loss = autograd.softmax_cross_entropy(out, ty) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] if DIST: if (spars == 0): sgd.backward_and_update(loss, threshold=50000) else: sgd.backward_and_sparse_update(loss, spars=spars, topK=topK, corr=corr) else: sgd.backward_and_update(loss) if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1,), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) # Output the Training Loss and Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase autograd.training = False for b in range(num_test_batch): x = test_x[b * batch_size:(b + 1) * batch_size] y = test_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model.forward(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if ((DIST == False) or (sgd.global_rank == 0)): print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_test_batch * batch_size * world_size), time.time() - start_time), flush=True)
x = autograd.relu(features) x = self.globalpooling(x) x = autograd.flatten(x) x = self.fc(x) return x def __call__(self, input): x = self.features(input) x = self.logits(x) return x if __name__ == '__main__': model = Xception(num_classes=1000) print('Start intialization............') dev = device.create_cuda_gpu_on(0) #dev = device.create_cuda_gpu() niters = 20 batch_size = 16 IMG_SIZE = 299 sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size,), dev, tensor.int32) autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y)
def run(global_rank, world_size, local_rank, max_epoch, batch_size, model, data, sgd, graph, verbosity, dist_option='fp32', spars=None): dev = device.create_cuda_gpu_on(local_rank) dev.SetRandSeed(0) np.random.seed(0) if data == 'cifar10': from data import cifar10 train_x, train_y, val_x, val_y = cifar10.load() elif data == 'cifar100': from data import cifar100 train_x, train_y, val_x, val_y = cifar100.load() elif data == 'mnist': from data import mnist train_x, train_y, val_x, val_y = mnist.load() num_channels = train_x.shape[1] image_size = train_x.shape[2] data_size = np.prod(train_x.shape[1:train_x.ndim]).item() num_classes = (np.max(train_y) + 1).item() #print(num_classes) if model == 'resnet': from model import resnet model = resnet.resnet50(num_channels=num_channels, num_classes=num_classes) elif model == 'xceptionnet': from model import xceptionnet model = xceptionnet.create_model(num_channels=num_channels, num_classes=num_classes) elif model == 'cnn': from model import cnn model = cnn.create_model(num_channels=num_channels, num_classes=num_classes) elif model == 'alexnet': from model import alexnet model = alexnet.create_model(num_channels=num_channels, num_classes=num_classes) elif model == 'mlp': import os, sys, inspect current = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) parent = os.path.dirname(current) sys.path.insert(0, parent) from mlp import module model = module.create_model(data_size=data_size, num_classes=num_classes) # For distributed training, sequential gives better performance if hasattr(sgd, "communicator"): DIST = True sequential = True else: DIST = False sequential = False if DIST: train_x, train_y, val_x, val_y = partition(global_rank, world_size, train_x, train_y, val_x, val_y) ''' # check dataset shape correctness if global_rank == 0: print("Check the shape of dataset:") print(train_x.shape) print(train_y.shape) ''' if model.dimension == 4: tx = tensor.Tensor( (batch_size, num_channels, model.input_size, model.input_size), dev, tensor.float32) elif model.dimension == 2: tx = tensor.Tensor((batch_size, data_size), dev, tensor.float32) np.reshape(train_x, (train_x.shape[0], -1)) np.reshape(val_x, (val_x.shape[0], -1)) ty = tensor.Tensor((batch_size, ), dev, tensor.int32) num_train_batch = train_x.shape[0] // batch_size num_val_batch = val_x.shape[0] // batch_size idx = np.arange(train_x.shape[0], dtype=np.int32) # attached model to graph model.set_optimizer(sgd) model.compile([tx], is_train=True, use_graph=graph, sequential=sequential) dev.SetVerbosity(verbosity) # Training and Evaluation Loop for epoch in range(max_epoch): start_time = time.time() np.random.shuffle(idx) if global_rank == 0: print('Starting Epoch %d:' % (epoch)) # Training Phase train_correct = np.zeros(shape=[1], dtype=np.float32) test_correct = np.zeros(shape=[1], dtype=np.float32) train_loss = np.zeros(shape=[1], dtype=np.float32) model.train() for b in range(num_train_batch): # Generate the patch data in this iteration x = train_x[idx[b * batch_size:(b + 1) * batch_size]] if model.dimension == 4: x = augmentation(x, batch_size) if (image_size != model.input_size): x = resize_dataset(x, model.input_size) y = train_y[idx[b * batch_size:(b + 1) * batch_size]] # Copy the patch data into input tensors tx.copy_from_numpy(x) ty.copy_from_numpy(y) # Train the model out, loss = model(tx, ty, dist_option, spars) train_correct += accuracy(tensor.to_numpy(out), y) train_loss += tensor.to_numpy(loss)[0] if DIST: # Reduce the Evaluation Accuracy and Loss from Multiple Devices reducer = tensor.Tensor((1, ), dev, tensor.float32) train_correct = reduce_variable(train_correct, sgd, reducer) train_loss = reduce_variable(train_loss, sgd, reducer) if global_rank == 0: print('Training loss = %f, training accuracy = %f' % (train_loss, train_correct / (num_train_batch * batch_size * world_size)), flush=True) # Evaluation Phase model.eval() for b in range(num_val_batch): x = val_x[b * batch_size:(b + 1) * batch_size] if model.dimension == 4: if (image_size != model.input_size): x = resize_dataset(x, model.input_size) y = val_y[b * batch_size:(b + 1) * batch_size] tx.copy_from_numpy(x) ty.copy_from_numpy(y) out_test = model(tx) test_correct += accuracy(tensor.to_numpy(out_test), y) if DIST: # Reduce the Evaulation Accuracy from Multiple Devices test_correct = reduce_variable(test_correct, sgd, reducer) # Output the Evaluation Accuracy if global_rank == 0: print('Evaluation accuracy = %f, Elapsed Time = %fs' % (test_correct / (num_val_batch * batch_size * world_size), time.time() - start_time), flush=True) dev.PrintTimeProfiling()
from singa import autograd from singa import tensor from singa import device from singa import opt import numpy as np from tqdm import trange if __name__ == "__main__": sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) sgd = opt.DistOpt(sgd) if (sgd.global_rank == 0): print("Start intialization...........", flush=True) dev = device.create_cuda_gpu_on(sgd.local_rank) from resnet import resnet50 model = resnet50() niters = 100 batch_size = 32 IMG_SIZE = 224 tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size, ), dev, tensor.int32) autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y)
from singa import opt import numpy as np from tqdm import trange if __name__ == "__main__": sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5) sgd = opt.DistOpt(sgd) from resnet import resnet50 model = resnet50() if (sgd.rank_in_global == 0): print("Start intialization...........", flush=True) dev = device.create_cuda_gpu_on(sgd.rank_in_local) niters = 100 batch_size = 32 IMG_SIZE = 224 tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev) ty = tensor.Tensor((batch_size, ), dev, tensor.int32) autograd.training = True x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32) y = np.random.randint(0, 1000, batch_size, dtype=np.int32) tx.copy_from_numpy(x) ty.copy_from_numpy(y) import time dev.Sync()
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= import unittest import numpy as np from singa import tensor from singa import opt from singa import device from singa import singa_wrap if (singa_wrap.USE_DIST): sgd = opt.SGD(lr=0.1) sgd = opt.DistOpt(sgd) dev = device.create_cuda_gpu_on(sgd.local_rank, set_default=False) param = tensor.Tensor((10, 10), dev, tensor.float32) grad = tensor.Tensor((10, 10), dev, tensor.float32) expected = np.ones((10, 10), dtype=np.float32) * (10 - 0.1) @unittest.skipIf(not singa_wrap.USE_DIST, 'DIST is not enabled') class TestDistOptimizer(unittest.TestCase): def test_dist_opt_fp32(self): # Test the C++ all reduce operation in fp32 param.set_value(10) grad.set_value(1) sgd.all_reduce(grad.data) sgd.wait()