def __init__(self, solver_file, softmax_layer_name, accuracy_layer_name, snapshot, gpu_idx = 0): self.solver_file = solver_file self.softmax_layer_name = softmax_layer_name self.accuracy_layer_name = accuracy_layer_name self.snapshot = snapshot self.gpu = owl.create_gpu_device(gpu_idx) owl.set_device(self.gpu)
def __init__(self, solver_file, snapshot, layer_name, result_path, gpu_idx = 0): self.solver_file = solver_file self.snapshot = snapshot self.layer_name = layer_name self.result_path = result_path self.gpu = owl.create_gpu_device(gpu_idx) owl.set_device(self.gpu)
def train_network(model, num_epochs=100, minibatch_size=256, dropout_rate=0.5, eps_w=0.01, eps_b=0.01, mom=0.9, wd=0.0005): gpu0 = owl.create_gpu_device(0) gpu1 = owl.create_gpu_device(1) num_layers = 20 num_weights = 8 count = 0 last = time.time() # dp = ImageNetDataProvider(mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', # train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', # val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', # test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') for i in xrange(num_epochs): print "---------------------Epoch #", i for j in xrange(300): count = count + 1 data = owl.randn([227, 227, 3, minibatch_size], 0, 1) label = owl.randn([1, minibatch_size], 0, 1) weightsgrad = [None] * num_weights biasgrad = [None] * num_weights num_samples = minibatch_size ''' thisimg = samples[0, :] print thisimg imgdata = np.transpose(thisimg.reshape([3, 227*227])).reshape([227, 227, 3]) print imgdata img = Image.fromarray(imgdata.astype(np.uint8)) img.save('testimg.jpg', format='JPEG') exit(0) ''' owl.set_device(gpu0) out = train_one_mb(model, data, label, weightsgrad, biasgrad, dropout_rate) for k in range(num_weights): model.weightsdelta[ k] = mom * model.weightsdelta[k] - eps_w / num_samples * ( weightsgrad[k] + wd * model.weights[k]) model.biasdelta[ k] = mom * model.biasdelta[k] - eps_b / num_samples * ( biasgrad[k] + wd * model.bias[k]) model.weights[k] += model.weightsdelta[k] model.weights[k].start_eval() model.bias[k] += model.biasdelta[k] model.bias[k].start_eval() if count % 3 == 0: print_training_accuracy(out, label, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time()
def train_network(model, num_epochs = 100, minibatch_size = 256, lr = 0.01, mom = 0.9, wd = 0.0000): np.set_printoptions(linewidth=200) owl.set_device(owl.create_gpu_device(0)) count = 0 # load data (train_data, test_data) = imageio.load_mb_from_mat("mnist_all.mat", minibatch_size) num_test_samples = test_data[0].shape[0] (test_samples, test_labels) = map(lambda npdata : owl.from_nparray(npdata), test_data) for i in xrange(num_epochs): print "---Epoch #", i for (mb_samples, mb_labels) in train_data: num_samples = mb_samples.shape[0] data = owl.from_nparray(mb_samples).reshape([28, 28, 1, num_samples]) label = owl.from_nparray(mb_labels) out, weightgrad, biasgrad = train(model, data, label) for k in range(len(model.weights)): model.weightdelta[k] = mom * model.weightdelta[k] - lr / num_samples * weightgrad[k] - wd * model.weights[k] model.biasdelta[k] = mom * model.biasdelta[k] - lr / num_samples * biasgrad[k] model.weights[k] += model.weightdelta[k] model.bias[k] += model.biasdelta[k] count = count + 1 if (count % 1) == 0: print_training_accuracy(out, label, num_samples) if count == 100: sys.exit()
def train_network(model, num_epochs = 100, minibatch_size=10, dropout_rate = 0.5, eps_w = 0.01, mom = 0.9, wd = 0.0005): gpu0 = owl.create_gpu_device(0) owl.set_device(gpu0) num_weights = 8 count = 0 last = time.time() cropped_size = 224 dp = ImageNetDataProvider(mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') #mark the output layer output_layer = 'prob' for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size, cropped_size): count = count + 1 num_samples = samples.shape[0] data = owl.from_numpy(samples).reshape([cropped_size, cropped_size, 3, num_samples]) target = owl.from_numpy(labels) model.ff(data, target) print_training_accuracy(model.layers[output_layer].get_act(), target, minibatch_size) model.bp(data, target) exit(0)
def train_network(model, num_epochs=100, minibatch_size=256, lr=0.01, mom=0.75, wd=5e-4): # load data (train_data, test_data) = mnist_io.load_mb_from_mat('mnist_all.mat', minibatch_size / len(gpu)) num_test_samples = test_data[0].shape[0] test_samples = owl.from_numpy(test_data[0]).reshape([28, 28, 1, num_test_samples]) test_labels = owl.from_numpy(test_data[1]) for i in xrange(num_epochs): print "---Epoch #", i last = time.time() count = 0 weightgrads = [None] * len(gpu) biasgrads = [None] * len(gpu) for (mb_samples, mb_labels) in train_data: count += 1 current_gpu = count % len(gpu) owl.set_device(gpu[current_gpu]) num_samples = mb_samples.shape[0] data = owl.from_numpy(mb_samples).reshape([28, 28, 1, num_samples]) label = owl.from_numpy(mb_labels) out, weightgrads[current_gpu], biasgrads[current_gpu] = bpprop(model, data, label) if current_gpu == 0: for k in range(len(model.weights)): model.weightdelta[k] = mom * model.weightdelta[k] - lr / num_samples / len(gpu) * multi_gpu_merge(weightgrads, 0, k) - lr * wd * model.weights[k] model.biasdelta[k] = mom * model.biasdelta[k] - lr / num_samples / len(gpu) * multi_gpu_merge(biasgrads, 0, k) model.weights[k] += model.weightdelta[k] model.bias[k] += model.biasdelta[k] if count % (len(gpu) * lazy_cycle) == 0: print_training_accuracy(out, label, num_samples, 'Training') print '---End of Epoch #', i, 'time:', time.time() - last # do test out, _, _ = bpprop(model, test_samples, test_labels) print_training_accuracy(out, test_labels, num_test_samples, 'Testing')
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): gpu0 = owl.create_gpu_device(0) owl.set_device(gpu0) num_weights = 8 count = 0 last = time.time() dp = ImageNetDataProvider(mean_file='/home/yutian/data/config_file/google_model/imagenet_mean.binaryproto', train_db='/home/yutian/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/yutian/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/yutian/data/imagenet/ilsvrc12_test_lmdb') for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size): count = count + 1 num_samples = samples.shape[0] data = owl.from_numpy(samples).reshape([227, 227, 3, num_samples]) target = owl.from_numpy(labels) out, weightsgrad, biasgrad = model.train_one_mb(data, target, dropout_rate) model.update(weightsgrad, biasgrad, num_samples, mom, eps_w, wd) if count % 4 == 0: print_training_accuracy(out, target, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time()
def multi_gpu_merge(l, base, layer): if len(l) == 1: return l[0][layer] left = multi_gpu_merge(l[:len(l) / 2], base, layer) right = multi_gpu_merge(l[len(l) / 2:], base + len(l) / 2, layer) owl.set_device(base) return left + right
def train_network(model, num_epochs=100, minibatch_size=256, lr=0.01, mom=0.75, wd=5e-4): # load data (train_data, test_data) = mnist_io.load_mb_from_mat('mnist_all.mat', minibatch_size / len(gpu)) num_test_samples = test_data[0].shape[0] test_samples = owl.from_numpy(test_data[0]).reshape([28, 28, 1, num_test_samples]) test_labels = owl.from_numpy(test_data[1]) for i in xrange(num_epochs): print "---Epoch #", i last = time.time() count = 0 weightgrads = [None] * len(gpu) biasgrads = [None] * len(gpu) for (mb_samples, mb_labels) in train_data: count += 1 current_gpu = count % len(gpu) owl.set_device(gpu[current_gpu]) num_samples = mb_samples.shape[0] data = owl.from_numpy(mb_samples).reshape([28, 28, 1, num_samples]) label = owl.from_numpy(mb_labels) out, weightgrads[current_gpu], biasgrads[current_gpu] = bpprop(model, data, label) out.start_eval() if current_gpu == 0: for k in range(len(model.weights)): model.weightdelta[k] = mom * model.weightdelta[k] - lr / num_samples / len(gpu) * multi_gpu_merge(weightgrads, 0, k) - lr * wd * model.weights[k] model.biasdelta[k] = mom * model.biasdelta[k] - lr / num_samples / len(gpu) * multi_gpu_merge(biasgrads, 0, k) model.weights[k] += model.weightdelta[k] model.bias[k] += model.biasdelta[k] if count % (len(gpu) * lazy_cycle) == 0: print_training_accuracy(out, label, num_samples, 'Training') print '---End of Epoch #', i, 'time:', time.time() - last # do test out, _, _ = bpprop(model, test_samples, test_labels) print_training_accuracy(out, test_labels, num_test_samples, 'Testing')
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): num_layers = model.num_layers num_weights = model.num_weights last = time.time() num_samples = minibatch_size minibatch_size = minibatch_size / num_gpu for i in xrange(num_epochs): print "---------------------Epoch #", i weightsgrad = [[None] * num_weights for z in range(num_gpu)] biasgrad = [[None] * num_weights for z in range(num_gpu)] for j in xrange(1, 1024): count = j % num_gpu owl.set_device(gpu_array[count]) data = owl.randn([227, 227, 3, minibatch_size], 0, 1) label = owl.randn([1, minibatch_size], 0, 1) out = train_one_mb(model, data, label, weightsgrad[count], biasgrad[count], dropout_rate) out.start_eval() if count == 0: # Update for k in range(num_weights): for l in range(1, num_gpu): weightsgrad[0][k] = weightsgrad[0][k] + weightsgrad[l][k] biasgrad[0][k] = biasgrad[0][k] + biasgrad[l][k] model.weightsdelta[k] = mom * model.weightsdelta[k] - eps_w / num_samples * (weightsgrad[0][k] + wd * model.weights[k]) model.biasdelta[k] = mom * model.biasdelta[k] - eps_b / num_samples * (biasgrad[0][k] + wd * model.bias[k]) model.weights[k] += model.weightsdelta[k] model.bias[k] += model.biasdelta[k] if j % (lazy * num_gpu) == 0: print_training_accuracy(out, label, minibatch_size) print "time: %s" % (time.time() - last) last = time.time()
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): num_layers = model.num_layers num_weights = model.num_weights last = time.time() minibatch_size = minibatch_size # / num_gpu for i in xrange(num_epochs): print "---------------------Epoch #", i weightsgrad = [[None] * num_weights for z in range(num_gpu)] biasgrad = [[None] * num_weights for z in range(num_gpu)] for j in xrange(1, 1024): count = j % num_gpu owl.set_device(gpu_array[count]) data = owl.randn([227, 227, 3, minibatch_size], 0, 1) label = owl.randn([1, minibatch_size], 0, 1) out = train_one_mb(model, data, label, weightsgrad[count], biasgrad[count], dropout_rate) for k in weightsgrad[count]: k.start_eval() for k in biasgrad[count]: k.start_eval() if count == 0: for k in range(0, num_gpu): for l in weightsgrad[k]: l.wait_for_eval() for l in biasgrad[k]: l.wait_for_eval() print "time: %s" % (time.time() - last) last = time.time()
def test_ones(self): owl.set_device(owl.create_mpi_device(1,1)) test = 0 for i in range(1000): owl.zeros([10000,10000]) owl.wait_for_all() owl.print_profiler_result()
def multi_dev_merge(l, base, layer): if len(l) == 1: return l[0][layer] # print "pre-multi" left = multi_dev_merge(l[:len(l) / 2], base, layer) # print "post-left" right = multi_dev_merge(l[len(l) / 2:], base + len(l) / 2, layer) # print "post-right" owl.set_device(base) return left + right
def run(self): (train_data, test_data) = mnist_io.load_mb_from_mat(self.data_file, self.mb_size) np.set_printoptions(linewidth=200) num_test_samples = test_data[0].shape[0] (test_samples, test_labels) = map(lambda npdata: owl.from_numpy(npdata), test_data) count = 1 owl.set_device(self.gpu) for epoch in range(self.num_epochs): print '---Start epoch #%d' % epoch # train for (mb_samples, mb_labels) in train_data: num_samples = mb_samples.shape[0] a1 = owl.from_numpy(mb_samples) target = owl.from_numpy(mb_labels) # ff a2 = ele.relu(self.w1 * a1 + self.b1) a3 = self.w2 * a2 + self.b2 # softmax & error out = co.softmax(a3) s3 = out - target # bp s2 = self.w2.trans() * s3 s2 = ele.relu_back(s2, a2) # grad gw1 = s2 * a1.trans() / num_samples gb1 = s2.sum(1) / num_samples gw2 = s3 * a2.trans() / num_samples gb2 = s3.sum(1) / num_samples # update self.w1 -= self.eps_w * gw1 self.w2 -= self.eps_w * gw2 self.b1 -= self.eps_b * gb1 self.b2 -= self.eps_b * gb2 if (count % 40 == 0): correct = out.argmax(0) - target.argmax(0) val = correct.to_numpy() print 'Training error:', float( np.count_nonzero(val)) / num_samples count = count + 1 # test a1 = test_samples a2 = ele.relu(self.w1 * a1 + self.b1) a3 = self.w2 * a2 + self.b2 correct = a3.argmax(0) - test_labels.argmax(0) val = correct.to_numpy() #print val print 'Testing error:', float( np.count_nonzero(val)) / num_test_samples print '---Finish epoch #%d' % epoch
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): gpu0 = owl.create_gpu_device(0) gpu1 = owl.create_gpu_device(1) num_layers = 20 num_weights = 8 count = 0 last = time.time() dp = ImageNetDataProvider(mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') minibatch_size = minibatch_size / 2 for i in xrange(num_epochs): print "---------------------Epoch #", i for j in xrange(300): count = count + 1 if count % 2 == 1: data1 = owl.from_nparray(samples).reshape([227, 227, 3, samples.shape[0]]) label1 = owl.from_nparray(labels) #data1 = owl.randn([227, 227, 3, minibatch_size], 0, 1) #label1 = owl.randn([1, minibatch_size], 0, 1) weightsgrad1 = [None] * num_weights biasgrad1 = [None] * num_weights owl.set_device(gpu0) out1 = train_one_mb(model, data1, label1, weightsgrad1, biasgrad1, dropout_rate) out1.start_eval() continue if count % 2 == 0: data2 = owl.from_nparray(samples).reshape([227, 227, 3, samples.shape[0]]) label2 = owl.from_nparray(labels) #data2 = owl.randn([227, 227, 3, minibatch_size], 0, 1) #label2 = owl.randn([1, minibatch_size], 0, 1) weightsgrad2 = [None] * num_weights biasgrad2 = [None] * num_weights num_samples = data1.shape[-1] + data2.shape[-1] owl.set_device(gpu1) out2 = train_one_mb(model, data2, label2, weightsgrad2, biasgrad2, dropout_rate) out2.start_eval() for k in range(num_weights): model.weightsdelta[k] = mom * model.weightsdelta[k] - eps_w / num_samples * (weightsgrad1[k] + weightsgrad2[k] + wd * model.weights[k]) model.biasdelta[k] = mom * model.biasdelta[k] - eps_b / num_samples * (biasgrad1[k] + biasgrad2[k]) model.weights[k] += model.weightsdelta[k] model.bias[k] += model.biasdelta[k] if count % 8 == 0: print_training_accuracy(out1, label1, data1.shape[-1]) print "time: %s" % (time.time() - last) last = time.time()
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): gpu = [None] * 2 gpu[0] = owl.create_gpu_device(0) gpu[1] = owl.create_gpu_device(1) num_layers = 20 num_weights = 8 count = 0 last = time.time() dp = ImageNetDataProvider(mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') minibatch_size = minibatch_size / 2 wgrad = [None] * 2 bgrad = [None] * 2 num_samples = 0 for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size): #for j in range(300): count = count + 1 gpuid = count % 2 owl.set_device(gpu[gpuid]) data = owl.from_numpy(samples).reshape([227, 227, 3, samples.shape[0]]) label = owl.from_numpy(labels) #data = owl.randn([227, 227, 3, 128], 0.0, 0.01) #label = owl.randn([1000, 128], 0.0, 0.01) num_samples += data.shape[-1] (out, wgrad[gpuid], bgrad[gpuid]) = model.train_one_mb(data, label, dropout_rate) out.start_eval() if count % 2 != 0: continue for k in range(num_weights): wgrad[0][k] += wgrad[1][k] bgrad[0][k] += bgrad[1][k] model.update(wgrad[0], bgrad[0], num_samples, mom, eps_w, wd) if count % 8 == 0: print_training_accuracy(out, label, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time() num_samples = 0 wgrad = [None] * 2 bgrad = [None] * 2
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): gpu = [None] * 2 gpu[0] = owl.create_gpu_device(0) gpu[1] = owl.create_gpu_device(1) num_layers = 20 num_weights = 8 count = 0 last = time.time() dp = ImageNetDataProvider(mean_file='/home/yutian/data/config_file/google_model/imagenet_mean.binaryproto', train_db='/home/yutian/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/yutian/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/yutian/data/imagenet/ilsvrc12_test_lmdb') minibatch_size = minibatch_size / 2 wgrad = [None] * 2 bgrad = [None] * 2 num_samples = 0 for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size): #for j in range(300): count = count + 1 gpuid = count % 2 owl.set_device(gpu[gpuid]) data = owl.from_numpy(samples).reshape([227, 227, 3, samples.shape[0]]) label = owl.from_numpy(labels) #data = owl.randn([227, 227, 3, 128], 0.0, 0.01) #label = owl.randn([1000, 128], 0.0, 0.01) num_samples += data.shape[-1] (out, wgrad[gpuid], bgrad[gpuid]) = model.train_one_mb(data, label, dropout_rate) if count % 2 != 0: continue for k in range(num_weights): wgrad[0][k] += wgrad[1][k] bgrad[0][k] += bgrad[1][k] model.update(wgrad[0], bgrad[0], num_samples, mom, eps_w, wd) if count % 8 == 0: print_training_accuracy(out, label, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time() num_samples = 0 wgrad = [None] * 2 bgrad = [None] * 2
def run(self): (train_data, test_data) = mnist_io.load_mb_from_mat(self.data_file, self.mb_size) np.set_printoptions(linewidth=200) num_test_samples = test_data[0].shape[0] (test_samples, test_labels) = map(lambda npdata : owl.from_numpy(npdata), test_data) count = 1 owl.set_device(self.gpu) for epoch in range(self.num_epochs): print '---Start epoch #%d' % epoch # train for (mb_samples, mb_labels) in train_data: num_samples = mb_samples.shape[0] a1 = owl.from_numpy(mb_samples) target = owl.from_numpy(mb_labels) # ff a2 = ele.relu(self.w1 * a1 + self.b1) a3 = self.w2 * a2 + self.b2 # softmax & error out = co.softmax(a3) s3 = out - target # bp s2 = self.w2.trans() * s3 s2 = ele.relu_back(s2, a2) # grad gw1 = s2 * a1.trans() / num_samples gb1 = s2.sum(1) / num_samples gw2 = s3 * a2.trans() / num_samples gb2 = s3.sum(1) / num_samples # update self.w1 -= self.eps_w * gw1 self.w2 -= self.eps_w * gw2 self.b1 -= self.eps_b * gb1 self.b2 -= self.eps_b * gb2 if (count % 40 == 0): correct = out.max_index(0) - target.max_index(0) val = correct.to_numpy() print 'Training error:', float(np.count_nonzero(val)) / num_samples count = count + 1 # test a1 = test_samples a2 = ele.relu(self.w1 * a1 + self.b1) a3 = self.w2 * a2 + self.b2 correct = a3.max_index(0) - test_labels.max_index(0) val = correct.to_numpy() #print val print 'Testing error:', float(np.count_nonzero(val)) / num_test_samples print '---Finish epoch #%d' % epoch
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): gpu0 = owl.create_gpu_device(0) gpu1 = owl.create_gpu_device(1) num_layers = 20 num_weights = 8 count = 0 last = time.time() # dp = ImageNetDataProvider(mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', # train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', # val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', # test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') for i in xrange(num_epochs): print "---------------------Epoch #", i for j in xrange(300): count = count + 1 data = owl.randn([227, 227, 3, minibatch_size], 0, 1) label = owl.randn([1, minibatch_size], 0, 1) weightsgrad = [None] * num_weights biasgrad = [None] * num_weights num_samples = minibatch_size ''' thisimg = samples[0, :] print thisimg imgdata = np.transpose(thisimg.reshape([3, 227*227])).reshape([227, 227, 3]) print imgdata img = Image.fromarray(imgdata.astype(np.uint8)) img.save('testimg.jpg', format='JPEG') exit(0) ''' owl.set_device(gpu0) out = train_one_mb(model, data, label, weightsgrad, biasgrad, dropout_rate) for k in range(num_weights): model.weightsdelta[k] = mom * model.weightsdelta[k] - eps_w / num_samples * (weightsgrad[k] + wd * model.weights[k]) model.biasdelta[k] = mom * model.biasdelta[k] - eps_b / num_samples * (biasgrad[k] + wd * model.bias[k]) model.weights[k] += model.weightsdelta[k] model.weights[k].start_eval() model.bias[k] += model.biasdelta[k] model.bias[k].start_eval() if count % 3 == 0: print_training_accuracy(out, label, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time()
def test(self): narrays = [] n = 32 exp = 0 for i in range(n): narrays.append(owl.ones([1000,1000])) j = 1 while j <= n/2: for i in range(0, n , j*2): owl.set_device(hash(i)%len(cpumpitestinit.devices)) narrays[i] = narrays[i]*narrays[i+j] j *= 2 exp = exp*2+1 test = narrays[0] expected = np.ones([1000,1000])*math.pow(1000,exp) print 'Expected\n',expected print "Actual\n",test.to_numpy() self.assertTrue(np.allclose(expected, test.to_numpy()))
def train_network(filename, model, num_epochs=5, minibatch_size=256, lr=0.1, lr_decay= 0.95, mom=0.9, wd=5e-4): # load data (train_data, test_data) = mnist_io.load_mb_from_mat(filename, minibatch_size / len(devs)) num_test_samples = test_data[0].shape[0] test_samples = owl.from_numpy(test_data[0]).reshape([28, 28, 1, num_test_samples]) test_labels = owl.from_numpy(test_data[1]) for i in xrange(num_epochs): print "---Epoch #", i last = time.time() count = 0 weightgrads = [None] * len(devs) biasgrads = [None] * len(devs) for (mb_samples, mb_labels) in train_data: count += 1 current_dev = count % len(devs) owl.set_device(devs[current_dev]) num_samples = mb_samples.shape[0] data = owl.from_numpy(mb_samples).reshape([28, 28, 1, num_samples]) label = owl.from_numpy(mb_labels) #print "\t[{}]Train Data imported to minerva format".format(count) out, weightgrads[current_dev], biasgrads[current_dev] = bpprop(model, data, label) #print "\t[{}]Backprop complete".format(count) # print "dev {}".format(current_dev) if current_dev == 0: # print "pre-merge" for k in range(len(model.weights)): model.weightdelta[k] = mom * model.weightdelta[k] - lr / num_samples / len(devs) * multi_dev_merge(weightgrads, 0, k) - lr * wd * model.weights[k] # print "\t weight merge" model.biasdelta[k] = mom * model.biasdelta[k] - lr / num_samples / len(devs) * multi_dev_merge(biasgrads, 0, k) # print "\t bias merge" model.weights[k] += model.weightdelta[k] model.bias[k] += model.biasdelta[k] # print "post-merge" if count % (len(devs) * lazy_cycle) == 0: print_training_accuracy(out, label, num_samples, 'Training ' + str(count)) owl.print_profiler_result() print '---End of Epoch #', i, 'time:', time.time() - last lr = lr*lr_decay # do test out, _, _ = bpprop(model, test_samples, test_labels) print_training_accuracy(out, test_labels, num_test_samples, 'Testing')
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): num_layers = model.num_layers num_weights = model.num_weights last = time.time() num_samples = minibatch_size minibatch_size = minibatch_size / num_gpu dp = ImageNetDataProvider(mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') for i in xrange(num_epochs): print "---------------------Epoch #", i weightsgrad = [[None] * num_weights for z in range(num_gpu)] biasgrad = [[None] * num_weights for z in range(num_gpu)] j = 0 for (samples, labels) in dp.get_train_mb(minibatch_size): j += 1 count = j % num_gpu owl.set_device(gpu_array[count]) data = owl.from_nparray(samples).reshape([227, 227, 3, minibatch_size]) label = owl.from_nparray(labels) out = train_one_mb(model, data, label, weightsgrad[count], biasgrad[count], dropout_rate) # out.start_eval() if count == 0: # Update for k in range(num_weights): for l in range(1, num_gpu): weightsgrad[0][k] = weightsgrad[0][k] + weightsgrad[l][k] biasgrad[0][k] = biasgrad[0][k] + biasgrad[l][k] model.weightsdelta[k] = mom * model.weightsdelta[k] - eps_w / num_samples * (weightsgrad[0][k] + wd * model.weights[k]) model.biasdelta[k] = mom * model.biasdelta[k] - eps_b / num_samples * (biasgrad[0][k] + wd * model.bias[k]) model.weights[k] += model.weightsdelta[k] model.weights[k].start_eval() model.bias[k] += model.biasdelta[k] model.bias[k].start_eval() if j % (lazy * num_gpu) == 0: print_training_accuracy(out, label, minibatch_size) print "time: %s" % (time.time() - last) last = time.time()
def test(self): # Expected cpu=owl.create_cpu_device() owl.set_device(cpu) img = np.arange(0,32, dtype=np.float32) #/32 img = np.reshape(img,[1,2,4,4]) expected = np.asarray([[[5,7], [13,15]], [[21,23], [29,31]]]) #/32.0 #expected = np.asarray([[[ 110.25, 124.25], # [ 166.25, 180.25]], # [[ 278.25, 324.25], # [ 462.25, 508.25]]]) # test owlimg = owl.from_numpy(img) pooler = owl.conv.Pooler(2,2,2,2) test = pooler.ff(owlimg) print 'Expected\n',expected print "Actual\n",test.to_numpy() print "This test must be run with a fractional bit width of 12" self.assertTrue(np.allclose(expected, test.to_numpy(), atol= 1.0/(1<<12)*4))
def test(self): owl.set_device(d[3%len(d)]) a = owl.ones([1000,900]) owl.set_device(d[2%len(d)]) b = owl.ones([900,1000]) owl.set_device(d[1%len(d)]) test = a*b expected = np.ones([1000,1000])*900 #print 'Expected\n',expected #print "Actual\n",test.to_numpy() self.assertTrue(np.array_equal(expected, test.to_numpy()))
def test(self): owl.set_device(cpumpitestinit.devices[-3]) a = owl.ones([20,900]) owl.set_device(cpumpitestinit.devices[-2]) b = owl.ones([900,800]) owl.set_device(cpumpitestinit.devices[-1]) test = a*b expected = np.ones([800,20])*900 #print 'Expected\n',expected #print "Actual\n",test.to_numpy() self.assertTrue(np.array_equal(expected, test.to_numpy()))
label = owl.from_numpy(labels) #data = owl.randn([227, 227, 3, 128], 0.0, 0.01) #label = owl.randn([1000, 128], 0.0, 0.01) num_samples += data.shape[-1] (out, wgrad[gpuid], bgrad[gpuid]) = model.train_one_mb(data, label, dropout_rate) if count % 2 != 0: continue for k in range(num_weights): wgrad[0][k] += wgrad[1][k] bgrad[0][k] += bgrad[1][k] model.update(wgrad[0], bgrad[0], num_samples, mom, eps_w, wd) if count % 8 == 0: print_training_accuracy(out, label, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time() num_samples = 0 wgrad = [None] * 2 bgrad = [None] * 2 if __name__ == '__main__': cpu = owl.create_cpu_device() owl.set_device(cpu) model = AlexModel() model.init_random() train_network(model)
cpu = owl.create_cpu_device() print "owl: local CPU creation in rank {} with id {}".format(owl.rank(), cpu) sys.stdout.flush() print ''' __ __ _ __ _ _____ ____ _ _ ___ / | / | | | | \\ | | | ___| | _ \\ | | / / / | / |/ | | | | \\| | | |__ | |_| | | | / / / /| | / /| /| | | | | | | __| | / | |/ / / /_| | / / | / | | | | | |\\ | | |___ | |\\ \\ | / / ___ | /_/ |_/ |_| |_| |_| \\__| |_____| |_| \\_\\ |__/ /_/ |_| ''' if owl.has_cuda(): print owl.get_gpu_device_count() gpu = [owl.create_gpu_device(i) for i in range(owl.get_gpu_device_count())] print '[INFO] You have %d GPU devices' % len(gpu) print '[INFO] Set device to gpu[0]' owl.set_device(gpu[0]) if owl.has_mpi(): n = owl.get_mpi_node_count() for i in range(1,n): id = owl.create_mpi_device(i,0) print "owl: created mpi cpu device on rank {} with id {}".format(i, id) else: print '[INFO] CUDA disabled' print '[INFO] Set device to cpu' owl.set_device(cpu) print "\nREADY FOR INPUT\n" #print z.to_numpy() #import IPython; IPython.start_ipython(argv=[])
def train_network(model, num_epochs=100, minibatch_size=256, dropout_rate=0.5, eps_w=0.01, eps_b=0.01, mom=0.9, wd=0.0005): gpu = owl.create_gpu_device(1) owl.set_device(gpu) num_layers = 20 count = 0 last = time.time() dp = ImageNetDataProvider( mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') acts = [None] * num_layers sens = [None] * num_layers for i in xrange(num_epochs): print "---------------------Epoch #", i sys.stdout.flush() for (samples, labels) in dp.get_train_mb(minibatch_size): num_samples = samples.shape[0] acts = [None] * num_layers sens = [None] * num_layers # FF acts[0] = owl.from_nparray(samples).reshape( [227, 227, 3, num_samples]) target = owl.from_nparray(labels) acts1 = conv_forward(acts[0], model.weights[0], model.bias[0], model.conv_infos[0]) acts[1] = ele.relu( acts1 ) #(conv_forward(acts[0], model.weights[0], model.bias[0], model.conv_infos[0])) # conv1 acts[2] = pooling_forward(acts[1], model.pooling_infos[0]) # pool1 acts3 = conv_forward(acts[2], model.weights[1], model.bias[1], model.conv_infos[1]) # conv2 acts[3] = ele.relu( acts3 ) #(conv_forward(acts[2], model.weights[1], model.bias[1], model.conv_infos[1])) # conv2 acts[4] = pooling_forward(acts[3], model.pooling_infos[1]) # pool2 acts5 = conv_forward(acts[4], model.weights[2], model.bias[2], model.conv_infos[2]) # conv3 acts[5] = ele.relu( acts5 ) #(conv_forward(acts[4], model.weights[2], model.bias[2], model.conv_infos[2])) # conv3 acts6 = conv_forward(acts[5], model.weights[3], model.bias[3], model.conv_infos[3]) # conv4 acts[6] = ele.relu( acts6 ) #(conv_forward(acts[5], model.weights[3], model.bias[3], model.conv_infos[3])) # conv4 acts7 = conv_forward(acts[6], model.weights[4], model.bias[4], model.conv_infos[4]) # conv5 acts[7] = ele.relu( acts7 ) #(conv_forward(acts[6], model.weights[4], model.bias[4], model.conv_infos[4])) # conv5 acts[8] = pooling_forward(acts[7], model.pooling_infos[2]) # pool5 re_acts8 = acts[8].reshape( [np.prod(acts[8].shape[0:3]), num_samples]) acts9 = model.weights[5] * re_acts8 + model.bias[5] # fc6 acts[9] = ele.relu( acts9) #(model.weights[5] * re_acts8 + model.bias[5]) # fc6 mask6 = owl.randb(acts[9].shape, dropout_rate) acts[9] = ele.mult(acts[9], mask6) # drop6 acts10 = model.weights[6] * acts[9] + model.bias[6] # fc7 acts[10] = ele.relu( acts10) #(model.weights[6] * acts[9] + model.bias[6]) # fc7 mask7 = owl.randb(acts[10].shape, dropout_rate) acts[10] = ele.mult(acts[10], mask7) # drop7 acts[11] = model.weights[7] * acts[10] + model.bias[7] # fc8 acts[12] = softmax_forward( acts[11].reshape([1000, 1, 1, num_samples]), soft_op.instance).reshape([1000, num_samples]) # prob # error sens[11] = acts[12] - target # BP sens[10] = model.weights[7].trans() * sens[11] # fc8 sens[10] = ele.mult(sens[10], mask7) # drop7 sens[10] = ele.relu_back(sens[10], acts[10], acts10) # relu7 sens[9] = model.weights[6].trans() * sens[10] sens[9] = ele.mult(sens[9], mask6) # drop6 sens[9] = ele.relu_back(sens[9], acts[9], acts9) # relu6 sens[8] = (model.weights[5].trans() * sens[9]).reshape( acts[8].shape) # fc6 sens[7] = pooling_backward(sens[8], acts[8], acts[7], model.pooling_infos[2]) # pool5 sens[7] = ele.relu_back(sens[7], acts[7], acts7) # relu5 sens[6] = conv_backward_data(sens[7], model.weights[4], model.conv_infos[4]) # conv5 sens[6] = ele.relu_back(sens[6], acts[6], acts6) # relu4 sens[5] = conv_backward_data(sens[6], model.weights[3], model.conv_infos[3]) # conv4 sens[5] = ele.relu_back(sens[5], acts[5], acts5) # relu3 sens[4] = conv_backward_data(sens[5], model.weights[2], model.conv_infos[2]) # conv3 sens[3] = pooling_backward(sens[4], acts[4], acts[3], model.pooling_infos[1]) # pool2 sens[3] = ele.relu_back(sens[3], acts[3], acts3) # relu2 sens[2] = conv_backward_data(sens[3], model.weights[1], model.conv_infos[1]) # conv2 sens[1] = pooling_backward(sens[2], acts[2], acts[1], model.pooling_infos[0]) # pool1 sens[1] = ele.relu_back(sens[1], acts[1], acts1) # relu1 model.weightsdelta[ 7] = mom * model.weightsdelta[7] - eps_w / num_samples * ( sens[11] * acts[10].trans() + wd * model.weights[7]) model.biasdelta[7] = mom * model.biasdelta[ 7] - eps_b / num_samples * sens[11].sum(1) model.weightsdelta[ 6] = mom * model.weightsdelta[6] - eps_w / num_samples * ( sens[10] * acts[9].trans() + wd * model.weights[6]) model.biasdelta[6] = mom * model.biasdelta[ 6] - eps_b / num_samples * sens[10].sum(1) model.weightsdelta[ 5] = mom * model.weightsdelta[5] - eps_w / num_samples * ( sens[9] * re_acts8.trans() + wd * model.weights[5]) model.biasdelta[5] = mom * model.biasdelta[ 5] - eps_b / num_samples * sens[9].sum(1) model.weightsdelta[ 4] = mom * model.weightsdelta[4] - eps_w / num_samples * ( conv_backward_filter(sens[7], acts[6], model.conv_infos[4]) + wd * model.weights[4]) model.biasdelta[4] = mom * model.biasdelta[ 4] - eps_b / num_samples * conv_backward_bias(sens[7]) model.weightsdelta[ 3] = mom * model.weightsdelta[3] - eps_w / num_samples * ( conv_backward_filter(sens[6], acts[5], model.conv_infos[3]) + wd * model.weights[3]) model.biasdelta[3] = mom * model.biasdelta[ 3] - eps_b / num_samples * conv_backward_bias(sens[6]) model.weightsdelta[ 2] = mom * model.weightsdelta[2] - eps_w / num_samples * ( conv_backward_filter(sens[5], acts[4], model.conv_infos[2]) + wd * model.weights[2]) model.biasdelta[2] = mom * model.biasdelta[ 2] - eps_b / num_samples * conv_backward_bias(sens[5]) model.weightsdelta[ 1] = mom * model.weightsdelta[1] - eps_w / num_samples * ( conv_backward_filter(sens[3], acts[2], model.conv_infos[1]) + wd * model.weights[1]) model.biasdelta[1] = mom * model.biasdelta[ 1] - eps_b / num_samples * conv_backward_bias(sens[3]) model.weightsdelta[ 0] = mom * model.weightsdelta[0] - eps_w / num_samples * ( conv_backward_filter(sens[1], acts[0], model.conv_infos[0]) + wd * model.weights[0]) model.biasdelta[0] = mom * model.biasdelta[ 0] - eps_b / num_samples * conv_backward_bias(sens[1]) for k in range(8): model.weights[k] += model.weightsdelta[k] model.bias[k] += model.biasdelta[k] count = count + 1 if count % 10 == 0: print_training_accuracy(acts[12], target, num_samples) print "time: %s" % (time.time() - last) last = time.time()
from operator import mul import matplotlib.pyplot as plt #The file containing the data in the format of one vector per line space separated floats DATAFILE = "????" if __name__ == "__main__": #Setup minerva cpu = owl.create_cpu_device() if owl.get_gpu_device_count() > 0: dev = owl.create_gpu_device(0) else: dev = cpu owl.set_device(dev) # load data gzfile = gzip.GzipFile('/home/jlovitt/storage/mnist/mnist.dat','rb') #discard stored variable name pickle.load(gzfile) data = pickle.load(gzfile) #data = np.loadtxt(DATAFILE,dtype=np.float32, delimiter=" ") #data = data - np.mean(data, 0) #data = data / np.var(data, 0) data = data/255.0 # training parameters epsilon = 0.01 momentum = 0.9
import owl devices = [] devices.append(owl.create_cpu_device()) if owl.has_mpi(): n = owl.get_mpi_node_count() for i in range(1,n): id = owl.create_mpi_device(i,0) devices.append(id) owl.set_device(devices[-1])
def run(self): (train_data, test_data) = imageio.load_mb_from_mat(self.data_file, self.mb_size) np.set_printoptions(linewidth=200) num_test_samples = test_data[0].shape[0] (test_samples, test_labels) = map(lambda npdata : owl.from_nparray(npdata), test_data) count = 1 for epoch in range(self.num_epochs): print '---Start epoch #%d' % epoch # train for (mb_samples, mb_labels) in train_data: num_samples = mb_samples.shape[0] owl.set_device(self.cpu) a1 = owl.from_nparray(mb_samples) target = owl.from_nparray(mb_labels) owl.set_device(self.gpu) # ff a2 = owl.elewise.sigmoid((self.w1 * a1).norm_arithmetic(self.b1, owl.op.add)) a3 = owl.elewise.sigmoid((self.w2 * a2).norm_arithmetic(self.b2, owl.op.add)) # softmax & error out = owl.softmax(a3) s3 = out - target # bp s3 = owl.elewise.mult(s3, 1 - s3) s2 = self.w2.trans() * s3 s2 = owl.elewise.mult(s2, 1 - s2) # grad gw1 = s2 * a1.trans() / num_samples gb1 = s2.sum(1) / num_samples gw2 = s3 * a2.trans() / num_samples gb2 = s3.sum(1) / num_samples # update self.w1 -= self.eps_w * gw1 self.w2 -= self.eps_w * gw2 self.b1 -= self.eps_b * gb1 self.b2 -= self.eps_b * gb2 if (count % 40 == 0): correct = out.max_index(0) - target.max_index(0) val = correct.tolist() print 'Training error:', (float(num_samples) - val.count(0.0)) / num_samples # test a1 = test_samples a2 = owl.elewise.sigmoid((self.w1 * a1).norm_arithmetic(self.b1, owl.op.add)) a3 = owl.elewise.sigmoid((self.w2 * a2).norm_arithmetic(self.b2, owl.op.add)) correct = a3.max_index(0) - test_labels.max_index(0) val = correct.tolist() #print val print 'Testing error:', (float(num_test_samples) - val.count(0.0)) / num_test_samples count = count + 1 # test #a1 = test_samples #a2 = owl.elewise.sigmoid((self.w1 * a1).norm_arithmetic(self.b1, owl.op.add)) #a3 = owl.elewise.sigmoid((self.w2 * a2).norm_arithmetic(self.b2, owl.op.add)) #out = owl.softmax(a3) #correct = out.max_index(0) - test_labels.max_index(0) #val = correct.tolist() #print 'Testing error:', (float(num_test_samples) - val.count(0.0)) / num_test_samples print '---Finish epoch #%d' % epoch
def gradient_checker(s, checklayer_name): ''' Check backpropagation on multiple GPUs ''' h = 1e-2 threshold = 1e-4 checklayer = s.owl_net.units[s.owl_net.name_to_uid[checklayer_name][0]] losslayer = [] for i in xrange(len(s.owl_net.units)): if isinstance(s.owl_net.units[i], net.SoftmaxUnit): losslayer.append(i) last = None ''' wunits = [] for i in xrange(len(s.owl_net.units)): if isinstance(s.owl_net.units[i], net.WeightedComputeUnit): wunits.append(i) ''' wunits = s.owl_net.get_weighted_unit_ids() accunits = s.owl_net.get_accuracy_units() owl.set_device(s.gpu[0]) for iteridx in range(100): #disturb the weights oriweight = checklayer.weight npweight = checklayer.weight.to_numpy() weightshape = np.shape(npweight) npweight = npweight.reshape(np.prod(weightshape[0:len(weightshape)])) position = np.random.randint(0, np.shape(npweight)[0]) disturb = np.zeros(np.shape(npweight), dtype = np.float32) disturb[position] = h oriposval = npweight[position] npweight += disturb newposval = npweight[position] npweight = npweight.reshape(weightshape) checklayer.weight = owl.from_numpy(npweight) all_loss = 0 # train on multi-gpu s.owl_net.forward_check() for i in range(len(losslayer)): if len(s.owl_net.units[losslayer[i]].loss_weight) == 1: all_loss += (s.owl_net.units[losslayer[i]].getloss() * s.owl_net.units[losslayer[i]].loss_weight[0]) else: all_loss += s.owl_net.units[losslayer[i]].getloss() #get origin loss checklayer.weight = oriweight ori_all_loss = 0 # train on multi-gpu s.owl_net.forward_check() for i in range(len(losslayer)): if len(s.owl_net.units[losslayer[i]].loss_weight) == 1: ori_all_loss += (s.owl_net.units[losslayer[i]].getloss() * s.owl_net.units[losslayer[i]].loss_weight[0]) else: ori_all_loss += s.owl_net.units[losslayer[i]].getloss() s.owl_net.backward('TEST') #get analytic gradient npgrad = checklayer.weightgrad.to_numpy() npgrad = npgrad.reshape(np.prod(weightshape[0:len(weightshape)])) analy_grad = npgrad[position] / s.owl_net.units[losslayer[i]].out.shape[1] num_grad = (all_loss - ori_all_loss) / h info = "Gradient Check at positon: %d analy: %f num: %f ratio: %f" % (position, analy_grad, num_grad, analy_grad / num_grad) print info
label = owl.from_nparray(labels) out = train_one_mb(model, data, label, weightsgrad[count], biasgrad[count], dropout_rate) # out.start_eval() if count == 0: # Update for k in range(num_weights): for l in range(1, num_gpu): weightsgrad[0][k] = weightsgrad[0][k] + weightsgrad[l][k] biasgrad[0][k] = biasgrad[0][k] + biasgrad[l][k] model.weightsdelta[k] = mom * model.weightsdelta[k] - eps_w / num_samples * (weightsgrad[0][k] + wd * model.weights[k]) model.biasdelta[k] = mom * model.biasdelta[k] - eps_b / num_samples * (biasgrad[0][k] + wd * model.bias[k]) model.weights[k] += model.weightsdelta[k] model.weights[k].start_eval() model.bias[k] += model.biasdelta[k] model.bias[k].start_eval() if j % (lazy * num_gpu) == 0: print_training_accuracy(out, label, minibatch_size) print "time: %s" % (time.time() - last) last = time.time() if __name__ == '__main__': owl.initialize(sys.argv) owl.create_cpu_device() for i in range(num_gpu): gpu_array.append(owl.create_gpu_device(i)) owl.set_device(gpu_array[0]) model = AlexModel() model.init_random() train_network(model)
def run(s): ''' Run the training algorithm on multiple GPUs The basic logic is similar to the traditional single GPU training code as follows (pseudo-code):: for epoch in range(MAX_EPOCH): for i in range(NUM_MINI_BATCHES): # load i^th minibatch minibatch = loader.load(i, MINI_BATCH_SIZE) net.ff(minibatch.data) net.bp(minibatch.label) grad = net.gradient() net.update(grad, MINI_BATCH_SIZE) With Minerva's lazy evaluation and dataflow engine, we are able to modify the above logic to perform data parallelism on multiple GPUs (pseudo-code):: for epoch in range(MAX_EPOCH): for i in range(0, NUM_MINI_BATCHES, NUM_GPU): gpu_grad = [None for i in range(NUM_GPU)] for gpuid in range(NUM_GPU): # specify which gpu following codes are running on owl.set_device(gpuid) # each minibatch is split among GPUs minibatch = loader.load(i + gpuid, MINI_BATCH_SIZE / NUM_GPU) net.ff(minibatch.data) net.bp(minibatch.label) gpu_grad[gpuid] = net.gradient() net.accumulate_and_update(gpu_grad, MINI_BATCH_SIZE) So each GPU will take charge of one *mini-mini batch* training, and since all their ``ff``, ``bp`` and ``gradient`` calculations are independent among each others, they could be paralleled naturally using Minerva's DAG engine. The only problem let is ``accumulate_and_update`` of the the gradient from all GPUs. If we do it on one GPU, that GPU would become a bottleneck. The solution is to also partition the workload to different GPUs (pseudo-code):: def accumulate_and_update(gpu_grad, MINI_BATCH_SIZE): num_layers = len(gpu_grad[0]) for layer in range(num_layers): upd_gpu = layer * NUM_GPU / num_layers # specify which gpu to update the layer owl.set_device(upd_gpu) for gid in range(NUM_GPU): if gid != upd_gpu: gpu_grad[upd_gpu][layer] += gpu_grad[gid][layer] net.update_layer(layer, gpu_grad[upd_gpu][layer], MINI_BATCH_SIZE) Since the update of each layer is independent among each others, the update could be paralleled affluently. Minerva's dataflow engine transparently handles the dependency resolving, scheduling and memory copying among different devices, so users don't need to care about that. ''' wgrad = [[] for i in range(s.num_gpu)] bgrad = [[] for i in range(s.num_gpu)] last = time.time() wunits = s.owl_net.get_weighted_unit_ids() last_start = time.time() for iteridx in range(s.snapshot * s.owl_net.solver.snapshot, s.owl_net.solver.max_iter): # get the learning rate if s.owl_net.solver.lr_policy == "poly": s.owl_net.current_lr = s.owl_net.base_lr * pow( 1 - float(iteridx) / s.owl_net.solver.max_iter, s.owl_net.solver.power) elif s.owl_net.solver.lr_policy == "step": s.owl_net.current_lr = s.owl_net.base_lr * pow( s.owl_net.solver.gamma, iteridx / s.owl_net.solver.stepsize) # train on multi-gpu for gpuid in range(s.num_gpu): owl.set_device(s.gpu[gpuid]) s.owl_net.forward('TRAIN') s.owl_net.backward('TRAIN') for wid in wunits: wgrad[gpuid].append(s.owl_net.units[wid].weightgrad) bgrad[gpuid].append(s.owl_net.units[wid].biasgrad) # weight update for i in range(len(wunits)): wid = wunits[i] upd_gpu = i * s.num_gpu / len(wunits) owl.set_device(s.gpu[upd_gpu]) for gid in range(s.num_gpu): if gid == upd_gpu: continue wgrad[upd_gpu][i] += wgrad[gid][i] bgrad[upd_gpu][i] += bgrad[gid][i] s.owl_net.units[wid].weightgrad = wgrad[upd_gpu][i] s.owl_net.units[wid].biasgrad = bgrad[upd_gpu][i] s.owl_net.update(wid) if iteridx % 2 == 0: owl.wait_for_all() thistime = time.time() - last print "Finished training %d minibatch (time: %s)" % (iteridx, thistime) last = time.time() wgrad = [[] for i in range(s.num_gpu)] # reset gradients bgrad = [[] for i in range(s.num_gpu)] # decide whether to display loss if (iteridx + 1) % (s.owl_net.solver.display) == 0: lossunits = s.owl_net.get_loss_units() for lu in lossunits: print "Training Loss %s: %f" % (lu.name, lu.getloss()) # decide whether to test if (iteridx + 1) % (s.owl_net.solver.test_interval) == 0: acc_num = 0 test_num = 0 for testiteridx in range(s.owl_net.solver.test_iter[0]): s.owl_net.forward('TEST') all_accunits = s.owl_net.get_accuracy_units() accunit = all_accunits[len(all_accunits) - 1] #accunit = all_accunits[0] test_num += accunit.batch_size acc_num += (accunit.batch_size * accunit.acc) print "Accuracy the %d mb: %f" % (testiteridx, accunit.acc) sys.stdout.flush() print "Testing Accuracy: %f" % (float(acc_num) / test_num) # decide whether to save model if (iteridx + 1) % (s.owl_net.solver.snapshot) == 0: print "Save to snapshot %d, current lr %f" % ( (iteridx + 1) / (s.owl_net.solver.snapshot), s.owl_net.current_lr) s.builder.save_net_to_file(s.owl_net, s.snapshot_dir, (iteridx + 1) / (s.owl_net.solver.snapshot)) sys.stdout.flush()
def __init__(self, solver_file, snapshot, gpu_idx = 0): self.solver_file = solver_file self.snapshot = snapshot self.gpu = owl.create_gpu_device(gpu_idx) owl.set_device(self.gpu)
def setfpga(): owl.set_device(devices[-1])
def __init__(self, solver_file, snapshot, gpu_idx=0): self.solver_file = solver_file self.snapshot = snapshot self.gpu = owl.create_gpu_device(gpu_idx) owl.set_device(self.gpu)
def train_network_n(n, model, num_epochs=100, minibatch_size=40, dropout_rate=0.5, eps_w=0.0001, eps_b=0.0002, mom=0.9, wd=0.0005): gpus = [] for i in range(0, n): gpus.append(owl.create_gpu_device(i)) count = 0 last = time.time() dp = ImageNetDataProvider( mean_file='./VGGmodel/vgg_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') minibatch_size = minibatch_size / n correct = 0 rerun = False startepoch = 0 curepoch = startepoch data = [None] * n label = [None] * n out = [None] * n biasgrad = [None] * n weightsgrad = [None] * n for i in range(startepoch, num_epochs): print "---------------------Epoch %d Index %d" % (curepoch, i) sys.stdout.flush() batchidx = 0 count = 0 loadmodel(i, model) for (samples, labels) in dp.get_train_mb(minibatch_size, 224): count = count + 1 data[count - 1] = owl.from_numpy(samples).reshape( [224, 224, 3, samples.shape[0]]) label[count - 1] = owl.from_numpy(labels) biasgrad[count - 1] = [None] * (model.num_layers - 1) weightsgrad[count - 1] = [None] * (model.num_layers - 1) owl.set_device(gpus[count - 1]) out[count - 1] = train_one_mb(model, data[count - 1], label[count - 1], weightsgrad[count - 1], biasgrad[count - 1]) out[count - 1].start_eval() if count % n > 0: continue totalweightsgrad = [None] * (model.num_layers - 1) totalbiasgrad = [None] * (model.num_layers - 1) num_samples = 0 for gpuidx in range(0, n): num_samples += data[gpuidx].shape[-1] for k in range(model.num_layers - 1): if model.ff_infos[k]['ff_type'] == 'conv' or model.ff_infos[ k]['ff_type'] == 'fully': if gpuidx == 0: totalweightsgrad[k] = weightsgrad[gpuidx][k] totalbiasgrad[k] = biasgrad[gpuidx][k] else: totalweightsgrad[k] += weightsgrad[gpuidx][k] totalbiasgrad[k] += biasgrad[gpuidx][k] for k in range(model.num_layers - 1): if model.ff_infos[k]['ff_type'] == 'conv' or model.ff_infos[k][ 'ff_type'] == 'fully': model.weightsdelta[k] = mom * model.weightsdelta[ k] - eps_w / num_samples * ( totalweightsgrad[k] + wd * num_samples * model.weights[k]) model.biasdelta[k] = mom * model.biasdelta[ k] - eps_b / num_samples * totalbiasgrad[k] model.weights[k] += model.weightsdelta[k] model.bias[k] += model.biasdelta[k] #print num_samples if count % n == 0: print 'batch %d' % (batchidx) batchidx = batchidx + 1 ''' #TODO hack if batchidx == 2000: savemodel(i+1, model) exit(0) ''' thiscorrect = print_training_accuracy(out[0], label[0], data[0].shape[-1]) print "time: %s" % (time.time() - last) last = time.time() count = 0 savemodel(i + 1, model)
def check_weight_2gpu(owl_net, checklayer, gpu): h = 1e-2 threshold = 1e-4 wunits = get_weights_id(owl_net) wgrad = [] bgrad = [] for iteridx in range(10): #disturb the weights oriweight = checklayer.weight npweight = checklayer.weight.to_numpy() weightshape = np.shape(npweight) npweight = npweight.reshape(np.prod(weightshape[0:len(weightshape)])) print np.shape(npweight) position = np.random.randint(0, np.shape(npweight)[0]) print position disturb = np.zeros(np.shape(npweight), dtype=np.float32) disturb[position] = h oriposval = npweight[position] npweight += disturb newposval = npweight[position] npweight = npweight.reshape(weightshape) checklayer.weight = owl.from_numpy(npweight) #get disturbed loss owl_net.forward('TRAIN') all_loss = 0 for i in xrange(len(losslayer)): all_loss += owl_net.units[losslayer[i]].getloss() all_loss = all_loss / owl_net.batch_size #+ 0.5 * owl_net.base_weight_decay * newposval * newposval #get origin loss checklayer.weight = oriweight owl_net.forward('TRAIN') ori_all_loss = 0 for i in xrange(len(losslayer)): ori_all_loss += owl_net.units[losslayer[i]].getloss() ori_all_loss = ori_all_loss / owl_net.batch_size #+ 0.5 * owl_net.base_weight_decay * oriposval * oriposval #analy_grad owl.set_device(gpu[0]) owl_net.forward('TRAIN') owl_net.backward('TRAIN') for wid in wunits: wgrad.append(owl_net.units[wid].weightgrad) bgrad.append(owl_net.units[wid].biasgrad) owl.set_device(gpu[1]) owl_net.forward('TRAIN') owl_net.backward('TRAIN') for i in range(len(wunits)): wid = wunits[i] owl_net.units[wid].weightgrad += wgrad[i] owl_net.units[wid].biasgrad += bgrad[i] wgrad = [] bgrad = [] #get analytic gradient npgrad = checklayer.weightgrad.to_numpy() npgrad = npgrad.reshape(np.prod(weightshape[0:len(weightshape)])) analy_grad = npgrad[position] / owl_net.batch_size / len(gpu) print all_loss print ori_all_loss num_grad = (all_loss - ori_all_loss) / h diff = np.abs(analy_grad - num_grad) info = "analy: %f num: %f ratio: %f" % (analy_grad, num_grad, analy_grad / num_grad) print info
def run(s): wgrad = [[] for i in range(s.num_gpu)] bgrad = [[] for i in range(s.num_gpu)] last = time.time() wunits = s.owl_net.get_weighted_unit_ids() last_start = time.time() for iteridx in range(s.snapshot * s.owl_net.solver.snapshot, s.owl_net.solver.max_iter): # get the learning rate if s.owl_net.solver.lr_policy == "poly": s.owl_net.current_lr = s.owl_net.base_lr * pow( 1 - float(iteridx) / s.owl_net.solver.max_iter, s.owl_net.solver.power) elif s.owl_net.solver.lr_policy == "step": s.owl_net.current_lr = s.owl_net.base_lr * pow( s.owl_net.solver.gamma, iteridx / s.owl_net.solver.stepsize) # train on multi-gpu for gpuid in range(s.num_gpu): owl.set_device(s.gpu[gpuid]) s.owl_net.forward('TRAIN') s.owl_net.backward('TRAIN') for wid in wunits: wgrad[gpuid].append(s.owl_net.units[wid].weightgrad) bgrad[gpuid].append(s.owl_net.units[wid].biasgrad) # weight update for i in range(len(wunits)): wid = wunits[i] upd_gpu = i * num_gpu / len(wunits) owl.set_device(s.gpu[upd_gpu]) for gid in range(s.num_gpu): if gid == upd_gpu: continue wgrad[upd_gpu][i] += wgrad[gid][i] bgrad[upd_gpu][i] += bgrad[gid][i] s.owl_net.units[wid].weightgrad = wgrad[upd_gpu][i] s.owl_net.units[wid].biasgrad = bgrad[upd_gpu][i] s.owl_net.update(wid) #s.owl_net.weight_update(num_gpu = s.num_gpu) if iteridx % 2 == 0: s.owl_net.wait_for_eval_loss() thistime = time.time() - last print "Finished training %d minibatch (time: %s)" % (iteridx, thistime) last = time.time() #s.owl_net.units[wunits[0]].weight.wait_for_eval() wgrad = [[] for i in range(s.num_gpu)] # reset gradients bgrad = [[] for i in range(s.num_gpu)] # decide whether to display loss if (iteridx + 1) % (s.owl_net.solver.display) == 0: lossunits = s.owl_net.get_loss_units() for lu in lossunits: print "Training Loss %s: %f" % (lu.name, lu.getloss()) # decide whether to test #if True: if (iteridx + 1) % (s.owl_net.solver.test_interval) == 0: acc_num = 0 test_num = 0 for testiteridx in range(s.owl_net.solver.test_iter[0]): s.owl_net.forward('TEST') all_accunits = s.owl_net.get_accuracy_units() accunit = all_accunits[len(all_accunits) - 1] #accunit = all_accunits[0] print accunit.name test_num += accunit.batch_size acc_num += (accunit.batch_size * accunit.acc) print "Accuracy the %d mb: %f" % (testiteridx, accunit.acc) sys.stdout.flush() print "Testing Accuracy: %f" % (float(acc_num) / test_num) # decide whether to save model if (iteridx + 1) % (s.owl_net.solver.snapshot) == 0: print "Save to snapshot %d, current lr %f" % ( (iteridx + 1) / (s.owl_net.solver.snapshot), s.owl_net.current_lr) s.builder.save_net_to_file(s.owl_net, s.snapshot_dir, (iteridx + 1) / (s.owl_net.solver.snapshot)) sys.stdout.flush()
if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # evaluation output = Y.to_numpy() # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]], 1e-20), 2) test_ll += sent_ll test_ent = test_ll * (-1) / words test_ppl = 2**test_ent print "Test PPL =", test_ppl if __name__ == '__main__': owl.initialize(sys.argv) gpu = owl.create_gpu_device(1) owl.set_device(gpu) model, train_sents, test_sents, train_words, test_words = LSTM_init() learning_rate = 0.1 for i in range(5): model, learning_rate = LSTM_train(model, train_sents, train_words, learning_rate, 1) LSTM_test(model, test_sents, test_words)
out, _, _ = bpprop(model, test_samples, test_labels) print_training_accuracy(out, test_labels, num_test_samples, 'Testing') def multi_gpu_merge(l, base, layer): if len(l) == 1: return l[0][layer] left = multi_gpu_merge(l[:len(l) / 2], base, layer) right = multi_gpu_merge(l[len(l) / 2:], base + len(l) / 2, layer) owl.set_device(base) return left + right if __name__ == '__main__': owl.initialize(sys.argv) parser = argparse.ArgumentParser(description='MNIST CNN') parser.add_argument('-n', '--num', help='number of GPUs to use', action='store', type=int, default=1) args = parser.parse_args() assert (1 <= args.num) print 'Using %d GPU(s)' % args.num gpu = [owl.create_gpu_device(i) for i in range(args.num)] owl.set_device(gpu[0]) model = MNISTCNNModel() model.init_random() train_network(model)
import owl import sys owl.initialize(sys.argv) owl.create_cpu_device() gpu0 = owl.create_gpu_device(0) gpu1 = owl.create_gpu_device(1) owl.set_device(gpu0)
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): gpu = owl.create_gpu_device(1) owl.set_device(gpu) num_layers = 20 count = 0 last = time.time() dp = ImageNetDataProvider(mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') acts = [None] * num_layers sens = [None] * num_layers for i in xrange(num_epochs): print "---------------------Epoch #", i sys.stdout.flush() for (samples, labels) in dp.get_train_mb(minibatch_size): num_samples = samples.shape[0] acts = [None] * num_layers sens = [None] * num_layers ''' thisimg = samples[0, :] print thisimg imgdata = np.transpose(thisimg.reshape([3, 227*227])).reshape([227, 227, 3]) print imgdata img = Image.fromarray(imgdata.astype(np.uint8)) img.save('testimg.jpg', format='JPEG') exit(0) ''' # FF acts[0] = owl.from_nparray(samples).reshape([227, 227, 3, num_samples]) #print np.array(acts[0].tolist())[0:227*227*3] target = owl.from_nparray(labels) #np.set_printoptions(linewidth=200) #print acts[0].shape, model.weights[0].shape, model.bias[0].shape #im = np.array(acts[0].tolist()).reshape([num_samples, 227, 227, 3]) #print im[0,:,:,0] #print im[0,:,:,1] #print im[0,:,:,2] #print target.max_index(0).tolist()[0:20] #sys.exit() acts1 = conv_forward(acts[0], model.weights[0], model.bias[0], model.conv_infos[0]) acts[1] = ele.relu(acts1)#(conv_forward(acts[0], model.weights[0], model.bias[0], model.conv_infos[0])) # conv1 acts[2] = pooling_forward(acts[1], model.pooling_infos[0]) # pool1 acts3 = conv_forward(acts[2], model.weights[1], model.bias[1], model.conv_infos[1]) # conv2 acts[3] = ele.relu(acts3)#(conv_forward(acts[2], model.weights[1], model.bias[1], model.conv_infos[1])) # conv2 acts[4] = pooling_forward(acts[3], model.pooling_infos[1]) # pool2 acts5 = conv_forward(acts[4], model.weights[2], model.bias[2], model.conv_infos[2]) # conv3 acts[5] = ele.relu(acts5)#(conv_forward(acts[4], model.weights[2], model.bias[2], model.conv_infos[2])) # conv3 acts6 = conv_forward(acts[5], model.weights[3], model.bias[3], model.conv_infos[3]) # conv4 acts[6] = ele.relu(acts6)#(conv_forward(acts[5], model.weights[3], model.bias[3], model.conv_infos[3])) # conv4 acts7 = conv_forward(acts[6], model.weights[4], model.bias[4], model.conv_infos[4]) # conv5 acts[7] = ele.relu(acts7)#(conv_forward(acts[6], model.weights[4], model.bias[4], model.conv_infos[4])) # conv5 acts[8] = pooling_forward(acts[7], model.pooling_infos[2]) # pool5 re_acts8 = acts[8].reshape([np.prod(acts[8].shape[0:3]), num_samples]) acts9 = model.weights[5] * re_acts8 + model.bias[5] # fc6 acts[9] = ele.relu(acts9)#(model.weights[5] * re_acts8 + model.bias[5]) # fc6 mask6 = owl.randb(acts[9].shape, dropout_rate) acts[9] = ele.mult(acts[9], mask6) # drop6 acts10 = model.weights[6] * acts[9] + model.bias[6] # fc7 acts[10] = ele.relu(acts10)#(model.weights[6] * acts[9] + model.bias[6]) # fc7 mask7 = owl.randb(acts[10].shape, dropout_rate) acts[10] = ele.mult(acts[10], mask7) # drop7 acts[11] = model.weights[7] * acts[10] + model.bias[7] # fc8 acts[12] = softmax_forward(acts[11].reshape([1000, 1, 1, num_samples]), soft_op.instance).reshape([1000, num_samples]) # prob # error sens[11] = acts[12] - target # BP sens[10] = model.weights[7].trans() * sens[11] # fc8 sens[10] = ele.mult(sens[10], mask7) # drop7 sens[10] = ele.relu_back(sens[10], acts[10], acts10) # relu7 sens[9] = model.weights[6].trans() * sens[10] sens[9] = ele.mult(sens[9], mask6) # drop6 sens[9] = ele.relu_back(sens[9], acts[9], acts9) # relu6 sens[8] = (model.weights[5].trans() * sens[9]).reshape(acts[8].shape) # fc6 sens[7] = pooling_backward(sens[8], acts[8], acts[7], model.pooling_infos[2]) # pool5 sens[7] = ele.relu_back(sens[7], acts[7], acts7) # relu5 sens[6] = conv_backward_data(sens[7], model.weights[4], model.conv_infos[4]) # conv5 sens[6] = ele.relu_back(sens[6], acts[6], acts6) # relu4 sens[5] = conv_backward_data(sens[6], model.weights[3], model.conv_infos[3]) # conv4 sens[5] = ele.relu_back(sens[5], acts[5], acts5) # relu3 sens[4] = conv_backward_data(sens[5], model.weights[2], model.conv_infos[2]) # conv3 sens[3] = pooling_backward(sens[4], acts[4], acts[3], model.pooling_infos[1]) # pool2 sens[3] = ele.relu_back(sens[3], acts[3], acts3) # relu2 sens[2] = conv_backward_data(sens[3], model.weights[1], model.conv_infos[1]) # conv2 sens[1] = pooling_backward(sens[2], acts[2], acts[1], model.pooling_infos[0]) # pool1 sens[1] = ele.relu_back(sens[1], acts[1], acts1) # relu1 model.weightsdelta[7] = mom * model.weightsdelta[7] - eps_w / num_samples * (sens[11] * acts[10].trans() + wd * model.weights[7]) model.biasdelta[7] = mom * model.biasdelta[7] - eps_b / num_samples * (sens[11].sum(1) + wd * model.bias[7]) model.weightsdelta[6] = mom * model.weightsdelta[6] - eps_w / num_samples * (sens[10] * acts[9].trans() + wd * model.weights[6]) model.biasdelta[6] = mom * model.biasdelta[6] - eps_b / num_samples * (sens[10].sum(1) + wd * model.bias[6]) model.weightsdelta[5] = mom * model.weightsdelta[5] - eps_w / num_samples * (sens[9] * re_acts8.trans() + wd * model.weights[5]) model.biasdelta[5] = mom * model.biasdelta[5] - eps_b / num_samples * (sens[9].sum(1) + wd * model.bias[5]) model.weightsdelta[4] = mom * model.weightsdelta[4] - eps_w / num_samples * (conv_backward_filter(sens[7], acts[6], model.conv_infos[4]) + wd * model.weights[4]) model.biasdelta[4] = mom * model.biasdelta[4] - eps_b / num_samples * (conv_backward_bias(sens[7]) + wd * model.bias[4]) model.weightsdelta[3] = mom * model.weightsdelta[3] - eps_w / num_samples * (conv_backward_filter(sens[6], acts[5], model.conv_infos[3]) + wd * model.weights[3]) model.biasdelta[3] = mom * model.biasdelta[3] - eps_b / num_samples * (conv_backward_bias(sens[6]) + wd * model.bias[3]) model.weightsdelta[2] = mom * model.weightsdelta[2] - eps_w / num_samples * (conv_backward_filter(sens[5], acts[4], model.conv_infos[2]) + wd * model.weights[2]) model.biasdelta[2] = mom * model.biasdelta[2] - eps_b / num_samples * (conv_backward_bias(sens[5]) + wd * model.bias[2]) model.weightsdelta[1] = mom * model.weightsdelta[1] - eps_w / num_samples * (conv_backward_filter(sens[3], acts[2], model.conv_infos[1]) + wd * model.weights[1]) model.biasdelta[1] = mom * model.biasdelta[1] - eps_b / num_samples * (conv_backward_bias(sens[3]) + wd * model.bias[1]) model.weightsdelta[0] = mom * model.weightsdelta[0] - eps_w / num_samples * (conv_backward_filter(sens[1], acts[0], model.conv_infos[0]) + wd * model.weights[0]) model.biasdelta[0] = mom * model.biasdelta[0] - eps_b / num_samples * (conv_backward_bias(sens[1]) + wd * model.bias[0]) for k in range(8): model.weights[k] += model.weightsdelta[k] model.bias[k] += model.biasdelta[k] count = count + 1 #if count % 2 == 0: #acts[18].start_eval() if count % 10 == 0: print_training_accuracy(acts[12], target, num_samples) print "time: %s" % (time.time() - last) last = time.time()
#prepare the net and solver builder = CaffeNetBuilder(sys.argv[1], sys.argv[2]) owl_net = net.Net() builder.build_net(owl_net) builder.init_net_from_file(owl_net, sys.argv[3]) accunitname = sys.argv[4] last = time.time() wunits = get_weights_id(owl_net) print len(wunits) wgrad = [[] for i in xrange(4)] bgrad = [[] for i in xrange(4)] for iteridx in range(owl_net.solver.max_iter): gpuidx = iteridx % 4 owl.set_device(gpu[gpuidx]) owl_net.forward('TRAIN') owl_net.backward('TRAIN') for wid in wunits: wgrad[gpuidx].append(owl_net.units[wid].weightgrad) bgrad[gpuidx].append(owl_net.units[wid].biasgrad) owl_net.get_units_by_name(accunitname)[0].ff_y.start_eval() if (iteridx + 1) % 2 == 0: for i in range(len(wunits)): wid = wunits[i] wgrad[gpuidx][i] += wgrad[gpuidx - 1][i] bgrad[gpuidx][i] += bgrad[gpuidx - 1][i] if (iteridx + 1) % 4 == 0:
bgrad = [[] for i in xrange(num_gpu)] for iteridx in range(startsnapshot * owl_net.solver.snapshot, owl_net.solver.max_iter): #get the learning rate if owl_net.solver.lr_policy == "poly": owl_net.current_lr = owl_net.base_lr * pow( 1 - float(iteridx) / owl_net.solver.max_iter, owl_net.solver.power) elif owl_net.solver.lr_policy == "step": owl_net.current_lr = owl_net.base_lr * pow( owl_net.solver.gamma, iteridx / owl_net.solver.step) # train on multi-gpu for gpuid in range(0, num_gpu): owl.set_device(gpuid) owl_net.forward('TRAIN') owl_net.backward('TRAIN') for wid in wunits: wgrad[gpuid].append(owl_net.units[wid].weightgrad) bgrad[gpuid].append(owl_net.units[wid].biasgrad) owl_net.get_units_by_name(evallayername)[0].ff_y.start_eval() if gpuid % 2 == 1: for i in range(len(wunits)): wid = wunits[i] wgrad[gpuid][i] += wgrad[gpuid - 1][i] bgrad[gpuid][i] += bgrad[gpuid - 1][i] if gpuid == 3: