def test(self): # Expected in_channels = 3 in_dim = 11 out_channels = 5 out_dim = (in_dim/2 + 1) img = np.arange(0,in_dim*in_dim*in_channels*1, dtype=np.float32) img = np.reshape(img,[in_dim,in_dim,in_channels,1]) filter = np.arange(0,3*3*in_channels*out_channels, dtype=np.float32) filter = np.reshape(filter,[3,3,in_channels,out_channels]) bias = np.zeros([5]) expected = np.zeros([out_dim,out_dim,out_channels]) for och in range(out_channels): tmp = np.zeros([out_dim,out_dim,1]) for ich in range(in_channels): imgslice = np.reshape(img[:,:,ich,0],[in_dim,in_dim]) filterslice = np.reshape(filter[:,:,ich,och],[3,3]) tmp += np.reshape(convolve(imgslice,filterslice,mode='constant',cval = 0.0)[::2,::2] , [out_dim, out_dim, 1]) expected[:,:,och] = np.squeeze(tmp) + bias[och] # test owlimg = owl.from_numpy(np.transpose(img)) owlfilter = owl.from_numpy(np.transpose(filter)) owlbias = owl.from_numpy(bias) convolver = owl.conv.Convolver(1,1,2,2) test = convolver.ff(owlimg, owlfilter, owlbias) print 'Expected\n',expected print "Actual\n",test.to_numpy() self.assertTrue(np.allclose(expected, test))
def init_weights_with_filler(self): ''' Init weights & bias. The function will be called during weight initialization. Currently, four types of initializers are supported: ``"constant", "gaussian", "uniform", "xavier"``. ''' #init weight npweights = None if self.weight_filler.type == "constant": npweights = np.ones(self.wshape, dtype = np.float32) * self.weight_filler.value elif self.weight_filler.type == "gaussian": npweights = np.random.normal(self.weight_filler.mean, self.weight_filler.std, self.wshape) elif self.weight_filler.type == "uniform": npweights = np.random.uniform(self.weight_filler.min, self.weight_filler.max, self.wshape) elif self.weight_filler.type == "xavier": fan_in = np.prod(self.in_shape[:]) scale = np.sqrt(float(3)/fan_in) npweights = np.random.uniform(-scale, scale, self.wshape) self.weight = owl.from_numpy(npweights.astype(np.float32)).reshape(self.wshape) #init bias npwbias = None if self.bias_filler.type == "constant": npbias = np.ones(self.bshape, dtype = np.float32) * self.bias_filler.value elif self.bias_filler.type == "gaussian": npbias = np.random.normal(self.bias_filler.mean, self.bias_filler.std, self.bshape) elif self.bias_filler.type == "uniform": npbias = np.random.uniform(self.bias_filler.min, self.bias_filler.max, self.bshape) elif self.bias_filler.type == "xavier": fan_in = np.prod(self.in_shape[:]) scale = np.sqrt(float(3)/fan_in) npbias = np.random.uniform(-scale, scale, self.bshape) self.bias = owl.from_numpy(npbias.astype(np.float32)).reshape(self.bshape)
def train_network(model, num_epochs=100, minibatch_size=256, lr=0.01, mom=0.75, wd=5e-4): # load data (train_data, test_data) = mnist_io.load_mb_from_mat('mnist_all.mat', minibatch_size / len(gpu)) num_test_samples = test_data[0].shape[0] test_samples = owl.from_numpy(test_data[0]).reshape([28, 28, 1, num_test_samples]) test_labels = owl.from_numpy(test_data[1]) for i in xrange(num_epochs): print "---Epoch #", i last = time.time() count = 0 weightgrads = [None] * len(gpu) biasgrads = [None] * len(gpu) for (mb_samples, mb_labels) in train_data: count += 1 current_gpu = count % len(gpu) owl.set_device(gpu[current_gpu]) num_samples = mb_samples.shape[0] data = owl.from_numpy(mb_samples).reshape([28, 28, 1, num_samples]) label = owl.from_numpy(mb_labels) out, weightgrads[current_gpu], biasgrads[current_gpu] = bpprop(model, data, label) if current_gpu == 0: for k in range(len(model.weights)): model.weightdelta[k] = mom * model.weightdelta[k] - lr / num_samples / len(gpu) * multi_gpu_merge(weightgrads, 0, k) - lr * wd * model.weights[k] model.biasdelta[k] = mom * model.biasdelta[k] - lr / num_samples / len(gpu) * multi_gpu_merge(biasgrads, 0, k) model.weights[k] += model.weightdelta[k] model.bias[k] += model.biasdelta[k] if count % (len(gpu) * lazy_cycle) == 0: print_training_accuracy(out, label, num_samples, 'Training') print '---End of Epoch #', i, 'time:', time.time() - last # do test out, _, _ = bpprop(model, test_samples, test_labels) print_training_accuracy(out, test_labels, num_test_samples, 'Testing')
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): gpu0 = owl.create_gpu_device(0) owl.set_device(gpu0) num_weights = 8 count = 0 last = time.time() dp = ImageNetDataProvider(mean_file='/home/yutian/data/config_file/google_model/imagenet_mean.binaryproto', train_db='/home/yutian/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/yutian/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/yutian/data/imagenet/ilsvrc12_test_lmdb') for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size): count = count + 1 num_samples = samples.shape[0] data = owl.from_numpy(samples).reshape([227, 227, 3, num_samples]) target = owl.from_numpy(labels) out, weightsgrad, biasgrad = model.train_one_mb(data, target, dropout_rate) model.update(weightsgrad, biasgrad, num_samples, mom, eps_w, wd) if count % 4 == 0: print_training_accuracy(out, target, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time()
def init_weights_with_filler(self): #init weight npweights = None if self.weight_filler.type == "constant": npweights = np.ones(self.wshape, dtype = np.float32) * self.weight_filler.value elif self.weight_filler.type == "gaussian": npweights = np.random.normal(self.weight_filler.mean, self.weight_filler.std, self.wshape) elif self.weight_filler.type == "uniform": npweights = np.random.uniform(self.weight_filler.min, self.weight_filler.max, self.wshape) elif self.weight_filler.type == "xavier": fan_in = np.prod(self.in_shape[:]) scale = np.sqrt(float(3)/fan_in) npweights = np.random.uniform(-scale, scale, self.wshape) self.weight = owl.from_numpy(npweights.astype(np.float32)).reshape(self.wshape) #init bias npwbias = None if self.bias_filler.type == "constant": npbias = np.ones(self.bshape, dtype = np.float32) * self.bias_filler.value elif self.bias_filler.type == "gaussian": npbias = np.random.normal(self.bias_filler.mean, self.bias_filler.std, self.bshape) elif self.bias_filler.type == "uniform": npbias = np.random.uniform(self.bias_filler.min, self.bias_filler.max, self.bshape) elif self.bias_filler.type == "xavier": fan_in = np.prod(self.in_shape[:]) scale = np.sqrt(float(3)/fan_in) npbias = np.random.uniform(-scale, scale, self.bshape) self.bias = owl.from_numpy(npbias.astype(np.float32)).reshape(self.bshape)
def train_network(model, num_epochs = 100, minibatch_size=10, dropout_rate = 0.5, eps_w = 0.01, mom = 0.9, wd = 0.0005): gpu0 = owl.create_gpu_device(0) owl.set_device(gpu0) num_weights = 8 count = 0 last = time.time() cropped_size = 224 dp = ImageNetDataProvider(mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') #mark the output layer output_layer = 'prob' for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size, cropped_size): count = count + 1 num_samples = samples.shape[0] data = owl.from_numpy(samples).reshape([cropped_size, cropped_size, 3, num_samples]) target = owl.from_numpy(labels) model.ff(data, target) print_training_accuracy(model.layers[output_layer].get_act(), target, minibatch_size) model.bp(data, target) exit(0)
def test(self): bottom = np.asarray([2,-1,0,1,2,3], np.float32) top = np.asarray([0,0,0,1,2,3], np.float32) top_diff = np.asarray([0.1,0.1,0.1,0.1,0.1,0.1], np.float32) print top_diff.shape expected = np.asarray([0,0,0,0.1,0.1,0.1], np.float32) owldiff = owl.from_numpy(top_diff) owltop = owl.from_numpy(top) test = elewise.relu_back(owldiff,owltop) #print 'Expected\n',expected #print "Actual\n",test.to_numpy() self.assertTrue(np.allclose(expected, test.to_numpy()))
def train_network(model, num_epochs = 100, minibatch_size=256, dropout_rate = 0.5, eps_w = 0.01, eps_b = 0.01, mom = 0.9, wd = 0.0005): gpu = [None] * 2 gpu[0] = owl.create_gpu_device(0) gpu[1] = owl.create_gpu_device(1) num_layers = 20 num_weights = 8 count = 0 last = time.time() dp = ImageNetDataProvider(mean_file='/home/yutian/data/config_file/google_model/imagenet_mean.binaryproto', train_db='/home/yutian/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/yutian/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/yutian/data/imagenet/ilsvrc12_test_lmdb') minibatch_size = minibatch_size / 2 wgrad = [None] * 2 bgrad = [None] * 2 num_samples = 0 for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size): #for j in range(300): count = count + 1 gpuid = count % 2 owl.set_device(gpu[gpuid]) data = owl.from_numpy(samples).reshape([227, 227, 3, samples.shape[0]]) label = owl.from_numpy(labels) #data = owl.randn([227, 227, 3, 128], 0.0, 0.01) #label = owl.randn([1000, 128], 0.0, 0.01) num_samples += data.shape[-1] (out, wgrad[gpuid], bgrad[gpuid]) = model.train_one_mb(data, label, dropout_rate) if count % 2 != 0: continue for k in range(num_weights): wgrad[0][k] += wgrad[1][k] bgrad[0][k] += bgrad[1][k] model.update(wgrad[0], bgrad[0], num_samples, mom, eps_w, wd) if count % 8 == 0: print_training_accuracy(out, label, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time() num_samples = 0 wgrad = [None] * 2 bgrad = [None] * 2
def run(self): (train_data, test_data) = mnist_io.load_mb_from_mat(self.data_file, self.mb_size) np.set_printoptions(linewidth=200) num_test_samples = test_data[0].shape[0] (test_samples, test_labels) = map(lambda npdata : owl.from_numpy(npdata), test_data) count = 1 owl.set_device(self.gpu) for epoch in range(self.num_epochs): print '---Start epoch #%d' % epoch # train for (mb_samples, mb_labels) in train_data: num_samples = mb_samples.shape[0] a1 = owl.from_numpy(mb_samples) target = owl.from_numpy(mb_labels) # ff a2 = ele.relu(self.w1 * a1 + self.b1) a3 = self.w2 * a2 + self.b2 # softmax & error out = co.softmax(a3) s3 = out - target # bp s2 = self.w2.trans() * s3 s2 = ele.relu_back(s2, a2) # grad gw1 = s2 * a1.trans() / num_samples gb1 = s2.sum(1) / num_samples gw2 = s3 * a2.trans() / num_samples gb2 = s3.sum(1) / num_samples # update self.w1 -= self.eps_w * gw1 self.w2 -= self.eps_w * gw2 self.b1 -= self.eps_b * gb1 self.b2 -= self.eps_b * gb2 if (count % 40 == 0): correct = out.max_index(0) - target.max_index(0) val = correct.to_numpy() print 'Training error:', float(np.count_nonzero(val)) / num_samples count = count + 1 # test a1 = test_samples a2 = ele.relu(self.w1 * a1 + self.b1) a3 = self.w2 * a2 + self.b2 correct = a3.max_index(0) - test_labels.max_index(0) val = correct.to_numpy() #print val print 'Testing error:', float(np.count_nonzero(val)) / num_test_samples print '---Finish epoch #%d' % epoch
def forward(self, from_btm, to_top, phase): ''' Feed-forward of data unit will get a batch of a fixed batch_size from data provider. .. note:: Phase indicates whether it's training or testing. Usualy, the data augmentation operation for training involves some randomness, while testing doesn't ''' if self.generator == None: self.generator = self.dp.get_mb(phase) while True: try: (samples, labels) = next(self.generator) if len(labels) == 0: (samples, labels) = next(self.generator) except StopIteration: print 'Have scanned the whole dataset; start from the begginning agin' self.generator = self.dp.get_mb(phase) continue break to_top[self.top_names[0]] = owl.from_numpy(samples).reshape( [self.crop_size, self.crop_size, 3, samples.shape[0]]) #may have multiplier labels for i in range (1, len(self.top_names)): to_top[self.top_names[i]] = labels[:,i - 1]
def forward(self, from_btm, to_top, phase): ''' Feed-forward operation may vary according to phase. .. note:: LMDB data provider now support multi-view testing, if phase is "MULTI_VIEW", it will produce concequtive 10 batches of different views of the same original image ''' if self.generator == None: if phase == 'TRAIN' or phase == 'TEST': self.generator = self.dp.get_mb(phase) #multiview test else: self.generator = self.dp.get_multiview_mb() while True: try: (samples, labels) = next(self.generator) if len(labels) == 0: (samples, labels) = next(self.generator) except StopIteration: print 'Have scanned the whole dataset; start from the begginning agin' self.generator = self.dp.get_mb(phase) continue break to_top[self.top_names[0]] = owl.from_numpy(samples).reshape( [self.crop_size, self.crop_size, 3, samples.shape[0]]) for i in range (1, len(self.top_names)): to_top[self.top_names[i]] = labels[:,i - 1]
def forward(self, from_btm, to_top, phase): ''' Feed-forward operation may vary according to phase. .. note:: ''' if self.generator == None: self.generator = self.dp.get_mb(phase) while True: try: (samples, labels) = next(self.generator) if len(labels) == 0: (samples, labels) = next(self.generator) except StopIteration: print 'Have scanned the whole dataset; start from the begginning agin' if self.multiview == False: self.generator = self.dp.get_mb(phase) #multiview test else: self.generator = self.dp.get_multiview_mb() continue break #TODO(Jesse Lovitt): Change this 256 to a division by 256/max-fixed-point-value to_top[self.top_names[0]] = owl.from_numpy(samples).reshape( [self.crop_size, self.crop_size, 3, samples.shape[0]]) for i in range (1, len(self.top_names)): to_top[self.top_names[i]] = labels[:,i - 1] #to_top[self.top_names[0]] = owl.zeros([self.crop_size, self.crop_size, 3, 256]) #for i in range (1, len(self.top_names)): #to_top[self.top_names[i]] = np.ones(256) self.out = to_top[self.top_names[0]]
def train_network(model, num_epochs=100, minibatch_size=256, lr=0.01, mom=0.75, wd=5e-4): # load data (train_data, test_data) = mnist_io.load_mb_from_mat('mnist_all.mat', minibatch_size / len(gpu)) num_test_samples = test_data[0].shape[0] test_samples = owl.from_numpy(test_data[0]).reshape( [28, 28, 1, num_test_samples]) test_labels = owl.from_numpy(test_data[1]) for i in xrange(num_epochs): print "---Epoch #", i last = time.time() count = 0 weightgrads = [None] * len(gpu) biasgrads = [None] * len(gpu) for (mb_samples, mb_labels) in train_data: count += 1 current_gpu = count % len(gpu) owl.set_device(gpu[current_gpu]) num_samples = mb_samples.shape[0] data = owl.from_numpy(mb_samples).reshape([28, 28, 1, num_samples]) label = owl.from_numpy(mb_labels) out, weightgrads[current_gpu], biasgrads[current_gpu] = bpprop( model, data, label) if current_gpu == 0: for k in range(len(model.weights)): model.weightdelta[k] = mom * model.weightdelta[ k] - lr / num_samples / len(gpu) * multi_gpu_merge( weightgrads, 0, k) - lr * wd * model.weights[k] model.biasdelta[ k] = mom * model.biasdelta[k] - lr / num_samples / len( gpu) * multi_gpu_merge(biasgrads, 0, k) model.weights[k] += model.weightdelta[k] model.bias[k] += model.biasdelta[k] if count % (len(gpu) * lazy_cycle) == 0: print_training_accuracy(out, label, num_samples, 'Training') print '---End of Epoch #', i, 'time:', time.time() - last # do test out, _, _ = bpprop(model, test_samples, test_labels) print_training_accuracy(out, test_labels, num_test_samples, 'Testing')
def test(self): base = np.arange(-10,10,dtype=np.float32)/2 a = owl.from_numpy(base) test = owl.NArray.relu(a) expected = base.clip(0,20) print 'Expected\n',expected print "Actual\n",test.to_numpy() self.assertTrue(np.array_equal(expected,test.to_numpy()))
def init_net_from_file(self, owl_net, weightpath, epochidx): weightpath = "%ssnapshot%d/" % (weightpath, epochidx) for i in range(len(owl_net.units)): if isinstance(owl_net.units[i], net.FullyConnection): #print owl_net.units[i].name layername = owl_net.units[i].name layername = layername.replace("/","_") weightname = '%s%s_weights.dat' % (weightpath, layername) npweight = np.fromfile(weightname, dtype = np.float32) length = np.shape(npweight)[0] wshape = [owl_net.units[i].inner_product_param.num_output, length / owl_net.units[i].inner_product_param.num_output] owl_net.units[i].weight = owl.from_numpy(npweight).reshape(wshape) weightname = '%s%s_weightdelta.dat' % (weightpath, layername) if os.path.isfile(weightname): npweightdelta = np.fromfile(weightname, dtype = np.float32) owl_net.units[i].weightdelta = owl.from_numpy(npweightdelta).reshape(wshape) biasname = '%s%s_bias.dat' % (weightpath, layername) npbias = np.fromfile(biasname, dtype = np.float32) bshape = [owl_net.units[i].inner_product_param.num_output, 1] owl_net.units[i].bias = owl.from_numpy(npbias).reshape(bshape) biasname = '%s%s_biasdelta.dat' % (weightpath, layername) if os.path.isfile(biasname): npbiasdetla = np.fromfile(biasname, dtype = np.float32) owl_net.units[i].biasdelta = owl.from_numpy(npbiasdetla).reshape(bshape) if isinstance(owl_net.units[i], net.ConvConnection): #print owl_net.units[i].name layername = owl_net.units[i].name layername = layername.replace("/","_") weightname = '%s%s_weights.dat' % (weightpath, layername) npweight = np.fromfile(weightname, dtype = np.float32) length = np.shape(npweight)[0] conv_params = owl_net.units[i].conv_params input_channel = length / conv_params.kernel_size / conv_params.kernel_size / conv_params.num_output wshape = [conv_params.kernel_size, conv_params.kernel_size, input_channel, conv_params.num_output] owl_net.units[i].weight = owl.from_numpy(npweight).reshape(wshape) weightname = '%s%s_weightdelta.dat' % (weightpath, layername) if os.path.isfile(weightname): npweightdelta = np.fromfile(weightname, dtype = np.float32) owl_net.units[i].weightdelta = owl.from_numpy(npweightdelta).reshape(wshape) biasname = '%s%s_bias.dat' % (weightpath, layername) npbias = np.fromfile(biasname, dtype = np.float32) bshape = [owl_net.units[i].conv_params.num_output] owl_net.units[i].bias = owl.from_numpy(npbias).reshape(bshape) biasname = '%s%s_biasdelta.dat' % (weightpath, layername) if os.path.isfile(biasname): npbiasdetla = np.fromfile(biasname, dtype = np.float32) owl_net.units[i].biasdelta = owl.from_numpy(npbiasdetla).reshape(bshape)
def test(self): base = np.arange(0,10, dtype = np.float32) base = np.reshape(base,[2,5]) expected = np.transpose(base) tmp = owl.from_numpy(base) test = tmp.trans() #print 'Expected\n',expected #print "Actual\n",test.to_numpy() self.assertTrue(np.allclose(expected,test.to_numpy()))
def forward(self, from_btm, to_top, phase): to_top[self.top_names[0]] = co.softmax(from_btm[self.btm_names[0]], co.soft_op.instance) self.ff_y = to_top[self.top_names[0]] #turn label into matrix form nplabel = np.zeros([self.ff_y.shape[1], self.ff_y.shape[0]], dtype=np.float32) self.strlabel = from_btm[self.btm_names[1]] for i in range(len(self.strlabel)): nplabel[i, self.strlabel[i]] = 1 self.y = owl.from_numpy(nplabel)
def test(self): base = np.arange(-10,10) owlbase = owl.from_numpy(base) print fpgatestinit.devices fpgatestinit.setfpga() test = owl.NArray.relu(owlbase) expected = base.clip(0,20) print 'Expected\n',expected print "Actual\n",test.to_numpy() self.assertTrue(np.array_equal(expected,test.to_numpy()))
def forward(self, from_btm, to_top, phase): if self.generator == None: self.generator = self.dp.get_train_mb(self.mirror, phase) while True: try: (samples, labels) = next(self.generator) if len(labels) == 0: (samples, labels) = next(self.generator) except StopIteration: print 'Have scanned the whole dataset; start from the begginning agin' self.generator = self.dp.get_train_mb(self.mirror, phase) continue break to_top[self.top_names[0]] = owl.from_numpy(samples).reshape( [self.crop_size, self.crop_size, 3, samples.shape[0]]) to_top[self.top_names[1]] = owl.from_numpy(labels) '''
def test(self): base = np.asarray([40.0,20.0,30.0,10.0]) max = np.max(base) base = np.reshape(base, [1,1,1,4]) owlarray = owl.from_numpy(base) expected = np.exp(base - max) expected = expected / np.sum(expected) test = conv.softmax(owlarray) #print 'Expected\n',expected #print "Actual\n",test.to_numpy() self.assertTrue(np.allclose(expected, test.to_numpy()))
def train_network(filename, model, num_epochs=5, minibatch_size=256, lr=0.1, lr_decay= 0.95, mom=0.9, wd=5e-4): # load data (train_data, test_data) = mnist_io.load_mb_from_mat(filename, minibatch_size / len(devs)) num_test_samples = test_data[0].shape[0] test_samples = owl.from_numpy(test_data[0]).reshape([28, 28, 1, num_test_samples]) test_labels = owl.from_numpy(test_data[1]) for i in xrange(num_epochs): print "---Epoch #", i last = time.time() count = 0 weightgrads = [None] * len(devs) biasgrads = [None] * len(devs) for (mb_samples, mb_labels) in train_data: count += 1 current_dev = count % len(devs) owl.set_device(devs[current_dev]) num_samples = mb_samples.shape[0] data = owl.from_numpy(mb_samples).reshape([28, 28, 1, num_samples]) label = owl.from_numpy(mb_labels) #print "\t[{}]Train Data imported to minerva format".format(count) out, weightgrads[current_dev], biasgrads[current_dev] = bpprop(model, data, label) #print "\t[{}]Backprop complete".format(count) # print "dev {}".format(current_dev) if current_dev == 0: # print "pre-merge" for k in range(len(model.weights)): model.weightdelta[k] = mom * model.weightdelta[k] - lr / num_samples / len(devs) * multi_dev_merge(weightgrads, 0, k) - lr * wd * model.weights[k] # print "\t weight merge" model.biasdelta[k] = mom * model.biasdelta[k] - lr / num_samples / len(devs) * multi_dev_merge(biasgrads, 0, k) # print "\t bias merge" model.weights[k] += model.weightdelta[k] model.bias[k] += model.biasdelta[k] # print "post-merge" if count % (len(devs) * lazy_cycle) == 0: print_training_accuracy(out, label, num_samples, 'Training ' + str(count)) owl.print_profiler_result() print '---End of Epoch #', i, 'time:', time.time() - last lr = lr*lr_decay # do test out, _, _ = bpprop(model, test_samples, test_labels) print_training_accuracy(out, test_labels, num_test_samples, 'Testing')
def init_weights_with_filler(self): ''' Init weights & bias. The function will be called during weight initialization. Currently, four types of initializers are supported: ``"constant", "gaussian", "uniform", "xavier"``. ''' #init weight npweights = None if self.weight_filler.type == "constant": npweights = np.ones(self.wshape, dtype=np.float32) * self.weight_filler.value elif self.weight_filler.type == "gaussian": npweights = np.random.normal(self.weight_filler.mean, self.weight_filler.std, self.wshape) elif self.weight_filler.type == "uniform": npweights = np.random.uniform(self.weight_filler.min, self.weight_filler.max, self.wshape) elif self.weight_filler.type == "xavier": fan_in = np.prod(self.in_shape[:]) scale = np.sqrt(float(3) / fan_in) npweights = np.random.uniform(-scale, scale, self.wshape) self.weight = owl.from_numpy(npweights.astype(np.float32)).reshape( self.wshape) #init bias npwbias = None if self.bias_filler.type == "constant": npbias = np.ones(self.bshape, dtype=np.float32) * self.bias_filler.value elif self.bias_filler.type == "gaussian": npbias = np.random.normal(self.bias_filler.mean, self.bias_filler.std, self.bshape) elif self.bias_filler.type == "uniform": npbias = np.random.uniform(self.bias_filler.min, self.bias_filler.max, self.bshape) elif self.bias_filler.type == "xavier": fan_in = np.prod(self.in_shape[:]) scale = np.sqrt(float(3) / fan_in) npbias = np.random.uniform(-scale, scale, self.bshape) self.bias = owl.from_numpy(npbias.astype(np.float32)).reshape( self.bshape)
def test(self): # Expected img = np.arange(0,32, dtype=np.float32) img = np.reshape(img,[1,2,4,4]) filter = np.arange(0,2*2*2*2, dtype=np.float32) filter = np.reshape(filter,[2,2,2,2]) bias = np.ones([2]) expected = np.asarray([[[441,497], [665,721]], [[1113,1297], [1849,2033]]]) # test owlimg = owl.from_numpy(img) owlfilter = owl.from_numpy(filter) owlbias = owl.from_numpy(bias) convolver = owl.conv.Convolver(0,0,2,2) test = convolver.ff(owlimg, owlfilter, owlbias) #print 'Expected\n',expected #print "Actual\n",test.to_numpy() self.assertTrue(np.allclose(expected, test.to_numpy()))
def train_network(model, num_epochs=100, minibatch_size=10, dropout_rate=0.5, eps_w=0.01, mom=0.9, wd=0.0005): gpu0 = owl.create_gpu_device(0) owl.set_device(gpu0) num_weights = 8 count = 0 last = time.time() cropped_size = 224 dp = ImageNetDataProvider( mean_file='/home/minjie/data/imagenet/imagenet_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') #mark the output layer output_layer = 'loss3/loss3' for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size, cropped_size): count = count + 1 num_samples = samples.shape[0] data = owl.from_numpy(samples).reshape( [cropped_size, cropped_size, 3, num_samples]) target = owl.from_numpy(labels) model.ff(data, target) print_training_accuracy(model.layers[output_layer].get_act(), target, minibatch_size) model.bp(data, target) model.update(num_samples, eps_w, mom, wd) exit(0)
def check_weight(owl_net, checklayer): h = 1e-2 threshold = 1e-4 for iteridx in range(10): #disturb the weights oriweight = checklayer.weight npweight = checklayer.weight.to_numpy() weightshape = np.shape(npweight) npweight = npweight.reshape(np.prod(weightshape[0:len(weightshape)])) print np.shape(npweight) position = np.random.randint(0, np.shape(npweight)[0]) print position disturb = np.zeros(np.shape(npweight), dtype=np.float32) disturb[position] = h oriposval = npweight[position] npweight += disturb newposval = npweight[position] npweight = npweight.reshape(weightshape) checklayer.weight = owl.from_numpy(npweight) #get disturbed loss owl_net.forward('TRAIN') all_loss = 0 for i in xrange(len(losslayer)): all_loss += owl_net.units[losslayer[i]].getloss() all_loss = all_loss / owl_net.batch_size #+ 0.5 * owl_net.base_weight_decay * newposval * newposval #get origin loss checklayer.weight = oriweight owl_net.forward('TRAIN') ori_all_loss = 0 for i in xrange(len(losslayer)): ori_all_loss += owl_net.units[losslayer[i]].getloss() ori_all_loss = ori_all_loss / owl_net.batch_size #+ 0.5 * owl_net.base_weight_decay * oriposval * oriposval owl_net.backward('TRAIN') #get analytic gradient npgrad = checklayer.weightgrad.to_numpy() npgrad = npgrad.reshape(np.prod(weightshape[0:len(weightshape)])) analy_grad = npgrad[position] / owl_net.batch_size #get numerical gradient print all_loss print ori_all_loss num_grad = (all_loss - ori_all_loss) / h diff = np.abs(analy_grad - num_grad) info = "analy: %f num: %f ratio: %f" % (analy_grad, num_grad, analy_grad / num_grad) print info
def loadmodel(i, model): basedir = './newinitmodel/epoch%d/' % (i) print 'load from %s' % (basedir) for k in range(model.num_layers-1): weightshape = model.weights[k].shape filename = '%sweights_%d.dat' % (basedir, k) weightarray = np.fromfile(filename, dtype=np.float32) model.weights[k] = owl.from_numpy(weightarray).reshape(weightshape) weightshape = model.weightsdelta[k].shape filename = '%sweightsdelta_%d.dat' % (basedir, k) weightarray = np.fromfile(filename, dtype=np.float32) model.weightsdelta[k] = owl.from_numpy(weightarray).reshape(weightshape) weightshape = model.bias[k].shape filename = '%sbias_%d.dat' % (basedir, k) weightarray = np.fromfile(filename, dtype=np.float32) model.bias[k] = owl.from_numpy(weightarray).reshape(weightshape) weightshape = model.biasdelta[k].shape filename = '%sbiasdelta_%d.dat' % (basedir, k) weightarray = np.fromfile(filename, dtype=np.float32) model.biasdelta[k] = owl.from_numpy(weightarray).reshape(weightshape)
def train_network(model, num_epochs=100, minibatch_size=256, dropout_rate=0.5, eps_w=0.01, eps_b=0.01, mom=0.9, wd=0.0005): gpu0 = owl.create_gpu_device(0) owl.set_device(gpu0) num_weights = 8 count = 0 last = time.time() dp = ImageNetDataProvider( mean_file= '/home/yutian/data/config_file/google_model/imagenet_mean.binaryproto', train_db='/home/yutian/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/yutian/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/yutian/data/imagenet/ilsvrc12_test_lmdb') for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size): count = count + 1 num_samples = samples.shape[0] data = owl.from_numpy(samples).reshape([227, 227, 3, num_samples]) target = owl.from_numpy(labels) out, weightsgrad, biasgrad = model.train_one_mb( data, target, dropout_rate) model.update(weightsgrad, biasgrad, num_samples, mom, eps_w, wd) if count % 4 == 0: print_training_accuracy(out, target, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time()
def check_weight(owl_net, checklayer): h = 1e-2 threshold = 1e-4 for iteridx in range(10): #disturb the weights oriweight = checklayer.weight npweight = checklayer.weight.to_numpy() weightshape = np.shape(npweight) npweight = npweight.reshape(np.prod(weightshape[0:len(weightshape)])) print np.shape(npweight) position = np.random.randint(0, np.shape(npweight)[0]) print position disturb = np.zeros(np.shape(npweight), dtype = np.float32) disturb[position] = h oriposval = npweight[position] npweight += disturb newposval = npweight[position] npweight = npweight.reshape(weightshape) checklayer.weight = owl.from_numpy(npweight) #get disturbed loss owl_net.forward('TRAIN') all_loss = 0 for i in xrange(len(losslayer)): all_loss += owl_net.units[losslayer[i]].getloss() all_loss = all_loss / owl_net.batch_size #+ 0.5 * owl_net.base_weight_decay * newposval * newposval #get origin loss checklayer.weight = oriweight owl_net.forward('TRAIN') ori_all_loss = 0 for i in xrange(len(losslayer)): ori_all_loss += owl_net.units[losslayer[i]].getloss() ori_all_loss = ori_all_loss / owl_net.batch_size #+ 0.5 * owl_net.base_weight_decay * oriposval * oriposval owl_net.backward('TRAIN') #get analytic gradient npgrad = checklayer.weightgrad.to_numpy() npgrad = npgrad.reshape(np.prod(weightshape[0:len(weightshape)])) analy_grad = npgrad[position] / owl_net.batch_size #get numerical gradient print all_loss print ori_all_loss num_grad = (all_loss - ori_all_loss) / h diff = np.abs(analy_grad - num_grad) info = "analy: %f num: %f ratio: %f" % (analy_grad, num_grad, analy_grad / num_grad) print info
def forward(self, from_btm, to_top, phase): if self.generator == None: self.generator = self.dp.get_mb(phase) while True: try: (samples, labels) = next(self.generator) if len(labels) == 0: (samples, labels) = next(self.generator) except StopIteration: print 'Have scanned the whole dataset; start from the begginning agin' self.generator = self.dp.get_mb(phase) continue break to_top[self.top_names[0]] = owl.from_numpy(samples).reshape( [self.crop_size, self.crop_size, 3, samples.shape[0]]) #may have multiplier labels for i in range (1, len(self.top_names)): to_top[self.top_names[i]] = labels[:,i - 1]
def forward(self, from_btm, to_top, phase): if self.top_k == 1: predict = from_btm[self.btm_names[0]].max_index(0) ground_truth = owl.from_numpy(from_btm[self.btm_names[1]]).reshape(predict.shape) self.batch_size = from_btm[self.btm_names[0]].shape[1] correct = (predict - ground_truth).count_zero() self.acc = correct * 1.0 / self.batch_size elif self.top_k == 5: predict = from_btm[self.btm_names[0]].to_numpy() top_5 = np.argsort(predict, axis=1)[:,::-1] ground_truth = from_btm[self.btm_names[1]] self.batch_size = np.shape(ground_truth)[0] correct = 0 for i in range(self.batch_size): for t in range(5): if ground_truth[i] == top_5[i,t]: correct += 1 break self.acc = correct * 1.0 / self.batch_size else: assert(FALSE)
def forward(self, from_btm, to_top, phase): if self.top_k == 1: predict = from_btm[self.btm_names[0]].max_index(0) ground_truth = owl.from_numpy(from_btm[self.btm_names[1]]).reshape( predict.shape) self.batch_size = from_btm[self.btm_names[0]].shape[1] correct = (predict - ground_truth).count_zero() self.acc = correct * 1.0 / self.batch_size elif self.top_k == 5: predict = from_btm[self.btm_names[0]].to_numpy() top_5 = np.argsort(predict, axis=1)[:, ::-1] ground_truth = from_btm[self.btm_names[1]] self.batch_size = np.shape(ground_truth)[0] correct = 0 for i in range(self.batch_size): for t in range(5): if ground_truth[i] == top_5[i, t]: correct += 1 break self.acc = correct * 1.0 / self.batch_size else: assert (FALSE)
def forward(self, from_btm, to_top, phase): """ Feed-forward operation may vary according to phase. .. note:: LMDB data provider now support multi-view testing, if multiview == True, it will produce concequtive 10 batches of different views of the same original image """ if self.generator == None: if self.multiview == False: self.generator = self.dp.get_mb(phase) # multiview test else: self.generator = self.dp.get_multiview_mb() while True: try: (samples, labels) = next(self.generator) if len(labels) == 0: (samples, labels) = next(self.generator) except StopIteration: print "Have scanned the whole dataset; start from the begginning agin" if self.multiview == False: self.generator = self.dp.get_mb(phase) # multiview test else: self.generator = self.dp.get_multiview_mb() continue break to_top[self.top_names[0]] = owl.from_numpy(samples).reshape( [self.crop_size, self.crop_size, 3, samples.shape[0]] ) for i in range(1, len(self.top_names)): to_top[self.top_names[i]] = labels[:, i - 1] # to_top[self.top_names[0]] = owl.zeros([self.crop_size, self.crop_size, 3, 256]) # for i in range (1, len(self.top_names)): # to_top[self.top_names[i]] = np.ones(256) self.out = to_top[self.top_names[0]]
def test(self): # Expected cpu=owl.create_cpu_device() owl.set_device(cpu) img = np.arange(0,32, dtype=np.float32) #/32 img = np.reshape(img,[1,2,4,4]) expected = np.asarray([[[5,7], [13,15]], [[21,23], [29,31]]]) #/32.0 #expected = np.asarray([[[ 110.25, 124.25], # [ 166.25, 180.25]], # [[ 278.25, 324.25], # [ 462.25, 508.25]]]) # test owlimg = owl.from_numpy(img) pooler = owl.conv.Pooler(2,2,2,2) test = pooler.ff(owlimg) print 'Expected\n',expected print "Actual\n",test.to_numpy() print "This test must be run with a fractional bit width of 12" self.assertTrue(np.allclose(expected, test.to_numpy(), atol= 1.0/(1<<12)*4))
def initw(n, d): magic_number = 0.3 npa = (np.random.rand(n, d) * 2 - 1) * magic_number # U[-0.1, 0.1] return owl.from_numpy(npa).trans()
def train_network_n(n, model, num_epochs=100, minibatch_size=40, dropout_rate=0.5, eps_w=0.0001, eps_b=0.0002, mom=0.9, wd=0.0005): gpus = [] for i in range(0, n): gpus.append(owl.create_gpu_device(i)) count = 0 last = time.time() dp = ImageNetDataProvider( mean_file='./VGGmodel/vgg_mean.binaryproto', train_db='/home/minjie/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/minjie/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/minjie/data/imagenet/ilsvrc12_test_lmdb') minibatch_size = minibatch_size / n correct = 0 rerun = False startepoch = 0 curepoch = startepoch data = [None] * n label = [None] * n out = [None] * n biasgrad = [None] * n weightsgrad = [None] * n for i in range(startepoch, num_epochs): print "---------------------Epoch %d Index %d" % (curepoch, i) sys.stdout.flush() batchidx = 0 count = 0 loadmodel(i, model) for (samples, labels) in dp.get_train_mb(minibatch_size, 224): count = count + 1 data[count - 1] = owl.from_numpy(samples).reshape( [224, 224, 3, samples.shape[0]]) label[count - 1] = owl.from_numpy(labels) biasgrad[count - 1] = [None] * (model.num_layers - 1) weightsgrad[count - 1] = [None] * (model.num_layers - 1) owl.set_device(gpus[count - 1]) out[count - 1] = train_one_mb(model, data[count - 1], label[count - 1], weightsgrad[count - 1], biasgrad[count - 1]) out[count - 1].start_eval() if count % n > 0: continue totalweightsgrad = [None] * (model.num_layers - 1) totalbiasgrad = [None] * (model.num_layers - 1) num_samples = 0 for gpuidx in range(0, n): num_samples += data[gpuidx].shape[-1] for k in range(model.num_layers - 1): if model.ff_infos[k]['ff_type'] == 'conv' or model.ff_infos[ k]['ff_type'] == 'fully': if gpuidx == 0: totalweightsgrad[k] = weightsgrad[gpuidx][k] totalbiasgrad[k] = biasgrad[gpuidx][k] else: totalweightsgrad[k] += weightsgrad[gpuidx][k] totalbiasgrad[k] += biasgrad[gpuidx][k] for k in range(model.num_layers - 1): if model.ff_infos[k]['ff_type'] == 'conv' or model.ff_infos[k][ 'ff_type'] == 'fully': model.weightsdelta[k] = mom * model.weightsdelta[ k] - eps_w / num_samples * ( totalweightsgrad[k] + wd * num_samples * model.weights[k]) model.biasdelta[k] = mom * model.biasdelta[ k] - eps_b / num_samples * totalbiasgrad[k] model.weights[k] += model.weightsdelta[k] model.bias[k] += model.biasdelta[k] #print num_samples if count % n == 0: print 'batch %d' % (batchidx) batchidx = batchidx + 1 ''' #TODO hack if batchidx == 2000: savemodel(i+1, model) exit(0) ''' thiscorrect = print_training_accuracy(out[0], label[0], data[0].shape[-1]) print "time: %s" % (time.time() - last) last = time.time() count = 0 savemodel(i + 1, model)
def gradient_checker(s, checklayer_name): ''' Check backpropagation on multiple GPUs ''' h = 1e-2 threshold = 1e-4 checklayer = s.owl_net.units[s.owl_net.name_to_uid[checklayer_name][0]] losslayer = [] for i in xrange(len(s.owl_net.units)): if isinstance(s.owl_net.units[i], net.SoftmaxUnit): losslayer.append(i) last = None ''' wunits = [] for i in xrange(len(s.owl_net.units)): if isinstance(s.owl_net.units[i], net.WeightedComputeUnit): wunits.append(i) ''' wunits = s.owl_net.get_weighted_unit_ids() accunits = s.owl_net.get_accuracy_units() owl.set_device(s.gpu[0]) for iteridx in range(100): #disturb the weights oriweight = checklayer.weight npweight = checklayer.weight.to_numpy() weightshape = np.shape(npweight) npweight = npweight.reshape(np.prod(weightshape[0:len(weightshape)])) position = np.random.randint(0, np.shape(npweight)[0]) disturb = np.zeros(np.shape(npweight), dtype = np.float32) disturb[position] = h oriposval = npweight[position] npweight += disturb newposval = npweight[position] npweight = npweight.reshape(weightshape) checklayer.weight = owl.from_numpy(npweight) all_loss = 0 # train on multi-gpu s.owl_net.forward_check() for i in range(len(losslayer)): if len(s.owl_net.units[losslayer[i]].loss_weight) == 1: all_loss += (s.owl_net.units[losslayer[i]].getloss() * s.owl_net.units[losslayer[i]].loss_weight[0]) else: all_loss += s.owl_net.units[losslayer[i]].getloss() #get origin loss checklayer.weight = oriweight ori_all_loss = 0 # train on multi-gpu s.owl_net.forward_check() for i in range(len(losslayer)): if len(s.owl_net.units[losslayer[i]].loss_weight) == 1: ori_all_loss += (s.owl_net.units[losslayer[i]].getloss() * s.owl_net.units[losslayer[i]].loss_weight[0]) else: ori_all_loss += s.owl_net.units[losslayer[i]].getloss() s.owl_net.backward('TEST') #get analytic gradient npgrad = checklayer.weightgrad.to_numpy() npgrad = npgrad.reshape(np.prod(weightshape[0:len(weightshape)])) analy_grad = npgrad[position] / s.owl_net.units[losslayer[i]].out.shape[1] num_grad = (all_loss - ori_all_loss) / h info = "Gradient Check at positon: %d analy: %f num: %f ratio: %f" % (position, analy_grad, num_grad, analy_grad / num_grad) print info
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version=1): # Constants N = model.Layers[1] # Number of units K = model.Layers[2] # Vocabulary size last_time = time.time() # For each epoch for epoch_id in range(1, EPOCH + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[2], model.Layers[1]]) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) dEmb = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] NVector = np.zeros((K, 1)) NVector[sent[t]] = 1 target = owl.from_numpy(NVector).trans() act_ig[t] = model.ig_weight_data * data[ t] + model.ig_weight_prev * Hout[ t - 1] + model.ig_weight_cell * C[ t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[ t] + model.fg_weight_prev * Hout[ t - 1] + model.fg_weight_cell * C[ t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[ t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[ t] + model.og_weight_prev * Hout[ t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # BP to Hout dY[t] = Y - target dBd += dY[t] dWd += dY[t] * Hout[t].trans() dHout[t] = model.decoder_weights.trans() * dY[t] # evaluation output = Y.to_numpy( ) # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]], 1e-20), 2) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### weight_update_ig_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_ig_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_fg_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_og_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_ff_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, Tau)): #print "sent",sent #print "t",t # BP from og controled gate and og if tanhC_version: tanhC = ele.tanh(C[t]) dTanhC = ele.mult(dHout[t], act_og[t]) sen_og = ele.mult(dHout[t], tanhC) dC[t] += ele.mult((1 - ele.mult(tanhC, tanhC)), dTanhC) else: sen_og = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) # BP from og sen_og = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og) dHout[t - 1] = model.og_weight_prev.trans() * sen_og dC[t] += model.og_weight_cell.trans() * sen_og dEmb[t] = model.og_weight_data.trans() * sen_og # BP from fg controled gate sen_fg = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_fg[t], dC[t]) # BP from ig controled gate sen_ig = ele.mult(act_ff[t], dC[t]) sen_ff = ele.mult(act_ig[t], dC[t]) sen_ff = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff) dEmb[t] += model.ff_weight_data.trans() * sen_ff # BP from fg sen_fg = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg) dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg dC[t - 1] += model.fg_weight_cell.trans() * sen_fg dEmb[t] += model.fg_weight_data.trans() * sen_fg # BP from ig sen_ig = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig) dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig dC[t - 1] += model.ig_weight_cell.trans() * sen_ig dEmb[t] += model.ig_weight_data.trans() * sen_ig # derivatives on weight matrix and bias weight_update_ig_data += sen_ig * data[t].trans() weight_update_ig_prev += sen_ig * Hout[t - 1].trans() weight_update_ig_cell += sen_ig * C[t - 1].trans() weight_update_ig_bias += sen_ig weight_update_fg_data += sen_fg * data[t].trans() weight_update_fg_prev += sen_fg * Hout[t - 1].trans() weight_update_fg_cell += sen_fg * C[t - 1].trans() weight_update_fg_bias += sen_fg weight_update_og_data += sen_og * data[t].trans() weight_update_og_prev += sen_og * Hout[t - 1].trans() weight_update_og_cell += sen_og * C[t].trans() weight_update_og_bias += sen_og weight_update_ff_data += sen_ff * data[t].trans() weight_update_ff_prev += sen_ff * Hout[t - 1].trans() weight_update_ff_bias += sen_ff # normalize the gradients rate = learning_rate / Tau # weight update model.ig_weight_prev -= rate * weight_update_ig_prev model.ig_weight_data -= rate * weight_update_ig_data model.ig_weight_cell -= rate * weight_update_ig_cell model.ig_weight_bias -= rate * weight_update_ig_bias model.fg_weight_prev -= rate * weight_update_fg_prev model.fg_weight_data -= rate * weight_update_fg_data model.fg_weight_cell -= rate * weight_update_fg_cell model.fg_weight_bias -= rate * weight_update_fg_bias model.og_weight_prev -= rate * weight_update_og_prev model.og_weight_data -= rate * weight_update_og_data model.og_weight_cell -= rate * weight_update_og_cell model.og_weight_bias -= rate * weight_update_og_bias model.ff_weight_prev -= rate * weight_update_ff_prev model.ff_weight_data -= rate * weight_update_ff_data model.ff_weight_bias -= rate * weight_update_ff_bias model.decoder_weights -= rate * dWd model.decoder_bias -= rate * dBd for t in range(1, Tau): model.emb_weight[sent[t - 1]] -= rate * dEmb[t] # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 2**epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time last_time = cur_time return model, learning_rate
def check_weight_2gpu(owl_net, checklayer, gpu): h = 1e-2 threshold = 1e-4 wunits = get_weights_id(owl_net) wgrad = [] bgrad = [] for iteridx in range(10): #disturb the weights oriweight = checklayer.weight npweight = checklayer.weight.to_numpy() weightshape = np.shape(npweight) npweight = npweight.reshape(np.prod(weightshape[0:len(weightshape)])) print np.shape(npweight) position = np.random.randint(0, np.shape(npweight)[0]) print position disturb = np.zeros(np.shape(npweight), dtype=np.float32) disturb[position] = h oriposval = npweight[position] npweight += disturb newposval = npweight[position] npweight = npweight.reshape(weightshape) checklayer.weight = owl.from_numpy(npweight) #get disturbed loss owl_net.forward('TRAIN') all_loss = 0 for i in xrange(len(losslayer)): all_loss += owl_net.units[losslayer[i]].getloss() all_loss = all_loss / owl_net.batch_size #+ 0.5 * owl_net.base_weight_decay * newposval * newposval #get origin loss checklayer.weight = oriweight owl_net.forward('TRAIN') ori_all_loss = 0 for i in xrange(len(losslayer)): ori_all_loss += owl_net.units[losslayer[i]].getloss() ori_all_loss = ori_all_loss / owl_net.batch_size #+ 0.5 * owl_net.base_weight_decay * oriposval * oriposval #analy_grad owl.set_device(gpu[0]) owl_net.forward('TRAIN') owl_net.backward('TRAIN') for wid in wunits: wgrad.append(owl_net.units[wid].weightgrad) bgrad.append(owl_net.units[wid].biasgrad) owl.set_device(gpu[1]) owl_net.forward('TRAIN') owl_net.backward('TRAIN') for i in range(len(wunits)): wid = wunits[i] owl_net.units[wid].weightgrad += wgrad[i] owl_net.units[wid].biasgrad += bgrad[i] wgrad = [] bgrad = [] #get analytic gradient npgrad = checklayer.weightgrad.to_numpy() npgrad = npgrad.reshape(np.prod(weightshape[0:len(weightshape)])) analy_grad = npgrad[position] / owl_net.batch_size / len(gpu) print all_loss print ori_all_loss num_grad = (all_loss - ori_all_loss) / h diff = np.abs(analy_grad - num_grad) info = "analy: %f num: %f ratio: %f" % (analy_grad, num_grad, analy_grad / num_grad) print info
def init_net_from_file(self, owl_net, weightpath, snapshotidx): '''Load network parameters from a saved snapshot. :ivar owl_net: the network to load parameters to :ivar str weightpath: the folder storing parameters :ivar int snapshotidx: the index of the snapshot ''' weightpath = "%s/snapshot%d/" % (weightpath, snapshotidx) for i in range(len(owl_net.units)): if isinstance(owl_net.units[i], net.FullyConnection): #print owl_net.units[i].name layername = owl_net.units[i].name layername = layername.replace("/","_") weightname = '%s%s_weights.dat' % (weightpath, layername) wshape = owl_net.units[i].wshape if os.path.isfile(weightname): npweight = np.fromfile(weightname, dtype = np.float32) length = np.shape(npweight)[0] if length == owl_net.units[i].in_shape[0] * owl_net.units[i].out_shape[0]: owl_net.units[i].weight = owl.from_numpy(npweight).reshape(wshape) weightname = '%s%s_weightdelta.dat' % (weightpath, layername) if os.path.isfile(weightname): npweightdelta = np.fromfile(weightname, dtype = np.float32) owl_net.units[i].weightdelta = owl.from_numpy(npweightdelta).reshape(wshape) else: print "Weight Need Reinit %s" % (owl_net.units[i].name) else: print "Weight Need Reinit %s" % (owl_net.units[i].name) biasname = '%s%s_bias.dat' % (weightpath, layername) bshape = owl_net.units[i].bshape if os.path.isfile(biasname): npbias = np.fromfile(biasname, dtype = np.float32) length = np.shape(npbias)[0] if length == owl_net.units[i].out_shape[0]: owl_net.units[i].bias = owl.from_numpy(npbias).reshape(bshape) biasname = '%s%s_biasdelta.dat' % (weightpath, layername) if os.path.isfile(biasname): npbiasdetla = np.fromfile(biasname, dtype = np.float32) owl_net.units[i].biasdelta = owl.from_numpy(npbiasdetla).reshape(bshape) else: print "Bias Need Reinit %s" % (owl_net.units[i].name) if isinstance(owl_net.units[i], net.ConvConnection): #print owl_net.units[i].name layername = owl_net.units[i].name layername = layername.replace("/","_") conv_params = owl_net.units[i].conv_params weightname = '%s%s_weights.dat' % (weightpath, layername) wshape = owl_net.units[i].wshape if os.path.isfile(weightname): npweight = np.fromfile(weightname, dtype = np.float32) length = np.shape(npweight)[0] if length == owl_net.units[i].in_shape[2] * owl_net.units[i].out_shape[2] * conv_params.kernel_size * conv_params.kernel_size: owl_net.units[i].weight = owl.from_numpy(npweight).reshape(wshape) weightname = '%s%s_weightdelta.dat' % (weightpath, layername) if os.path.isfile(weightname): npweightdelta = np.fromfile(weightname, dtype = np.float32) owl_net.units[i].weightdelta = owl.from_numpy(npweightdelta).reshape(wshape) else: print "Conv Weight Need Reinit %s" % (owl_net.units[i].name) else: print "Conv Weight Need Reinit %s" % (owl_net.units[i].name) biasname = '%s%s_bias.dat' % (weightpath, layername) bshape = owl_net.units[i].bshape if os.path.isfile(biasname): npbias = np.fromfile(biasname, dtype = np.float32) length = np.shape(npbias)[0] if length == owl_net.units[i].out_shape[2]: owl_net.units[i].bias = owl.from_numpy(npbias).reshape(bshape) biasname = '%s%s_biasdelta.dat' % (weightpath, layername) if os.path.isfile(biasname): npbiasdetla = np.fromfile(biasname, dtype = np.float32) owl_net.units[i].biasdelta = owl.from_numpy(npbiasdetla).reshape(bshape) else: print "Conv Bias Need Reinit %s" % (owl_net.units[i].name) else: print "Conv Bias Need Reinit %s" % (owl_net.units[i].name)
def init_net_from_file(self, owl_net, weightpath, snapshotidx): '''Load network parameters from a saved snapshot. :ivar owl_net: the network to load parameters to :ivar str weightpath: the folder storing parameters :ivar int snapshotidx: the index of the snapshot ''' weightpath = "%ssnapshot%d/" % (weightpath, snapshotidx) for i in range(len(owl_net.units)): if isinstance(owl_net.units[i], net.FullyConnection): #print owl_net.units[i].name layername = owl_net.units[i].name layername = layername.replace("/","_") weightname = '%s%s_weights.dat' % (weightpath, layername) wshape = owl_net.units[i].wshape if os.path.isfile(weightname): npweight = np.fromfile(weightname, dtype = np.float32) length = np.shape(npweight)[0] if length == owl_net.units[i].in_shape[0] * owl_net.units[i].out_shape[0]: owl_net.units[i].weight = owl.from_numpy(npweight).reshape(wshape) weightname = '%s%s_weightdelta.dat' % (weightpath, layername) if os.path.isfile(weightname): npweightdelta = np.fromfile(weightname, dtype = np.float32) owl_net.units[i].weightdelta = owl.from_numpy(npweightdelta).reshape(wshape) else: print "Weight Need Reinit %s" % (owl_net.units[i].name) else: print "Weight Need Reinit %s" % (owl_net.units[i].name) biasname = '%s%s_bias.dat' % (weightpath, layername) bshape = owl_net.units[i].bshape if os.path.isfile(biasname): npbias = np.fromfile(biasname, dtype = np.float32) length = np.shape(npbias)[0] if length == owl_net.units[i].out_shape[0]: owl_net.units[i].bias = owl.from_numpy(npbias).reshape(bshape) biasname = '%s%s_biasdelta.dat' % (weightpath, layername) if os.path.isfile(biasname): npbiasdetla = np.fromfile(biasname, dtype = np.float32) owl_net.units[i].biasdelta = owl.from_numpy(npbiasdetla).reshape(bshape) else: print "Bias Need Reinit %s" % (owl_net.units[i].name) if isinstance(owl_net.units[i], net.ConvConnection): #print owl_net.units[i].name layername = owl_net.units[i].name layername = layername.replace("/","_") conv_params = owl_net.units[i].conv_params weightname = '%s%s_weights.dat' % (weightpath, layername) wshape = owl_net.units[i].wshape if os.path.isfile(weightname): npweight = np.fromfile(weightname, dtype = np.float32) length = np.shape(npweight)[0] if length == owl_net.units[i].in_shape[2] * owl_net.units[i].out_shape[2] * conv_params.kernel_size * conv_params.kernel_size: owl_net.units[i].weight = owl.from_numpy(npweight).reshape(wshape) weightname = '%s%s_weightdelta.dat' % (weightpath, layername) if os.path.isfile(weightname): npweightdelta = np.fromfile(weightname, dtype = np.float32) owl_net.units[i].weightdelta = owl.from_numpy(npweightdelta).reshape(wshape) else: print "Conv Weight Need Reinit %s" % (owl_net.units[i].name) else: print "Conv Weight Need Reinit %s" % (owl_net.units[i].name) biasname = '%s%s_bias.dat' % (weightpath, layername) bshape = owl_net.units[i].bshape if os.path.isfile(biasname): npbias = np.fromfile(biasname, dtype = np.float32) length = np.shape(npbias)[0] if length == owl_net.units[i].out_shape[2]: owl_net.units[i].bias = owl.from_numpy(npbias).reshape(bshape) biasname = '%s%s_biasdelta.dat' % (weightpath, layername) if os.path.isfile(biasname): npbiasdetla = np.fromfile(biasname, dtype = np.float32) owl_net.units[i].biasdelta = owl.from_numpy(npbiasdetla).reshape(bshape) else: print "Conv Bias Need Reinit %s" % (owl_net.units[i].name) else: print "Conv Bias Need Reinit %s" % (owl_net.units[i].name)
def train_network(model, num_epochs=100, minibatch_size=256, dropout_rate=0.5, eps_w=0.01, eps_b=0.01, mom=0.9, wd=0.0005): gpu = [None] * 2 gpu[0] = owl.create_gpu_device(0) gpu[1] = owl.create_gpu_device(1) num_layers = 20 num_weights = 8 count = 0 last = time.time() dp = ImageNetDataProvider( mean_file= '/home/yutian/data/config_file/google_model/imagenet_mean.binaryproto', train_db='/home/yutian/data/imagenet/ilsvrc12_train_lmdb', val_db='/home/yutian/data/imagenet/ilsvrc12_val_lmdb', test_db='/home/yutian/data/imagenet/ilsvrc12_test_lmdb') minibatch_size = minibatch_size / 2 wgrad = [None] * 2 bgrad = [None] * 2 num_samples = 0 for i in xrange(num_epochs): print "---------------------Epoch #", i for (samples, labels) in dp.get_train_mb(minibatch_size): #for j in range(300): count = count + 1 gpuid = count % 2 owl.set_device(gpu[gpuid]) data = owl.from_numpy(samples).reshape( [227, 227, 3, samples.shape[0]]) label = owl.from_numpy(labels) #data = owl.randn([227, 227, 3, 128], 0.0, 0.01) #label = owl.randn([1000, 128], 0.0, 0.01) num_samples += data.shape[-1] (out, wgrad[gpuid], bgrad[gpuid]) = model.train_one_mb(data, label, dropout_rate) if count % 2 != 0: continue for k in range(num_weights): wgrad[0][k] += wgrad[1][k] bgrad[0][k] += bgrad[1][k] model.update(wgrad[0], bgrad[0], num_samples, mom, eps_w, wd) if count % 8 == 0: print_training_accuracy(out, label, data.shape[-1]) print "time: %s" % (time.time() - last) last = time.time() num_samples = 0 wgrad = [None] * 2 bgrad = [None] * 2
def LSTM_test(model, sents, vocab_size, words, tanhC_version=1): N = 10 K = vocab_size test_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print "sent_id",sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood batch_size = Tau data = [None] * Tau prev = [None] * Tau embed = np.zeros((K, 1)) embed[sent[0]] = 1 data[0] = owl.from_numpy(embed).trans() Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) Ym = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): prev[t] = Hout[t - 1] embed = np.zeros((K, 1)) embed[sent[t]] = 1 data[t] = owl.from_numpy(embed).trans() act_ig[t] = model.ig_weight_data.trans() * data[ t - 1] + model.ig_weight_prev.trans( ) * prev[t] + model.ig_weight_bias act_fg[t] = model.fg_weight_data.trans() * data[ t - 1] + model.fg_weight_prev.trans( ) * prev[t] + model.fg_weight_bias act_og[t] = model.og_weight_data.trans() * data[ t - 1] + model.og_weight_prev.trans( ) * prev[t] + model.og_weight_bias act_ff[t] = model.ff_weight_data.trans() * data[ t - 1] + model.ff_weight_prev.trans( ) * prev[t] + model.ff_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = ele.sigm(act_fg[t]) act_og[t] = ele.sigm(act_og[t]) act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] + model.decoder_bias) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] output = Ym[t].trans() * data[t] test_ll += math.log10(max(np.sum(output.to_numpy()), 1e-20)) print test_ll test_ent = test_ll * (-1) / words test_ppl = 10**test_ent print("Test PPL = %f" % (test_ppl))
def LSTM_train(model, sents, vocab_size, words, NUM_EPOCHS=100, tanhC_version=1): # Constants ALPHA = 1 # Learning rate N = 10 # Number of units learning_rate = 1 K = vocab_size # Vocabulary size # For each epoch last_ll = 1e99 last_time = time.time() for epoch_id in range(1, NUM_EPOCHS + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print "sent_id",sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood batch_size = Tau data = [None] * Tau prev = [None] * Tau embed = np.zeros((K, 1)) embed[sent[0]] = 1 data[0] = owl.from_numpy(embed).trans() Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) Ym = [None] * Tau dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[1], model.Layers[2]]) #Hout.transpose().dot(dY) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) ##### Forward pass ##### # For each time step for t in range(1, Tau): prev[t] = Hout[t - 1] embed = np.zeros((K, 1)) embed[sent[t]] = 1 data[t] = owl.from_numpy(embed).trans() act_ig[t] = model.ig_weight_data.trans() * data[ t - 1] + model.ig_weight_prev.trans( ) * prev[t] + model.ig_weight_bias act_fg[t] = model.fg_weight_data.trans() * data[ t - 1] + model.fg_weight_prev.trans( ) * prev[t] + model.fg_weight_bias act_og[t] = model.og_weight_data.trans() * data[ t - 1] + model.og_weight_prev.trans( ) * prev[t] + model.og_weight_bias act_ff[t] = model.ff_weight_data.trans() * data[ t - 1] + model.ff_weight_prev.trans( ) * prev[t] + model.ff_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = ele.sigm(act_fg[t]) act_og[t] = ele.sigm(act_og[t]) act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] + model.decoder_bias) dY[t] = data[t] - Ym[t] dBd += dY[t] / batch_size dWd += Hout[t] * dY[t].trans() / batch_size dHout[t] = model.decoder_weights * dY[t] #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### for t in range(1, Tau): output = Ym[t].trans() * data[t] sent_ll += math.log10(max(np.sum(output.to_numpy()), 1e-20)) sen_ig = [None] * Tau sen_fg = [None] * Tau sen_og = [None] * Tau sen_ff = [None] * Tau weight_update_ig_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ig_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_fg_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_og_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ff_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dHin = owl.zeros([model.Layers[1], model.Layers[1]]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, len(sent))): #print "sent",sent #print "t",t if tanhC_version: tanhCt = ele.tanh(C[t]) sen_og[t] = ele.mult(tanhCt, dHout[t]) dC[t] += ele.mult((1 - ele.mult(tanhCt, tanhCt)), ele.mult(act_og[t], dHout[t])) else: sen_og[t] = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) sen_fg[t] = owl.zeros([model.Layers[1], 1]) if t > 0: sen_fg[t] = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_og[t], dC[t]) sen_ig[t] = ele.mult(act_ff[t], dC[t]) sen_ff[t] = ele.mult(act_ig[t], dC[t]) # backprop activation functions sen_ff[t] = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff[t]) sen_ig[t] = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig[t]) sen_fg[t] = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg[t]) sen_og[t] = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og[t]) # backprop matrix multiply weight_update_ig_data += data[t] * sen_ig[t].trans() weight_update_ig_prev += prev[t] * sen_ig[t].trans() weight_update_fg_bias += sen_ig[t] # sen_ig[t].sum(0 or 1) weight_update_fg_data += data[t] * sen_fg[t].trans() weight_update_fg_prev += prev[t] * sen_fg[t].trans() weight_update_fg_bias += sen_fg[t] weight_update_og_data += data[t] * sen_og[t].trans() weight_update_og_prev += prev[t] * sen_og[t].trans() weight_update_og_bias += sen_og[t] weight_update_ff_data += data[t] * sen_ff[t].trans() weight_update_ff_prev += prev[t] * sen_ff[t].trans() weight_update_ff_bias += sen_ff[t] if t > 1: dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig[t] dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg[t] dHout[t - 1] += model.og_weight_prev.trans() * sen_og[t] dHout[t - 1] += model.ff_weight_prev.trans() * sen_ff[t] # normalize the gradients # dWLSTM /= batch_size weight_update_ig_prev /= batch_size weight_update_ig_data /= batch_size weight_update_ig_bias /= batch_size weight_update_fg_prev /= batch_size weight_update_fg_data /= batch_size weight_update_fg_bias /= batch_size weight_update_og_prev /= batch_size weight_update_og_data /= batch_size weight_update_og_bias /= batch_size weight_update_ff_prev /= batch_size weight_update_ff_data /= batch_size weight_update_ff_bias /= batch_size # weight update model.ig_weight_prev += learning_rate * weight_update_ig_prev model.ig_weight_data += learning_rate * weight_update_ig_data model.ig_weight_bias += learning_rate * weight_update_ig_bias model.fg_weight_prev += learning_rate * weight_update_fg_prev model.fg_weight_data += learning_rate * weight_update_fg_data model.fg_weight_bias += learning_rate * weight_update_fg_bias model.og_weight_prev += learning_rate * weight_update_og_prev model.og_weight_data += learning_rate * weight_update_og_data model.og_weight_bias += learning_rate * weight_update_og_bias model.ff_weight_prev += learning_rate * weight_update_ff_prev model.ff_weight_data += learning_rate * weight_update_ff_data model.ff_weight_bias += learning_rate * weight_update_ff_bias model.decoder_weights += learning_rate * dWd model.decoder_bias += learning_rate * dBd # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 10**epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time if last_ll > epoch_ll: learning_rate /= 2.0 last_ll = epoch_ll last_time = cur_time
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version = 1): # Constants N = model.Layers[1] # Number of units K = model.Layers[2] # Vocabulary size last_time = time.time() # For each epoch for epoch_id in range(1, EPOCH + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[2], model.Layers[1]]) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) dEmb = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] NVector = np.zeros((K, 1)) NVector[sent[t]] = 1 target = owl.from_numpy(NVector).trans() act_ig[t] = model.ig_weight_data * data[t] + model.ig_weight_prev * Hout[t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[t] + model.fg_weight_prev * Hout[t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[t] + model.og_weight_prev * Hout[t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # BP to Hout dY[t] = Y - target dBd += dY[t] dWd += dY[t] * Hout[t].trans() dHout[t] = model.decoder_weights.trans() * dY[t] # evaluation output = Y.to_numpy() # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]],1e-20), 2) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### weight_update_ig_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_ig_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ig_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_fg_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_fg_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_og_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_og_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_ff_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, Tau)): #print "sent",sent #print "t",t # BP from og controled gate and og if tanhC_version: tanhC = ele.tanh(C[t]) dTanhC = ele.mult(dHout[t], act_og[t]) sen_og = ele.mult(dHout[t], tanhC) dC[t] += ele.mult((1 - ele.mult(tanhC, tanhC)), dTanhC) else: sen_og = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) # BP from og sen_og = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og) dHout[t - 1] = model.og_weight_prev.trans() * sen_og dC[t] += model.og_weight_cell.trans() * sen_og dEmb[t] = model.og_weight_data.trans() * sen_og # BP from fg controled gate sen_fg = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_fg[t], dC[t]) # BP from ig controled gate sen_ig = ele.mult(act_ff[t], dC[t]) sen_ff = ele.mult(act_ig[t], dC[t]) sen_ff = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff) dEmb[t] += model.ff_weight_data.trans() * sen_ff # BP from fg sen_fg = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg) dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg dC[t - 1] += model.fg_weight_cell.trans() * sen_fg dEmb[t] += model.fg_weight_data.trans() * sen_fg # BP from ig sen_ig = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig) dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig dC[t - 1] += model.ig_weight_cell.trans() * sen_ig dEmb[t] += model.ig_weight_data.trans() * sen_ig # derivatives on weight matrix and bias weight_update_ig_data += sen_ig * data[t].trans() weight_update_ig_prev += sen_ig * Hout[t - 1].trans() weight_update_ig_cell += sen_ig * C[t - 1].trans() weight_update_ig_bias += sen_ig weight_update_fg_data += sen_fg * data[t].trans() weight_update_fg_prev += sen_fg * Hout[t - 1].trans() weight_update_fg_cell += sen_fg * C[t - 1].trans() weight_update_fg_bias += sen_fg weight_update_og_data += sen_og * data[t].trans() weight_update_og_prev += sen_og * Hout[t - 1].trans() weight_update_og_cell += sen_og * C[t].trans() weight_update_og_bias += sen_og weight_update_ff_data += sen_ff * data[t].trans() weight_update_ff_prev += sen_ff * Hout[t - 1].trans() weight_update_ff_bias += sen_ff # normalize the gradients rate = learning_rate / Tau # weight update model.ig_weight_prev -= rate * weight_update_ig_prev model.ig_weight_data -= rate * weight_update_ig_data model.ig_weight_cell -= rate * weight_update_ig_cell model.ig_weight_bias -= rate * weight_update_ig_bias model.fg_weight_prev -= rate * weight_update_fg_prev model.fg_weight_data -= rate * weight_update_fg_data model.fg_weight_cell -= rate * weight_update_fg_cell model.fg_weight_bias -= rate * weight_update_fg_bias model.og_weight_prev -= rate * weight_update_og_prev model.og_weight_data -= rate * weight_update_og_data model.og_weight_cell -= rate * weight_update_og_cell model.og_weight_bias -= rate * weight_update_og_bias model.ff_weight_prev -= rate * weight_update_ff_prev model.ff_weight_data -= rate * weight_update_ff_data model.ff_weight_bias -= rate * weight_update_ff_bias model.decoder_weights -= rate * dWd model.decoder_bias -= rate * dBd for t in range(1, Tau): model.emb_weight[sent[t - 1]] -= rate * dEmb[t] # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 2 ** epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time last_time = cur_time return model, learning_rate
def forward(self, from_btm, to_top, phase): predict = from_btm[self.btm_names[0]].argmax(0) ground_truth = owl.from_numpy(from_btm[self.btm_names[1]]).reshape(predict.shape) self.batch_size = from_btm[self.btm_names[0]].shape[1] correct = (predict - ground_truth).count_zero() self.acc = correct * 1.0 / self.batch_size
# training parameters epsilon = 0.01 momentum = 0.9 num_epochs = 20 batch_size = 64 num_batches = data.shape[1]//batch_size # model parameters num_vis = data.shape[0] num_hid = 128 # initialize weights np.random.seed(1234) weights = owl.from_numpy(0.1 * np.random.randn(num_vis, num_hid)).trans() #weights = 0.1 * owl.randn([num_vis, num_hid],0,1) bias_v = owl.zeros([1,num_vis]) bias_h = owl.zeros([1,num_hid]) # initialize weight updates d_weights = owl.zeros((num_vis,num_hid )) d_bias_v = owl.zeros([1,num_vis]) d_bias_h = owl.zeros([1,num_hid]) start_time = time.time() for epoch in range(num_epochs): print("Epoch %i" % (epoch + 1)) err = [] weights_old = weights for batch in range(num_batches):