def __init__(self, n_visible, n_hidden=None, vistype='sigmoid', hidtype='sigmoid', W=None, hbias=None, vbias=None, batch_size=128): # initialize parameters self.SIZE_LIMIT = 80000000 # the size of the largest gpu array self.vistype = vistype self.hidtype = hidtype self.batch_size = batch_size self.n_visible = n_visible if n_hidden is None: n_hidden = self.n_visible self.n_hidden = n_hidden n = self.n_visible*self.n_hidden + self.n_hidden bound = 2.38 / np.sqrt(n) if W is None: W = np.zeros((self.n_visible, self.n_hidden)) for i in range(self.n_visible): for j in range(self.n_hidden): W[i,j] = np.random.uniform(-bound, bound) W = gp.garray(W) self.W = W if vbias is None: vbias = gp.zeros(self.n_visible) else: vbias = gp.garray(vbias) self.vbias = vbias if hbias is None: hbias = np.zeros((self.n_hidden,)) for i in range(self.n_hidden): hbias[i] = np.random.uniform(-bound, bound) hbias = gp.garray(hbias) self.hbias = hbias #initialize updates self.wu_vh = gp.zeros((self.n_visible, self.n_hidden)) self.wu_v = gp.zeros(self.n_visible) self.wu_h = gp.zeros(self.n_hidden)
def computeNumericGradient(self, input, factor=1.0, eps=1e-4, sampleNum=500): """ compute gradients throught numeric way for gradient check gradient of J w.r.t. x computed by (J(x+eps)-J(x-eps))/2eps only check param at sampleNum positions J=0.5*(a[0]-a[-1])**2+WeightCost """ param=self.combineParam() plen=param.size if factor==0: plen=plen/2 sample=np.random.randint(0,plen,sampleNum) grad=gp.zeros(sampleNum) for (i,idx) in enumerate(sample): if i%100==0: sys.stdout.write('.') sys.stdout.flush() q=gp.zeros(param.shape) q[idx]=eps p1=param+q p2=param-q c1,_=self.getCost(p1, input,factor) c2,_=self.getCost(p2, input,factor) grad[i]=(c1-c2)/(2.0*eps) print "end" return grad, sample
def __init__(self, config, name): super(AE, self).__init__(config, name) #dimension of hidden layer self.hDim = int(self.readField(config, name, "hidden_dimension")) #dimension of visible layer self.vDim = int(self.readField(config, name, "visible_dimension")) #baise for hidden layer if self.hDim>0: self.b1 = gp.zeros(self.hDim) #biase for visible layer if self.vDim>0: self.b2 = gp.zeros(self.vDim) #init weight: uniform between +-sqrt(6)/sqrt(v+h+1) if self.hDim*self.vDim>0: gp.seed_rand() r=gp.sqrt(6)/gp.sqrt(self.hDim+self.vDim+1) self.W1 = gp.randn(self.vDim, self.hDim) * 2 * r - r self.W2 = gp.randn(self.hDim, self.vDim) * 2 * r - r self.initUpdate() self.initHyperParam(config, name)
def init_params(self, w_scale=0.01, b_scale=0.0): """Randomly initialize the weights in this layer.""" self.params['W'] = w_scale * gp.randn((self.dim_input, self.dim_output)) self.grads['W'] = gp.zeros((self.dim_input, self.dim_output)) self.params['b'] = gp.zeros((1, self.dim_output)) self.grads['b'] = gp.zeros((1, self.dim_output)) return
def check_against_exact(): with misc.gnumpy_conversion_check('allow'): rbm = test_tractable.random_rbm(NVIS, NHID) G, s = tractable.exact_fisher_information(rbm, return_mean=True, batch_units=2) rw = fisher.RegressionWeights.from_maximum_likelihood(G, NVIS, NHID) G, s = gnp.garray(G), gnp.garray(s) S = G + np.outer(s, s) m_unary = s[:NVIS + NHID] S_unary = S[:NVIS + NHID, :NVIS + NHID] m_pair = gnp.zeros((NVIS, NHID, 3)) S_pair = gnp.zeros((NVIS, NHID, 3, 3)) for i in range(NVIS): for j in range(NHID): vis_idx = i hid_idx = NVIS + j vishid_idx = NVIS + NHID + NHID * i + j idxs = np.array([vis_idx, hid_idx, vishid_idx]) m_pair[i, j, :] = s[idxs] S_pair[i, j, :] = S[idxs[:, nax], idxs[nax, :]] stats = fang.Statistics(m_unary, S_unary, m_pair, S_pair) beta, sigma_sq = stats.compute_regression_weights() assert np.allclose(beta, rw.beta) assert np.allclose(sigma_sq, rw.sigma_sq) Sigma = stats.unary_covariance() assert np.max(np.abs(Sigma - G[:NVIS + NHID, :NVIS + NHID])) < 1e-6
def generation_on_a_line(net, n_points=100, imsz=[28, 28], nrows=10, h_seeds=None): if h_seeds is None: h = net.sample_hiddens(2) z = gnp.zeros((n_points, h.shape[1])) diff = h[1] - h[0] step = diff / (n_points - 1) for i in range(n_points): z[i] = h[0] + step * i else: n_seeds = h_seeds.shape[0] z = gnp.zeros((n_points * n_seeds, h_seeds.shape[1])) for i in range(n_seeds): h0 = h_seeds[i] h1 = h_seeds[(i + 1) % n_seeds] diff = h1 - h0 step = diff / (n_points - 1) for j in range(n_points): z[i * n_points + j] = h0 + step * j x = net.generate_samples(z=z) vt.bwpatchview(x.asarray(), imsz, nrows, rowmajor=True, gridintensity=1)
def __init__(self, initialWeights, initialBiases, initialGenBiases, outputActFunct, realValuedVis = False, useReLU = False, max_norm=-1, noises = [], dropout_adv = 0.0): self.realValuedVis = realValuedVis self.learnRates = [0.05 for i in range(len(initialWeights))] self.momentum = 0.9 self.L2Costs = [0.0001 for i in range(len(initialWeights))] self.dropouts = [0 for i in range(len(initialWeights))] self.nesterov = False self.nestCompare = False self.rmsLims = [None for i in range(len(initialWeights))] if self.realValuedVis: self.learnRates[0] = 0.005 self.weights = initialWeights self.biases = initialBiases self.genBiases = initialGenBiases if useReLU: self.RBMHidUnitType = RBMReLU() self.hidActFuncts = [ReLU() for i in range(len(self.weights) - 1)] else: self.RBMHidUnitType = RBMBinary() self.hidActFuncts = [Sigmoid() for i in range(len(self.weights) - 1)] self.outputActFunct = outputActFunct #state variables modified in bprop self.WGrads = [gnp.zeros(self.weights[i].shape) for i in range(len(self.weights))] self.biasGrads = [gnp.zeros(self.biases[i].shape) for i in range(len(self.biases))] self.max_norm = max_norm self.noises = noises self.dropout_adv = dropout_adv
def __init__(self, config, name): super(AE, self).__init__(config, name) #dimension of hidden layer self.hDim = int(self.readField(config, name, "hidden_dimension")) #dimension of visible layer self.vDim = int(self.readField(config, name, "visible_dimension")) #baise for hidden layer if self.hDim > 0: self.b1 = gp.zeros(self.hDim) #biase for visible layer if self.vDim > 0: self.b2 = gp.zeros(self.vDim) #init weight: uniform between +-sqrt(6)/sqrt(v+h+1) if self.hDim * self.vDim > 0: gp.seed_rand() r = gp.sqrt(6) / gp.sqrt(self.hDim + self.vDim + 1) self.W1 = gp.randn(self.vDim, self.hDim) * 2 * r - r self.W2 = gp.randn(self.hDim, self.vDim) * 2 * r - r self.initUpdate() self.initHyperParam(config, name)
def singlePathNumericGrad(self, saes, inputs, factor=1,sampleNum=500,eps=1e-4): """ get gradient for single path by numeric computing aes: (my_aes, other_aes), autoencoders for this path and the other path inputs:(my_input, other_input), input data for this path and the other_path Since the param of the other path is fixed, no need to compute its cost """ mysae, osae=saes myinput, oinput=inputs myparam=mysae.combineParam(down=False) #aes[0] is None oas=osae.forward(oinput) plen=myparam.size sample=np.random.randint(0,plen,sampleNum) grad=gp.zeros(sampleNum) for (i,idx) in enumerate(sample): if i%100==0: sys.stdout.write('.') sys.stdout.flush() q=gp.zeros(myparam.shape) q[idx]=eps p1=myparam+q p2=myparam-q c1,a=mysae.getCost(p1,myinput,factor) c1+=self.getDiffLoss(a[self.depth-1],oas[self.depth-1]) c2,a=mysae.getCost(p2,myinput,factor) c2+=self.getDiffLoss(a[self.depth-1],oas[self.depth-1]) grad[i]=(c1-c2)/(2.0*eps) return grad, sample
def __init__(self, initialWeights, initialBiases, initialGenBiases, outputActFunct, realValuedVis = False, useReLU = False): self.realValuedVis = realValuedVis self.learnRates = [0.05 for i in range(len(initialWeights))] self.momentum = 0.9 self.L2Costs = [0.0001 for i in range(len(initialWeights))] self.dropouts = [0.0 for i in range(len(initialWeights))] self.nesterov = False self.nestCompare = False self.rmsLims = [None for i in range(len(initialWeights))] if self.realValuedVis: self.learnRates[0] = 0.005 # TODO - This should not be set here - should be an optional variable self.weights = initialWeights self.biases = initialBiases self.genBiases = initialGenBiases #### FIXME - generative biases - does this mean input biases or something like that? if useReLU: self.RBMHidUnitType = RBMReLU() self.hidActFuncts = [ReLU() for i in range(len(self.weights) - 1)] else: self.RBMHidUnitType = RBMBinary() self.hidActFuncts = [Sigmoid() for i in range(len(self.weights) - 1)] self.outputActFunct = outputActFunct # State variables modified in bprop self.WGrads = [gnp.zeros(self.weights[i].shape) for i in range(len(self.weights))] self.biasGrads = [gnp.zeros(self.biases[i].shape) for i in range(len(self.biases))]
def get_scores(rbm, batch_units=10, show_progress=False): nhid = rbm.nhid assert nhid <= 30 prefix_len = nhid - batch_units batch_size = 2 ** batch_units prefixes = combinations_array(prefix_len) num_batches = prefixes.shape[0] hid = gnp.zeros((batch_size, nhid)) hid[:, prefix_len:] = combinations_array(batch_units) scores = gnp.zeros((num_batches, batch_size)) if show_progress: pbar = misc.pbar(num_batches) for i, prefix in enumerate(prefixes): hid[:, :prefix_len] = prefix scores[i, :] = rbm.free_energy_hid(hid) if show_progress: pbar.update(i) if show_progress: pbar.finish() return scores
def __init__(self, layer_sizes, scale=0.05, verbose=1, l2=0.0001, momentum=0.9, epochs=20, batch_size=256,dropouts=0.0, learning_rate=0.01, learning_rate_decays=0.9): self.layer_sizes = layer_sizes self.scale = scale self.verbose = 1 self.l2 = l2 self.momentum = momentum self.epochs = epochs self.batch_size = batch_size self.dropouts = [dropouts for l in range(len(layer_sizes)-1)] self.learning_rate = learning_rate self.learning_rate_decays = learning_rate_decays shapes = [(layer_sizes[i-1], layer_sizes[i]) for i in range(1, len(layer_sizes))] self.biases = init_biases_matrix(layer_sizes) self.weights = init_weights_matrix(shapes, scale) self.rms_limits = [None for i in range(len(self.weights))] self.hidden_functions = [self.hidden_function for i in range(len(self.weights) - 1)] self.weight_grads_l2_norm = [gnp.ones(weight.shape) for weight in self.weights] self.bias_gradis_l2_norm = [gnp.ones(bias.shape) for bias in self.biases] self.weight_grads = [gnp.zeros(weight.shape) for weight in self.weights] self.bias_grads = [gnp.zeros(bias.shape) for bias in self.biases]
def __init__(self, initialWeights, initialBiases, initialGenBiases, outputActFunct, realValuedVis = False, useReLU = False): self.realValuedVis = realValuedVis self.learnRates = [0.05 for i in range(len(initialWeights))] self.momentum = 0.9 self.L2Costs = [0.0001 for i in range(len(initialWeights))] self.dropouts = [0 for i in range(len(initialWeights))] self.nesterov = False self.nestCompare = False self.rmsLims = [None for i in range(len(initialWeights))] if self.realValuedVis: self.learnRates[0] = 0.005 self.weights = initialWeights self.biases = initialBiases self.genBiases = initialGenBiases if useReLU: self.RBMHidUnitType = RBMReLU() self.hidActFuncts = [ReLU() for i in range(len(self.weights) - 1)] else: self.RBMHidUnitType = RBMBinary() self.hidActFuncts = [Sigmoid() for i in range(len(self.weights) - 1)] self.outputActFunct = outputActFunct #state variables modified in bprop self.WGrads = [gnp.zeros(self.weights[i].shape) for i in range(len(self.weights))] self.biasGrads = [gnp.zeros(self.biases[i].shape) for i in range(len(self.biases))]
def costAndGradSFO(self,stack,datums): """ Wrapper function used for SFO optimizer. """ N = len(datums) cost = 0. grad = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.stack] # Push stack to device self.stack = [[gp.garray(w),gp.garray(b)] for w,b in stack] for datum in datums: data = gp.garray(self.data_dict[datum]) labels = np.array(self.alis[datum], dtype=np.int32) costSingle,gradSingle,skip = self.costAndGrad(data,labels) if skip: print "LOGGING SKIP" #TODO what to do here? N -= 1 continue grad = [[gs[0]+g[0],gs[1]+g[1]] for gs,g in zip(gradSingle,grad)] cost += costSingle # Have to force GC the gpu... gnumpy lameness gp.free_reuse_cache() # Pull gradient from device grad = [[((1./N)*gw).as_numpy_array(), ((1./N)*gb).as_numpy_array()] for gw,gb in grad] cost *= 1./N return cost,grad
def _initialize(self, matrix): n_docs, vocab_size = matrix.shape print "initializing state matrices" # number of times document m and topic z co-occur self.nmz = gpu.zeros((n_docs, self.n_topics)) # number of times topic z and word w co-occur self.nzw = gpu.zeros((self.n_topics, vocab_size)) self.nm = gpu.zeros(n_docs) self.nz = gpu.zeros(self.n_topics) self.topics = {} print "populating state matrices" time = datetime.now() # begin timer for m in xrange(n_docs): # i is a number between 0 and doc_length-1 # w is a number between 0 and vocab_size-1 for i, w in enumerate(word_indices(matrix[m, :])): # choose an arbitrary topic as first topic for word i z = np.random.randint(self.n_topics) self.nmz[m,z] += 1 self.nm[m] += 1 self.nzw[z,w] += 1 self.nz[z] += 1 self.topics[(m,i)] = z print datetime.now() - time # end timer
def singlePathNumericGrad(self, saes, inputs, factor=1, sampleNum=500, eps=1e-4): """ get gradient for single path by numeric computing aes: (my_aes, other_aes), autoencoders for this path and the other path inputs:(my_input, other_input), input data for this path and the other_path Since the param of the other path is fixed, no need to compute its cost """ mysae, osae = saes myinput, oinput = inputs myparam = mysae.combineParam(down=False) #aes[0] is None oas = osae.forward(oinput) plen = myparam.size sample = np.random.randint(0, plen, sampleNum) grad = gp.zeros(sampleNum) for (i, idx) in enumerate(sample): if i % 100 == 0: sys.stdout.write('.') sys.stdout.flush() q = gp.zeros(myparam.shape) q[idx] = eps p1 = myparam + q p2 = myparam - q c1, a = mysae.getCost(p1, myinput, factor) c1 += self.getDiffLoss(a[self.depth - 1], oas[self.depth - 1]) c2, a = mysae.getCost(p2, myinput, factor) c2 += self.getDiffLoss(a[self.depth - 1], oas[self.depth - 1]) grad[i] = (c1 - c2) / (2.0 * eps) return grad, sample
def exact_fisher_information_biases(rbm, batch_units=10, show_progress=False): batch_size = 2 ** batch_units nvis, nhid = rbm.nvis, rbm.nhid num_params = nvis + nhid s = gnp.zeros(num_params) G = gnp.zeros((num_params, num_params)) for hid, p in iter_configurations(rbm, batch_units=batch_units, show_progress=show_progress): g = gnp.zeros((batch_size, num_params)) cond_vis = gnp.logistic(rbm.vis_inputs(hid)) g[:, :nvis] = cond_vis g[:, nvis:] = hid s += gnp.dot(p, g) G += gnp.dot(g.T * p, g) diag_term = gnp.dot(p, g * (1. - g)) G += np.diag(diag_term.as_numpy_array()) G -= s[:, nax] * s[nax, :] return G
def ConvDown(hidActs, filters, moduleStride, paddingStart): if filters.shape[3] == 1 and hidActs.shape[0] == 1: hidActs_16 = gp.zeros( (16, hidActs.shape[1], hidActs.shape[2], hidActs.shape[3])) hidActs_16[:1, :, :, :] = hidActs filters_16 = gp.zeros( (filters.shape[0], filters.shape[1], filters.shape[2], 16)) filters_16[:, :, :, :1] = filters return ConvNet.convDown(hidActs_16, filters_16, moduleStride=moduleStride, paddingStart=paddingStart) elif filters.shape[3] == 3 and hidActs.shape[0] == 3: hidActs_16 = gp.zeros( (16, hidActs.shape[1], hidActs.shape[2], hidActs.shape[3])) hidActs_16[:3, :, :, :] = hidActs filters_16 = gp.zeros( (filters.shape[0], filters.shape[1], filters.shape[2], 16)) filters_16[:, :, :, :3] = filters return ConvNet.convDown(hidActs_16, filters_16, moduleStride=moduleStride, paddingStart=paddingStart) elif filters.shape[3] % 16 == 0 and hidActs.shape[0] % 16 == 0: return ConvNet.convDown(hidActs, filters, moduleStride, paddingStart) else: raise Exception("Hidden or Filters Mode 16")
def localUp(images, filters, count_unused=False): #assert paddingStart <= 0 numChannels, imSizeX, imSizeX, numImages = images.shape numModulesX, numModulesX, numFilterChannels, filterSizeX, filterSizeX, numFilters = filters.shape assert numModulesX <= imSizeX moduleStride = 1 paddingStart = -(numModulesX - imSizeX + filterSizeX - 1) #numModulesX = (abs(paddingStart) + imSizeX - filterSizeX + 1) numModules = numModulesX**2 numGroups = 1 targets = g.zeros((numFilters, numModulesX, numModulesX, numImages)) images2 = g.zeros((numChannels, imSizeX+2*abs(paddingStart), imSizeX+2*abs(paddingStart), numImages)) if paddingStart != 0: images2[:, abs(paddingStart):-abs(paddingStart), abs(paddingStart):-abs(paddingStart), :] = images else: images2 = images used=0 for i in range(numImages): for f in range(numFilters): for c in range(numChannels): for y1 in range(numModulesX): for y2 in range(numModulesX): for u1 in range(filterSizeX): for u2 in range(filterSizeX): x1 = y1 + u1 x2 = y2 + u2 targets[f, y1, y2, i] += \ filters[y1, y2, c ,u1,u2,f] * \ images2[c,x1,x2,i] # if images2 is exactly zero, it means we're the victims of padding. used += (images2[c,x1,x2,i]!=0) if count_unused: unused = numImages*filters.size - used assert unused % numImages == 0 print 'localUp: num unused filters: %s' % (unused / numImages) return targets
def localDown(hidActs, filters, paddingStart=0): numGroups = 1 moduleStride = 1 assert paddingStart <= 0 numFilters, numModulesX, numModulesX, numImages = hidActs.shape numModulesX, numModulesX, numFilterChannels, filterSizeX, filterSizeX, numFilters = filters.shape # what about the stride? I don't support stride. I don't like it. #paddingStart = -(numModulesX - imSizeX + filterSizeX + 1) #numModulesX = (abs(paddingStart) + imSizeX - filterSizeX + 1) imSizeX = numModulesX - abs(paddingStart) + filterSizeX - 1 numChannels = numFilterChannels * numGroups numModules = numModulesX**2 targets = g.zeros((numChannels, imSizeX, imSizeX, numImages)) targets2 = g.zeros((numChannels, imSizeX + 2 * abs(paddingStart), imSizeX + 2 * abs(paddingStart), numImages)) numImgColors = numChannels #numFilters, numModulesX, numModulesX, numImages = hidActs.shape #numFilterChannels, filterSizeX, filterSizeX, numFilters = filters.shape moduleStride = 1 numModulesX = (abs(paddingStart) + imSizeX - filterSizeX + 1) numModules = numModulesX**2 numGroups = 1 #targets = g.zeros((numFilters, numModulesX, numModulesX, numImages)) for i in range(numImages): for f in range(numFilters): for c in range(numChannels): for y1 in range(numModulesX): for y2 in range(numModulesX): for u1 in range(filterSizeX): for u2 in range(filterSizeX): x1 = y1 + u1 x2 = y2 + u2 # targets[f, y1, y2, i] += \ # filters[c ,u1,u2,f] * \ # images2[c,x1,x2,i] targets2[c,x1,x2,i] += \ filters[y1, y2, c ,u1,u2,f] * \ hidActs[f, y1, y2, i] if paddingStart != 0: targets[:] = targets2[:, abs(paddingStart):-abs(paddingStart), abs(paddingStart):-abs(paddingStart), :] else: targets = targets2 return targets
def _load_from_stream(self, f): self._param_id, layer_dim = struct.unpack('ii', f.read(4*2)) self.gamma = gnp.garray(np.fromstring(f.read(layer_dim * 4), dtype=np.float32)) self.beta = gnp.garray(np.fromstring(f.read(layer_dim * 4), dtype=np.float32)) self.param_size = self.gamma.size + self.beta.size self.gamma_grad = gnp.zeros(self.gamma.size) self.beta_grad = gnp.zeros(self.beta.size)
def grad_costfunc_gpu_ReLU(x, *args): num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden num_weights2 = (num_hidden + 1) * num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) #hidden_activation = gpu.log(1+hidden_sum.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum)) * (hidden_sum > 0) hidden_activation = hidden_sum * relu_mask_hidden1 #hidden_derivative = hidden_sum.logistic() hidden_derivative = relu_mask_hidden1 hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) hidden_derivative = gpu.concatenate((gpu.ones( (1, nData)), hidden_derivative), axis=0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs - inputs weights2_grad += gpu.dot( p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())), p) #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = q_temp * hidden_derivative delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad / nData weights2_grad = weights2_grad / nData weights1_grad[:, 1:shape(weights1_grad)[1]] = weights1_grad[:, 1:shape( weights1_grad)[1]] + weights1[:, 1:shape(weights1)[1]] * lambda_val weights2_grad[:, 1:shape(weights2_grad)[1]] = weights2_grad[:, 1:shape( weights2_grad)[1]] + weights2[:, 1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) del x del inputs del data del p del q_temp del q del delta2 del hidden_sum del hidden_activation del weights1 del weights2 gpu.free_reuse_cache() return hstack( (weights1_grad.as_numpy_array(), weights2_grad.as_numpy_array()))
def initUpdate(self): #increment for hidden biase self.incb1 = gp.zeros(self.hDim) #increment for visible biase self.incb2 = gp.zeros(self.vDim) #increment for weight self.incW1 = gp.zeros((self.vDim, self.hDim)) self.incW2 = gp.zeros((self.hDim, self.vDim))
def dev_loss(A, dev_type=1, use_shepherd=0): """DEV regularizer, cool stuff.""" b_reps = len(A) b_obs = A[0].shape[0] At = [] for i in range(b_reps): if (dev_type == 1): At.append(norm_trans(A[i],'ff')) elif (dev_type == 2): At.append(tanh_trans(A[i],'ff')) elif (dev_type == 3): At.append(line_trans(A[i],'ff')) else: raise Exception('Unknown DEV types.') # Compute the mean activations for this ensemble sample N = float(A[0].shape[1]) n = float(b_reps) m = float(b_obs * b_reps * N) Am = gp.zeros(At[0].shape) if (use_shepherd != 1): for i in range(b_reps): Am = Am + At[i] Am = Am / float(b_reps) else: Am = At[0] # Compute difference from mean of each set of droppy activations Ad = [(At[i] - Am) for i in range(b_reps)] L = sum([gp.sum(ad**2.0) for ad in Ad]) / m dLdA = [] if (use_shepherd != 1): Add = gp.zeros(At[0].shape) for i in range(b_reps): Add = Add + Ad[i] for i in range(b_reps): dLdA.append(-(2.0/m) * ((((1.0/n) - 1.0) * Ad[i]) + \ ((1.0/n) * (Add - Ad[i])))) else: for i in range(b_reps): if (i == 0): dLdA.append(gp.zeros(Ad[0].shape)) else: dLdA.append((2.0 / m) * Ad[i]) for i in range(1,b_reps): dLdA[0] = dLdA[0] - dLdA[i] # Backpropagate gradient on variance through the desired transform for i in range(b_reps): BP = {'X': A[i], 'A': At[i], 'dLdA': dLdA[i]} if (dev_type == 1): dLdA[i] = norm_trans(BP, 'bp') elif (dev_type == 2): dLdA[i] = tanh_trans(BP, 'bp') elif (dev_type == 3): dLdA[i] = line_trans(BP, 'bp') return {'L': L, 'dLdA': dLdA}
def dev_loss(A, dev_type=1, use_shepherd=0): """DEV regularizer, cool stuff.""" b_reps = len(A) b_obs = A[0].shape[0] At = [] for i in range(b_reps): if (dev_type == 1): At.append(norm_trans(A[i], 'ff')) elif (dev_type == 2): At.append(tanh_trans(A[i], 'ff')) elif (dev_type == 3): At.append(line_trans(A[i], 'ff')) else: raise Exception('Unknown DEV types.') # Compute the mean activations for this ensemble sample N = float(A[0].shape[1]) n = float(b_reps) m = float(b_obs * b_reps * N) Am = gp.zeros(At[0].shape) if (use_shepherd != 1): for i in range(b_reps): Am = Am + At[i] Am = Am / float(b_reps) else: Am = At[0] # Compute difference from mean of each set of droppy activations Ad = [(At[i] - Am) for i in range(b_reps)] L = sum([gp.sum(ad**2.0) for ad in Ad]) / m dLdA = [] if (use_shepherd != 1): Add = gp.zeros(At[0].shape) for i in range(b_reps): Add = Add + Ad[i] for i in range(b_reps): dLdA.append(-(2.0/m) * ((((1.0/n) - 1.0) * Ad[i]) + \ ((1.0/n) * (Add - Ad[i])))) else: for i in range(b_reps): if (i == 0): dLdA.append(gp.zeros(Ad[0].shape)) else: dLdA.append((2.0 / m) * Ad[i]) for i in range(1, b_reps): dLdA[0] = dLdA[0] - dLdA[i] # Backpropagate gradient on variance through the desired transform for i in range(b_reps): BP = {'X': A[i], 'A': At[i], 'dLdA': dLdA[i]} if (dev_type == 1): dLdA[i] = norm_trans(BP, 'bp') elif (dev_type == 2): dLdA[i] = tanh_trans(BP, 'bp') elif (dev_type == 3): dLdA[i] = line_trans(BP, 'bp') return {'L': L, 'dLdA': dLdA}
def pre_wuw_wu(self, frame_number, static_dimension, var_base): wuw_mat = gnp.zeros((frame_number*static_dimension, frame_number*static_dimension)) wu_mat = gnp.zeros((frame_number*static_dimension, 3*frame_number*static_dimension)) for i in xrange(static_dimension): temp_var_base = [var_base[i*3], var_base[i*3+1], var_base[i*3+2]] temp_wuw, temp_wu = self.pre_compute_wuw(frame_number, temp_var_base) wuw_mat[frame_number*i:frame_number*(i+1), frame_number*i:frame_number*(i+1)] = gnp.garray(temp_wuw[:]) wu_mat[frame_number*i:frame_number*(i+1), frame_number*i:frame_number*(i+3)] = gnp.garray(temp_wu[:]) return wuw_mat, wu_mat
def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x): """ Predict the class label of input x from supervised DBN Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html x: Input data. (NxD matrix) """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # for every class, assume it is the correct label and calculate its free energy y = gnp.zeros((K, N)) free_energy = gnp.zeros((N, K)) # we actually calculate -free_energy for k in range(K): # set the current assumed class label y[k, :] = 1.0 # visible unit vector v = gnp.concatenate((y, h_prev)) e_v = gnp.dot(ws_v[-1].T, v) # bias energy term ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1] e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0) free_energy[:, k] = e_v + e_h # zero the class labels for next iteration y[:, :] = 0.0 # since these numbers may get pretty small, use the sum-exp trick for converting # these to probabilities pred_y = ( gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]) / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis] ) return pred_y
def __init__(self): """ Create a Layer which contains no input, output, delta or gradient """ # Initialize inputs, outputs, deltas and gradients to be None by default. # Then, getters and setters can easily be inherited self.input = None self.output = None self.delta = None self.gradient = gpu.zeros((0,0)) self.parameters = gpu.zeros((0,0))
def __init__(self): """ Create a Layer which contains no input, output, delta or gradient """ # Initialize inputs, outputs, deltas and gradients to be None by default. # Then, getters and setters can easily be inherited self.input = None self.output = None self.delta = None self.gradient = gpu.zeros((0, 0)) self.parameters = gpu.zeros((0, 0))
def exact_moments(rbm, batch_units=10, show_progress=False): expect_vis = gnp.zeros(rbm.nvis) expect_hid = gnp.zeros(rbm.nhid) expect_prod = gnp.zeros((rbm.nvis, rbm.nhid)) for hid, p in iter_configurations(rbm, batch_units=batch_units, show_progress=show_progress): cond_vis = gnp.logistic(rbm.vis_inputs(hid)) expect_vis += gnp.dot(p, cond_vis) expect_hid += gnp.dot(p, hid) expect_prod += gnp.dot(cond_vis.T * p, hid) return binary_rbms.Moments(expect_vis, expect_hid, expect_prod)
def __init__(self, layerSizes=None, outputActFunct=Linear(), useReLU = True, \ initialWeights=None, initialBiases=None, targMean=None, targStd=None): """ Construct a Neural Network object with Basic Structure: - layerSizes: [input size, hidden layer size list, output size] - outputActFunct: activation function for output layer, such as Linear() and LinearMasked() - useReLU: True/False, use ReLU() or Sigmoid() as activation function """ self.layerSizes = layerSizes self.outputActFunct = outputActFunct self.useReLU = useReLU if useReLU: self.hidActFuncts = [ReLU() for i in range(len(layerSizes) - 2)] else: self.hidActFuncts = [Sigmoid() for i in range(len(layerSizes) - 2)] # initialize weights and biases if initialWeights is None: # set wscale for each layer according to 0.5*n*Var(w) = 1 scale_list = [num.sqrt(2.0 / n) for n in layerSizes[:-1]] shapes = [(layerSizes[i - 1], layerSizes[i]) for i in range(1, len(layerSizes))] self.weights = [ gnp.garray( initWeightMatrix(shapes[i], scale_list[i], None, False)) for i in range(len(shapes)) ] else: self.weights = initialWeights if initialBiases is None: self.biases = [ gnp.garray(0 * num.random.rand(1, self.layerSizes[i])) for i in range(1, len(self.layerSizes)) ] else: self.biases = initialBiases # initialize gradients of weights and biases self.WGrads = [ gnp.zeros(self.weights[i].shape) for i in range(len(self.weights)) ] self.biasGrads = [ gnp.zeros(self.biases[i].shape) for i in range(len(self.biases)) ] # specify targMean and targStd with model since they are important model parameters #assert(len(targMean) == layerSizes[-1]) self.targMean = targMean #assert(len(targStd) == layerSizes[-1]) self.targStd = targStd
def localUp(images, filters, count_unused=False): #assert paddingStart <= 0 numChannels, imSizeX, imSizeX, numImages = images.shape numModulesX, numModulesX, numFilterChannels, filterSizeX, filterSizeX, numFilters = filters.shape assert numModulesX <= imSizeX moduleStride = 1 paddingStart = -(numModulesX - imSizeX + filterSizeX - 1) #numModulesX = (abs(paddingStart) + imSizeX - filterSizeX + 1) numModules = numModulesX**2 numGroups = 1 targets = g.zeros((numFilters, numModulesX, numModulesX, numImages)) images2 = g.zeros((numChannels, imSizeX + 2 * abs(paddingStart), imSizeX + 2 * abs(paddingStart), numImages)) if paddingStart != 0: images2[:, abs(paddingStart):-abs(paddingStart), abs(paddingStart):-abs(paddingStart), :] = images else: images2 = images used = 0 for i in range(numImages): for f in range(numFilters): for c in range(numChannels): for y1 in range(numModulesX): for y2 in range(numModulesX): for u1 in range(filterSizeX): for u2 in range(filterSizeX): x1 = y1 + u1 x2 = y2 + u2 targets[f, y1, y2, i] += \ filters[y1, y2, c ,u1,u2,f] * \ images2[c,x1,x2,i] # if images2 is exactly zero, it means we're the victims of padding. used += (images2[c, x1, x2, i] != 0) if count_unused: unused = numImages * filters.size - used assert unused % numImages == 0 print 'localUp: num unused filters: %s' % (unused / numImages) return targets
def preTrainIth(self, i, minibatchStream, epochs, mbPerEpoch): self.dW = gnp.zeros(self.weights[i].shape) self.dvb = gnp.zeros(self.genBiases[i].shape) self.dhb = gnp.zeros(self.biases[i].shape) for ep in range(epochs): recErr = 0 totalCases = 0 for j in range(mbPerEpoch): inpMB = minibatchStream.next() curRecErr = self.CDStep(inpMB, i, self.learnRates[i], self.momentum, self.L2Costs[i]) recErr += curRecErr totalCases += inpMB.shape[0] yield recErr/float(totalCases)
def grad_costfunc_gpu(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation,axis=1)/nData grad_sparse = -1*sparsityParam/p_avg.as_numpy_array() + (1-sparsityParam)/(1-p_avg.as_numpy_array()) grad_sparse = append(0,grad_sparse) grad_sparse = tile(grad_sparse, (nData, 1)) grad_sparse = gpu.garray(transpose(grad_sparse)) hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs-inputs weights2_grad += gpu.dot(p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())),p) + beta*grad_sparse #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = (q_temp*hidden_activation)*(1-hidden_activation) delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad/nData weights2_grad = weights2_grad/nData weights1_grad[:,1:shape(weights1_grad)[1]] = weights1_grad[:,1:shape(weights1_grad)[1]] + weights1[:,1:shape(weights1)[1]] * lambda_val weights2_grad[:,1:shape(weights2_grad)[1]] = weights2_grad[:,1:shape(weights2_grad)[1]] + weights2[:,1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) del x del inputs del data del grad_sparse del p del q_temp del q del delta2 del hidden_sum del hidden_activation del weights1 del weights2 gpu.free_reuse_cache() return hstack((weights1_grad.as_numpy_array(),weights2_grad.as_numpy_array()))
def train(self): self.time_interval = 0 t1 = time.time() cd = 1 for current_epochs, weight_size in zip(self.epochs, self.weights_to_do): self.initialize_weights(weight_size) for epoch in xrange(current_epochs): error = 0 for start_idx in range(0, self.X.shape[0], self.batch_size): self.w_updt = gpu.zeros((self.input, weight_size)) self.bias_h_updt = gpu.zeros((1, weight_size)) self.bias_v_updt = gpu.zeros((1, self.input)) self.allocate_batch(start_idx) self.input_original = self.get_visible_vector(self.batch) self.input_dropped = self.input_original self.positive_phase() self.gibbs_updates(weight_size) for j in range(cd): self.negative_phase() self.w += self.alpha * self.w_updt / float( self.current_batch_size) self.bias_h += self.alpha * self.bias_h_updt / float( self.current_batch_size) self.bias_v += self.alpha * self.bias_v_updt / float( self.current_batch_size) t0 = time.time() error += gpu.mean( (self.input_dropped - self.input_original)**2) self.time_interval += time.time() - t0 s = 'EPOCH: ' + str(epoch + 1) self.log_message(s) s = 'Reconstruction error: ' + str( error / (self.X.shape[0] / float(self.batch_size))) self.log_message(s) self.trained_weights.append( [self.w.as_numpy_array(), self.bias_h.as_numpy_array()]) self.input = self.w.shape[1] print 'Time interval: ' + str(self.time_interval) print 'Training time: ' + str(time.time() - t1) self.free_GPU_memory() return self.trained_weights
def __init__(self,model,alpha=1e-2,minibatch=256, optimizer='momentum',momentum=0.9): self.model = model assert self.model is not None, "Must define a function to optimize" self.it = 0 self.momentum = momentum # momentum self.alpha = alpha # learning rate self.minibatch = minibatch # minibatch self.optimizer = optimizer if self.optimizer == 'momentum' or self.optimizer == 'nesterov': print "Using %s.."%self.optimizer self.velocity = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.model.stack] elif self.optimizer == 'adagrad' or self.optimizer == 'adagrad3' or self.optimizer == 'adadelta': print "Using %s.."%self.optimizer self.gradt = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.model.stack] elif self.optimizer == 'adaccel2': print "Using adaccel2.." self.gradt = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.model.stack] self.velocity = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.model.stack] elif self.optimizer == 'sgd': print "Using sgd.." else: raise ValueError("Invalid optimizer") self.costt = [] self.expcost = []
def __init__(self, layer_dim=None): if layer_dim is None: return self.gamma = gnp.ones(layer_dim) self.beta = gnp.zeros(layer_dim) self.gamma_grad = gnp.zeros(layer_dim) self.beta_grad = gnp.zeros(layer_dim) self.param_size = self.gamma.size + self.beta.size self._param_id = LayerParams._param_count LayerParams._param_count += 1
def convOutp(images, hidActs, paddingStart = 0): numGroups = 1 moduleStride = 1 assert paddingStart <= 0 numFilters, numModulesX, numModulesX, numImages = hidActs.shape numChannels, imSizeX, imSizeX, numImages = images.shape numFilterChannels = numChannels / numGroups filterSizeX = imSizeX - numModulesX + abs(paddingStart) + 1 targets = g.zeros((numFilterChannels, filterSizeX, filterSizeX, numFilters)) numImgColors = numChannels images2 = g.zeros((numChannels, imSizeX+2*abs(paddingStart), imSizeX+2*abs(paddingStart), numImages)) if paddingStart != 0: images2[:, abs(paddingStart):-abs(paddingStart), abs(paddingStart):-abs(paddingStart), :] = images else: images2 = images for i in range(numImages): for f in range(numFilters): for c in range(numChannels): for y1 in range(numModulesX): for y2 in range(numModulesX): for u1 in range(filterSizeX): for u2 in range(filterSizeX): x1 = y1 + u1 x2 = y2 + u2 # targets[f, y1, y2, i] += \ # filters[c ,u1,u2,f] * \ # images2[c,x1,x2,i] targets[c ,u1,u2,f] += \ hidActs[f, y1, y2, i] * \ images2[c,x1,x2,i] return targets
def __init__(self, to_port): """ Create a new bias """ # Properly initialize the Bias AbstractConnection.__init__(self, None, to_port) self.parameters = gpu.zeros((1, to_port.size)) self.dimensions = (1, to_port.size) self.input = gpu.zeros((0,0)) self.output = gpu.zeros((1,to_port.size)) self.gradient = gpu.zeros(self.dimensions)
def preTrainIth(self, i, minibatchStream, epochs, mbPerEpoch): #initialize CD gradient variables self.dW = gnp.zeros(self.weights[i].shape) self.dvb = gnp.zeros(self.genBiases[i].shape) self.dhb = gnp.zeros(self.biases[i].shape) for ep in range(epochs): recErr = 0 totalCases = 0 for j in range(mbPerEpoch): inpMB = minibatchStream.next() curRecErr = self.CDStep(inpMB, i, self.learnRates[i], self.momentum, self.L2Costs[i]) recErr += curRecErr totalCases += inpMB.shape[0] yield recErr/float(totalCases)
def test_gnumpy(dat, num_epochs): import gnumpy as gpu import numpy import time # load data. <dat> is 2 dimensional: 60000 X 784 #dat = gpu.garray(load('mnist_cudaTest').T/255.) # training parameters epsilon = 0.1 momentum = 0.9 batch_size = 128 num_batches = dat.shape[0] / batch_size # model parameters num_vis = dat.shape[1] num_hid = 4096 # initialize weights w_vh = 0.1 * gpu.randn(num_vis, num_hid) w_v = gpu.zeros(num_vis) w_h = -4. * gpu.ones(num_hid) # initialize weight updates wu_vh = gpu.zeros((num_vis, num_hid)) wu_v = gpu.zeros(num_vis) wu_h = gpu.zeros(num_hid) for epoch in range(num_epochs): err = [] tic = time.clock() for batch in range(num_batches): # positive phase v1 = dat[batch * batch_size:(batch + 1) * batch_size] h1 = (gpu.dot(v1, w_vh) + w_h).logistic() # sample hiddens hSampled = h1.rand() < h1 # negative phase v2 = (gpu.dot(hSampled, w_vh.T) + w_v).logistic() h2 = (gpu.dot(v2, w_vh) + w_h).logistic() # update weights wu_vh = wu_vh * momentum + gpu.dot(v1.T, h1) - gpu.dot(v2.T, h2) wu_v = wu_v * momentum + v1.sum(0) - v2.sum(0) wu_h = wu_h * momentum + h1.sum(0) - h2.sum(0) w_vh += wu_vh * (epsilon / batch_size) w_v += wu_v * (epsilon / batch_size) w_h += wu_h * (epsilon / batch_size) # calculate reconstruction error err.append((v2 - v1).euclid_norm()**2 / (num_vis * batch_size)) toc = time.clock() print "Mean squared error: %.4f, takes time: %d" % (numpy.mean(err), toc - tic) return w_vh, w_v, w_h
def dev_loss(self, X, Y, M, Ws=[]): """Compute DEV-regularized loss for inputs X with target outputs Y. This loss function computes a combination of standard output loss (e.g. for classification/regression) and Dropout Ensemble Variance regularization loss. X should be a list of 'dev_reps' input arrays, where 'dev_reps' is the number of times each input will be pushed through a droppy network when computing the DEV regularizer. M should be a list of lists of per-layer dropout masks, matched to size of the input arrays in X. Y should contain the target outputs for X[0], for which inputs will be pushed through a drop-free network. """ if (len(Ws) == 0): Ws = self.layer_weights() dev_reps = len(X) # Compute activations for observations in X A = [self.feedforward(X[i], M[i], Ws) for i in range(dev_reps)] # Compute loss and gradient for output-layer activations, for the # (should be) drop free feedforward of X[0]. O = self.out_loss(A[0][-1], Y) # Make list of activation gradients dLdA = [[gp.zeros(Aj.shape) for Aj in A[0]] \ for i in range(dev_reps)] dLdA[0][-1] = O['dL'] # Compute DEV regularizer loss and gradients Ld = 0.0 for i in range(self.layer_count): dev_type = self.dev_types[i] dev_lam = self.dev_lams[i] if (dev_lam > 0.0000001): Ai = [A[j][i] for j in range(dev_reps)] Di = lnf.dev_loss(Ai, dev_type, 0) Ld = Ld + (dev_lam * Di['L']) for j in range(dev_reps): dLdA[j][i] = dLdA[j][i] + (dev_lam * Di['dLdA'][j]) # Backpropagate gradients for each DEV rep B = {'dLdWs': [gp.zeros(W.shape) for W in Ws]} for i in range(dev_reps): Bi = self.backprop(dLdA[i], A[i], X[i], M[i], Ws) for j in range(self.layer_count): B['dLdWs'][j] = B['dLdWs'][j] + Bi['dLdWs'][j] # Compute parameter regularization loss and gradients R = self.reg_loss(Ws) # Combine output loss, DEV loss, and regularization loss L = [O['L'], Ld, R['L']] # Combine output loss gradient and regularization gradient dLdWs = [(dWb + dWr) for (dWb, dWr) in zip(B['dLdWs'], R['dLdWs'])] return {'L': L, 'dLdWs': dLdWs}
def sde_loss(self, X, Y, M, Ws=[], do_print=0): """Compute dropout loss for inputs X with target outputs Y. This loss function computes the standard dropout loss for some inputs X with target outputs Y, when dropout is applied following the masks in M, given the layer weights in Ws (default self.layer_weights()). """ if (len(Ws) == 0): Ws = self.layer_weights() # Compute droppy activations for observations in X A = self.feedforward(X, M, Ws) # Compute loss and gradient for output-layer activations O = self.out_loss(A[-1], Y) # Make list of activation gradients dLdA = [gp.zeros(Ai.shape) for Ai in A] dLdA[-1] = O['dL'] # Backprop the output loss gradient through network B = self.backprop(dLdA, A, X, M, Ws) # Compute parameter regularization loss and gradients R = self.reg_loss(Ws) # Combine output loss, DEV loss, and regularization loss L = [O['L'], 0.0, R['L']] # Combine output loss gradient and regularization gradient dLdWs = [(dWb + dWr) for (dWb, dWr) in zip(B['dLdWs'], R['dLdWs'])] return {'L': L, 'dLdWs': dLdWs}
def zeroHistoryDeltaBatch(self, batchSize): """ Set the initial history delta to zeros for the provided batch size """ zero_delta = gpu.zeros((batchSize, self.layerSize)) self.setHistoryDelta(zero_delta)
def __init__(self, **kwargs): dictlist.replace(kwargs, lambda x: (x,) if isinstance(x, int) else x) self.n_pars = n_pars_by_partition(kwargs) # Create two representations of the parameters of the object. The first # is the symbolic theano variable (of which the type is GPU/CPU # specific), the second either a gnumpy or numpy array (depending on # GPU/CPU again). Also set a default size for testing. if GPU: self.data = gnumpy.zeros(self.n_pars) self.flat = theano.sandbox.cuda.fvector('parameters') else: self.data = np.empty(self.n_pars).astype(theano.config.floatX) self.flat = T.vector('parameters') self.flat.tag.test_value = self.data # Go through parameters and assign space and variable. self.views = array_partition_views(self.data, kwargs) # Make sure the keys are legit -- that they do not overwrite # anything. for key in kwargs: if hasattr(self, key): raise ValueError("%s is an illegal name for a variable") variables = array_partition_views(self.flat, kwargs) variables = dictlist.copy(variables, dct_maker=attrdict.AttrDict) self.__dict__.update(variables)
def MaxPool(images, subsX, startX, strideX, outputsX): numChannels, imSizeX, imSizeX, numImages = images.shape numImgColors = numChannels targets = g.zeros((numChannels, outputsX, outputsX, numImages)) imagesCu = images._base.p_mat targetsCu = targets._base.p_mat from pylab import prod imagesCu_orig = tuple(imagesCu.contents.size) imagesTotSize = images.size imagesCu.contents.size[0] = numImages imagesCu.contents.size[1] = numImgColors * imSizeX**2 #assert imagesTotSize == prod(imagesCu.contents.size) targetsCu_orig = tuple(targetsCu.contents.size) targetsTotSize = targets.size targetsCu.contents.size[0] = numImages targetsCu.contents.size[1] = numImgColors * outputsX**2 #assert targetsTotSize == prod(targetsCu.contents.size) numFilters = numImgColors _ConvNet.MaxPool(imagesCu, targetsCu, numFilters, subsX, startX, strideX, outputsX) for i in range(2): targetsCu.contents.size[i] = targetsCu_orig[i] imagesCu.contents.size[i] = imagesCu_orig[i] return targets
def show_chains(rbm, state, dataset, num_particles=20, num_samples=20, show_every=10, display=True, figname='Gibbs chains', figtitle='Gibbs chains'): samples = gnp.zeros((num_particles, num_samples, state.v.shape[1])) state = state[:num_particles, :, :] for i in range(num_samples): samples[:, i, :] = rbm.vis_expectations(state.h) for j in range(show_every): state = rbm.step(state) npix = dataset.num_rows * dataset.num_cols rows = [vm.hjoin([samples[i, j, :npix].reshape((dataset.num_rows, dataset.num_cols)).as_numpy_array() for j in range(num_samples)], normalize=False) for i in range(num_particles)] grid = vm.vjoin(rows, normalize=False) if display: pylab.figure(figname) pylab.matshow(grid, cmap='gray', fignum=False) pylab.title(figtitle) pylab.gcf().canvas.draw() return grid
def initParams(self): # crude way of random initialization (random seed) for parameters import time self.seed = int(time.time()) % 100000 # for tt in range(self.seed): gp.rand() sizes = [self.inputDim] + self.layerSizes + [self.outputDim] scales = [ gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:]) ] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s, self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s, self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape), gp.empty(b.shape)] for w, b in self.stack] for tt in range(self.seed): gp.rand() self.stack = [[ ws[0] + .01 * gp.randn(ws[0].shape), ws[1] + .01 * gp.randn(ws[1].shape) ] for ws in self.stack]
def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False): pred = gnp.as_garray(pred) M = 1 - pred * self.target loss = (((M > 0) * M)**2).sum() grad = -2 * ((M > 0) * self.target * M) if compute_grad else gnp.zeros( pred.shape) return loss, grad
def __init__(self, layer_shape, dropout_probability, n_epochs = 50, l2_max = 15.0, learning_rate = lambda x:1.0 * .998 ** x, doGradientCheck = False): assert(len(dropout_probability) == len(layer_shape)) self.dropout_probability = dropout_probability self.activation_hidden = activation_relu self.gradient_hidden = gradient_relu self.activation_output = activation_softmax self.gradient_output = gradient_output_softmax self.n_epochs = n_epochs self.f_score = score_softmax self.learning_rate = learning_rate self.mini_batch_size = 100 self.doGradientCheck = doGradientCheck self.l2_max = l2_max self.training_score = [] self.training_validation_error = [] self.weights = [] self.activation = [] self.gradient = [] for i in range(1,len(layer_shape)): self.weights.append([g.randn(layer_shape[i-1],layer_shape[i])*0.01, g.zeros(layer_shape[i])]) self.activation.append(self.activation_hidden) self.gradient.append(self.gradient_hidden) self.activation[-1] = self.activation_output self.gradient[-1] = self.gradient_output
def sample(self, X): assert X.shape[1] == self.input_size batch_size = X.shape[0] ANS = g.zeros((batch_size ,self.output_size)) for (f, (ir0,ir1), (or0,or1)) in zip(self.fns, self.input_ranges, self.output_ranges): ANS[:,or0:or1] = f.sample(X[:, ir0:ir1]) return ANS
def backward(self, Y, preds, acts, words, X): """ Backward pass through the network """ batchsize = preds.shape[0] # Compute part of df/dR Ix = gpu.garray(preds[:,:-1] - Y) / batchsize delta = gpu.dot(acts.T, Ix) dR = delta[:-1,:] + self.gamma_r * self.R db = delta[-1,:] dR = dR.as_numpy_array() # Compute df/dC and word inputs for df/dR Ix = gpu.dot(Ix, self.R.T) dC = gpu.zeros(np.shape(self.C)) for i in range(self.context): delta = gpu.dot(words[:,:,i].T, Ix) dC[i,:,:] = delta + self.gamma_c * self.C[i,:,:] delta = gpu.dot(Ix, self.C[i,:,:].T) delta = delta.as_numpy_array() for j in range(X.shape[0]): dR[:,X[j,i]] = dR[:,X[j,i]] + delta.T[:,j] self.dR = gpu.garray(dR) self.db = db self.dC = dC
def forward(self, X, test=False): """ Feed-forward pass through the model X: ('batchsize' x 'context') matrix of word indices """ batchsize = X.shape[0] R = self.R C = self.C bw = self.bw # Obtain word features tmp = R.as_numpy_array()[:,X.flatten()].flatten(order='F') # flatten(), default in row-major order, order='F' means Fortran(column-major) order tmp = tmp.reshape((batchsize, self.K * self.context)) # reshape(), in row-major order words = np.zeros((batchsize, self.K, self.context)) for i in range(batchsize): words[i,:,:] = tmp[i,:].reshape((self.K, self.context), order='F') words = gpu.garray(words) # Compute the hidden layer (predicted next word representation) acts = gpu.zeros((batchsize, self.K)) for i in range(self.context): acts = acts + gpu.dot(words[:,:,i], C[i,:,:]) # the dot() of 2-D matrix is equiverlent to multiply acts = gpu.concatenate((acts, gpu.ones((batchsize, 1))), 1) # Compute softmax preds = gpu.dot(acts, gpu.concatenate((R, bw))) preds = gpu.exp(preds - preds.max(1).reshape(batchsize, 1)) denom = preds.sum(1).reshape(batchsize, 1) preds = gpu.concatenate((preds / denom, gpu.ones((batchsize, 1))), 1) return (words, acts, preds.as_numpy_array())