def _prob_expr(self, factors, as_logp): ''' Implementation for probability and logp functions respectively''' # Get Calibrated Potentials calibrated = self.calibrated_potentials(factors, 'sum_product') minlen = 9999999999 mint = None # Find a clique with minimal scope, so we minimize the amount of final summing for v in calibrated: if (len(v.scope)<minlen): minlen = len(v.scope) mint = v if (minlen==1): break # Marginalize out everything if (self.logspace): res = mint.logsumexp_marginalize(mint.scope) res = T.reshape(res.pt_tensor, [1], ndim=1)[0] if (not as_logp): res = T.exp(res) else: res = mint.marginalize(mint.scope) res = T.reshape(res.pt_tensor, [1], ndim=1)[0] if (as_logp): res = T.log(res) return res
def define_train_test_funcs(self): activation = self.layers[len(self.layers) - 1].activation self.Y = T.matrix("Y") pYs = T.reshape(activation, (self.maskY.shape[0] * self.batch_size, self.out_size)) tYs = T.reshape(self.Y, (self.maskY.shape[0] * self.batch_size, self.out_size)) cost = self.categorical_crossentropy(pYs, tYs) gparams = [] for param in self.params: #gparam = T.grad(cost, param) gparam = T.clip(T.grad(cost, param), -10, 10) gparams.append(gparam) lr = T.scalar("lr") # eval(): string to function optimizer = eval(self.optimizer) updates = optimizer(self.params, gparams, lr) #updates = sgd(self.params, gparams, lr) #updates = momentum(self.params, gparams, lr) #updates = rmsprop(self.params, gparams, lr) #updates = adagrad(self.params, gparams, lr) #updates = dadelta(self.params, gparams, lr) #updates = adam(self.params, gparams, lr) self.train = theano.function(inputs = [self.X, self.maskX, self.Y, self.maskY, lr, self.batch_size], givens = {self.is_train : np.cast['int32'](1)}, outputs = cost, updates = updates) self.predict = theano.function(inputs = [self.X, self.maskX, self.batch_size], givens = {self.is_train : np.cast['int32'](0)}, outputs = activation)
def test_shape(): x = T.tensor3() x_flat_2_mat = T.flatten(x, 2) x_flat_2_vec = T.flatten(x, 1) flat_f = theano.function([x], [x_flat_2_mat, x_flat_2_vec]) flat_mat_val, flat_vec_val = flat_f(tensor3_val) print 'flatten to 2-d array:' print flat_mat_val print 'flatten to 1-d array:' print flat_vec_val x_mat = T.matrix() x_mat_2_t3 = T.reshape(x_mat, (2, 2, 2)) x_mat_2_vec = T.reshape(x_mat, (8,)) reshape_f = theano.function([x_mat], [x_mat_2_t3, x_mat_2_vec]) """ t3_shape = T.lvector() vec_shape = T.lvector() x_mat_2_t3 = T.reshape(x_mat, t3_shape, 3) x_mat_2_vec = T.reshape(x_mat, vec_shape, 1) reshape_f = theano.function([x_mat, t3_shape, vec_shape], [x_mat_2_t3, x_mat_2_vec]) """ mat_2_t3_val, mat_2_vec_val = reshape_f(flat_mat_val) print 'reshape 2-d array to 3-d array:' print mat_2_t3_val print 'reshape 2-d array to 1-d array:' print mat_2_vec_val
def get_unfolding_cost(self): ''' computes the unfolding rwconstructed cost (more than 2 inputs) ''' x = T.reshape(self.x, (-1, self.n_vector)) yi = x[0];i=1 for i in range(1, self.num): #while T.lt(i, self.num): xi = T.concatenate((yi, x[i])) yi = self.get_hidden_values(xi) i += 1 # Save the deepest hidden value as output vactor self.vector = copy.deepcopy(yi) tmp = [] i = 1 for i in range(1, self.num): #while T.lt(i, self.num): zi = self.get_reconstructed(yi) t = T.reshape(zi, (2, self.n_vector)) tmp.append(t[1]) yi = t[0] i += 1 tmp.append(yi) tmp.reverse() x = self.x z = T.concatenate(tmp) # cross-entropy cost should be modified here. L = -T.sum( (0.5*x+0.5)*T.log(0.5*z+0.5) + (-0.5*x+0.5)*T.log(-0.5*z+0.5) ) # squred cost. #L = -T.sum( (x-z)**2 ) cost = T.mean(L) + 0.01*(self.W**2).sum() # cost for a minibatch return cost
def _transform_affine(theta, input, downsample_factor): num_batch, num_channels, height, width = input.shape theta = T.reshape(theta, (-1, 2, 3)) # grid of (x_t, y_t, 1), eq (1) in ref [1] out_height = T.cast(height / downsample_factor[0], 'int64') out_width = T.cast(width / downsample_factor[1], 'int64') grid = _meshgrid(out_height, out_width) # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s) T_g = T.dot(theta, grid) x_s = T_g[:, 0] y_s = T_g[:, 1] x_s_flat = x_s.flatten() y_s_flat = y_s.flatten() # dimshuffle input to (bs, height, width, channels) input_dim = input.dimshuffle(0, 2, 3, 1) input_transformed = _interpolate( input_dim, x_s_flat, y_s_flat, out_height, out_width) output = T.reshape( input_transformed, (num_batch, out_height, out_width, num_channels)) output = output.dimshuffle(0, 3, 1, 2) # dimshuffle to conv format return output
def tensor_softmax(inpt, n_classes=2): output = inpt.dimshuffle(0, 3, 4, 1, 2) output = T.reshape(output, (-1, n_classes)) f = lookup('softmax', _transfer) output = T.reshape(f(output), (1, -1, n_classes)) return output
def depool(X, factor=2): """ luke perforated upsample http://www.brml.org/uploads/tx_sibibtex/281.pdf """ output_shape = [ X.shape[1], X.shape[2]*factor, X.shape[3]*factor ] stride = X.shape[2] offset = X.shape[3] in_dim = stride * offset out_dim = in_dim * factor * factor upsamp_matrix = T.zeros((in_dim, out_dim)) rows = T.arange(in_dim) cols = rows*factor + (rows/stride * factor * offset) upsamp_matrix = T.set_subtensor(upsamp_matrix[rows, cols], 1.) flat = T.reshape(X, (X.shape[0], output_shape[0], X.shape[2] * X.shape[3])) up_flat = T.dot(flat, upsamp_matrix) upsamp = T.reshape(up_flat, (X.shape[0], output_shape[0], output_shape[1], output_shape[2])) return upsamp
def _meshgrid(height, width, depth): # This function is the grid generator from eq. (1) in reference [1]. # It is equivalent to the following numpy code: # x_t, y_t,z_t = np.meshgrid(np.linspace(-1, 1, width), # np.linspace(-1, 1, height)) # ones = np.ones(np.prod(x_t.shape)) # grid = np.vstack([x_t.flatten(), y_t.flatten(), ones]) # It is implemented in Theano instead to support symbolic grid sizes. # Note: If the image size is known at layer construction time, we could # compute the meshgrid offline in numpy instead of doing it dynamically # in Theano. However, it hardly affected performance when we tried. x_t = T.dot( T.reshape(T.dot( _linspace(-1.0, 1.0, height).dimshuffle(0, 'x'), T.ones((1, width))), (height, width, 1)), T.ones((1, 1, depth)) ) y_t = T.dot( T.reshape(T.dot( T.ones((height, 1)), _linspace(-1.0, 1.0, width).dimshuffle('x', 0)), (height, width, 1)), T.ones((1, 1, depth)) ) z_t = T.dot(T.ones((height, width, 1)), T.reshape(_linspace(-1.0, 1.0, depth), (1, 1, -1))) x_t_flat = x_t.reshape((1, -1)) y_t_flat = y_t.reshape((1, -1)) z_t_flat = z_t.reshape((1, -1)) ones = T.ones_like(x_t_flat) grid = T.concatenate([x_t_flat, y_t_flat, z_t_flat, ones], axis=0) return grid
def k_max_pool(self, x, k): """ perform k-max pool on the input along the rows input: theano.tensor.tensor4 k: theano.tensor.iscalar the k parameter Returns: 4D tensor """ x = T.reshape(x, (x.shape[0], x.shape[1], 1, x.shape[2] * x.shape[3])) ind = T.argsort(x, axis=3) sorted_ind = T.sort(ind[:, :, :, -k:], axis=3) dim0, dim1, dim2, dim3 = sorted_ind.shape indices_dim0 = T.arange(dim0).repeat(dim1 * dim2 * dim3) indices_dim1 = ( T.arange(dim1).repeat(dim2 * dim3).reshape((dim1 * dim2 * dim3, 1)).repeat(dim0, axis=1).T.flatten() ) indices_dim2 = T.arange(dim2).repeat(dim3).reshape((dim2 * dim3, 1)).repeat(dim0 * dim1, axis=1).T.flatten() result = x[indices_dim0, indices_dim1, indices_dim2, sorted_ind.flatten()].reshape(sorted_ind.shape) shape = (result.shape[0], result.shape[1], result.shape[2] * result.shape[3], 1) result = T.reshape(result, shape) return result
def make_ro(r, raster_space, width, height): """Symbolically render rays starting with raster_space according to geometry e defined by """ nmatrices = r.shape[0] resolution = np.array([width, height], dtype=config.floatX) # Normalise it to be bound between 0 1 norm_raster_space = raster_space / resolution # Put it in NDC space, -1, 1 screen_space = -1.0 + 2.0 * norm_raster_space # Make pixels square by mul by aspect ratio ndc_space = screen_space * np.array([resolution[0]/resolution[1],1.0], dtype=config.floatX) # Ray Direction # Position on z-plane ndc_xyz = stack(ndc_space, width, height, 1.0)*0.5 # Change focal length # Put the origin farther along z-axis ro = np.array([0,0,1.5], dtype=config.floatX) # Rotate both by same rotation matrix ro_t = T.dot(T.reshape(ro, (1,3)), r) ndc_t = T.dot(T.reshape(ndc_xyz, (1, width, height, 3)), r) ndc_t = T.reshape(ndc_t, (width, height, nmatrices, 3)) ndc_t = T.transpose(ndc_t, (2,0,1,3)) # Increment by 0.5 since voxels are in [0, 1] ro_t = ro_t + 0.5 ndc_t = ndc_t + 0.5 # Find normalise ray dirs from origin to image plane unnorm_rd = ndc_t - T.reshape(ro_t, (nmatrices,1,1,3)) rd = unnorm_rd / T.reshape(unnorm_rd.norm(2, axis=3), (nmatrices, width, height, 1)) return rd, ro_t
def error(self, outputs): '''Build a theano expression for computing the network error. Parameters ---------- outputs : dict mapping str to theano expression A dictionary of all outputs generated by the layers in this network. Returns ------- error : theano expression A theano expression representing the network error. ''' output = outputs[self.output_name()] alpha = outputs['hid2:alpha'] alpha_sum = alpha.sum(axis = 0) # max_dst_len * batch_size * max_src_len alpha_l_inf = alpha_sum.max(axis = -1) # batch_size # flatten all but last components of the output and labels n = output.shape[0] * output.shape[1] #print output.shape.eval() correct = TT.reshape(self.labels, (n, )) weights = TT.reshape(self.weights, (n, )) prob = TT.reshape(output, (n, output.shape[2])) nlp = -TT.log(TT.clip(prob[TT.arange(n), correct], 1e-8, 1)) if self.weighted: return (weights * nlp).sum() / weights.sum() + alpha_l_inf.mean() return nlp.mean()
def build(self,output_type): self.params+=[self.W_hy, self.b_hy,self.W_hi, self.b_hi] for param in self.params: self.updates[param] = theano.shared( value = np.zeros( param.get_value( borrow = True).shape, dtype = theano.config.floatX), name = 'updates') ### set up regularizer self.L1 += T.sum(abs(self.W_hy)) self.L2_sqr += T.sum(self.W_hy**2) ### fianl prediction formular #self.y = T.vector(name = 'y', dtype = 'int32') self.y_pred = T.dot(self.get_output(), self.W_hy) + self.b_hy y_p = self.y_pred y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1)) y_p_s = T.nnet.softmax(y_p_m) self.p_y_given_x = T.reshape(y_p_s, y_p.shape) self.loss = lambda y: Loss.nll_multiclass(self.p_y_given_x,y)
def _active(m, pre_h, x): x = T.reshape(x, (self.batch_size, last_shape[0])) pre_h = T.reshape(pre_h, (self.batch_size, last_shape[1])) h = self.decoder._active(x, pre_h) y = T.nnet.softmax(T.dot(h, self.W_hy) + self.b_y) y = y * m[:, None] print type(y) y_dim_y = y[:, 0:self.dim_y] y_dim_pos = y[:, self.dim_y:] print type(y_dim_y) print type(y_dim_pos) new_y_dim_y = y_dim_y + T.dot(y_dim_pos, self.word_tag_matrix) # y = np.column_stack((new_y_dim_y, y_dim_pos)) y = T.concatenate([new_y_dim_y, y_dim_pos], axis=1) print type(y) h = T.reshape(h, (1, self.batch_size * last_shape[1])) y = T.reshape(y, (1, self.batch_size * last_shape[0])) return h, y, new_y_dim_y, y_dim_pos
def T_l2_cost_conv_dA(x,a,A,imshp,kshp,featshp,stride=(1,1),mask=True): image_error, kernel, features = helper_T_l2_cost_conv(x=x,a=a,A=A,imshp=imshp,kshp=kshp,featshp=featshp,stride=stride,mask=mask) if stride == (1,1): image_error_rot = T.transpose(image_error,[1,0,2,3])[:,:,::-1,::-1] imshp_rot = (imshp[1],imshp[0],imshp[2],imshp[3]) featshp_rot = (featshp[1],featshp[0],featshp[2],featshp[3]) features_rot = T.transpose(features,[1,0,2,3]) featshp_rot_logical = (featshp_rot[0], featshp_rot[1], imshp[2] - kshp[2] + 1, imshp[3] - kshp[3] + 1) kernel_grad_rot = -1.*conv2d(image_error_rot,features_rot, image_shape=imshp_rot,filter_shape=featshp_rot, imshp_logical=imshp_rot[1:],kshp_logical=featshp_rot_logical[2:]) kernel_grad = T.transpose(kernel_grad_rot,[1,0,2,3]) reshape_kernel_grad = T.transpose(T.reshape(kernel_grad,(kshp[0],kshp[1]*kshp[2]*kshp[3]),ndim=2)) return reshape_kernel_grad else: my_conv = MyConv_view(strides=stride,kshp=kshp) kernel_grad = my_conv(image_error,features) reshape_kernel_grad = T.transpose(T.reshape(kernel_grad, (kshp[0], kshp[1] * kshp[2] * kshp[3]), ndim=2)) return reshape_kernel_grad
def T_subspacel1_slow_shrinkage_conv(a, L, lam_sparse, lam_slow, imshp,kshp,featshp,stride=(1,1),small_value=.001): featshp = (imshp[0],kshp[0],featshp[2],featshp[3]) # num images, features, szy, szx features = T.reshape(T.transpose(a),featshp,ndim=4) amp = T.sqrt(features[:,::2,:,:]**2 + features[:,1::2,:,:]**2 + small_value) #damp = amp[:,1:] - amp[:,:-1] # compose slow shrinkage with subspace l1 shrinkage # slow shrinkage div = T.zeros_like(amp) d1 = amp[1:,:,:,:] - amp[:-1,:,:,:] d2 = d1[1:,:,:,:] - d1[:-1,:,:,:] div = T.set_subtensor(div[1:-1,:,:,:], -d2) div = T.set_subtensor(div[0,:,:,:], -d1[0,:,:,:]) div = T.set_subtensor(div[-1,:,:,:], d1[-1,:,:,:]) slow_amp_shrinkage = 1 - (lam_slow / L) * (div / amp) slow_amp_value = T.switch(T.gt(slow_amp_shrinkage, 0), slow_amp_shrinkage, 0) slow_shrinkage_prox_a = slow_amp_value * features[:, ::2, :,:] slow_shrinkage_prox_b = slow_amp_value * features[:,1::2, :,:] # subspace l1 shrinkage amp_slow_shrinkage_prox = T.sqrt(slow_shrinkage_prox_a ** 2 + slow_shrinkage_prox_b ** 2) #amp_shrinkage = 1. - (lam_slow*lam_sparse/L)*amp_slow_shrinkage_prox amp_shrinkage = 1. - (lam_sparse / L) / amp_slow_shrinkage_prox amp_value = T.switch(T.gt(amp_shrinkage, 0.), amp_shrinkage, 0.) subspacel1_prox = T.zeros_like(features) subspacel1_prox = T.set_subtensor(subspacel1_prox[:, ::2, :,:], amp_value * slow_shrinkage_prox_a) subspacel1_prox = T.set_subtensor(subspacel1_prox[:,1::2, :,:], amp_value * slow_shrinkage_prox_b) reshape_subspacel1_prox = T.transpose(T.reshape(subspacel1_prox,(featshp[0],featshp[1]*featshp[2]*featshp[3]),ndim=2)) return reshape_subspacel1_prox
def max_pool_2d(input, ds, ignore_border=False, st=None, padding=(0, 0), mode='max'): """ Takes as input a N-D tensor, where N >= 2. It downscales the input image by the specified factor, by keeping only the maximum value of non-overlapping patches of size (ds[0],ds[1]) :type input: N-D theano tensor of input images. :param input: input images. Max pooling will be done over the 2 last dimensions. :type ds: tuple of length 2 :param ds: factor by which to downscale (vertical ds, horizontal ds). (2,2) will halve the image in each dimension. :type ignore_border: bool :param ignore_border: When True, (5,5) input with ds=(2,2) will generate a (2,2) output. (3,3) otherwise. :type st: tuple of lenght 2 :param st: stride size, which is the number of shifts over rows/cols to get the the next pool region. if st is None, it is considered equal to ds (no overlap on pooling regions) :param padding: (pad_h, pad_w), pad zeros to extend beyond four borders of the images, pad_h is the size of the top and bottom margins, and pad_w is the size of the left and right margins. :type padding: tuple of two ints :param mode: 'max', 'average_inc_pad' or 'average_exc_pad'. Operation executed on each window. `max` always excludes the padding in the computation. `average` gives you the choice to include or exclude it. :type mode: string """ if input.ndim < 2: raise NotImplementedError('max_pool_2d requires a dimension >= 2') if input.ndim == 4: op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding, mode=mode) output = op(input) return output # extract image dimensions img_shape = input.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = tensor.prod(input.shape[:-2]) batch_size = tensor.shape_padright(batch_size, 1) # store as 4D tensor with shape: (batch_size,1,height,width) new_shape = tensor.cast(tensor.join(0, batch_size, tensor.as_tensor([1]), img_shape), 'int64') input_4D = tensor.reshape(input, new_shape, ndim=4) # downsample mini-batch of images op = DownsampleFactorMax(ds, ignore_border, st=st, padding=padding, mode=mode) output = op(input_4D) # restore to original shape outshp = tensor.join(0, input.shape[:-2], output.shape[-2:]) return tensor.reshape(output, outshp, ndim=input.ndim)
def cost(self): """ :param y: shape (time*batch,) -> label :return: error scalar, known_grads dict """ y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim = 1), 'int32') known_grads = None if self.loss == 'sprint': if not isinstance(self.sprint_opts, dict): import json self.sprint_opts = json.loads(self.sprint_opts) assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer" if self.exp_normalize: log_probs = T.log(self.p_y_given_x) else: log_probs = self.z sprint_error_op = SprintErrorSigOp(self.attrs.get("target", "classes"), self.sprint_opts) err, grad = sprint_error_op(log_probs, T.sum(self.index, axis=0)) err = err.sum() if self.loss_like_ce: y_ref = T.clip(self.p_y_given_x - grad, numpy.float32(0), numpy.float32(1)) err = -T.sum(T.log(T.pow(self.p_y_given_x, y_ref)) * T.cast(self.index, "float32").dimshuffle(0, 1, 'x')) if self.ce_smoothing: err *= numpy.float32(1.0 - self.ce_smoothing) grad *= numpy.float32(1.0 - self.ce_smoothing) if not self.prior_scale: # we kept the softmax bias as it was nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) else: # assume that we have subtracted the bias by the log priors beforehand assert self.log_prior is not None # In this case, for the CE calculation, we need to add the log priors again. y_m_prior = T.reshape(self.z + numpy.float32(self.prior_scale) * self.log_prior, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m_prior[self.i], y_idx=self.y_data_flat[self.i]) ce = numpy.float32(self.ce_smoothing) * T.sum(nll) err += ce grad += T.grad(ce, self.z) known_grads = {self.z: grad} return err, known_grads elif self.loss == 'ctc': from theano.tensor.extra_ops import cpu_contiguous err, grad, priors = CTCOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc()) known_grads = {self.z: grad} return err.sum(), known_grads, priors.sum(axis=0) elif self.loss == 'ce_ctc': y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) p_y_given_x = T.nnet.softmax(y_m) #pcx = p_y_given_x[(self.i > 0).nonzero(), y_f[(self.i > 0).nonzero()]] pcx = p_y_given_x[self.i, self.y_data_flat[self.i]] ce = -T.sum(T.log(pcx)) return ce, known_grads elif self.loss == 'ctc2': from NetworkCtcLayer import ctc_cost, uniq_with_lengths, log_sum max_time = self.z.shape[0] num_batches = self.z.shape[1] time_mask = self.index.reshape((max_time, num_batches)) y_batches = self.y_data_flat.reshape((max_time, num_batches)) targets, seq_lens = uniq_with_lengths(y_batches, time_mask) log_pcx = self.z - log_sum(self.z, axis=0, keepdims=True) err = ctc_cost(log_pcx, time_mask, targets, seq_lens) return err, known_grads
def forward_filter_step(self, xp): #need to sample from the proposal distribution first #these terms are the same for every particle xpred=T.dot(self.W.T,(xp-self.c))/(2.0*self.xvar**2) sig=(1.0/(self.b**2+1.0/(2.0*self.xvar**2)))/2.0 [s_samps, s_pred, prop_terms], updates = theano.scan(fn=self.sample_proposal_s, outputs_info=[None, None, None], sequences=[self.s_now, self.h_now], non_sequences=[xpred, sig], n_steps=self.npcl) #now that we have samples from the proposal distribution, we need to reweight them #would use this if we have multiple generative models #recons, updates = theano.scan(fn=get_recon, #outputs_info=[None], #sequences=[s_samps, h_samps], #n_steps=self.npcl) #this loops over every row of A and mu to calculate relative h probabilities #for each particle h_probs = self.calc_h_probs(s_samps) h_samps=self.theano_rng.multinomial(pvals=h_probs.T) recons=T.dot(self.W, s_samps.T) + T.reshape(self.c,(self.nx,1)) x_terms=-T.sum((recons-T.reshape(xp,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2) s_terms=-T.sum(((s_samps-s_pred)*self.b)**2,axis=1) energies=x_terms+s_terms-prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights energies_recentered=energies-T.max(energies) alpha=T.exp(energies_recentered) #these are the reweighting factors new_weights_unnorm=self.weights_now*alpha normalizer=T.sum(new_weights_unnorm) new_weights=new_weights_unnorm/normalizer #need to normalize new weights updates[self.h_past]=T.cast(self.h_now,'float32') updates[self.s_past]=T.cast(self.s_now,'float32') updates[self.h_now]=T.cast(h_samps,'float32') updates[self.s_now]=T.cast(s_samps,'float32') updates[self.weights_past]=T.cast(self.weights_now,'float32') updates[self.weights_now]=T.cast(new_weights,'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates return h_samps, updates
def _transform(theta, input, downsample_factor): num_batch, num_channels, height, width = input.shape theta = T.reshape(theta, (-1, 1)) # grid of (x_t, y_t, 1), eq (1) in ref [1] out_height = T.cast(height / downsample_factor[0], 'int64') out_width = T.cast(width / downsample_factor[1], 'int64') grid = _meshgrid(out_height, out_width) zeros = T.zeros_like(theta) padded_theta = T.concatenate([theta, zeros], axis=1) T_g = padded_theta.dimshuffle(0, 1, 'x') + grid.dimshuffle('x', 0, 1) x_s = T_g[:, 0] y_s = T_g[:, 1] x_s_flat = x_s.flatten() y_s_flat = y_s.flatten() # dimshuffle input to (bs, height, width, channels) input_dim = input.dimshuffle(0, 2, 3, 1) input_transformed = _interpolate( input_dim, x_s_flat, y_s_flat, out_height, out_width) output = T.reshape( input_transformed, (num_batch, out_height, out_width, num_channels)) output = output.dimshuffle(0, 3, 1, 2) # dimshuffle to conv format return output
def castray(ro, rd, shape_params, nprims, width, height): tmin = 1.0 tmax = 20.0 precis = 0.002 m = -1.0 # There are a sequence of distances, d1, d2, ..., dn # then theres the accumulated distances d1, d1+d2, d1+d2+d3.... # What we actually want in the output is the sfor each ray the distance to the surface # So we want something like 0, 20, 25, 27, 28, 28, 28, 28, 28 # OK max_num_steps = 25 # distcolors = map(ro + rd * 0, width, height) #FIXME, reshape instead of mul by 0 distcolors = mapedit(ro + rd * 0, shape_params, nprims, width, height) dists = distcolors steps = T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists)) accum_dists = T.reshape(dists, (width, height, 1)) for i in range(max_num_steps - 1): # distcolors = map(ro + rd * accum_dists, width, height) #FIXME, reshape instead of mul by 0 distcolors = mapedit(ro + rd * accum_dists, shape_params, nprims, width, height) #FIXME, reshape instead of mul by 0 dists = distcolors steps = steps + T.switch(dists < precis, T.zeros_like(dists), T.ones_like(dists)) accum_dists = accum_dists + T.reshape(dists, (width, height, 1)) last_depth = T.reshape(accum_dists, (width, height)) depthmap = T.switch(last_depth < tmax, last_depth / tmax, T.zeros_like(last_depth)) color = 1.0 - steps / float(max_num_steps) # Distance marched along ray and delta between last two steps return depthmap
def mapedit(pos, params, nprims, width, height): pos_repeat = T.reshape(T.tile(pos, nprims), (width, height, nprims, 3)) translate_params = params[:, 0:3] translated_pos = pos_repeat + translate_params # Do sphere norms = translated_pos.norm(2, axis = 3) sphere_radii = params[:, 3] spheredists = norms - sphere_radii # Round box box_radii = params[:, 4] # FIXME? Share radii param? abspos = T.clip(T.abs_(translated_pos) - np.array([.15, .15, .15]), 0.0, 1000.0) rounddists = abspos.norm(2, axis = 3) - box_radii # Blend blend_params = params[:, 5:7] expweights = T.exp(blend_params) softweights = expweights / T.reshape(T.sum(expweights, axis = 1), (nprims, 1)) # MIX stacked = T.stack([spheredists, rounddists], axis=3) reweighted = stacked * softweights mixed = T.sum(reweighted, axis = 3) union = mixed.min(axis=2) # add colour and plane stacked_union = adddim(union) # GET RID OF COOLOUR FROM GEOM plane = sdPlane(pos) return opU(stacked_union, plane, width, height)
def compute_f_mu(x, t, params): [centers, spreads, biases, M, b]=params diffs=x.dimshuffle(0,1,2,'x')-centers.dimshuffle('x','x',0,1) scaled_diffs=(diffs**2)*T.exp(spreads).dimshuffle('x','x',0,1) exp_terms=T.sum(scaled_diffs,axis=2)+biases.dimshuffle('x','x',0)*0.0 h=T.exp(-exp_terms) sumact=T.sum(h,axis=2) #Normalization hnorm=h/sumact.dimshuffle(0,1,'x') z=T.dot(hnorm,M) z=T.reshape(z,(t.shape[0],t.shape[1],ntgates,nx))+b.dimshuffle('x','x',0,1) #nt by nb by ntgates by nx #z=z+T.reshape(x,(t.shape[0],t.shape[1],1,nx)) tpoints=T.cast(T.arange(ntgates),'float32')/T.cast(ntgates-1,'float32') tpoints=T.reshape(tpoints, (1,1,ntgates)) #tgating=T.exp(T.dot(t,muWT)+mubT) #nt by nb by ntgates tgating=T.exp(-kT*(tpoints-t)**2) tgating=tgating/T.reshape(T.sum(tgating, axis=2),(t.shape[0], t.shape[1], 1)) tgating=T.reshape(tgating,(t.shape[0],t.shape[1],ntgates,1)) mult=z*tgating out=T.sum(mult,axis=2) #out=out+x return T.cast(out,'float32')
def maxpool_3D(input, ds, ignore_border=False): #input.dimshuffle (0, 2, 1, 3, 4) # convert to make video in back. # no need to reshuffle. if input.ndim < 3: raise NotImplementedError('max_pool_3d requires a dimension >= 3') # extract nr dimensions vid_dim = input.ndim # max pool in two different steps, so we can use the 2d implementation of # downsamplefactormax. First maxpool frames as usual. # Then maxpool the time dimension. Shift the time dimension to the third # position, so rows and cols are in the back # extract dimensions frame_shape = input.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = T.prod(input.shape[:-2]) batch_size = T.shape_padright(batch_size,1) # store as 4D tensor with shape: (batch_size,1,height,width) new_shape = T.cast(T.join(0, batch_size, T.as_tensor([1,]), frame_shape), 'int32') input_4D = T.reshape(input, new_shape, ndim=4) # downsample mini-batch of videos in rows and cols op = DownsampleFactorMax((ds[1],ds[2]), ignore_border) # so second and third dimensions of ds are for height and width output = op(input_4D) # restore to original shape outshape = T.join(0, input.shape[:-2], output.shape[-2:]) out = T.reshape(output, outshape, ndim=input.ndim) # now maxpool time # output (time, rows, cols), reshape so that time is in the back shufl = (list(range(vid_dim-3)) + [vid_dim-2]+[vid_dim-1]+[vid_dim-3]) input_time = out.dimshuffle(shufl) # reset dimensions vid_shape = input_time.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = T.prod(input_time.shape[:-2]) batch_size = T.shape_padright(batch_size,1) # store as 4D tensor with shape: (batch_size,1,width,time) new_shape = T.cast(T.join(0, batch_size, T.as_tensor([1,]), vid_shape), 'int32') input_4D_time = T.reshape(input_time, new_shape, ndim=4) # downsample mini-batch of videos in time op = DownsampleFactorMax((1,ds[0]), ignore_border) # Here the time dimension is downsampled. outtime = op(input_4D_time) # output # restore to original shape (xxx, rows, cols, time) outshape = T.join(0, input_time.shape[:-2], outtime.shape[-2:]) shufl = (list(range(vid_dim-3)) + [vid_dim-1]+[vid_dim-3]+[vid_dim-2]) #rval = T.reshape(outtime, outshape, ndim=input.ndim).dimshuffle(shufl) return T.reshape(outtime, outshape, ndim=input.ndim).dimshuffle(shufl)
def do_fft(input, n_hidden): fft_input = T.reshape(input, (input.shape[0], 2, n_hidden)) fft_input = fft_input.dimshuffle(0,2,1) fft_output = cufft(fft_input) * T.sqrt(n_hidden) fft_output = fft_output.dimshuffle(0,2,1) output = T.reshape(fft_output, (input.shape[0], 2*n_hidden)) return output
def do_ifft(input, n_hidden): ifft_input = T.reshape(input, (input.shape[0], 2, n_hidden)) ifft_input = ifft_input.dimshuffle(0,2,1) ifft_output = cuifft(ifft_input) / T.sqrt(n_hidden) ifft_output = ifft_output.dimshuffle(0,2,1) output = T.reshape(ifft_output, (input.shape[0], 2*n_hidden)) return output
def T_l2_cost_conv(x,a,A,imshp,kshp,mask=True): """ xsz*ysz*nchannels, nimages = x.shape xsz*ysz*nfeat, nimages = a.shape xsz*ysz*nchannels, nfeat = A.shape """ #imshp = num images, channels, szy, szx #kshp = features, channels, szy, szx #featshp = num images, features, szy, szx featshp = (imshp[0],kshp[0],imshp[2] - kshp[2] + 1,imshp[3] - kshp[3] + 1) # num images, features, szy, szx image = T.reshape(T.transpose(x),imshp) kernel = T.reshape(T.transpose(A),kshp) features = T.reshape(T.transpose(a),featshp) # Need to transpose first two dimensions of kernel, and reverse index kernel image dims (for correlation) kernel_rotated = T.transpose(kernel[:,:,::-1,::-1],axes=[1,0,2,3]) image_estimate = conv2d(features,kernel_rotated,border_mode='full') if mask: image_error_temp = image - image_estimate image_error = T.zeros_like(image_error_temp) image_error = T.set_subtensor(image_error[:,:,(kshp[2]-1):(imshp[2]-kshp[2]+1),(kshp[3]-1):(imshp[3]-kshp[3]+1)], image_error_temp[:,:,(kshp[2]-1):(imshp[2]-kshp[2]+1),(kshp[3]-1):(imshp[3]-kshp[3]+1)]) else: image_error = image - image_estimate return .5*T.sum(image_error **2)
def unitary_transform(input, n_hidden, U): UR, UI = U[0, :, :], U[1, :, :] unitary_input = T.reshape(input, (input.shape[0], 2, n_hidden)) IR, II = unitary_input[:, 0, :], unitary_input[:, 1, :] output = T.stack([IR.dot(UR) - II.dot(UI), IR.dot(UI) + II.dot(UR)], axis=1) output = T.reshape(output, (input.shape[0], 2*n_hidden)) return output
def Transform(X, w1, g1, b1, w2, g2, b2, downsample_factor=2): theta = GetTheta(X, w1, g1, b1, w2, g2, b2) num_batch, num_channels, height, width = X.shape theta = T.reshape(theta, (-1, 2, 3)) height_f = T.cast(height, 'float32') width_f = T.cast(width, 'float32') out_height = T.cast(height_f // downsample_factor, 'int64') out_width = T.cast(width_f // downsample_factor, 'int64') grid = Meshgrid(out_height, out_width) # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s) T_g = T.dot(theta, grid) x_s, y_s = T_g[:, 0], T_g[:, 1] x_s_flat = x_s.flatten() y_s_flat = y_s.flatten() # dimshuffle input to (bs, height, width, channels) input_dim = input.dimshuffle(0, 2, 3, 1) input_transformed = Interpolate(input_dim, x_s_flat, y_s_flat, downsample_factor) output = T.reshape(input_transformed, (num_batch, out_height, out_width, num_channels)) output = output.dimshuffle(0, 3, 1, 2) return output
def get_output(self, train=False): X = self.get_input(train) X = T.reshape(X, (X.shape[0], X.shape[1], X.shape[2], 1)).dimshuffle(0, 2, 1, 3) border_mode = self.border_mode if on_gpu() and dnn.dnn_available(): if border_mode == 'same': assert(self.subsample_length == 1) pad_x = (self.filter_length - self.subsample_length) // 2 conv_out = dnn.dnn_conv(img=X, kerns=self.W, border_mode=(pad_x, 0)) else: conv_out = dnn.dnn_conv(img=X, kerns=self.W, border_mode=border_mode, subsample=self.subsample) else: if border_mode == 'same': assert(self.subsample_length == 1) border_mode = 'full' conv_out = T.nnet.conv.conv2d(X, self.W, border_mode=border_mode, subsample=self.subsample) if self.border_mode == 'same': shift_x = (self.filter_length - 1) // 2 conv_out = conv_out[:, :, shift_x:X.shape[2] + shift_x, :] output = self.activation(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) output = T.reshape(output, (output.shape[0], output.shape[1], output.shape[2])).dimshuffle(0, 2, 1) return output
def max_pool_2d(input, ds, ignore_border=False): """ Takes as input a N-D tensor, where N >= 2. It downscales the input image by the specified factor, by keeping only the maximum value of non-overlapping patches of size (ds[0],ds[1]) :type input: N-D theano tensor of input images. :param input: input images. Max pooling will be done over the 2 last dimensions. :type ds: tuple of length 2 :param ds: factor by which to downscale. (2,2) will halve the image in each dimension. :param ignore_border: boolean value. When True, (5,5) input with ds=(2,2) will generate a (2,2) output. (3,3) otherwise. """ if input.ndim < 2: raise NotImplementedError("max_pool_2d requires a dimension >= 2") # extract image dimensions img_shape = input.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = tensor.prod(input.shape[:-2]) batch_size = tensor.shape_padright(batch_size, 1) # store as 4D tensor with shape: (batch_size,1,height,width) new_shape = tensor.cast(tensor.join(0, batch_size, tensor.as_tensor([1]), img_shape), "int64") input_4D = tensor.reshape(input, new_shape, ndim=4) # downsample mini-batch of images op = DownsampleFactorMax(ds, ignore_border) output = op(input_4D) # restore to original shape outshp = tensor.join(0, input.shape[:-2], output.shape[-2:]) return tensor.reshape(output, outshp, ndim=input.ndim)
def categorical_crossentropy(self, y_pred, y_true): y_pred = T.clip(y_pred, self.epsilon, 1.0 - self.epsilon) m = T.reshape(self.mask, (self.mask.shape[0] * self.batch_size, 1)) ce = T.nnet.categorical_crossentropy(y_pred, y_true) ce = T.reshape(ce, (self.mask.shape[0] * self.batch_size, 1)) return T.sum(ce * m) / T.sum(m)
def lookup_all(sentences): results, ups = theano.scan(lookup_sentence, sequences=[sentences]) shape = results.shape return T.reshape(results, (shape[0], 1, shape[1], shape[2]), ndim=4)
def warp_bilinear_interpolation(orig_img, x, y, out_height, out_width): # shuffle channel dim to last dimension, since we want to apply the same # transform to the whole dim img = orig_img.dimshuffle(0, 2, 3, 1) # flatten batch dims x = x.flatten() y = y.flatten() # *_f are floats num_batch, height, width, num_channels = img.shape height_f = T.cast(height, theano.config.floatX) width_f = T.cast(width, theano.config.floatX) # scale indices from [-1, 1] to [0, width/height]. x = (x + 1) / 2 * width_f y = (y + 1) / 2 * height_f # Clip indices to ensure they are not out of bounds. max_x = width_f - 1 max_y = height_f - 1 # TODO add monitoring to out of bounds points x0 = T.clip(x, 0, max_x) x1 = T.clip(x + 1, 0, max_x) y0 = T.clip(y, 0, max_y) y1 = T.clip(y + 1, 0, max_y) # We need floatX for interpolation and int64 for indexing. x0_f = T.floor(x0) x1_f = T.floor(x1) y0_f = T.floor(y0) y1_f = T.floor(y1) x0 = T.cast(x0, 'int64') x1 = T.cast(x1, 'int64') y0 = T.cast(y0, 'int64') y1 = T.cast(y1, 'int64') # The input is [num_batch, height, width, channels]. We do the lookup in # the flattened input, i.e [num_batch*height*width, channels]. We need # to offset all indices to match the flat version dim2 = width dim1 = width * height base = T.repeat( T.arange(num_batch, dtype='int64') * dim1, out_height * out_width) base_y0 = base + y0 * dim2 base_y1 = base + y1 * dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels for all samples img_flat = img.reshape((-1, num_channels)) Ia = img_flat[idx_a] Ib = img_flat[idx_b] Ic = img_flat[idx_c] Id = img_flat[idx_d] # calculate interpolated values wa = ((x1_f - x) * (y1_f - y)).dimshuffle(0, 'x') wb = ((x1_f - x) * (y - y0_f)).dimshuffle(0, 'x') wc = ((x - x0_f) * (y1_f - y)).dimshuffle(0, 'x') wd = ((x - x0_f) * (y - y0_f)).dimshuffle(0, 'x') output_2d = T.sum([wa * Ia, wb * Ib, wc * Ic, wd * Id], axis=0) output_4d = T.reshape(output_2d, (num_batch, out_height, out_width, num_channels)) # convert back from b01c (batch, dim0, dim1, channels) # to bc01 (batch, channels, dim0, dim1) output = output_4d.dimshuffle(0, 3, 1, 2) return output
def get_real_coefficients(self): return ( tt.reshape(self.a, (self.a.size, )), tt.reshape(self.c, (self.c.size, )), )
def build(self): """ Build the model variables. """ CMReduction = self.build_reduction_var() # Window of active countermeasures extended into the past Earlier_ActiveCMs = self.d.get_ActiveCMs( self.d.Ds[0] - pd.DateOffset(self.CMDelayCut), self.d.Ds[-1]) # [region, CM, day] Reduction factor for each CM,C,D ActiveCMReduction = (T.reshape(CMReduction, (1, self.nCMs, 1))**Earlier_ActiveCMs) # [region, day] Reduction factor from CMs for each C,D (noise added below) GrowthReduction = self.Det("GrowthReduction", T.prod(ActiveCMReduction, axis=1), plot_trace=False) # [region, day] Convolution of GrowthReduction by DelayProb along days DelayedGrowthReduction = self.Det( "DelayedGrowthReduction", geom_convolution(GrowthReduction, self.CMDelayProb, axis=1)[:, self.CMDelayCut:], plot_trace=False, ) # [] Baseline growth rate (wide prior OK, mean estimates ~10% daily growth) BaseGrowthRate = self.LogNorm("BaseGrowthRate", 1.2, 2.3) # [region] Region growth rate # TODO: Estimate growth rate variance RegionGrowthRate = self.LogNorm("RegionGrowthRate", BaseGrowthRate, 0.3, shape=(self.nRs, )) # [region] Region unreliability as common scale multiplier of its: # * measurements (measurement unreliability) # * expected growth noise # TODO: Estimate good prior (but can be weak?) RegionScaleMult = self.LogNorm("RegionScaleMult", 1.0, 1.0, shape=(self.nRs, )) # [region, day] The ideal predicted daily growth PredictedGrowth = self.Det( "PredictedGrowth", T.reshape(RegionGrowthRate, (self.nRs, 1)) * DelayedGrowthReduction, plot_trace=False, ) # [region, day] The actual (still hidden) growth rate each day # TODO: Estimate noise varince (should be small, measurement variance below) # Miscalibration: too low: time effects pushed into CMs, too high: explains away CMs RealGrowth = self.LogNorm( "RealGrowth", PredictedGrowth, RegionScaleMult.reshape((self.nRs, 1)) * 0.1, shape=(self.nRs, self.nDs), plot_trace=False, ) # [region, day] Multiplicative noise applied to predicted growth rate RealGrowthNoise = self.Det("RealGrowthNoise", RealGrowth / PredictedGrowth, plot_trace=False) # [region] Initial size of epidemic (the day before the start, only those detected; wide prior OK) InitialSize = self.LogNorm("InitialSize", 1.0, 10, shape=(self.nRs, )) # [region, day] The number of cases that would be detected with noiseless testing # (Noise source includes both false-P/N rates and local variance in test volume and targetting) # (Since we ony care about growth rates and assume consistent testing, it is fine to ignore real size) Size = self.Det( "Size", T.reshape(InitialSize, (self.nRs, 1)) * self.RealGrowth.cumprod(axis=1), plot_trace=False, ) # [region, day] Cummulative tested positives Observed = self.LogNorm( "Observed", Size, 0.4, # self.RegionScaleMult.reshape((self.nRs, 1)) * 0.4, shape=(self.nRs, self.nDs), observed=self.d.Confirmed, plot_trace=False, ) # [region, day] Multiplicative noise applied to predicted growth rate # Note: computed backwards, since self.Observed needs to be a distribution ObservedNoise = self.Det("ObservedNoise", Observed / Size, plot_trace=False)
def flatten(array): return T.reshape(m, (size(m), )).eval()
def max_pool_3d(input, ds, ignore_border=False): """ Takes as input a N-D tensor, where N >= 3. It downscales the input video by the specified factor, by keeping only the maximum value of non-overlapping patches of size (ds[0],ds[1],ds[2]) (time, height, width) :type input: N-D theano tensor of input images. :param input: input images. Max pooling will be done over the 3 last dimensions. :type ds: tuple of length 3 :param ds: factor by which to downscale. (2,2,2) will halve the video in each dimension. :param ignore_border: boolean value. When True, (5,5,5) input with ds=(2,2,2) will generate a (2,2,2) output. (3,3,3) otherwise. """ if input.ndim < 3: raise NotImplementedError('max_pool_3d requires a dimension >= 3') # extract nr dimensions vid_dim = input.ndim # max pool in two different steps, so we can use the 2d implementation of # downsamplefactormax. First maxpool frames as usual. # Then maxpool the time dimension. Shift the time dimension to the third # position, so rows and cols are in the back # extract dimensions frame_shape = input.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = T.prod(input.shape[:-2]) batch_size = T.shape_padright(batch_size, 1) # store as 4D tensor with shape: (batch_size,1,height,width) new_shape = T.cast(T.join(0, batch_size, T.as_tensor([ 1, ]), frame_shape), 'int32') input_4D = T.reshape(input, new_shape, ndim=4) # downsample mini-batch of videos in rows and cols output = T.signal.pool.pool_2d(input_4D, (ds[1], ds[2]), ignore_border) # restore to original shape outshape = T.join(0, input.shape[:-2], output.shape[-2:]) out = T.reshape(output, outshape, ndim=input.ndim) # now maxpool time # output (time, rows, cols), reshape so that time is in the back shufl = (list(range(vid_dim - 3)) + [vid_dim - 2] + [vid_dim - 1] + [vid_dim - 3]) input_time = out.dimshuffle(shufl) # reset dimensions vid_shape = input_time.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = T.prod(input_time.shape[:-2]) batch_size = T.shape_padright(batch_size, 1) # store as 4D tensor with shape: (batch_size,1,width,time) new_shape = T.cast(T.join(0, batch_size, T.as_tensor([ 1, ]), vid_shape), 'int32') input_4D_time = T.reshape(input_time, new_shape, ndim=4) # downsample mini-batch of videos in time outtime = T.signal.pool.pool_2d(input_4D_time, (1, ds[0]), ignore_border) # output # restore to original shape (xxx, rows, cols, time) outshape = T.join(0, input_time.shape[:-2], outtime.shape[-2:]) shufl = (list(range(vid_dim - 3)) + [vid_dim - 1] + [vid_dim - 3] + [vid_dim - 2]) return T.reshape(outtime, outshape, ndim=input.ndim).dimshuffle(shufl)
def build_model(shared_params, options): trng = RandomStreams(1234) drop_ratio = options['drop_ratio'] batch_size = options['batch_size'] n_dim = options['n_dim'] w_emb = shared_params['w_emb'] dropout = theano.shared(numpy.float32(0.)) image_feat = T.ftensor3('image_feat') # batch_size x T input_idx = T.imatrix('input_idx') input_mask = T.matrix('input_mask') # label is the TRUE label label = T.ivector('label') empty_word = theano.shared(value=np.zeros((1, options['n_emb']), dtype='float32'), name='empty_word') w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']], axis=0) input_emb = w_emb_extend[input_idx] # a trick here, set the maxpool_h/w to be large # maxpool_shape = (options['maxpool_h'], options['maxpool_w']) # turn those appending words into zeros # batch_size x T x n_emb input_emb = input_emb * input_mask[:, :, None] if options['sent_drop']: input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio) if options['use_unigram_conv']: unigram_conv_feat = fflayer(shared_params, input_emb, options, prefix='conv_unigram', act_func=options.get('sent_conv_act', 'tanh')) unigram_pool_feat = unigram_conv_feat.max(axis=1) if options['use_bigram_conv']: idx = T.concatenate([T.arange(input_emb.shape[1])[:-1], T.arange(input_emb.shape[1])[1:]]).reshape((2, input_emb.shape[1] - 1)).transpose().flatten() bigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0], input_emb.shape[1] - 1, 2 * input_emb.shape[2])) bigram_conv_feat = fflayer(shared_params, bigram_emb, options, prefix='conv_bigram', act_func=options.get('sent_conv_act', 'tanh')) bigram_pool_feat = bigram_conv_feat.max(axis=1) if options['use_trigram_conv']: idx = T.concatenate([T.arange(input_emb.shape[1])[:-2], T.arange(input_emb.shape[1])[1:-1], T.arange(input_emb.shape[1])[2:]]).reshape((3, input_emb.shape[1] - 2)).transpose().flatten() trigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0], input_emb.shape[1] - 2, 3 * input_emb.shape[2])) trigram_conv_feat = fflayer(shared_params, trigram_emb, options, prefix='conv_trigram', act_func=options.get('sent_conv_act', 'tanh')) trigram_pool_feat = trigram_conv_feat.max(axis=1) # pool_feat = T.concatenate([unigram_pool_feat, bigram_pool_feat, trigram_pool_feat], axis=1) image_feat_down = fflayer(shared_params, image_feat, options, prefix='image_mlp', act_func=options.get('image_mlp_act', 'tanh')) if options.get('use_before_attention_drop', False): image_feat_down = dropout_layer(image_feat_down, dropout, trng, drop_ratio) pool_feat = dropout_layer(pool_feat, dropout, trng, drop_ratio) # attention model begins here # first layer attention model image_feat_attention_1 = fflayer(shared_params, image_feat_down, options, prefix='image_att_mlp_1', act_func=options.get('image_att_mlp_act', 'tanh')) pool_feat_attention_1 = fflayer(shared_params, pool_feat, options, prefix='sent_att_mlp_1', act_func=options.get('sent_att_mlp_act', 'tanh')) combined_feat_attention_1 = image_feat_attention_1 + \ pool_feat_attention_1[:, None, :] if options['use_attention_drop']: combined_feat_attention_1 = dropout_layer(combined_feat_attention_1, dropout, trng, drop_ratio) combined_feat_attention_1 = fflayer(shared_params, combined_feat_attention_1, options, prefix='combined_att_mlp_1', act_func=options.get( 'combined_att_mlp_act', 'tanh')) prob_attention_1 = T.nnet.softmax(combined_feat_attention_1[:, :, 0]) image_feat_ave_1 = (prob_attention_1[:, :, None] * image_feat_down).sum(axis=1) combined_hidden_1 = image_feat_ave_1 + pool_feat # second layer attention model image_feat_attention_2 = fflayer(shared_params, image_feat_down, options, prefix='image_att_mlp_2', act_func=options.get('image_att_mlp_act', 'tanh')) pool_feat_attention_2 = fflayer(shared_params, combined_hidden_1, options, prefix='sent_att_mlp_2', act_func=options.get('sent_att_mlp_act', 'tanh')) combined_feat_attention_2 = image_feat_attention_2 + \ pool_feat_attention_2[:, None, :] if options['use_attention_drop']: combined_feat_attention_2 = dropout_layer(combined_feat_attention_2, dropout, trng, drop_ratio) combined_feat_attention_2 = fflayer(shared_params, combined_feat_attention_2, options, prefix='combined_att_mlp_2', act_func=options.get( 'combined_att_mlp_act', 'tanh')) prob_attention_2 = T.nnet.softmax(combined_feat_attention_2[:, :, 0]) image_feat_ave_2 = (prob_attention_2[:, :, None] * image_feat_down).sum(axis=1) if options.get('use_final_image_feat_only', False): combined_hidden = image_feat_ave_2 + pool_feat else: combined_hidden = image_feat_ave_2 + combined_hidden_1 for i in range(options['combined_num_mlp']): if options.get('combined_mlp_drop_%d'%(i), False): combined_hidden = dropout_layer(combined_hidden, dropout, trng, drop_ratio) if i == options['combined_num_mlp'] - 1: combined_hidden = fflayer(shared_params, combined_hidden, options, prefix='combined_mlp_%d'%(i), act_func='linear') else: combined_hidden = fflayer(shared_params, combined_hidden, options, prefix='combined_mlp_%d'%(i), act_func=options.get('combined_mlp_act_%d'%(i), 'tanh')) # drop the image output prob = T.nnet.softmax(combined_hidden) prob_y = prob[T.arange(prob.shape[0]), label] pred_label = T.argmax(prob, axis=1) # sum or mean? cost = -T.mean(T.log(prob_y)) accu = T.mean(T.eq(pred_label, label)) # return image_feat, input_idx, input_mask, \ # label, dropout, cost, accu return image_feat, input_idx, input_mask, \ label, dropout, cost, accu, pred_label, \ prob_attention_1, prob_attention_2
def run_model(index, in_dir, out_dir, data_filename, func_filename, struct_filename, dist_filename, kernel, n, sample_size, tune_size): """ index: data in_dir: set up work directory out_dir: save the trace as csv in the out directory data_filename: filename for time series data func_filename: filename for functional connectivity struct_filename: filename for structural connectivity dist_filename: filename for distribution matrix of n ROIs kernel: "exponential" or "gaussian" or "matern52" or "matern32" n: ROI number sample_size: NUTS number tune_size: burning number """ os.chdir(in_dir + str(index)) Y = get_data(data_filename) mFunc = get_func(func_filename, n) Struct = get_struct(struct_filename, n) Dist = get_dist(dist_filename, n) m = Dist[0].shape[0] k = Y.shape[1] n_vec = n * (n + 1) // 2 Y_mean = [] for i in range(n): Y_mean.append(np.mean(Y[i * m:(i + 1) * m, 0])) Y_mean = np.array(Y_mean) with pm.Model() as model_generator: # convariance matrix log_Sig = pm.Uniform("log_Sig", -8, 8, shape=(n, )) SQ = tt.diag(tt.sqrt(tt.exp(log_Sig))) Func_Covm = tt.dot(tt.dot(SQ, mFunc), SQ) Struct_Convm = tt.dot(tt.dot(SQ, Struct), SQ) # double fusion of structural and FC L_fc_vec = tt.reshape( tt.slinalg.cholesky(tt.squeeze(Func_Covm)).T[np.triu_indices(n)], (n_vec, )) L_st_vec = tt.reshape( tt.slinalg.cholesky( tt.squeeze(Struct_Convm)).T[np.triu_indices(n)], (n_vec, )) Struct_vec = tt.reshape(Struct[np.triu_indices(n)], (n_vec, )) lambdaw = pm.Beta("lambdaw", alpha=1, beta=1, shape=(n_vec, )) Kf = pm.Beta("Kf", alpha=1, beta=1, shape=(n_vec, )) rhonn = Kf*( (1-lambdaw)*L_fc_vec + lambdaw*L_st_vec ) + \ (1-Kf)*( (1-Struct_vec*lambdaw)*L_fc_vec + Struct_vec*lambdaw*L_st_vec ) # correlation Cov_temp = tt.triu(tt.ones((n, n))) Cov_temp = tt.set_subtensor(Cov_temp[np.triu_indices(n)], rhonn) Cov_mat_v = tt.dot(Cov_temp.T, Cov_temp) d = tt.sqrt(tt.diagonal(Cov_mat_v)) rho = (Cov_mat_v.T / d).T / d rhoNew = pm.Deterministic("rhoNew", rho[np.triu_indices(n, 1)]) # temporal correlation AR(1) phi_T = pm.Uniform("phi_T", 0, 1, shape=(n, )) sigW_T = pm.Uniform("sigW_T", 0, 100, shape=(n, )) B = pm.Normal("B", 0, 100, shape=(n, )) muW1 = Y_mean - B # get the shifted mean mean_overall = muW1 / (1.0 - phi_T) # AR(1) mean tau_overall = (1.0 - tt.sqr(phi_T)) / tt.sqr(sigW_T) # AR (1) variance W_T = pm.MvNormal("W_T", mu=mean_overall, tau=tt.diag(tau_overall), shape=(k, n)) # add all parts together one_m_vec = tt.ones((m, 1)) one_k_vec = tt.ones((1, k)) D = pm.MvNormal("D", mu=tt.zeros(n), cov=Cov_mat_v, shape=(n, )) phi_s = pm.Uniform("phi_s", 0, 20, shape=(n, )) spat_prec = pm.Uniform("spat_prec", 0, 100, shape=(n, )) H_base = pm.Normal("H_base", 0, 1, shape=(m, n)) Mu_all = tt.zeros((m * n, k)) if kernel == "exponential": for i in range(n): r = Dist[i] * phi_s[i] H_temp = tt.sqr(spat_prec[i]) * tt.exp(-r) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_update = tt.set_subtensor(Mu_all[m*i:m*(i+1), :], B[i] + D[i] + one_m_vec*W_T[:,i] + \ tt.dot(L_H_temp, tt.reshape(H_base[:,i], (m, 1)))*one_k_vec) Mu_all = Mu_all_update elif kernel == "gaussian": for i in range(n): r = Dist[i] * phi_s[i] H_temp = tt.sqr(spat_prec[i]) * tt.exp(-tt.sqr(r) * 0.5) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_update = tt.set_subtensor(Mu_all[m*i:m*(i+1), :], B[i] + D[i] + one_m_vec*W_T[:,i] + \ tt.dot(L_H_temp, tt.reshape(H_base[:,i], (m, 1)))*one_k_vec) Mu_all = Mu_all_update elif kernel == "matern52": for i in range(n): r = Dist[i] * phi_s[i] H_temp = tt.sqr(spat_prec[i]) * ( (1.0 + tt.sqrt(5.0) * r + 5.0 / 3.0 * tt.sqr(r)) * tt.exp(-1.0 * tt.sqrt(5.0) * r)) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_update = tt.set_subtensor(Mu_all[m*i:m*(i+1), :], B[i] + D[i] + one_m_vec*W_T[:,i] + \ tt.dot(L_H_temp, tt.reshape(H_base[:,i], (m, 1)))*one_k_vec) Mu_all = Mu_all_update elif kernel == "matern32": for i in range(n): r = Dist[i] * phi_s[i] H_temp = tt.sqr(spat_prec[i]) * ( 1.0 + tt.sqrt(3.0) * r) * tt.exp(-tt.sqrt(3.0) * r) L_H_temp = tt.slinalg.cholesky(H_temp) Mu_all_update = tt.set_subtensor(Mu_all[m*i:m*(i+1), :], B[i] + D[i] + one_m_vec*W_T[:,i] + \ tt.dot(L_H_temp, tt.reshape(H_base[:,i], (m, 1)))*one_k_vec) Mu_all = Mu_all_update sigma_error_prec = pm.Uniform("sigma_error_prec", 0, 100) Y1 = pm.Normal("Y1", mu=Mu_all, sd=sigma_error_prec, observed=Y) with model_generator: step = pm.NUTS() trace = pm.sample(sample_size, step=step, tune=tune_size, chains=1) # save as pandas format and output the csv file save_trace = pm.trace_to_dataframe(trace) save_trace.to_csv(out_dir + date.today().strftime("%m_%d_%y") + \ "_sample_size_" + str(sample_size) + "_index_" + str(index) + ".csv")
def set_output(self): self._output = tensor.reshape(self._prev_layer.output, self._output_shape)
def interpolate_bilinear(im, x, y, out_shape=None, border_mode='nearest'): if im.ndim != 4: raise TypeError('im should be a 4D Tensor image, got %dD.' % im.ndim) out_shape = out_shape if out_shape else T.shape(im)[2:] x, y = x.flatten(), y.flatten() n, c, h, w = im.shape h_out, w_out = out_shape height_f = T.cast(h, theano.config.floatX) width_f = T.cast(w, theano.config.floatX) # scale coordinates from [-1, 1] to [0, width/height - 1] x = (x + 1) / 2 * (width_f - 1) y = (y + 1) / 2 * (height_f - 1) x0_f = T.floor(x) y0_f = T.floor(y) x1_f = x0_f + 1 y1_f = y0_f + 1 if border_mode == 'nearest': x0 = T.clip(x0_f, 0, width_f - 1) x1 = T.clip(x1_f, 0, width_f - 1) y0 = T.clip(y0_f, 0, height_f - 1) y1 = T.clip(y1_f, 0, height_f - 1) elif border_mode == 'mirror': w = 2 * (width_f - 1) x0 = T.minimum(x0_f % w, -x0_f % w) x1 = T.minimum(x1_f % w, -x1_f % w) h = 2 * (height_f - 1) y0 = T.minimum(y0_f % h, -y0_f % h) y1 = T.minimum(y1_f % h, -y1_f % h) elif border_mode == 'wrap': x0 = T.mod(x0_f, width_f) x1 = T.mod(x1_f, width_f) y0 = T.mod(y0_f, height_f) y1 = T.mod(y1_f, height_f) else: raise ValueError("border_mode must be one of " "'nearest', 'mirror', 'wrap'") x0, x1, y0, y1 = (T.cast(v, 'int64') for v in (x0, x1, y0, y1)) base = T.arange(n) * w * h base = T.reshape(base, (-1, 1)) base = T.tile(base, (1, h_out * w_out)) base = base.flatten() base_y0 = base + y0 * w base_y1 = base + y1 * w idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 im_flat = T.reshape(im.dimshuffle((0, 2, 3, 1)), (-1, c)) pixel_a = im_flat[idx_a] pixel_b = im_flat[idx_b] pixel_c = im_flat[idx_c] pixel_d = im_flat[idx_d] wa = ((x1_f - x) * (y1_f - y)).dimshuffle((0, 'x')) wb = ((x1_f - x) * (1. - (y1_f - y))).dimshuffle((0, 'x')) wc = ((1. - (x1_f - x)) * (y1_f - y)).dimshuffle((0, 'x')) wd = ((1. - (x1_f - x)) * (1. - (y1_f - y))).dimshuffle((0, 'x')) output = T.sum((wa * pixel_a, wb * pixel_b, wc * pixel_c, wd * pixel_d), axis=0) output = T.reshape(output, (n, h_out, w_out, c)) return output.dimshuffle((0, 3, 1, 2))
def __init__(self, data_dir, word2vec, word_vector_size, dim, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_features, self.train_fns_dict, self.train_num_imgs = self._process_input_sind(self.data_dir, 'train') self.test_dict_story, self.test_features, self.test_fns_dict, self.test_num_imgs = self._process_input_sind(self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) self.q_var = T.matrix('q_var') # Now, it's a batch * image_sieze. self.answer_var = T.imatrix('answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3('answer_inp_var') # answer of example in minibatch print "==> building question module" # Now, share the parameter with the input module. q_var_shuffled = self.q_var.dimshuffle(1,0) self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,)) q_hist = T.dot(self.W_inp_emb_in, q_var_shuffled) + self.b_inp_emb_in.dimshuffle(0,'x') q_hist_shuffled = q_hist.dimshuffle(1,0) if self.batch_norm: logging.info("Using batch normalization.") q_net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=q_hist_shuffled) if self.batch_norm: q_net = layers.BatchNormLayer(incoming=q_net) if self.dropout > 0 and self.mode == 'train': q_net = layers.DropoutLayer(q_net, p=self.dropout) #last_mem = layers.get_output(q_net).dimshuffle((1, 0)) self.q_q = layers.get_output(q_net).dimshuffle(1,0) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1,2,0) #self.W_mem_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.dim)) self.W_inp_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.vocab_size + 1)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb,_ = theano.scan(fn = _dot2, sequences = answer_inp_var_shuffled, non_sequences = self.W_inp_emb ) # seq x dim x batch mem_ans = self.q_q mem_ans_dim = mem_ans.dimshuffle('x',0,1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis = 0) dummy = theano.shared(np.zeros((self.dim, self.batch_size), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) logging.info('answer_inp size') #last_mem = printing.Print('prob_sm')(last_mem) results, _ = theano.scan(fn = self.answer_gru_step, sequences = answer_inp, outputs_info = [ dummy ]) prob,_ = theano.scan(fn = lambda x, w: T.dot(w, x), sequences = results, non_sequences = self.W_a ) preds = prob[1:,:,:] prob = prob[1:-1,:,:] prob_shuffled = prob.dimshuffle(2,0,1) # b * len * vocab preds_shuffled = preds.dimshuffle(2,0,1) logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) preds_rhp = T.reshape(preds_shuffled, (n_preds, preds_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) preds_sm = nn_utils.softmax_(preds_rhp) self.prediction = prob_sm # this one is for the training. # This one is for the beamsearch. self.pred = T.reshape(preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1], preds_shuffled.shape[2])) mask = T.reshape(self.answer_mask, (n,)) lbl = T.reshape(self.answer_var, (n,)) self.params = [self.W_a,self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, self.W_inp_emb_in, self.b_inp_emb_in, self.W_inp_emb] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec ).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 grad = T.grad(self.loss, self.params) #scaled_grad = lasagne.updates.norm_constraint(grad, max_norm = 1e4) updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate = 0.01) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], outputs=[self.prediction, self.loss]) print "==> compiling pred_fn" self.pred_fn= theano.function(inputs=[self.q_var, self.answer_inp_var], outputs=[self.pred])
def conv2d( input, filters, image_shape=None, filter_shape=None, border_mode="valid", subsample=(1, 1), **kargs, ): """ signal.conv.conv2d performs a basic 2D convolution of the input with the given filters. The input parameter can be a single 2D image or a 3D tensor, containing a set of images. Similarly, filters can be a single 2D filter or a 3D tensor, corresponding to a set of 2D filters. Shape parameters are optional and will result in faster execution. Parameters ---------- input : Symbolic theano tensor for images to be filtered. Dimensions: ([num_images], image height, image width) filters : Symbolic theano tensor for convolution filter(s). Dimensions: ([num_filters], filter height, filter width) border_mode: {'valid', 'full'} See scipy.signal.convolve2d. subsample Factor by which to subsample output. image_shape : tuple of length 2 or 3 ([num_images,] image height, image width). filter_shape : tuple of length 2 or 3 ([num_filters,] filter height, filter width). kwargs See theano.tensor.nnet.conv.conv2d. Returns ------- symbolic 2D,3D or 4D tensor Tensor of filtered images, with shape ([number images,] [number filters,] image height, image width). """ assert input.ndim in (2, 3) assert filters.ndim in (2, 3) # use shape information if it is given to us ### if filter_shape and image_shape: if input.ndim == 3: bsize = image_shape[0] else: bsize = 1 imshp = (1, ) + tuple(image_shape[-2:]) if filters.ndim == 3: nkern = filter_shape[0] else: nkern = 1 kshp = filter_shape[-2:] else: nkern, kshp = None, None bsize, imshp = None, None # reshape tensors to 4D, for compatibility with ConvOp ### if input.ndim == 3: sym_bsize = input.shape[0] else: sym_bsize = 1 if filters.ndim == 3: sym_nkern = filters.shape[0] else: sym_nkern = 1 new_input_shape = tensor.join(0, tensor.stack([sym_bsize, 1]), input.shape[-2:]) input4D = tensor.reshape(input, new_input_shape, ndim=4) new_filter_shape = tensor.join(0, tensor.stack([sym_nkern, 1]), filters.shape[-2:]) filters4D = tensor.reshape(filters, new_filter_shape, ndim=4) # perform actual convolution ### op = conv.ConvOp( output_mode=border_mode, dx=subsample[0], dy=subsample[1], imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize, **kargs, ) output = op(input4D, filters4D) # flatten to 3D tensor if convolving with single filter or single image if input.ndim == 2 and filters.ndim == 2: if theano.config.warn__signal_conv2d_interface: warnings.warn( "theano.tensor.signal.conv2d() now outputs a 2d tensor when both" " inputs are 2d. To disable this warning, set the Theano flag" " warn__signal_conv2d_interface to False", stacklevel=3, ) output = tensor.flatten(output.T, ndim=2).T elif input.ndim == 2 or filters.ndim == 2: output = tensor.flatten(output.T, ndim=3).T return output
import numpy as np import matplotlib.pyplot as plt import Updates import qqplot import time theano.config.floatX = 'float32' #3 elements. #Discriminator on true point #Discriminator on generated point (these share the same parameters) #Generator poolSize = 5 maxout = lambda vector: T.max(T.reshape(vector, (vector.shape[0], vector.shape[1] / poolSize, poolSize)), axis = 2) relu = lambda vector: T.maximum(0.0, vector) activation = maxout n = 20000 td1 = np.random.gamma(1.0,2.0, n / 2) td2 = np.random.normal(-3.0,2.0, n / 2) true_dist = td1.tolist() + td2.tolist() random.shuffle(true_dist) true_dist = np.asarray(true_dist) #true_dist = np.random.binomial(1, 0.5, n) mean = true_dist.mean() stdv = np.sqrt(true_dist.var())
def inner_fn(t, stm1, postm1, vtm1,\ r_Wq_hst_ot, r_Wq_hst_oht, r_Wq_hst_oat, r_Wq_hst_stm1, r_bq_hst,\ r_Wq_hst2_hst, r_bq_hst2,\ r_Wq_stmu_hst2, r_bq_stmu,\ r_Wq_stsig_hst2, r_bq_stsig,\ r_Wl_stmu_stm1, r_bl_stmu,\ r_Wl_stsig_stm1, r_bl_stsig,\ r_Wl_ost_st, r_bl_ost,\ r_Wl_ost2_ost, r_bl_ost2,\ r_Wl_ost3_ost2, r_bl_ost3,\ r_Wl_otmu_st, r_bl_otmu,\ r_Wl_otsig_st, r_bl_otsig,\ r_Wl_ohtmu_st, r_bl_ohtmu,\ r_Wl_ohtsig_st, r_bl_ohtsig,\ r_Wl_oatmu_st, r_bl_oatmu,\ r_Wl_oatsig_st, r_bl_oatsig,\ r_Wa_aht_st, r_ba_aht,\ r_Wa_atmu_aht, r_ba_atmu,\ r_Wa_atsig_aht, r_ba_atsig\ ): # Use hidden state to generate action state aht = T.batched_tensordot(r_Wa_aht_st, T.reshape(stm1,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_ba_aht #aht2 = T.batched_tensordot(r_Wa_aht2_aht, T.reshape(aht,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_ba_aht2 #aht3 = T.batched_tensordot(r_Wa_aht3_aht2, T.reshape(aht2,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_ba_aht3 at_mu = T.batched_tensordot(r_Wa_atmu_aht, T.reshape(aht,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_ba_atmu at_sig = T.nnet.softplus( T.batched_tensordot(r_Wa_atsig_aht, T.reshape(aht,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_ba_atsig ) + sig_min_action # Sample Action at = at_mu + theano_rng.normal((n_perturbations,n_oa,n_proc))*at_sig # Update Environment action_force = T.tanh( at ) force = T.switch(T.lt(postm1,0.0),-2*postm1 - 1,-T.pow(1+5*T.sqr(postm1),-0.5)-T.sqr(postm1)*T.pow(1 + 5*T.sqr(postm1),-1.5)-T.pow(postm1,4)/16.0) - 0.25*vtm1 vt = vtm1 + 0.05*force + 0.03*action_force post = postm1 + vt # Generate Sensory Inputs: # 1.) Observation of Last Action oat = at # 2.) Noisy Observation of Current Position ot = post + theano_rng.normal((n_perturbations,n_o,n_proc))*0.01 # 3.) Nonlinear Transformed Sensory Channel oht = T.exp(-T.sqr(post-1.0)/2.0/0.3/0.3) # Infer hidden state from last hidden state and current observations, using variational density hst = T.nnet.relu( T.batched_tensordot(r_Wq_hst_stm1,T.reshape(stm1,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + T.batched_tensordot(r_Wq_hst_ot,T.reshape(ot,(n_perturbations,n_o,n_proc)),axes=[[2],[1]]) + T.batched_tensordot(r_Wq_hst_oht,T.reshape(oht,(n_perturbations,n_oh,n_proc)),axes=[[2],[1]]) + T.batched_tensordot(r_Wq_hst_oat,T.reshape(oat,(n_perturbations,n_oa,n_proc)),axes=[[2],[1]]) + r_bq_hst ) hst2 = T.nnet.relu( T.batched_tensordot(r_Wq_hst2_hst,T.reshape(hst,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bq_hst2 ) stmu = T.tanh( T.batched_tensordot(r_Wq_stmu_hst2,T.reshape(hst2,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bq_stmu ) stsig = T.nnet.softplus( T.batched_tensordot(r_Wq_stsig_hst2,T.reshape(hst2,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bq_stsig ) + sig_min_states # Explicitly encode position as homeostatic state variable # Rescale representation to fit within linear response of the tanh-nonlinearity stmu = T.set_subtensor(stmu[:,0,:],0.1*ot[:,0,:]).reshape((n_perturbations,n_s,n_proc)) stsig = T.set_subtensor(stsig[:,0,:],0.005).reshape((n_perturbations,n_s,n_proc)) # Sample from variational density st = stmu + theano_rng.normal((n_perturbations,n_s,n_proc))*stsig # Calculate parameters of likelihood distributions from sampled state ost = T.nnet.relu( T.batched_tensordot(r_Wl_ost_st,T.reshape(st,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_ost ) ost2 = T.nnet.relu( T.batched_tensordot(r_Wl_ost2_ost,T.reshape(ost,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_ost2 ) ost3 = T.nnet.relu( T.batched_tensordot(r_Wl_ost3_ost2,T.reshape(ost2,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_ost3 ) otmu = T.batched_tensordot(r_Wl_otmu_st, T.reshape(ost3,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_otmu otsig = T.nnet.softplus(T.batched_tensordot(r_Wl_otsig_st, T.reshape(ost3,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_otsig) + sig_min_obs ohtmu = T.batched_tensordot(r_Wl_ohtmu_st, T.reshape(ost3,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_ohtmu ohtsig = T.nnet.softplus( T.batched_tensordot(r_Wl_ohtsig_st, T.reshape(ost3,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_ohtsig ) + sig_min_obs oatmu = T.batched_tensordot(r_Wl_oatmu_st, T.reshape(ost3,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_oatmu oatsig = T.nnet.softplus( T.batched_tensordot(r_Wl_oatsig_st, T.reshape(ost3,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_oatsig ) + sig_min_obs # Calculate negative log-likelihood of observations p_ot = GaussianNLL(ot, otmu, otsig) p_oht = GaussianNLL(oht, ohtmu, ohtsig) p_oat = GaussianNLL(oat, oatmu, oatsig) # Calculate prior expectation on hidden state from previous state prior_stmu = T.tanh( T.batched_tensordot(r_Wl_stmu_stm1, T.reshape(stm1,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_stmu ) prior_stsig = T.nnet.softplus( T.batched_tensordot(r_Wl_stsig_stm1, T.reshape(stm1,(n_perturbations,n_s,n_proc)),axes=[[2],[1]]) + r_bl_stsig ) + sig_min_states # Explicitly encode expectations on homeostatic state variable prior_stmu = ifelse(T.lt(t,20),prior_stmu, T.set_subtensor(prior_stmu[:,0,:],0.1)) prior_stsig = ifelse(T.lt(t,20),prior_stsig, T.set_subtensor(prior_stsig[:,0,:],0.005)) # Calculate KL divergence between variational density and prior density # using explicit formula for diagonal gaussians KL_st = KLGaussianGaussian(stmu, stsig, prior_stmu, prior_stsig) # Put free energy functional together FEt = KL_st + p_ot + p_oht + p_oat return st, post, vt, oat, ot, oht, FEt, KL_st, hst, hst2, stmu, stsig, force, p_ot, p_oht, p_oat
def create_snn(self, layers='None'): print 'Building snn...' if (layers == 'None'): layers = self.layers input_layer = InputLayer( shape=self.input_shape, input_var=T.reshape(self.DoG_maps[0], self.input_shape)) #the input layer of #the graph which takes a slice of DoG map. all_layers, _ = self.create_net(layers, input_layer) self.all_layers = all_layers LR = T.scalar() def fn(*args): #args[0] - input slice of DoG map #args[1] - output_spike train #args[2] - v_in for snn DenseLayer #args[3] - h_in for snn DenseLayer ** not present if snn enabled is false #. #. #. args = list(args) print(args) # for i in range(1,len(args)-1): # args[i]=args[i][0] print(len(args)) print('args') print(args) i = 2 for layer in (all_layers[1:]): if (layer.snn_enabled): layer.v_in = args[i] layer.H_in = args[i + 1] i += 2 else: layer.v_in = args[i] i += 1 all_layers[0].input_var = args[0] # #all_layers[0].input_var=T.reshape(args[0],(1,2,28,28)) output_spike_train = lasagne.layers.get_output( all_layers[-1]) #the graph is created # print(T.shape(output_spike_train)) vH_out_list = [] #H_out_list=[] W_dict = [] # for layer in all_layers[1:]: vH_out_list.append(layer.v_out) if (layer.snn_enabled): layer.do_stdp() vH_out_list.append(layer.H_out) W_dict.append((layer.W, layer.W + LR * layer.update)) print('fn returning : ') # print([output_spike_train]+vH_out_list) return [output_spike_train] + vH_out_list, W_dict #return vH_out_list def set_outputs_info(): output = [] #initial_spike_train=T.zeros(all_layers[-1].get_output_shape()[2]) initial_spike_train = T.zeros( (self.batch_size, self.all_layers[-1].num_units)) print(T.shape(initial_spike_train)) #output.append(initial_spike_train) vH_list = [] # for layer in all_layers[1:]: # layer.set_inputs(T.vector(),T.tensor4()) for layer in all_layers[1:]: # print(T.zeros(layer.get_output_shape()[0]) vH_list.append(T.zeros(layer.get_output_shape()[0])) if (layer.snn_enabled): # print() vH_list.append(T.zeros(layer.get_output_shape()[1])) output = [initial_spike_train] + vH_list #output=vH_list print(output) #output = [T.shape_padleft(a) for a in output] # for i,a in enumerate(output): # output[i]=T.shape_padleft(a) print('set output info :') print(output) #print(T.shape(output)) return output # theano.printing.pydotprint(self.DoG_maps, outfile="./debug.png", var_with_name_simple=True) components, updates = theano.scan(fn, sequences=[self.DoG_maps], non_sequences=LR, outputs_info=set_outputs_info()) #print(T.shape(components)) shape = T.shape(components[0]) output = T.sum(components[0], axis=0) output = T.switch(T.ge(output, 1.0), 1.0, output) output = T.cast(output, dtype=theano.config.floatX) #128x1024 time_peaked = T.sum(components[0], axis=2) #32x128 real_valued = T.argmax(time_peaked, axis=0) real_valued = (32 - real_valued) / 32.0 factor = T.sum(time_peaked, axis=0) #to take care of no spike factor = factor * real_valued #128, factor = T.reshape(factor, [T.shape(factor)[0], 1]) factor = T.addbroadcast(factor, 1) output = output * factor delta_weight = T.zeros((1)) print('*********') print(delta_weight) for key, value in updates.iteritems(): delta_weight += T.mean(abs(value - key)) delta_weight /= len(updates.keys()) self.train = theano.function(inputs=[self.input, LR], outputs=[components[0], delta_weight], updates=updates, on_unused_input='ignore') self.test = theano.function(inputs=[self.input, LR], outputs=output) print('compiled')
def generate(self, state): generated = self.generator(state) generated = T.reshape(generated, [self.n_sam, self.batch_size, 28, 28]) return generated
sampler = sampling.AudioFileSampler.load(path + "/sampler.p") def make_random(batch_size): return numpy.random.randn( batch_size, generator.gen_dim ) # * numpy.random.randint(0, 2, [generator.dim[0], batch_size]) ################## network ###################3 z = T.dmatrix('z') batch_size = z.shape[0] x_gen = generator(T.reshape(z, [-1, 1, generator.gen_dim])) ### !!!!!!!!!!!!!!!! x_in = T.dtensor3('x_in') cost_gen = -sampling.energyDstTheano2(T.reshape(x_gen, [batch_size, 1, -1]), T.reshape(x_in, [batch_size, 1, -1])) ################# descent ##################333 param_gen = generator.getParameters() grad_gen = generator.getGradients(cost_gen, 1.0) descent = SimpleDescent.Grad(param_gen, grad_gen) train = descent.step([x_in, z], [x_gen, cost_gen], 0.01) ###################### training #############
def __init__(self, input, n_in, n_hidden, n_out, activation=T.tanh, output_type='real'): self.input = input self.activation = activation self.output_type = output_type self.batch_size = T.iscalar() # theta is a vector of all trainable parameters # it represents the value of W, W_in, W_out, h0, bh, by theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \ n_hidden + n_hidden + n_out self.theta = theano.shared( value=np.zeros(theta_shape, dtype=theano.config.floatX)) # Parameters are reshaped views of theta param_idx = 0 # pointer to somewhere along parameter vector # recurrent weights as a shared variable self.W = self.theta[param_idx:(param_idx + n_hidden**2)].reshape( (n_hidden, n_hidden)) self.W.name = 'W' W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_hidden**2 # input to hidden layer weights self.W_in = self.theta[param_idx:(param_idx + n_in * \ n_hidden)].reshape((n_in, n_hidden)) self.W_in.name = 'W_in' W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_in * n_hidden # hidden to output layer weights self.W_out = self.theta[param_idx:(param_idx + n_hidden * \ n_out)].reshape((n_hidden, n_out)) self.W_out.name = 'W_out' W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out), low=-0.01, high=0.01), dtype=theano.config.floatX) param_idx += n_hidden * n_out self.h0 = self.theta[param_idx:(param_idx + n_hidden)] self.h0.name = 'h0' h0_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) param_idx += n_hidden self.bh = self.theta[param_idx:(param_idx + n_hidden)] self.bh.name = 'bh' bh_init = np.zeros((n_hidden, ), dtype=theano.config.floatX) param_idx += n_hidden self.by = self.theta[param_idx:(param_idx + n_out)] self.by.name = 'by' by_init = np.zeros((n_out, ), dtype=theano.config.floatX) param_idx += n_out assert (param_idx == theta_shape) # for convenience self.params = [ self.W, self.W_in, self.W_out, self.h0, self.bh, self.by ] # shortcut to norms (for monitoring) self.l2_norms = {} for param in self.params: self.l2_norms[param] = T.sqrt(T.sum(param**2)) # initialize parameters # DEBUG_MODE gives division by zero error when we leave parameters # as zeros self.theta.set_value( np.concatenate([ x.ravel() for x in (W_init, W_in_init, W_out_init, h0_init, bh_init, by_init) ])) self.theta_update = theano.shared( value=np.zeros(theta_shape, dtype=theano.config.floatX)) # recurrent function (using tanh activation function) and linear output # activation function def step(x_t, h_tm1): h_t = self.activation(T.dot(x_t, self.W_in) + \ T.dot(h_tm1, self.W) + self.bh) y_t = T.dot(h_t, self.W_out) + self.by return h_t, y_t # the hidden state `h` for the entire sequence, and the output for the # entire sequence `y` (first dimension is always time) # Note the implementation of weight-sharing h0 across variable-size # batches using T.ones multiplying h0 [self.h, self.y_pred], _ = theano.scan(step, sequences=self.input, outputs_info=[ T.alloc(self.h0, self.input.shape[1], n_hidden), None ]) # outputs_info=[T.ones(shape=(self.input.shape[1], # self.h0.shape[0])) * self.h0, None]) # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = 0 self.L1 += abs(self.W.sum()) self.L1 += abs(self.W_in.sum()) self.L1 += abs(self.W_out.sum()) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = 0 self.L2_sqr += (self.W**2).sum() self.L2_sqr += (self.W_in**2).sum() self.L2_sqr += (self.W_out**2).sum() if self.output_type == 'real': self.loss = lambda y: self.mse(y) elif self.output_type == 'binary': # push through sigmoid self.p_y_given_x = T.nnet.sigmoid(self.y_pred) # apply sigmoid self.y_out = T.round(self.p_y_given_x) # round to {0,1} self.loss = lambda y: self.nll_binary(y) elif self.output_type == 'softmax': # push through softmax, computing vector of class-membership # probabilities in symbolic form # # T.nnet.softmax will not operate on T.tensor3 types, only matrices # We take our n_steps x n_seq x n_classes output from the net # and reshape it into a (n_steps * n_seq) x n_classes matrix # apply softmax, then reshape back y_p = self.y_pred y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1)) y_p_s = T.nnet.softmax(y_p_m) self.p_y_given_x = T.reshape(y_p_s, y_p.shape) # compute prediction as class whose probability is maximal self.y_out = T.argmax(self.p_y_given_x, axis=-1) self.loss = lambda y: self.nll_multiclass(y) else: raise NotImplementedError
def build( self, initial_stepsize, n_steps, target_acceptance_rate=.65, stepsize_dec=0.98, stepsize_min=0.0001, stepsize_max=0.5, stepsize_inc=1.02, # used in geometric avg. 1.0 would be not moving at all avg_acceptance_slowness=0.9, seed=12345, init_state=None): if init_state is None: init_h = np.random.normal( 0, 1, size=[self.n_sam * self.batch_size, self.hdim]).astype(np.float32) else: init_h = init_state print('load init_state') init_m = np.random.randn(self.n_sam * self.batch_size, self.hdim).astype(np.float32) # For HMC # h denotes current states self.h = sharedX(init_h) # m denotes momentum t = T.scalar() self.generated = self.generate(self.h) lld = T.reshape(-self.energy_fn(self.h), [self.n_sam, self.batch_size]) self.eval_lld = theano.function([t], lld, givens={ self.obs: self.obs_val, self.t: t }) # allocate shared variables stepsize = sharedX(initial_stepsize) avg_acceptance_rate = sharedX(target_acceptance_rate) s_rng = TT.shared_randomstreams.RandomStreams(seed) # define graph for an `n_steps` HMC simulation accept, final_pos = hmc_move(s_rng, self.h, self.energy_fn, stepsize, n_steps) # define the dictionary of updates, to apply on every `simulate` call simulate_updates = hmc_updates( self.h, stepsize, avg_acceptance_rate, final_pos=final_pos, accept=accept, stepsize_min=stepsize_min, stepsize_max=stepsize_max, stepsize_inc=stepsize_inc, stepsize_dec=stepsize_dec, target_acceptance_rate=target_acceptance_rate, avg_acceptance_slowness=avg_acceptance_slowness) self.step = theano.function([t], [accept], updates=simulate_updates, givens={ self.obs: self.obs_val, self.t: t })
print("Error: unrecognized content layer: {}".format( args.content_layer)) sys.exit(1) content_loss = T.sum(T.sqr(cl_X - cl_Xtr)) / T.cast(cl_X.size, floatX) # Build the style loss. style_loss = 0. X.set_value(style_image) for layer_name in args.style_layers: try: sl_X = perceptual_net_X.get_layer(layer_name).output sl_Xtr = perceptual_net_Xtr.get_layer(layer_name).output except AttributeError: print("Error: unrecognized style layer: {}".format(layer_name)) sys.exit(1) slf_X = T.reshape(sl_X, (sl_X.shape[0], sl_X.shape[1], -1)) gram_X = ( T.batched_tensordot(slf_X, slf_X.dimshuffle(0, 2, 1), axes=1) / T.cast(slf_X.size, floatX)) * T.cast(slf_X.shape[0], floatX) slf_Xtr = T.reshape(sl_Xtr, (sl_Xtr.shape[0], sl_Xtr.shape[1], -1)) gram_Xtr = ( T.batched_tensordot(slf_Xtr, slf_Xtr.dimshuffle(0, 2, 1), axes=1) / T.cast(slf_Xtr.size, floatX)) * T.cast(slf_Xtr.shape[0], floatX) get_gram_X = theano.function([], gram_X) style_gram = theano.shared(get_gram_X()[0, :, :]) style_loss = style_loss + T.sum( T.sqr(style_gram.dimshuffle("x", 0, 1) - gram_Xtr)) / T.cast( Xtr.shape[0], floatX) # Build the TV loss.
def get_output_for(self, input, **kwargs): new_shape = [input.shape[0], 1] + [input.shape[k] for k in xrange(1, input.ndim)] output = T.reshape(input, new_shape, ndim=input.ndim + 1) # see the details in pydoc output = T.repeat(output, self._n, axis=1) return output
def fit(self, sentences, cc_matrix=None, learning_rate=10e-5, reg=0.1, xmax=100, alpha=0.75, epochs=10, gd=False, use_theano=True): # build co-occurrence matrix # paper calls it X, so we will call it X, instead of calling # the training data X # TODO: would it be better to use a sparse matrix? t0 = datetime.now() V = self.V D = self.D if os.path.exists(cc_matrix): X = np.load(cc_matrix) else: X = np.zeros((V, V)) N = len(sentences) print("Number of sentences to process:", N) it = 0 for sentence in sentences: it += 1 if it % 10000 == 0: print("processed", it, "/", N) n = len(sentence) for i in xrange(n): wi = sentence[i] start = max(0, i - self.context_sz) end = min(n, i + self.context_sz) # we can either choose only one side as context, or both # here we are doing both # make sure "start" and "end" tokens are part of some context # otherwise their f(X) will be 0 (denominator in bias update) if i - self.context_sz < 0: points = 1.0 / (i + 1) X[wi, 0] += points X[0, wi] += points if i + self.context_sz > n: points = 1.0 / (n - i) X[wi, 1] += points X[1, wi] += points for j in xrange(start, i): if j == i: continue wj = sentence[j] points = 1.0 / abs(i - j) # this is +ve X[wi, wj] += points X[wj, wi] += points # save the cc matrix because it takes forever to create np.save(cc_matrix, X) print("max in X:", X.max()) # weighting fX = np.zeros((V, V)) fX[X < xmax] = (X[X < xmax] / float(xmax))**alpha fX[X >= xmax] = 1 print("max in f(X):", fX.max()) # target logX = np.log(X + 1) print("max in log(X):", logX.max()) print("time to build co-occurrence matrix:", (datetime.now() - t0)) # initialize weights W = np.random.randn(V, D) / np.sqrt(V + D) b = np.zeros(V) U = np.random.randn(V, D) / np.sqrt(V + D) c = np.zeros(V) mu = logX.mean() if gd and use_theano: thW = theano.shared(W) thb = theano.shared(b) thU = theano.shared(U) thc = theano.shared(c) thLogX = T.matrix('logX') thfX = T.matrix('fX') params = [thW, thb, thU, thc] thDelta = thW.dot(thU.T) + T.reshape(thb, (V, 1)) + T.reshape( thc, (1, V)) + mu - thLogX thCost = (thfX * thDelta * thDelta).sum() grads = T.grad(thCost, params) updates = [(p, p - learning_rate * g) for p, g in zip(params, grads)] train_op = theano.function( inputs=[thfX, thLogX], updates=updates, ) costs = [] sentence_indexes = range(len(sentences)) for epoch in xrange(epochs): delta = W.dot(U.T) + b.reshape(V, 1) + c.reshape(1, V) + mu - logX cost = (fX * delta * delta).sum() costs.append(cost) print("epoch:", epoch, "cost:", cost) if gd: # gradient descent method if use_theano: train_op(fX, logX) W = thW.get_value() b = thb.get_value() U = thU.get_value() c = thc.get_value() else: # update W oldW = W.copy() for i in xrange(V): W[i] -= learning_rate * (fX[i, :] * delta[i, :]).dot(U) W -= learning_rate * reg * W # update b for i in xrange(V): b[i] -= learning_rate * fX[i, :].dot(delta[i, :]) b -= learning_rate * reg * b # update U for j in xrange(V): U[j] -= learning_rate * (fX[:, j] * delta[:, j]).dot(oldW) U -= learning_rate * reg * U # update c for j in xrange(V): c[j] -= learning_rate * fX[:, j].dot(delta[:, j]) c -= learning_rate * reg * c else: # ALS method # update W # fast way # t0 = datetime.now() for i in xrange(V): # matrix = reg*np.eye(D) + np.sum((fX[i,j]*np.outer(U[j], U[j]) for j in xrange(V)), axis=0) matrix = reg * np.eye(D) + (fX[i, :] * U.T).dot(U) # assert(np.abs(matrix - matrix2).sum() < 10e-5) vector = (fX[i, :] * (logX[i, :] - b[i] - c - mu)).dot(U) W[i] = np.linalg.solve(matrix, vector) # print "fast way took:", (datetime.now() - t0) # update b for i in xrange(V): denominator = fX[i, :].sum() # assert(denominator > 0) numerator = fX[i, :].dot(logX[i, :] - W[i].dot(U.T) - c - mu) # for j in xrange(V): # numerator += fX[i,j]*(logX[i,j] - W[i].dot(U[j]) - c[j]) b[i] = numerator / denominator / (1 + reg) # print "updated b" # update U for j in xrange(V): matrix = reg * np.eye(D) + (fX[:, j] * W.T).dot(W) vector = (fX[:, j] * (logX[:, j] - b - c[j] - mu)).dot(W) U[j] = np.linalg.solve(matrix, vector) # update c for j in xrange(V): denominator = fX[:, j].sum() numerator = fX[:, j].dot(logX[:, j] - W.dot(U[j]) - b - mu) c[j] = numerator / denominator / (1 + reg) self.W = W self.U = U plt.plot(costs) plt.show()
def build_model(tparams, options): """ Build up the whole computation graph """ trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) last_n = options['last_n'] # video blocks. (n_timesteps, n_samples, n_annotations, ctxdim) x = tensor.tensor4('x', dtype='float32') mask = tensor.matrix('mask', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] n_annotations = x.shape[2] ctxdim = x.shape[3] # action labels y = tensor.tensor3('y', dtype='int64') #ctx = tensor.reshape(ctx, (n_timesteps, n_samples, n_annotations, ctxdim)) ctx = x # initial state/cell ctx_mean = ctx.mean( 0) ### ctx_mean is now (n_samples, n_annotations, ctxdim) ctx_mean = ctx_mean.mean(1) ### you want ctx_mean to be n_samples x ctxdim for lidx in xrange(1, options['n_layers_init']): ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_init_%d' % lidx, activ='rectifier') if options['use_dropout']: ctx_mean = dropout_layer(ctx_mean, use_noise, trng) init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') # decoder proj = get_layer('lstm_cond')[1](tparams, ctx, options, prefix='decoder', mask=mask, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise) # collection proj_h = proj[0] alphas = proj[2] ctxs = proj[3] if options['selector']: sels = proj[4] if options['use_dropout']: proj_h = dropout_layer(proj_h, use_noise, trng) # outputs logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') if options['ctx2out']: logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tanh(logit) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d' % lidx, activ='rectifier') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='sigmoid') logit_shp = logit.shape #(TS, BS, o/p) probs = logit probs = probs.reshape([probs.shape[0] * probs.shape[1], probs.shape[2]]) #(TSxBS, o/p) # Cost Function tmp = tensor.reshape(y, [y.shape[0] * y.shape[1], y.shape[2]]) # (TSxBS, 12) cost = -tmp * tensor.log(probs + 1e-8) - (1 - tmp) * tensor.log( 1 - probs + 1e-8) # (TSxBS, 12) cost = cost.sum(1) # (TSxBS,) cost = cost.reshape([x.shape[0], x.shape[1]]) # (TS, BS) cost = (cost * mask).sum(0).sum(0) # float32 # Predictions probs = probs.reshape([x.shape[0], x.shape[1], probs.shape[1]]) # (TS, BS, 12) preds = tensor.mean(probs[-last_n:, :, :], axis=0) # (BS, 12) opt_outs = dict() if options['selector']: opt_outs['selector'] = sels return trng, use_noise, [x, mask, y], alphas, cost, opt_outs, preds
def convolve(kerns, kshp, nkern, images, imgshp, step=(1, 1), bias=None, mode='valid', flatten=True): """Convolution implementation by sparse matrix multiplication. :note: For best speed, put the matrix which you expect to be smaller as the 'kernel' argument "images" is assumed to be a matrix of shape batch_size x img_size, where the second dimension represents each image in raster order If flatten is "False", the output feature map will have shape: .. code-block:: python batch_size x number of kernels x output_size If flatten is "True", the output feature map will have shape: .. code-block:: python batch_size x number of kernels * output_size .. note:: IMPORTANT: note that this means that each feature map (image generate by each kernel) is contiguous in memory. The memory layout will therefore be: [ <feature_map_0> <feature_map_1> ... <feature_map_n>], where <feature_map> represents a "feature map" in raster order kerns is a 2D tensor of shape nkern x N.prod(kshp) :param kerns: 2D tensor containing kernels which are applied at every pixel :param kshp: tuple containing actual dimensions of kernel (not symbolic) :param nkern: number of kernels/filters to apply. nkern=1 will apply one common filter to all input pixels :param images: tensor containing images on which to apply convolution :param imgshp: tuple containing image dimensions :param step: determines number of pixels between adjacent receptive fields (tuple containing dx,dy values) :param mode: 'full', 'valid' see CSM.evaluate function for details :param sumdims: dimensions over which to sum for the tensordot operation. By default ((2,),(1,)) assumes kerns is a nkern x kernsize matrix and images is a batchsize x imgsize matrix containing flattened images in raster order :param flatten: flatten the last 2 dimensions of the output. By default, instead of generating a batchsize x outsize x nkern tensor, will flatten to batchsize x outsize*nkern :return: out1, symbolic result :return: out2, logical shape of the output img (nkern,heigt,width) :TODO: test for 1D and think of how to do n-d convolutions """ N = numpy # start by computing output dimensions, size, etc kern_size = N.int64(N.prod(kshp)) # inshp contains either 2 entries (height,width) or 3 (nfeatures,h,w) # in the first case, default nfeatures to 1 if N.size(imgshp) == 2: imgshp = (1,) + imgshp # construct indices and index pointers for sparse matrix, which, # when multiplied with input images will generate a stack of image # patches indices, indptr, spmat_shape, sptype, outshp = \ convolution_indices.conv_eval(imgshp, kshp, step, mode) # build sparse matrix, then generate stack of image patches csc = theano.sparse.CSM(sptype)(N.ones(indices.size), indices, indptr, spmat_shape) patches = (sparse.structured_dot(csc, images.T)).T # compute output of linear classifier pshape = tensor.stack(images.shape[0] * tensor.as_tensor(N.prod(outshp)),\ tensor.as_tensor(imgshp[0] * kern_size)) patch_stack = tensor.reshape(patches, pshape, ndim=2) # kern is of shape: nkern x ksize*number_of_input_features # output is thus of shape: bsize*outshp x nkern output = tensor.dot(patch_stack, kerns.T) # add bias across each feature map (more efficient to do it now) if bias is not None: output += bias # now to have feature maps in raster order ... # go from bsize*outshp x nkern to bsize x nkern*outshp newshp = tensor.stack(images.shape[0],\ tensor.as_tensor(N.prod(outshp)),\ tensor.as_tensor(nkern)) tensout = tensor.reshape(output, newshp, ndim=3) output = tensor.DimShuffle((False,) * tensout.ndim, (0, 2, 1))(tensout) if flatten: output = tensor.flatten(output, 2) return output, N.hstack((nkern, outshp))
def define_layers(self): self.layers = [] self.params = [] for i in xrange(self.num_hds): if i == 0: layer_input = self.X h_shape = (self.out_size, self.hidden_size_list[0]) else: layer_input = self.layers[i - 1].activation h_shape = (self.hidden_size_list[i - 1], self.hidden_size_list[i]) if self.cell == "gru": hidden_layer = GRULayer(self.rng, self.prefix + self.layer_id + str(i), h_shape, layer_input, self.mask, self.is_train, self.batch_size, self.drop_rate) elif self.cell == "lstm": hidden_layer = LSTMLayer(self.rng, self.prefix + self.layer_id + str(i), h_shape, layer_input, self.mask, self.is_train, self.batch_size, self.drop_rate) self.layers.append(hidden_layer) self.params += hidden_layer.params #the last decoder layer for decoding if self.num_hds == 0: output_layer_input = self.X last_shape = (self.in_size, self.out_size) else: output_layer_input = self.layers[-1].activation last_shape = (self.in_size, self.layers[-1].out_size) self.W_hy = init_weights((last_shape[1], last_shape[0]), self.prefix + "W_hy" + self.layer_id) self.b_y = init_bias(last_shape[0], self.prefix + "b_y" + self.layer_id) if self.cell == "gru": self.decoder = GRULayer(self.rng, self.prefix + self.layer_id, last_shape, output_layer_input, self.mask, self.is_train, self.batch_size, self.drop_rate) def _active(m, pre_h, x): x = T.reshape(x, (self.batch_size, last_shape[0])) pre_h = T.reshape(pre_h, (self.batch_size, last_shape[1])) h = self.decoder._active(x, pre_h) y = T.nnet.softmax(T.dot(h, self.W_hy) + self.b_y) y = y * m[:, None] h = T.reshape(h, (1, self.batch_size * last_shape[1])) y = T.reshape(y, (1, self.batch_size * last_shape[0])) return h, y [h, y], updates = theano.scan( _active, #n_steps = self.words, sequences=[self.mask], outputs_info=[{ 'initial': output_layer_input, 'taps': [-1] }, T.alloc(floatX(0.), 1, self.batch_size * last_shape[0])]) elif self.cell == "lstm": self.decoder = LSTMLayer(self.rng, self.prefix + self.layer_id, last_shape, output_layer_input, self.mask, self.is_train, self.batch_size, self.drop_rate) def _active(m, pre_h, pre_c, x): x = T.reshape(x, (self.batch_size, last_shape[0])) pre_h = T.reshape(pre_h, (self.batch_size, last_shape[1])) pre_c = T.reshape(pre_c, (self.batch_size, last_shape[1])) h, c = self.decoder._active(x, pre_h, pre_c) y = T.nnet.softmax(T.dot(h, self.W_hy) + self.b_y) y = y * m[:, None] h = T.reshape(h, (1, self.batch_size * last_shape[1])) c = T.reshape(c, (1, self.batch_size * last_shape[1])) y = T.reshape(y, (1, self.batch_size * last_shape[0])) return h, c, y [h, c, y], updates = theano.scan( _active, sequences=[self.mask], outputs_info=[{ 'initial': output_layer_input, 'taps': [-1] }, { 'initial': output_layer_input, 'taps': [-1] }, T.alloc(floatX(0.), 1, self.batch_size * last_shape[0])]) y = T.reshape(y, (self.words, self.batch_size * last_shape[0])) self.activation = y self.params += self.decoder.params self.params += [self.W_hy, self.b_y] # self.layers.append(self.decoder) self.hhhh = h
def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2), k=4): """ Allocate a LeNetConvPoolLayer with shared variable internal parameters. :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.dtensor4 :param input: symbolic image tensor, of shape image_shape :type filter_shape: tuple or list of length 4 :param filter_shape: (number of filters, num input feature maps, filter height,filter width) :type image_shape: tuple or list of length 4 :param image_shape: (batch size, num input feature maps, image height, image width) :type poolsize: tuple or list of length 2 :param poolsize: the downsampling (pooling) factor (#rows,#cols) """ assert image_shape[1] == filter_shape[1] self.input = input # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit fan_in = numpy.prod(filter_shape[1:]) # each unit in the lower layer receives a gradient from: # "num output feature maps * filter height * filter width" / # pooling size fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) # initialize weights with random weights W_bound = numpy.sqrt(6. / (fan_in + fan_out)) self.W = theano.shared(numpy.asarray( rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), dtype=theano.config.floatX), borrow=True) # the bias is a 1D tensor -- one bias per output feature map b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, borrow=True) # convolve input feature maps with filters conv_out = conv.conv2d(input=input, filters=self.W, filter_shape=filter_shape, image_shape=image_shape) #images2neibs produces a 2D matrix neighborsForPooling = TSN.images2neibs(ten4=conv_out, neib_shape=(1,conv_out.shape[3]), mode='ignore_borders') #k = poolsize[1] neighborsArgSorted = T.argsort(neighborsForPooling, axis=1) kNeighborsArg = neighborsArgSorted[:,-k:] kNeighborsArgSorted = T.sort(kNeighborsArg, axis=1) ii = T.repeat(T.arange(neighborsForPooling.shape[0]), k) jj = kNeighborsArgSorted.flatten() pooledkmaxTmp = neighborsForPooling[ii, jj] # reshape pooledkmaxTmp new_shape = T.cast(T.join(0, conv_out.shape[:-2], T.as_tensor([conv_out.shape[2]]), T.as_tensor([k])), 'int64') pooled_out = T.reshape(pooledkmaxTmp, new_shape, ndim=4) # downsample each feature map individually, using maxpooling ''' pooled_out = downsample.max_pool_2d(input=conv_out, ds=poolsize, ignore_border=True) ''' # add the bias term. Since the bias is a vector (1D array), we first # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will # thus be broadcasted across mini-batches and feature map # width & height self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) # store parameters of this layer self.params = [self.W, self.b]
def __init__(self, env_spec, hidden_dim=32, feature_network=None, state_include_action=True, hidden_nonlinearity=NL.tanh): """ :param env_spec: A spec for the env. :param hidden_dim: dimension of hidden layer :param hidden_nonlinearity: nonlinearity used for each hidden layer :return: """ assert isinstance(env_spec.action_space, Discrete) Serializable.quick_init(self, locals()) super(CategoricalGRUPolicy, self).__init__(env_spec) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim if state_include_action: input_dim = obs_dim + action_dim else: input_dim = obs_dim l_input = L.InputLayer(shape=(None, None, input_dim), name="input") if feature_network is None: feature_dim = input_dim l_flat_feature = None l_feature = l_input else: feature_dim = feature_network.output_layer.output_shape[-1] l_flat_feature = feature_network.output_layer l_feature = OpLayer( l_flat_feature, extras=[l_input], name="reshape_feature", op=lambda flat_feature, input: TT.reshape( flat_feature, [input.shape[0], input.shape[1], feature_dim]), shape_op=lambda _, input_shape: (input_shape[0], input_shape[1], feature_dim)) prob_network = GRUNetwork(input_shape=(feature_dim, ), input_layer=l_feature, output_dim=env_spec.action_space.n, hidden_dim=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=TT.nnet.softmax, name="prob_network") self.prob_network = prob_network self.feature_network = feature_network self.l_input = l_input self.state_include_action = state_include_action flat_input_var = TT.matrix("flat_input") if feature_network is None: feature_var = flat_input_var else: feature_var = L.get_output( l_flat_feature, {feature_network.input_layer: flat_input_var}) self.f_step_prob = ext.compile_function( [flat_input_var, prob_network.step_prev_hidden_layer.input_var], L.get_output([ prob_network.step_output_layer, prob_network.step_hidden_layer ], {prob_network.step_input_layer: feature_var})) self.input_dim = input_dim self.action_dim = action_dim self.hidden_dim = hidden_dim self.prev_action = None self.prev_hidden = None self.dist = RecurrentCategorical(env_spec.action_space.n) out_layers = [prob_network.output_layer] if feature_network is not None: out_layers.append(feature_network.output_layer) LasagnePowered.__init__(self, out_layers)
def __init__(self, C, D, K, S, rng_state=None, epsilon=1e-2, use_precision=True, tradeoff_hybrid=0.5, tradeoff_ssl=0.9, gamma=1, eta=10, init_params=None): ''' Constructs the Theano computation graph for the given parameters C: Number of classes D: Number of input features K: List of length C containing the number of components per class S: Number of dimensions of the low-rank approximation of the DPLR matrix structure. Note that this parameter is actually called 'R' in the paper. rng_state: Random number generator seed to use if the parameters should be initialized randomly. This parameter is ignored if 'init_params' is given. epsilon: Regularizer for the diagonal of the covariance matrices. use_precision: Determines if precisions or covariances should be used. The precision is the inverse of the covariance matrix. tradeoff_hybrid: The lambda parameter of the hybrid objective. Close to 1 means very generative, close to 0 means very discriminative. tradeoff_ssl: The kappa parameter of the hybrid objective for semi- supervised learning. Close to 1 puts more weight onto the labeled samples, close to 0 puts more weights onto the unlabeled samples. gamma: The gamma parameter of the MM/LM objective. eta: The parameter for the softmax approximation. init_params: Use this to provide initial parameters. Usually parameters obtained with the EM algorithm are provided here. init_params must be a five-tuple containing - mu_vals (K_all x D): Mean values for each component - s_vals (K_all x D x S): Low-rank matrices for each component - d_rho_vals (D x K_all): Diagonal variances for each component (inverse softplus values) - prior_k_rho_vals (K_all): Logits of the component priors - prior_c_rho_vals (C): Logits of the class priors with K_all = sum(K). The component parameters are stored linearly for all classes. E.g. the first K[0] entries correspond to components of class 0. If precisions are used instead of covariances, the use of s_vals and d_rho_vals changes accordingly. ''' self.x = T.matrix('x') self.t = T.ivector('t') self.tradeoff_hybrid = tradeoff_hybrid self.tradeoff_ssl = tradeoff_ssl self.gamma = gamma self.eta = eta self.epsilon = epsilon K_all = np.sum(K) if init_params is None: rng = np.random.RandomState(rng_state) mu_vals = rng.normal(0., 1., size=(K_all, D)) s_vals = rng.normal(0., 1., size=(K_all, D, S)) d_rho_vals = rng.normal(0, 0.1, size=(D, K_all)) prior_k_rho_vals = np.zeros((np.sum(K), )) prior_c_rho_vals = np.zeros((C, )) else: assert len(init_params) == 5 mu_vals, s_vals, d_rho_vals, prior_k_rho_vals, prior_c_rho_vals = init_params assert mu_vals.shape == (K_all, D) assert s_vals.shape == (K_all, D, S) assert d_rho_vals.shape == (D, K_all) assert prior_k_rho_vals.shape == (K_all, ) assert prior_c_rho_vals.shape == (C, ) mu_vals = np.asarray(mu_vals, dtype=theano.config.floatX) s_vals = np.asarray(s_vals, dtype=theano.config.floatX) d_rho_vals = np.asarray(d_rho_vals, dtype=theano.config.floatX) prior_k_rho_vals = np.asarray(prior_k_rho_vals, dtype=theano.config.floatX) prior_c_rho_vals = np.asarray(prior_c_rho_vals, dtype=theano.config.floatX) # Shared variables self.means = theano.shared(mu_vals, name='means', borrow=True) self.s = theano.shared(s_vals, name='s', borrow=True) self.d_rho = theano.shared(d_rho_vals, name='d_rho', borrow=True) self.prior_k_rho = theano.shared(prior_k_rho_vals, name='prior_k_rho', borrow=True) self.prior_c_rho = theano.shared(prior_c_rho_vals, name='prior_c_rho', borrow=True) self.params = [ self.means, self.s, self.d_rho, self.prior_k_rho, self.prior_c_rho ] self.d = T.nnet.softplus(self.d_rho) + self.epsilon if use_precision == True: # s and d are used to represent precision matrices self.exponent = T.dot(self.x**2, self.d) #xDx self.exponent -= 2 * T.dot(self.x, self.d * self.means.T) #-2xDm self.exponent += T.sum(self.means**2 * self.d.T, axis=1) # mDm self.exponent += T.sum(T.dot(self.x, self.s)**2, axis=2) # xSSx self.exponent -= 2 * T.sum(T.dot(self.x, self.s) * T.sum( self.s * self.means[:, :, None], axis=1)[None, :, :], axis=2) # -2xSSm self.exponent += T.sum(T.sum(self.s * self.means[:, :, None], axis=1)**2, axis=1) # mSSm self.exponent *= -0.5 eye_S = T.eye(S, dtype=theano.config.floatX) self.aux_matrix = T.batched_tensordot( self.s / self.d.T[:, :, None], self.s, axes=(1, 1)) + eye_S self.aux_logdet, _ = theano.scan(fn=lambda aux: logdet_psd(aux), outputs_info=None, sequences=self.aux_matrix, non_sequences=None) self.logdet = T.sum(T.log(self.d), axis=0) + self.aux_logdet # logpK contains all log probabilities of all components in an (N x sum(K)) array # Note that the log component priors are not added yet self.logpK = -0.5 * D * T.log( 2. * np.pi) + 0.5 * self.logdet + self.exponent else: # s and d are used to represent covariance matrices if S == 1: self.aux_matrix = T.sum( self.s[:, :, 0] / self.d.T * self.s[:, :, 0], axis=1).reshape((K_all, 1, 1)) + 1. else: # Since the latest Cuda/Theano update, the following two lines # cause an error in the case of S=1. eye_S = T.eye(S, dtype=theano.config.floatX) self.aux_matrix = T.batched_tensordot( self.s / self.d.T[:, :, None], self.s, axes=(1, 1)) + eye_S (self.aux_inv, self.aux_logdet), _ = theano.scan( fn=lambda aux: [T.nlinalg.matrix_inverse(aux), logdet_psd(aux)], outputs_info=None, sequences=[self.aux_matrix], non_sequences=None) self.logdet = T.sum(T.log(self.d), axis=0) + self.aux_logdet # Product inv(d) * s for all K --> K x D x S self.rs = self.s / self.d.T[:, :, None] # Product inv(d) * s * aux_inv for all K --> K x D x S self.ls = T.batched_dot(self.rs, self.aux_inv) # s and d are used to represent covariance matrices self.exponent = T.dot(self.x**2, 1. / self.d) #xDx self.exponent -= 2 * T.dot(self.x, (1. / self.d) * self.means.T) #-2xDm self.exponent += T.sum(self.means**2 * (1. / self.d.T), axis=1) # mDm self.exponent -= T.sum(T.dot(self.x, self.ls) * T.dot(self.x, self.rs), axis=2) # -x ls rs x self.exponent += 2 * T.sum(T.dot(self.x, self.ls) * T.sum( self.rs * self.means[:, :, None], axis=1)[None, :, :], axis=2) # 2x ls rs m self.exponent -= T.sum( T.sum(self.ls * self.means[:, :, None], axis=1) * T.sum(self.rs * self.means[:, :, None], axis=1), axis=1) # -m ls rs m self.exponent *= -0.5 # logpK contains all log probabilities of all components in an (N x sum(K)) array # Note that the log component priors are not added yet self.logpK = -0.5 * D * T.log( 2. * np.pi) - 0.5 * self.logdet + self.exponent # logpC contains the log joint probabilities p(x,c) in an (N x C) array self.logpC = self.logpK self.logpC_list = [] for c in range(C): k1 = int(np.sum(K[:c])) k2 = int(k1 + K[c]) self.logpC_list.append(self.logpC[:, k1:k2]) aux_max = T.max(self.prior_k_rho[k1:k2] ) # Compute log-probabilities without division log_prior_k = self.prior_k_rho[k1:k2] - T.log( T.sum(T.exp(self.prior_k_rho[k1:k2] - aux_max))) - aux_max self.logpC_list[c] += log_prior_k aux_max = T.max(self.logpC_list[c], axis=1, keepdims=True) self.logpC_list[c] = T.log( T.sum(T.exp(self.logpC_list[c] - aux_max), axis=1)) + aux_max.flatten() self.logpC = T.stack(self.logpC_list, axis=1) aux_max = T.max( self.prior_c_rho) # Compute log-probabilities without division log_prior_c = self.prior_c_rho - T.log( T.sum(T.exp(self.prior_c_rho - aux_max))) - aux_max self.logpC += log_prior_c # mm and cll objective are only for labeled data # logl objective is slightly different for labeled and unlabeled data idx_sv = T.ge(self.t, 0).nonzero() idx_usv = T.lt(self.t, 0).nonzero() self.logpC_sv = self.logpC[idx_sv] self.logpC_usv = self.logpC[idx_usv] is_sv_empty = T.eq(self.logpC_sv.shape[0], 0) is_usv_empty = T.eq(self.logpC_usv.shape[0], 0) # If there are no supervised/unsupervised samples create a dummy entry # to avoid problems. The corresponding costs are set to 0 later. We set # the number of rows to 2 because 1 results in an error. # The problems appear to be CUDNN related if for instance a sum over an # empty tensor is computed. self.logpC_sv = theano.ifelse.ifelse( is_sv_empty, T.zeros((2, C), theano.config.floatX), self.logpC_sv) self.t_sv = theano.ifelse.ifelse(is_sv_empty, T.zeros((2, ), 'int32'), self.t[idx_sv]) self.logpC_usv = theano.ifelse.ifelse( is_usv_empty, T.zeros((2, C), theano.config.floatX), self.logpC_usv) # Compute mean divisor since T.mean causes divisions by zero if there # are no labeled or unlabeled data in the minibatch. Therefore, we # compute T.mean with T.sum()/N self.aux_mean_divisor_sv = T.switch(is_sv_empty, 1., self.logpC_sv.shape[0]) self.aux_mean_divisor_usv = T.switch(is_usv_empty, 1., self.logpC_usv.shape[0]) # Create cost functions # Compute the log of the softmax of logpc which gives the log of the conditional likelihood self.cll_max_tmp = T.max(self.logpC_sv, axis=1, keepdims=True) self.cll_logsumexp = T.log( T.sum(T.exp(self.logpC_sv - self.cll_max_tmp), axis=1)) + T.reshape(self.cll_max_tmp, (self.cll_max_tmp.shape[0], )) self.cost_cll = theano.ifelse.ifelse( is_sv_empty, 0., -T.sum(self.logpC_sv[T.arange(self.logpC_sv.shape[0]), self.t_sv] - self.cll_logsumexp)) self.cost_cll_normalized = self.cost_cll / self.aux_mean_divisor_sv # Negative log-likelihood of labeled data self.cost_nll_sv = theano.ifelse.ifelse( is_sv_empty, 0., -T.sum(self.logpC_sv[T.arange(self.t_sv.shape[0]), self.t_sv])) self.cost_nll_sv_normalized = self.cost_nll_sv / self.aux_mean_divisor_sv # Negative log-likelihood of unlabeled data self.logpC_usv_max = T.max(self.logpC_usv, axis=1, keepdims=True) self.logpC_usv_logsumexp = T.log( T.sum(T.exp(self.logpC_usv - self.logpC_usv_max), axis=1)) + T.reshape(self.logpC_usv_max, (self.logpC_usv.shape[0], )) self.cost_nll_usv = theano.ifelse.ifelse( is_usv_empty, 0., -T.sum(self.logpC_usv_logsumexp)) self.cost_nll_usv_normalized = self.cost_nll_usv / self.aux_mean_divisor_usv # Total negative log-likelihood self.cost_nll = self.cost_nll_sv + self.cost_nll_usv self.cost_nll_normalized = self.cost_nll / self.x.shape[0] self.margin_start = self.gamma + self.logpC_sv - T.reshape( self.logpC_sv[T.arange(self.t_sv.shape[0]), self.t_sv], (self.t_sv.shape[0], 1)) self.margin = self.gamma + self.logpC_sv - T.reshape( self.logpC_sv[T.arange(self.t_sv.shape[0]), self.t_sv], (self.t_sv.shape[0], 1)) self.margin *= self.eta self.margin = T.set_subtensor( self.margin[T.arange(self.t_sv.shape[0]), self.t_sv], -np.inf) # Log-sum-exp trick self.margin_max_tmp = T.max(self.margin, axis=1, keepdims=True) self.max_margin = T.log( T.sum(T.exp(self.margin - self.margin_max_tmp), axis=1)) + T.reshape(self.margin_max_tmp, (self.margin.shape[0], )) self.max_margin /= self.eta # The cast in the following statement resolves an error that says that # both paths of ifelse must be of equal type. Setting the dtype argument # of T.sum did not solve the problem. self.cost_mm = theano.ifelse.ifelse( is_sv_empty, 0., T.cast(T.sum(T.nnet.relu(self.max_margin)), theano.config.floatX)) self.cost_mm_normalized = self.cost_mm / self.aux_mean_divisor_sv # Note: The division by self.x.shape[0] in the following two expressions # ensures that gradients of minibatches are unbiased. # Cost with CLL criterion self.cost_hybrid_cll = ( self.tradeoff_hybrid * (self.tradeoff_ssl * self.cost_nll_sv + (1. - self.tradeoff_ssl) * self.cost_nll_usv) + (1. - self.tradeoff_hybrid) * self.cost_cll) / (self.x.shape[0]) # Cost with MM criterion self.cost_hybrid_mm = (self.tradeoff_hybrid * (self.tradeoff_ssl * self.cost_nll_sv + (1. - self.tradeoff_ssl) * self.cost_nll_usv) + (1. - self.tradeoff_hybrid) * self.cost_mm) / ( self.x.shape[0]) # Predictions and classification errors self.y = T.argmax(self.logpC, axis=1) self.y_sv = self.y[idx_sv] self.y_usv = self.y[idx_usv] self.ce = theano.ifelse.ifelse( is_sv_empty, 0., T.mean(T.neq(self.y_sv, self.t_sv), dtype=theano.config.floatX))
def pool_2d(input, ds, ignore_border=None, st=None, padding=(0, 0), mode='max'): """Downscale the input by a specified factor Takes as input a N-D tensor, where N >= 2. It downscales the input image by the specified factor, by keeping only the maximum value of non-overlapping patches of size (ds[0],ds[1]) Parameters ---------- input : N-D theano tensor of input images Input images. Max pooling will be done over the 2 last dimensions. ds : tuple of length 2 Factor by which to downscale (vertical ds, horizontal ds). (2,2) will halve the image in each dimension. ignore_border : bool (default None, will print a warning and set to False) When True, (5,5) input with ds=(2,2) will generate a (2,2) output. (3,3) otherwise. st : tuple of two ints Stride size, which is the number of shifts over rows/cols to get the next pool region. If st is None, it is considered equal to ds (no overlap on pooling regions). padding : tuple of two ints (pad_h, pad_w), pad zeros to extend beyond four borders of the images, pad_h is the size of the top and bottom margins, and pad_w is the size of the left and right margins. mode : {'max', 'sum', 'average_inc_pad', 'average_exc_pad'} Operation executed on each window. `max` and `sum` always exclude the padding in the computation. `average` gives you the choice to include or exclude it. """ if input.ndim < 2: raise NotImplementedError('pool_2d requires a dimension >= 2') if ignore_border is None: warnings.warn( "pool_2d() will have the parameter ignore_border" " default value changed to True (currently" " False). To have consistent behavior with all Theano" " version, explicitly add the parameter ignore_border=True." " On the GPU, using ignore_border=True is needed to use cuDNN." " When using ignore_border=False and not using cuDNN, the only" " GPU combination supported is when" " `ds == st and padding == (0, 0) and mode == 'max'`." " Otherwise, the convolution will be executed on CPU.", stacklevel=2) ignore_border = False if input.ndim == 4: op = Pool(ds, ignore_border, st=st, padding=padding, mode=mode) output = op(input) return output # extract image dimensions img_shape = input.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = tensor.prod(input.shape[:-2]) batch_size = tensor.shape_padright(batch_size, 1) # store as 4D tensor with shape: (batch_size,1,height,width) new_shape = tensor.cast(tensor.join(0, batch_size, tensor.as_tensor([1]), img_shape), 'int64') input_4D = tensor.reshape(input, new_shape, ndim=4) # downsample mini-batch of images op = Pool(ds, ignore_border, st=st, padding=padding, mode=mode) output = op(input_4D) # restore to original shape outshp = tensor.join(0, input.shape[:-2], output.shape[-2:]) return tensor.reshape(output, outshp, ndim=input.ndim)